[关闭]
@windmelon 2019-05-22T20:13:25.000000Z 字数 4726 阅读 4737

C++实现简单的词法分析器

编译原理


简介

词法分析是编译器工作的第一步。词法分析负责将用户的代码从左至右依次读入,将用户的代码转换为token。

词法分析器可以分为两个步骤来实现,首先过滤掉源程序中的换行符、制表符和注释(空格在这里需要保留,因为删除空格会导致无法识别int iinti的现象),然后逐字符读入源程序,对程序进行分解,分解成五类,分别是标识符保留字常数运算符界符

实现步骤

种别码定义

种别码是用来表示扫描到的字符所属类型的索引,根据扫描函数返回的种别码,可以直接判断该字符是标识符保留字常数运算符或者界符

0 -> '\0' #种别码0代表读到文件末尾,结束标识
1~32 -> 保留字
33~68 -> 运算符界符
99 -> 常量
100 -> 标识符

定义保留字表和运算符界符

  1. //保留字表
  2. static char reserveWord[32][20] = {
  3. "auto", "break", "case", "char", "const", "continue",
  4. "default", "do", "double", "else", "enum", "extern",
  5. "float", "for", "goto", "if", "int", "long",
  6. "register", "return", "short", "signed", "sizeof", "static",
  7. "struct", "switch", "typedef", "union", "unsigned", "void",
  8. "volatile", "while"
  9. };
  10. //界符运算符
  11. static char operatorOrDelimiter[36][10] = {
  12. "+","-","*","/","<","<=",">",">=","=","==",
  13. "!=",";","(",")","^",",","\"","\'","#","&",
  14. "&&","|","||","%","~","<<",">>","[","]","{",
  15. "}","\\",".","\?",":","!"
  16. };
  17. static char IDentifierTbl[1000][50] = {""};//标识符表

这一步是根据所针对的程序类型,指定保留字和运算符界符,然后定义一个标识符表,用来保存程序中自定义的变量名

定义判断字符类型的函数

  1. bool isDigit(char ch)//判断是否为数字
  2. {
  3. if(ch>= '0' && ch <= '9')
  4. return true;
  5. return false;
  6. }
  7. bool isLetter(char ch)//判断是否为字母或下划线
  8. {
  9. if((ch>= 'a' && ch<='z') || ( ch <= 'Z' && ch >= 'A')|| ch == '_')
  10. return true;
  11. return false;
  12. }
  13. int isReserve(char *s)//判断是否为保留字
  14. {
  15. for(int i=0;i<32;++i)
  16. {
  17. if(strcmp(reserveWord[i],s) == 0)
  18. return i+1;//返回种别码
  19. }
  20. return -1;
  21. }

isDigit()isLetter()两个函数是在逐字符扫描时用来判断字符类型的函数,而isReserve()是在扫描出一个完整单词后用来判断该单词是否为保留字的,判断是否为界符或者运算符的在后面实现

定义过滤无用字符的函数

  1. void filter(char *s,int len)//源程序预处理,过滤注释和换行回车制表符
  2. {
  3. char tmp[10000];
  4. int p = 0;
  5. for(int i=0;i<len;++i)
  6. {
  7. if(s[i] == '/' && s[i+1] == '/')//单行注释
  8. {
  9. while(s[i]!='\n') ++i;//扫描到换行符为止
  10. }
  11. if(s[i] == '/' && s[i+1] == '*')//多行注释
  12. {
  13. i+=2;
  14. while(s[i] != '*' && s[i+1] != '/')
  15. {
  16. if(s[i] == '\0')
  17. {
  18. cout<<"annotation error!"<<endl;
  19. exit(0);
  20. }
  21. i++;
  22. }
  23. i+=2;
  24. }
  25. if(s[i] != '\n' && s[i] != '\t' && s[i] != '\v' && s[i] != '\r')
  26. {
  27. tmp[p] = s[i];
  28. ++p;
  29. }
  30. }
  31. tmp[p] = '\0';
  32. strcpy(s,tmp);
  33. }

该函数将对源程序进行过滤,删除注释,换行符,制表符和回车符

定义对程序进行扫描的函数

  1. void scanner(int &syn,char * project,char * token,int &p)//扫描源程序,syn是种别码,token是当前扫描的单词,p为扫描位置索引
  2. {
  3. int count = 0;
  4. char ch;
  5. ch = project[p];
  6. while(ch == ' ')//去掉空格
  7. {
  8. ++p;
  9. ch = project[p];
  10. }
  11. for(int i=0;i<20;i++)//清空token
  12. {
  13. token[i] = '\0';
  14. }
  15. if(isLetter(project[p]))
  16. {//以字母开头
  17. token[count++] = project[p++];
  18. while(isLetter(project[p])||isDigit(project[p]))//后面是字母或数字
  19. {
  20. token[count++] = project[p++];
  21. }
  22. token[count] = '\0';
  23. syn = isReserve(token);//查表找到种别码
  24. if(syn == -1)
  25. {//若不是保留字则是标识符
  26. syn = 100;//标识符种别码
  27. }
  28. return;
  29. }
  30. else if(isDigit(project[p]))
  31. {//以数字开头
  32. token[count++] = project[p++];
  33. while(isDigit(project[p]))//后面是数字
  34. {
  35. token[count++] = project[p++];
  36. }
  37. token[count] = '\0';
  38. syn = 99;
  39. return;
  40. }
  41. else if(ch == '+' || ch == '-' || ch == '*' || ch == '/' || ch == ';' || ch == '(' || ch == ')' || ch == '^'
  42. || ch == ',' || ch == '\"' || ch == '\'' || ch == '~' || ch == '#' || ch == '%' || ch == '['
  43. || ch == ']' || ch == '{' || ch == '}' || ch == '\\' || ch == '.' || ch == '\?' || ch == ':')
  44. {
  45. token[count++] = project[p++];
  46. token[count] = '\0';
  47. for(int i=0;i<36;++i)
  48. {
  49. if(strcmp(operatorOrDelimiter[i],token) == 0)
  50. {
  51. syn = 33+i;
  52. break;
  53. }
  54. }
  55. return;
  56. }
  57. else if(ch == '<')
  58. {//可能是< <= <<
  59. ++p;
  60. if(project[p] == '=')//<=
  61. {
  62. syn = 38;
  63. }
  64. if(project[p] == '<')//<<
  65. {
  66. syn = 58;
  67. }
  68. else//<
  69. {
  70. --p;
  71. syn = 37;
  72. }
  73. ++p;
  74. return;
  75. }
  76. else if(ch == '>')
  77. {//可能是> >= >>
  78. ++p;
  79. if(project[p] == '=')//>=
  80. {
  81. syn = 40;
  82. }
  83. if(project[p] == '>')//>>
  84. {
  85. syn = 59;
  86. }
  87. else//>
  88. {
  89. --p;
  90. syn = 39;
  91. }
  92. ++p;
  93. return;
  94. }
  95. else if(ch == '=')
  96. {//可能是= ==
  97. ++p;
  98. if(project[p] == '=')//==
  99. {
  100. syn = 42;
  101. }
  102. else
  103. {//=
  104. --p;
  105. syn = 41;
  106. }
  107. ++p;
  108. return;
  109. }
  110. else if(ch == '!')
  111. {//可能是! !=
  112. ++p;
  113. if(project[p] == '=')//==
  114. {
  115. syn = 43;
  116. }
  117. else
  118. {
  119. --p;
  120. syn = 68;
  121. }
  122. ++p;
  123. return;
  124. }
  125. else if(ch == '&')
  126. {//可能是& &&
  127. ++p;
  128. if(project[p] == '&')//&&
  129. {
  130. syn = 53;
  131. }
  132. else
  133. {//&
  134. --p;
  135. syn = 52;
  136. }
  137. ++p;
  138. return;
  139. }
  140. else if(ch == '|')
  141. {//可能是| ||
  142. ++p;
  143. if(project[p] == '|')//||
  144. {
  145. syn = 55;
  146. }
  147. else
  148. {
  149. --p;
  150. syn = 54;
  151. }
  152. ++p;
  153. return;
  154. }
  155. else if(ch == '\0')//文件结束
  156. {
  157. syn = 0;
  158. }
  159. else
  160. {
  161. cout<<"wrong letter:"<<ch<<endl;
  162. exit(0);
  163. }
  164. }

scanner()函数对过滤后的源文件逐字符扫描,直至文件结束(遇到'\0')

程序入口

  1. int main()
  2. {
  3. //打开一个文件,读取源程序
  4. char project[10000];
  5. char token[20] = {0};
  6. int syn = -1;
  7. int p = 0; //程序位置索引
  8. ifstream in("test.txt");
  9. ofstream out("out.txt");
  10. if(!in.is_open())
  11. {
  12. cout << "error opening file!"<<endl;
  13. exit(0);
  14. }
  15. while(!in.eof())
  16. {
  17. in.get(project[p++]);
  18. }
  19. project[p++] = '\0';
  20. in.close();
  21. cout<<"源程序为:\n"<<project<<endl;
  22. filter(project,p);
  23. cout<<"过滤后的源程序为:\n"<<project<<endl;
  24. p = 0;
  25. while(syn != 0)//开始扫描
  26. {
  27. scanner(syn,project,token,p);
  28. if(syn == 100)//标识符
  29. {
  30. for(int i = 0;i<1000;i++)
  31. {//插入标识符表
  32. if(strcmp(IDentifierTbl[i],token) == 0)
  33. {//已存在表中
  34. break;
  35. }
  36. else if(strcmp(IDentifierTbl[i],"") == 0)
  37. {
  38. strcpy(IDentifierTbl[i],token);
  39. break;
  40. }
  41. }
  42. cout<<"标识符:"<<token<<endl;
  43. out<<"标识符:"<<token<<endl;
  44. }
  45. else if(syn == 99)//常数
  46. {
  47. cout<<"常数:"<<token<<endl;
  48. out<<"常数:"<<token<<endl;
  49. }
  50. else if(syn <= 32 && syn >= 1)//保留字
  51. {
  52. cout<<reserveWord[syn - 1]<<":"<<syn<<endl;
  53. out<<reserveWord[syn - 1]<<":"<<syn<<endl;
  54. }
  55. else if(syn >= 33 && syn <= 68)//运算符或界符
  56. {
  57. cout<<operatorOrDelimiter[syn - 33]<<":"<<syn<<endl;
  58. out<<operatorOrDelimiter[syn - 33]<<":"<<syn<<endl;
  59. }
  60. }
  61. out.close();
  62. system("pause");
  63. }

使用内容如下的文件进行测试

int main()
{
//打开一个文件,读取源程序
char project[10000];
char token[20] = {0};
int syn = -1;
int p = 0; //程序位置索引
ifstream in("test.cpp");
//ofstream out("test_out.cpp");
if(!in.is_open())
{
cout << "error opening file!"< exit(0);
}
while(!in.eof())
{
in>>project[p++];
}
project[p++] = '\0';
in.close();
system("pause");
}

image.png-72.3kB

image.png-43.4kB

参考

添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注