lex+yacc 构造语法树（一）

100 阅读 0 评论 66 点赞

我是靠谱客的博主鲤鱼奇异果，这篇文章主要介绍lex+yacc 构造语法树（一），现在分享给大家，希望可以做个参考。

本文章是基于一种为 small C语言的语法规则，通过lex和yacc编写归约规则，使得当输入一个合法程序的时候，输出基于归约规则和语法规则的语法树，该语法树的表现形式为缩进，同列的元素为兄弟节点，往右推进一列的元素为儿子节点。

假定small C的语法规则如下：

复制代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
INT =>
/* integer 1*/
ID => /* identier2*/
SEMI => ;
COMMA => ,
DOT => .
BINARYOP => /* binary operators3 */
UNARYOP => /* unary operators4 */
ASSIGNOP => =
TYPE => int
LP => (
RP => )
LB => [
RB => ]
LC => {
RC => }
STRUCT => struct
RETURN ) return
IF => if
ELSE => else
BREAK => break
CONT => continue
FOR => for

1 A sequence of digits or digits followed by x(0X)" or " without spaces.
2 A character string consisting of alphabetic characters, digits and the underscore. In addition, digits can't be the rst character.

符号的优先级如下：

确定好这些规则后，我们便可以写出lex.l文件如下：

复制代码

%{
#include <stdlib.h>
#include <string.h>
#include "y.tab.h"
void
yyerror(char *s);
int No_Line=1;
struct Node* newNode(char* nameIn,int line);
%}
%%
n
{No_Line++;}
int
{ yylval.token_p= newNode(yytext,No_Line);
return TYPE;}
struct
{ yylval.token_p= newNode(yytext,No_Line);
return STRUCT; }
return
{ yylval.token_p= newNode(yytext,No_Line);
return RETURN; }
if
{ yylval.token_p= newNode(yytext,No_Line);
return IF ; }
else
{ yylval.token_p= newNode(yytext,No_Line);
return ELSE ;}
break
{ yylval.token_p= newNode(yytext,No_Line);
return BREAK;}
cont
{ yylval.token_p= newNode(yytext,No_Line);
return CONT; }
for
{ yylval.token_p= newNode(yytext,No_Line);
return FOR;
}
read
{ yylval.token_p= newNode(yytext,No_Line);
return READ;
}
write
{ yylval.token_p= newNode(yytext,No_Line);
return WRITE;
}
0(x|X)([0-9A-F]{1,8})
{
yylval.token_p= newNode(yytext,No_Line);
return INT;
}
[0-9]{1,10}
{
yylval.token_p= newNode(yytext,No_Line);
return INT;
}
[a-zA-Z_]([a-zA-Z_0-9]*)
{
yylval.token_p= newNode(yytext,No_Line);
return ID;
}
[;]
{
yylval.token_p= newNode(yytext,No_Line);
return SEMI;
}
[,]
{
yylval.token_p= newNode(yytext,No_Line);
return COMMA;
}
[.]
{
yylval.token_p= newNode(yytext,No_Line);
return DOT;
}
(!)|(++)|(--)|(~)
{
yylval.token_p= newNode(yytext,No_Line);
return UNARYOP;
}
[-]
{yylval.token_p = newNode(yytext, No_Line);
return (SUB);}
(*)|(/)|(%)
{
yylval.token_p= newNode(yytext,No_Line);
return BINARYOP1;
}
(+)
{
yylval.token_p= newNode(yytext,No_Line);
return BINARYOP2;
}
(<<)|(>>)
{
yylval.token_p= newNode(yytext,No_Line);
return BINARYOP3;
}
(>)|(>=)|(<)|(<=)
{
yylval.token_p= newNode(yytext,No_Line);
return BINARYOP4;
}
(==)|(!=)
{
yylval.token_p= newNode(yytext,No_Line);
return BINARYOP5;
}
[&]
{
yylval.token_p= newNode(yytext,No_Line);
return BINARYOP6;
}
"^"
{
yylval.token_p= newNode(yytext,No_Line);
return BINARYOP7;
}
(|)
{
yylval.token_p= newNode(yytext,No_Line);
return BINARYOP8;
}
(&&)
{
yylval.token_p= newNode(yytext,No_Line);
return BINARYOP9;
}
(||)
{
yylval.token_p= newNode(yytext,No_Line);
return BINARYOP10;
}
(+=)|(-=)|(*=)|(/=)|(&=)|(^=)|(|=)|(<<=)|(>>=)
{
yylval.token_p= newNode(yytext,No_Line);
return BINARYOP11;
}
[=]
{ yylval.token_p= newNode(yytext,No_Line);
return ASSIGNOP;}
[(]
{ yylval.token_p= newNode(yytext,No_Line);
return LP;
}
[)]
{ yylval.token_p= newNode(yytext,No_Line);
return RP;
}
[[]
{ yylval.token_p= newNode(yytext,No_Line);
return LB;
}
[]]
{ yylval.token_p= newNode(yytext,No_Line);
return RB;
}
[{]
{ yylval.token_p= newNode(yytext,No_Line);
return LC;
}
[}]
{ yylval.token_p= newNode(yytext,No_Line);
return RC;
}
[ t]+
;
.
yyerror("Error:invalid input.n");
%%
int yywrap() {
return 1;
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
%{
#include <stdlib.h>
#include <string.h>
#include "y.tab.h"
void
yyerror(char *s);
int No_Line=1;
struct Node* newNode(char* nameIn,int line);
%}
%%
n
{No_Line++;}
int
{ yylval.token_p= newNode(yytext,No_Line);
return TYPE;}
struct
{ yylval.token_p= newNode(yytext,No_Line);
return STRUCT; }
return
{ yylval.token_p= newNode(yytext,No_Line);
return RETURN; }
if
{ yylval.token_p= newNode(yytext,No_Line);
return IF ; }
else
{ yylval.token_p= newNode(yytext,No_Line);
return ELSE ;}
break
{ yylval.token_p= newNode(yytext,No_Line);
return BREAK;}
cont
{ yylval.token_p= newNode(yytext,No_Line);
return CONT; }
for
{ yylval.token_p= newNode(yytext,No_Line);
return FOR;
}
read
{ yylval.token_p= newNode(yytext,No_Line);
return READ;
}
write
{ yylval.token_p= newNode(yytext,No_Line);
return WRITE;
}
0(x|X)([0-9A-F]{1,8})
{
yylval.token_p= newNode(yytext,No_Line);
return INT;
}
[0-9]{1,10}
{
yylval.token_p= newNode(yytext,No_Line);
return INT;
}
[a-zA-Z_]([a-zA-Z_0-9]*)
{
yylval.token_p= newNode(yytext,No_Line);
return ID;
}
[;]
{
yylval.token_p= newNode(yytext,No_Line);
return SEMI;
}
[,]
{
yylval.token_p= newNode(yytext,No_Line);
return COMMA;
}
[.]
{
yylval.token_p= newNode(yytext,No_Line);
return DOT;
}
(!)|(++)|(--)|(~)
{
yylval.token_p= newNode(yytext,No_Line);
return UNARYOP;
}
[-]
{yylval.token_p = newNode(yytext, No_Line);
return (SUB);}
(*)|(/)|(%)
{
yylval.token_p= newNode(yytext,No_Line);
return BINARYOP1;
}
(+)
{
yylval.token_p= newNode(yytext,No_Line);
return BINARYOP2;
}
(<<)|(>>)
{
yylval.token_p= newNode(yytext,No_Line);
return BINARYOP3;
}
(>)|(>=)|(<)|(<=)
{
yylval.token_p= newNode(yytext,No_Line);
return BINARYOP4;
}
(==)|(!=)
{
yylval.token_p= newNode(yytext,No_Line);
return BINARYOP5;
}
[&]
{
yylval.token_p= newNode(yytext,No_Line);
return BINARYOP6;
}
"^"
{
yylval.token_p= newNode(yytext,No_Line);
return BINARYOP7;
}
(|)
{
yylval.token_p= newNode(yytext,No_Line);
return BINARYOP8;
}
(&&)
{
yylval.token_p= newNode(yytext,No_Line);
return BINARYOP9;
}
(||)
{
yylval.token_p= newNode(yytext,No_Line);
return BINARYOP10;
}
(+=)|(-=)|(*=)|(/=)|(&=)|(^=)|(|=)|(<<=)|(>>=)
{
yylval.token_p= newNode(yytext,No_Line);
return BINARYOP11;
}
[=]
{ yylval.token_p= newNode(yytext,No_Line);
return ASSIGNOP;}
[(]
{ yylval.token_p= newNode(yytext,No_Line);
return LP;
}
[)]
{ yylval.token_p= newNode(yytext,No_Line);
return RP;
}
[[]
{ yylval.token_p= newNode(yytext,No_Line);
return LB;
}
[]]
{ yylval.token_p= newNode(yytext,No_Line);
return RB;
}
[{]
{ yylval.token_p= newNode(yytext,No_Line);
return LC;
}
[}]
{ yylval.token_p= newNode(yytext,No_Line);
return RC;
}
[ t]+
;
.
yyerror("Error:invalid input.n");
%%
int yywrap() {
return 1;
}

由以上的程序可以看出，我们根据语法规则给每个元素判定它所属的token，并新建一个记录了元素内容和它所属行数的node。node的新建代码如下：

复制代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
struct Node* newNode (char* node_name,int line)
{
struct Node *p=(struct Node*)malloc(sizeof(struct Node));
if (p==NULL)
{
printf("Error:out of memory.n");
exit(1);
}
strncpy(p->name,node_name,20);
p->brother=NULL;
p->child=NULL;
p->No_Line=line;
p->No_Child=0;
p->col=0;
p->IsBegin=0;
return p;
}