c语言用自动机识别字符串,使用有限状态自动机实现C语言的宣言解析器

使用有限状态自动机实现C语言的声明解析器

摘要:在很多的游戏编程中,我们使用了有限状态自动机作为模型。有限状态自动机作为变成模型,具有通用性好,方便理解的特点。本文主要结合前一个系列的两篇文章(1)C语言声明解析器的实现和(2)用C语言实现有限状态自动机来说明如何用有限状态自动机模型实现一个C语言的声明解析器。

一、状态机的设计

形式定义

· 定义:有限状态自动机(FA—finite automaton)是一个五元组:

– M=(Q, Σ, δ, q0, F)

· 其中,

– Q——状态的非空有穷集合。∀q∈Q,q称为M的一个状态,我们常常用枚举类型来实现,定义在machine结构体之外。

– Σ——输入字母表。我们常常用枚举类型来实现,定义在machine结构体之外。

– δ——状态转移函数,有时又叫作状态转换函数或者移动函数,δ:Q×Σ→Q,δ(q,a)=p。可以用一个数组来实现,大概是一个statenum*inputnum的数组,元素是trasition,包含动作函数action和下一个状态。放在machine结构之内。

– q0——M的开始状态,也可叫作初始状态或启动状态。q0∈Q。

– F——M的终止状态集合。F被Q包含。任给q∈F,q称为M的终止状态。

在我们进行程序实现的时候,一个statemachine的定义大概是这样的:

typedef struct

{

state current;

trasition trasitions[STATENUM][CONDITIONS];

} StateMachine, * pStateMachine;

其中,附属的其他定义还有其他的相关。

我们在解析一个C语言声明的时候,大致过程是这样的:

1)读取声明到第一个变量名称标识符name,此为状态S0

2)读取name后面的下一个标识符,确定类型并采取相应的动作:

“[”:这是一个数组,我们要处理这个数组,然后进入状态S1

“(”:这是一个函数,我们要处理这个函数,然后进入状态S2

“)”:它的前面是一个指针,我们要处理这个指针,然后进入状态S3

“/0”:声明读取完毕,可以结束了,进入状态S4

”other“:出错

这个状态机如下:

105044272.jpg

注意:S1,S2,S3,S4的状态图没有从图中体现出来,实际上,S1~S4的出边和S0是几乎一样的,所不同的就是S1没有经过function到达S2的出边,因为数组的元素不能是函数(可以是函数指针)。这一部分可以参考前面的C语言声明规则。

S0:读完声明到第一个变量标识符

S1:读完一个数组

S2:读完后面的一个函数

S3:读完前面的一个指针

S4:解析完毕

二、数据结构的设计

我们仅仅谈与自动机相关的部分,其他的部分可以在 另外一篇文章 一个C语言声明解析器的设计与实现里找到。

typedef int (* actiontype)( );//函数指针,是一个动作

typedef struct{

state next;

actiontype action;

}trasition, *ptrasition;//转移动作:下一个状态+动作

typedef struct

{

state current;

int statenum;

int conditionnum;

trasition trasitions[STATENUM][CONDITIONS];

} StateMachine, * pStateMachine;//自动机:关键是 当前状态+转移函数集合

int initMachine(pStateMachine mymachine)//初始化状态机,设置初始状态和转移函数

state step(pStateMachine machine, condition mycondition)//状态机的运转函数

三、详细代码

#include

#include

#include

#include

#include

#define MAXTOKENLEN 64

#define MAXTOKENS 100

#define MAXDECLLEN 200

typedef int state;

typedef char condition;

#define STATENUM 5

#define STATE0 0

#define STATE1 1

#define STATE2 2

#define STATE3 3

#define STATE4 4

#define STATETRAP 6

#define CONDITIONS 5

struct token{

char type;

char string[MAXTOKENLEN];

};

struct decl{

char string[MAXDECLLEN];

int current_location;

};

struct decl mydecl;

int top=-1;

struct token this;//the current token

struct token stack[MAXTOKENS];//the tokens before the first identifer

#define IDENTIFIER 'I'

#define QUALIFIER 'Q'

#define TYPE 'T'

#define pop stack[top--]

#define push(s) stack[++top]=s

//#define debug

char classify_string(const char *string);

/*

* init the value of mydecl*/

int init_parse_decl(){

if( fgets(mydecl.string,MAXDECLLEN,stdin) == NULL){

perror("can not read from the stdin");

}

if(strlen(mydecl.string)==MAXDECLLEN-1)

printf("you input is too long, you may get an error!");

mydecl.current_location=0;

#ifdef debug

printf("init:we get last char of mydecl:%c,%d\n",mydecl.string[strlen(mydecl.string)],mydecl.string[strlen(mydecl.string)]);

#endif

}

/*

get a token from the current string,value the "this" and move the pointer

notice: we may get a '\0' at the end of the string

*/

int gettoken(){

char *ch_pointer=this.string;

//leave out the blank

while(mydecl.string[mydecl.current_location]==' '){

mydecl.current_location++;

}

*ch_pointer=mydecl.string[mydecl.current_location];

if(isalnum(*ch_pointer)){

while(isalnum(*ch_pointer)){

mydecl.current_location++;

ch_pointer++;

*ch_pointer=mydecl.string[mydecl.current_location];

}

*ch_pointer='\0';

this.type=classify_string(this.string );//indentifier,qualifier,type

#ifdef debug

printf("we get token:%s ",this.string );

#endif

return 1;

}

//else:(, ), [, ]

// printf("current location is:%d\n",mydecl.current_location);

// printf("(%d)",mydecl.string[mydecl.current_location]);

switch(*ch_pointer){

case '*':

case '(':

case ')':

case '[':

case ']':

this.type=*ch_pointer;

this.string[1]='\0';

mydecl.current_location++;

#ifdef debug

printf("we get token:%s ",this.string );

#endif

break;

case '\n':

case '\0':this.type='\0';

// printf("we come to the end\n");

strcpy(this.string,"then end");

return 0;

default :

printf("we can not read this indentifier %d\n",*ch_pointer);

parse_error();

}

return 1;

}

char classify_string(const char *string){

if(isspace(*string))

return '\0';

if(!strcmp(string,"const"))

return QUALIFIER;

if(!strcmp(string,"volatile"))

return QUALIFIER;

if(!strcmp(string,"void"))

return TYPE;

if(!strcmp(string,"char"))

return TYPE;

if(!strcmp(string,"signed"))

return TYPE;

if(!strcmp(string,"unsigned"))

return TYPE;

if(!strcmp(string,"short"))

return TYPE;

if(!strcmp(string,"int"))

return TYPE;

if(!strcmp(string,"long"))

return TYPE;

if(!strcmp(string,"float"))

return TYPE;

if(!strcmp(string,"struct"))

return TYPE;

if(!strcmp(string,"union"))

return TYPE;

if(!strcmp(string,"enum"))

return TYPE;

return IDENTIFIER ;

}

/*follow the dec string until the first identifier,push token to a stack*/

int read_to_first_identifier(){

if(gettoken()==0){

printf("readtofirst:missing the identifier,please put in your string...\n");

return 0;

}

//the token is a struct with member type and string

while(this.type!=IDENTIFIER)

{

push(this);//a stack with element of token

if(gettoken()==0){

printf("readtofirst:missing the identifier\n");

return 0;

}

}

printf("%s is ", this.string);

return 1;

}

/*

deal with the next token in diff ways according to the token type

we just go to right

1) "(":deal in a function way,deal_with_declaration,

2) "[":deal in a array way,deal_with_declaration,

3) ")":deal with a pointer way,deal_with_declaration

4) NULL:move_left,

*/

int deal_with_declaration(){

if(gettoken()==0){

move_left();

return 1;

}

//

switch(this.type){

case '(': deal_with_function();break;

case ')': deal_with_pointer();break;

case '[': deal_with_array();break;

default : printf("there should not be a %c after the identifier\n",this.type );parse_error();

}

deal_with_declaration();

}

/*the current token is [,the content in the '[]' may be digtal,al,and other*/

int deal_with_array(){

printf("array of ");

while(this.type!=']'){

gettoken();//we may get an "]" or digital

if(isdigit(this.string[0])){

printf("%d ", atoi(this.string));

//the next token must be ]

gettoken();

if(this.type!=']'){

printf("the array miss the ']'before %s",this.string);

parse_error();

}

}

}

return 0;

}

int parse_error(){

printf("press any key to exit\n");

getchar();

exit(0);

}

/*the current token is ')' */

int deal_with_pointer(){

while(stack[top].type=='*'){

printf("pointer pointing to \n");

pop;

}

if(stack[top].type!='('){

printf(" missing an '(' after %s\n",stack[top].string );

parse_error();

}

else{

pop;

return 1;

}

}

/*the current token is '('*/

int deal_with_function(){

while(this.type!=')'){

gettoken();

if(this.type=='\0'){

printf(" missing an ')' before %s",this.string);

parse_error();

}

}

printf(" function returning ");

return 1;

}

int move_left(){

while(top>=0){

switch (stack[top].type){

case '*': printf(" pointer pointing to ");break;

case QUALIFIER: printf(" %s ",stack[top].string );break;

case TYPE: printf(" %s ",stack[top].string );break;

default :printf("there is someting wrong about %s",stack[top].string);parse_error();break;

}

top--;

}

}

typedef int (* actiontype)( );

typedef struct{

state next;

actiontype action;

}trasition, *ptrasition;

typedef struct

{

state current;

int statenum;

int conditionnum;

trasition trasitions[STATENUM][CONDITIONS];

} StateMachine, * pStateMachine;

int initMachine(pStateMachine mymachine){

trasition array={

STATE1,deal_with_array

},

fun={

STATE2,deal_with_function

},

pointer={

STATE3,deal_with_pointer

},

left={

STATE4,move_left

},

acerr={

STATE4,parse_error

},

end={

STATE4,parse_error/*it will no be execed*/

};

trasition mytrasitions[STATENUM][CONDITIONS]={

/*[, (, ), \0,other*/

/*s0*/ array,fun,pointer,left,acerr,

/*s1*/ array,fun,pointer,left,acerr,

/*s2*/ array,fun,pointer,left,acerr,

/*s3*/ array,fun,pointer,left,acerr,

/*s4*/ end, end, end ,end , end

};

mymachine->statenum=STATENUM;

mymachine->conditionnum=CONDITIONS;

mymachine->current=STATE0;

int i,j;

for (i = 0; i < STATENUM ; ++i)

{

for (j = 0; j < CONDITIONS; ++j)

{

mymachine->trasitions[i][j]=mytrasitions[i][j];

}

}

}

state step(pStateMachine machine, condition mycondition)

{

int index;

switch (mycondition){

case '[': index=0;break;

case '(': index=1;break;

case ')': index=2;break;

case '\0': index=3;break;

default: index=4;

}

trasition t = machine->trasitions[machine->current][index];

(*(t.action))( );

if(machine->current==1&&index==1){

printf("\n\n\nerror:the element of array should not be function\n");

parse_error();

}

if(machine->current==2&&index==1){

printf("\n\n\nerror:the returning type of a funciton should not be function\n");

parse_error();

}

if(machine->current==2&&index==0){

printf("\n\n\nerror:the returning type of a funciton should not be array\n");

parse_error();

}

machine->current = t.next;

//printf("the current state is %d\n",t->next );

return machine->current;

}

int main(int argc, char *argv[])

{

StateMachine mymachine;

init_parse_decl();

read_to_first_identifier();

initMachine(&mymachine);

char mycon;

while(mymachine.current!=STATE4){

gettoken();

mycon=this.type;

step(&mymachine,mycon);

}

printf("\n");

return 0;

}

说明:注意line339~line350,这一部分逻辑是新加上去的,因为“函数的返回值不能是函数”等三条规则,所以有了这一部分if语句。这样,这个声明解析器在逻辑上就更完备了。