编译原理——词法分析C语言程序JAVA源代码

一、实验目的：
加深对词法分析器的工作过程的理解；加强对词法分析方法的掌握；能够采用一种编程语言实现简单的词法分析程序；能够使用自己编写的分析程序对简单的程序段进行词法分析。
二、实验内容：
1．要识别的词素：
(1)保留字或关键字：如：BEGIN、 END、VAR、INTEGER、REAL、 IF、 THEN、READ、WRITE、WHILE。
(2)运算符：如：+、-、*、/、:=、=、>、<、>=、<=
(3)标识符：用户定义的变量名、常数名、过程名
(4)常数：如：10、25、100、2.3等整数或实数
(5)界符：如：‘,’、‘.’ 、‘;’ 、‘(’ 、‘)’、‘{’、‘}’，‘:’
2．词法分析过程所要完成的任务：
（1）给出源程序（要求一段完整的代码）
（2）滤空格
（3）识别保留字
（4）识别标识符
（5）拼数
（6）拼复合词（如:=）
（7）输出源程序的token(词法单元)序列。
需要进行词法分析的源程序：
识别后的输出序列：
源代码：
package cifaapp;

import java.io.*;
import java.util.*;


//存放token序列
class token {
    Integer key;
    String value;
    token(Integer key, String value) {
        this.key = key;
        this.value = value;
    }
}
public class app {
	
	
	// 单词种别码, 1-17为关键字种别码
	public static final int CHAR = 1;
	public static final int SHORT = 2;
	public static final int INT = 3;
	public static final int LONG = 4;
	public static final int FLOAT = 5;
	public static final int DOUBLE = 6;
	public static final int FINAL = 7;
	public static final int STATIC = 8;
	public static final int IF = 9;
	public static final int ELSE = 10;
	public static final int WHILE = 11;
	public static final int DO = 12;
	public static final int FOR = 13;
	public static final int BREAK = 14;
	public static final int CONTINUE = 15;
	public static final int VOID = 16;
	public static final int RETURN = 17;
	public static String key[]={"char","short","int","long","float","double","final","static","if","else","while","do","for","break","continue","void","return"}; 
	// 20为标识符种别码
	public static final int ID = 20;

	// 30为常量种别码
	public static final int NUM = 30;

	// 31-40为运算符种别码
	public static final int AS = 31; // =
	public static final int EQ = 32; // ==
	public static final int GT = 33; // >
	public static final int LT = 34; // <
	public static final int GE = 35; // >=
	public static final int LE = 36; // <=
	public static final int ADD = 37; // +
	public static final int SUB = 38; // -
	public static final int MUL = 39; // *
	public static final int DIV = 40; // /

	// 41-49为界限符种别码
	public static final int LP = 41; // (
	public static final int RP = 42; // )
	public static final int LBT = 43; // [
	public static final int RBT = 44; // ]
	public static final int LBS = 45; // {
	public static final int RBS = 46; // }
	public static final int COM = 47; // ,
	public static final int COL = 48; // :
	public static final int SEM = 49; // ;

	// -1为无法识别的字符标志码
	public static final int ERROR = -1;
	public static int errorNum = 0; // 记录词法分析错误的个数

	
	
	public static LinkedList<String> list = new LinkedList<String>();
	public static LinkedList<token> tokenlist = new LinkedList<token>();
	//以行为单位读取文件内容
	public static void readFileByLines() {
		File file = new File("code.txt");
		BufferedReader reader = null;
        try {
            reader = new BufferedReader(new FileReader(file));
            String tempString = null;
            // 一次读入一行，直到读入null为文件结束
            while ((tempString = reader.readLine()) != null) {
                //System.out.println(tempString);
                separate(tempString);
            }
            reader.close();
        } catch (IOException e) {} 
        
        finally {
            if (reader != null) {
                try {
                    reader.close();
                } catch (IOException e1) {}
            }
        }
	}
	//正则表达式分离字符串，放入链表中
	public static void separate(String linestring) {
		String temp[]=linestring.split("\\s+|\\n|(?<=\\+)|(?=\\+)|(?<=-)|(?=-)|(?<=\\*)|(?=\\*)|(?<=/)|(?=/)|(?<=\\>)|(?=\\>)|(?<=\\<)|(?=\\<)|(?<==)|(?==)|(?<=\\()|(?=\\()|(?<=\\))|(?=\\))|(?<=\\[)|(?=\\[)|(?<=])|(?=])|(?<=\\{)|(?=\\{)|(?<=})|(?=})|(?<=,)|(?=,)|(?<=:)|(?=:)|(?<=;)|(?=;)");
		for(int i=0;i<temp.length;i++) {
			list.add(temp[i]);
		}
	}
	
	public static void analyse() {
	    for (int i = 0; i < list.size(); i++) {
	        if (list.get(i).length() == 1) {
	            if (list.get(i).equals("=")) { // 运算符"="
	                if (list.get(i+1).equals("=")) { // 若后面跟的是"="，则是运算符"=="
	                	tokenlist.add(new token(EQ, list.get(i) + list.get(++i)));
	                } else { // 否则是运算符"="
	                	tokenlist.add(new token(AS, list.get(i)));
	                }
	            } else if (list.get(i).equals(">")) { // 运算符">"
	                if (list.get(i+1).equals("=")) { // 若后面跟的是"="，则是运算符">="
	                	tokenlist.add(new token(GE, list.get(i) + list.get(++i)));
	                } else { // 否则是运算符">"
	                	tokenlist.add(new token(GT, list.get(i)));
	                }
	            } else if (list.get(i).equals("<")) { // 运算符"<"
	                if (list.get(i+1).equals("=")) { // 若后面跟的是"="，则是运算符"<="
	                    tokenlist.add(new token(LE, list.get(i) + list.get(++i)));
	                } else { // 否则是运算符"<"
	                    tokenlist.add(new token(LT, list.get(i)));
	                }
	            } else if (list.get(i).equals("+")) { // 运算符"+"
	                if ((list.get(i-1).equals("=") || list.get(i-1).equals("("))
	                    && isNum(list.get(i+1))) { // 判断是否是有符号常量（正数）
	                    tokenlist.add(new token(NUM, list.get(i) + list.get(++i)));
	                } else { // 否则是运算符"+"
	                    tokenlist.add(new token(ADD, list.get(i)));
	                }
	            } else if (list.get(i).equals("-")) { // 运算符"-"
	                if ((list.get(i-1).equals("=") || list.get(i-1).equals("("))
	                        && isNum(list.get(i+1))) { // 判断是否是有符号常量（负数）
	                    tokenlist.add(new token(NUM, list.get(i) + list.get(++i)));
	                } else { // 否则是运算符"-"
	                    tokenlist.add(new token(SUB, list.get(i)));
	                }
	            } else if (list.get(i).equals("*")) { // 运算符"*"
	                tokenlist.add(new token(MUL, list.get(i)));
	            } else if (list.get(i).equals("/")) { // 运算符"/"
	                tokenlist.add(new token(DIV, list.get(i)));
	            } else if (list.get(i).equals("(")) { // 界限符"("
	                tokenlist.add(new token(LP, list.get(i)));
	            } else if (list.get(i).equals(")")) { // 界限符")"
	                tokenlist.add(new token(RP, list.get(i)));
	            } else if (list.get(i).equals("[")) { // 界限符"["
	                tokenlist.add(new token(LBT, list.get(i)));
	            } else if (list.get(i).equals("]")) { // 界限符"]"
	                tokenlist.add(new token(RBT, list.get(i)));
	            } else if (list.get(i).equals("{")) { // 界限符"{"
	                tokenlist.add(new token(LBS, list.get(i)));
	            } else if (list.get(i).equals("}")) { // 界限符"}"
	                tokenlist.add(new token(RBS, list.get(i)));
	            } else if (list.get(i).equals(",")) { // 界限符","
	                tokenlist.add(new token(COM, list.get(i)));
	            } else if (list.get(i).equals(":")) { // 界限符":"
	                tokenlist.add(new token(COL, list.get(i)));
	            } else if (list.get(i).equals(";")) { // 界限符";"
	                tokenlist.add(new token(SEM, list.get(i)));
	            } else if (list.get(i).charAt(0) >= '0' && list.get(i).charAt(0) <= '9') { // 判断是否是一位数字常量
	                tokenlist.add(new token(NUM, list.get(i)));
	            } else if (isLetter(list.get(i).charAt(0))) { // 判断是否是一位字母标识符
	                tokenlist.add(new token(ID, list.get(i)));
	            } else { // 否则是无法识别的字符
	                tokenlist.add(new token(ERROR, list.get(i)));
	                errorNum++;
	            }
	        } else if ((list.get(i).charAt(0) >= '0' && list.get(i).charAt(0) <= '9')
	                    || list.get(i).charAt(0) == '.') { // 判断是否是正确的常量
	            if (!isNum(list.get(i))) { // 不是常量，则是无法识别的字符
	                tokenlist.add(new token(ERROR, list.get(i)));
	                errorNum++;
	            } else if ((list.get(i+1).charAt(0) == '+' || list.get(i+1).charAt(0) == '-')
	                    && isNum(list.get(i+2))) { // 判断是否是有符号的常量
	                tokenlist.add(new token(NUM, list.get(i) + list.get(++i) + list.get(++i)));
	            } else { // 否则是无符号的常量
	                tokenlist.add(new token(NUM, list.get(i)));
	            }
	        } else if (isKeyID(list.get(i)) != 0) { // 判断是否为关键字
	            tokenlist.add(new token(isKeyID(list.get(i)), list.get(i)));
	        } else if (isLetter(list.get(i).charAt(0)) || list.get(i).charAt(0) == '_') { // 判断是否为标识符（以字母或者下划线开头）
	            tokenlist.add(new token(ID, list.get(i)));
	        } else { // 否则是无法识别的单词
	            tokenlist.add(new token(ERROR, list.get(i)));
	            errorNum++;
	        	}
	    	}
	    }
	  //判断是否为数字
	    public static boolean isNum(String str){
	       for (int i = str.length();--i>=0;){
	           if (!Character.isDigit(str.charAt(i))){
	               return false;
	           }
	       }
	       return true;
	    }
	    //判断是否为关键字
	    static int isKeyID(String s){
	    	 int i;
	    	 for(i=0;i<6;i++){
	    		 if(s.equals(key[i])) {
	    			 return i;
	    		 }
	    	 }
	    		  return 0;
	    }
	    static boolean isLetter(char c)
	    {
	     if(c>='a' && c<='z')
	      return true;
	     return false;
	    }
	    
	public static void main(String args[]) {
		readFileByLines();
		analyse();
		
		System.out.println(list);
		System.out.println("词法分析结果如下：\n<单词种别码，单词>          //所属类别");
		for(int i=0;i<tokenlist.size();i++){
			System.out.print("<   " + tokenlist.get(i).key + "    " + tokenlist.get(i).value + "  >          ");
			if (tokenlist.get(i).key > 0 && tokenlist.get(i).key < 20) {
		        System.out.println("//关键字");
		    } else if (tokenlist.get(i).key == 20) {
		        System.out.println("//标识符");
		    } else if (tokenlist.get(i).key == 30) {
		        System.out.println("//常量");
		    } else if (tokenlist.get(i).key > 30 && tokenlist.get(i).key <= 40) {
		        System.out.println("//运算符");
		    } else if (tokenlist.get(i).key > 40 && tokenlist.get(i).key < 50) {
		        System.out.println("//界限符");
		    } else if (tokenlist.get(i).key == -1) {
		        System.out.println("//无法识别的符号");
		    }
		}
		System.out.println("词法分析结束！共" + errorNum + "个无法识别的符号");
	}
}
原文链接：https://blog.csdn.net/qq_57633498/article/details/121599762