The design of lexical analyzer for compiling principle

Steps:
1. Select the test code, that is, the input of the lexical analyzer. Here, select the bubble sorting code (only remove the preprocessing command) and store it in the a.txt file, as shown in Figure 1.

2. Count the word symbols included in the test program, and design the corresponding category code of word symbols, as shown in Figure 2

3. Design lexical analyzer
The code is as follows: (to make the code run, you need to write a.txt file, the content of which is the code in Figure 1, and put the file and the. cpp file in the same directory.)

#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<math.h>
#define _KEY_WORD_END "waiting for your expanding"

//Defining the structure of a tuple
typedef struct
{
	int typenum;
	char *word;
}WORD;

char *str;		  //Test code string
char token[255];  //Word buffer
int p_input;      //Pointer to test code string
int p_token;     //Word buffer pointer
char ch;		  //Current read in character
char *rwtab[] = {"void","int","if","for","break",_KEY_WORD_END};//Pointer array of keywords, without semicolon

//Custom function
char m_getch();     //Read a character from the code string into char
void getbc();       //Remove carriage return, line feed, tab
void concat();		//Splicing words for token
int letter();		//Determine whether the current character char is a letter
int digit();		//Determine whether the current character char is a number
int reserve();		//Determine whether the beginning of the letter is an identifier or a keyword, and return its type code
void retract();		//Back one character
char *dtb();		//Convert decimal string to corresponding binary string
WORD *scaner();     //Lexical scan function to get a word pointer returned

int main ()
{
	FILE *p;					//Test code file pointer
	FILE *q;					//File pointer output by lexical analyzer
	int filesize;				//Code string length
	char *fname_in = "a.txt";   //File where the source program is located
	char *fname_out = "b.txt";  //The file of word symbols from lexical analysis
	p_input = 0;
	int over = 1;				//Seed code
	char typenum_str[10];		//The string corresponding to the category code, which is used for fputs writing files
	WORD *oneword = new WORD;	//Word, a pointer to a binary structure
	
	//Open the file where the test program is located
    p = fopen(fname_in,"rb");		//p = fopen("D:\\a.txt","rb");
	if(p == NULL)
	{
		printf("Open file%s error\n",fname_in);
		exit(1);//exit(-1);
	}

	fseek(p,0,SEEK_END);
	filesize = ftell(p);
	str = (char*)malloc(filesize+1);//Dynamic memory allocation
	rewind(p);
	fread(str,sizeof(char),filesize,p);
	str[filesize] = '\0';
	fclose(p);
	//Code in output file
	printf("a.txt The code in the file is:\n");
	puts(str);
	fclose(p);


	//Create and open the file of lexical analysis target program
	q = fopen(fname_out,"wb");
	if(q == NULL)
	{
		printf("Open file%s error\n",fname_out);
		exit(1);
	}

	while(over<1000 && over!=-1)  //1000 is the category code of '\ 0' (will not be output, only as a sign of for loop exit), - 1-bit wrong word category code
	{
		oneword = scaner();
		if(oneword->typenum<1000)
		{
			itoa(oneword->typenum,typenum_str,10);//Type code integer converted to corresponding string
			fputs(typenum_str,q);				  //Write category code to b.txt
			fputs(", ",q);						  //Write, to b.txt
			fputs(oneword->word,q);				  //Write the value of the word to b.txt
			fputc('\n',q);						  //Write a word and wrap it
		}
		over = oneword->typenum;
	}
	printf("Word symbols from lexical analysis have been saved b.txt file\n");	
	fclose(q);
	return 0;
}




//******************************!! The following are custom functions****************************************
//******************************************************************************************
//Read a character from the test code string str to char
char m_getch()
{
	ch = str[p_input];
	p_input = p_input+1;
	return (ch);
}

//Remove carriage return, line feed, tab
void getbc()
{
	while(' '==ch || '\r'==ch || '\n'==ch || '\t'==ch)  //File enter = '\ r\n', windows
	{
		ch = str[p_input];
		p_input++;
	}
}

//Splicing words for token
void concat()
{
	token[p_token] = ch;
	p_token++;
	token[p_token] = '\0';  //Clever
}

//Determine whether the current character char is a letter
int letter()
{
	if(ch>='a'&&ch<='z' || ch>='A'&&ch<='Z')
		return 1;
	else
		return 0;
}

//Determine whether the current character char is a number
int digit()
{
	if(ch>='0' && ch<='9')
		return 1;
	else
		return 0;
}

//Determine whether the beginning of the letter is an identifier or a keyword, and return its type code
int reserve()
{
	int i = 0;
	while(strcmp(rwtab[i],_KEY_WORD_END))
	{
		if(!strcmp(rwtab[i],token))
		{
			return (i+1);   //Category code of keyword
		}
		i++;
	}
	return 20;              //Category code of identifier
}

//Back one character
void retract()
{
	p_input--;
}

//Convert decimal string to corresponding binary string
char *dtb()
{
	int b = 0;
	int i = 0;
	int a = atoi(token);//String to integer
	while(a != 0)
	{
		b = b+a%2*(int)pow(10,i++);
		a = a/2;
	}
	itoa(b,token,10);//Integer to string
	return token;
}

//Lexical scan function to get a word pointer returned
WORD *scaner()
{
	WORD *myword = new WORD;
	myword->typenum = 20;	//Species code with initial value assigned as identifier
	myword->word = "";
	p_token = 0;
	m_getch();
	getbc();
	if(letter())			//Beginning of letter, i.e. identifier or keyword
	{
		while(letter() || digit())
		{
			concat();
			m_getch();
		}
		retract();
		myword->typenum = reserve();  //The reserve function determines whether it is a keyword or an identifier
		myword->word = token;
		return (myword);
	}
	else if(digit())		//Beginning of number, i.e. constant
	{	
		while(digit())
		{
			concat();
			m_getch();
		}
		if(ch>='a'&&ch<='z'||ch>='A'&&ch<='Z')  //Non constant, e.g. 56x is the wrong input
		{
			myword->typenum = -1;
			myword->word = "ERROR";
			return (myword);
		}
		else
		{
			retract();
			myword->typenum = 21; //Species code of constant
			myword->word = dtb(); //dtb returns the binary string corresponding to the decimal constant
			return (myword);
		}
	}
	else switch(ch)			//Non numeric or alphabetic start, i.e. delimiter or operator
	{
		case '[':
			myword->typenum = 22;
			myword->word = "[";
			return (myword);
			break;
		case ']':
			myword->typenum = 23;
			myword->word = "]";
			return (myword);
			break;
		case '(':
			myword->typenum = 24;
			myword->word = "(";
			return (myword);
			break;
		case ')':
			myword->typenum = 25;
			myword->word = ")";
			return (myword);
			break;
		case '{':
			myword->typenum = 26;
			myword->word = "{";
			return (myword);
			break;
		case '}':
			myword->typenum = 27;
			myword->word = "}";
			return (myword);
			break;
		case ',':
			myword->typenum = 28;
			myword->word = ",";
			return (myword);
			break;
		case ';':
			myword->typenum = 29;
			myword->word = ";";
			return (myword);
			break;
		case '<':
			myword->typenum = 40;
			myword->word = "<";
			return (myword);
			break;
		case '>':
			myword->typenum = 41;
			myword->word = ">";
			return (myword);
			break;
		case '=':			//Determine whether the operator starting with = is = or==
			m_getch();
			if('=' == ch)
			{
				myword->typenum = 45;
				myword->word = "==";
				return (myword);
			}
			retract();
			myword->typenum = 44;
			myword->word = "=";
			return (myword);
			break;
		case '+':			//Determine whether the operator starting with + is + or not++
			m_getch();
			if('+' == ch)
			{
				myword->typenum = 44;
				myword->word = "++";
				return (myword);
			}
			retract();
			myword->typenum = 43;
			myword->word = "+";
			return (myword);
			break;
		case '\0':					//End of code string
			myword->typenum = 1000;
			myword->word = "OVER";
			return (myword);
			break;
		default:					//Other characters are defined, category code is - 1, error
			myword->typenum = -1;
			myword->word = "ERROR";
			return (myword);
	}
}

4. View the output of lexical analyzer, i.e. b.txt file. Some screenshots are as follows

Published 11 original articles, won praise 7, visited 1637
Private letter follow

Tags: Windows

Posted on Sat, 11 Jan 2020 07:56:26 -0800 by aliahmad