发现2006年3月写的COBOL语言解释程序在偶滴BLOG所有文章中人气最高,偶滴blog点击次数刚刚超过10万,为了祝贺。时隔2年半后再展示一个解释程序,html解释程序。html解释程序是 易扩网络搜索(2007年本人自主开发的搜索引擎)的一个小部分, 事实上为了网络搜索而使用整套解释程序是用高射炮打蚊子的行为,用DOM和regx应该能够实现,当时有这个想法,不过没有付诸行动。 下面是解释系统的框架图 Copyright (C) 2007-8 SGPRO 这个程序比较庞大,所以这里只能展示语法分析部分。 HTML源代码虽然有一套语法规则,但是仍然可以写的相当随意,所以解释程序容错能力要很高,如果是xml语法非常严格,相对应的解释程序应该简单的多(虽然我没有实现过) 语法分析(Syntax Analysis)的代码,不能单独编译,需要词法分析和错误处理等模块协作 下面2个数据结构必须说明 //1、 有限自动机,从html源代码中读取有效单词,目前只支持2种,一种是HTML有效单词,一种是Javascript有效单词 typedef struct { BOOL (*ReadHTMLToken)(Token *argToken, BOOL filterSpace); BOOL (*ReadJavaScriptToken)(Token *argToken, BOOL filterSpace);}LangDFA; // 2、语法解释器typedef struct{ HTMLObjectTree ObjectTree; // 对象树,属于解释器的目标生成对象 ObjectList ObjectArr; // 对象列表, 对象树的列表结构 HTMLInterpreterError *IntpErr; // 错误对象句柄 BOOL (*start)(BOOL (*LoadHTMLCode)(char *), // 启动接口 char filename[], Log_Type LogInfor, int (*LogFunc)(char strLog[])); HTMLInterpreterError(*GetInterpreterError)(); // 错误处理接口}HTMLInterpreter; //generate the object tree and summary array #include "sgpro_htmlInterpret_io.h"#include "sgpro_htmlInterpret_common.h"#include "sgpro_htmlInterpret_keywords.h"#include "sgpro_htmlInterpret_codebuffer.h" #include "sgpro_htmlInterpret_objecttree.h"#include "sgpro_htmlInterpret_errorhandle.h"#include "sgpro_htmlInterpret.h" #include "comstack.h" // 公用栈#include "sgpro_abstractstack.h" // 抽象堆栈 Token CurrentToken;Stack *ObjectStack; AbstractStack *ObjTreeStack; HTMLObjectTree ObjectTree;HTMLObjectTree hCurrent;ObjectList* ObjectArr; char title[1024];Token *href;Token *summary;int (*WriteSummary)(STRING);int (*WriteObject)(STRING);int (*WriteInfor)(STRING); extern HTMLCodeBuffer CodeBuff; extern LangDFA DFA; extern HTMLInterpreterError *globalError; extern char CurrentChar; BOOL StmtEndOfFront = false; HTMLInterpreter* CreateHTMLInterpreter(){ int i = 0; HTMLInterpreter* objHTMLIntp = (HTMLInterpreter* )malloc(sizeof(HTMLInterpreter)); if ( objHTMLIntp == NULL) { return objHTMLIntp; } //Init Error objHTMLIntp->IntpErr = new_HTMLInterperterError(); if (objHTMLIntp->IntpErr == NULL) { return NULL; } else { globalError = objHTMLIntp->IntpErr; } //Init global object array handle ObjectArr = &(objHTMLIntp->ObjectArr); //Init size ObjectArr->ObjectArrSize = DEF_OBJLIST_SIZE; //Init Current pointer ObjectArr->CurrentObjPointer = 0; //Init every element of object data array for ( i = 0; i < ObjectArr->ObjectArrSize; i++ ) { if (!InitHTMLObjectTreeNode(ObjectArr->data + i)) { SetInterpreterError(ErrType_CreateErr, "Create Object node error"); return NULL; } } strcpy(ObjectArr->data[ObjectArr->CurrentObjPointer++].objName, "sgpro_htmlInterpreter_objectList"); objHTMLIntp->start = HTMLInterpret; objHTMLIntp->GetInterpreterError = GetInterpreterError; return objHTMLIntp;} BOOL InitHTMLInterpreter(HTMLInterpreter* objHTMLIntp){ int i = 0; if (objHTMLIntp == NULL) { return false; } memset(objHTMLIntp, 0x00, sizeof(objHTMLIntp)); //Init Error objHTMLIntp->IntpErr = new_HTMLInterperterError(); if (objHTMLIntp->IntpErr == NULL) { return false; } else { globalError = objHTMLIntp->IntpErr; } //Init global object array handle ObjectArr = &(objHTMLIntp->ObjectArr); //Init size ObjectArr->ObjectArrSize = DEF_OBJLIST_SIZE; ObjectArr->data[0].AttribArrsize = ATTRIBARRSIZE; //Init every element of object data array for ( i = 1; i < ObjectArr->ObjectArrSize; i++ ) { ObjectArr->data[i] = ObjectArr->data[0]; } objHTMLIntp->start = HTMLInterpret; objHTMLIntp->GetInterpreterError = GetInterpreterError; return true;} static BOOL HTMLInterpret(BOOL (*LoadHTMLCode)(char *), char filename[], Log_Type LogInforType, int (*LogFunc)(char strLog[])){ BOOL result = false; if (!CreateToken(&CurrentToken)) { SetInterpreterError(ErrType_CreateErr, "Create token error"); return result; } if (!CreateStack(&ObjectStack, CreateToken, 1024)) { SetInterpreterError(ErrType_CreateErr, "Create stack error"); return result; } LogSetting(LogInforType, LogFunc); if (LoadHTMLCode == NULL) { CodeBuff.LoadHTMLCode = LoadHTMLCodeByFile; } else { CodeBuff.LoadHTMLCode = LoadHTMLCode; } if (CodeBuff.LoadHTMLCode(filename)) { if (result = Syntax_Analysis()) { ShowInfor("\nOK!\n"); } else { ShowInfor("Error Code: %02d\n, Error Message", GetInterpreterErrorCode(), GetInterpreterErrorString()); ShowInfor("\nFail!\n"); } } CloseAllFile(); DestroyStack(&ObjectStack); DestroyToken(&CurrentToken); return result;} BOOL Syntax_Analysis(){ if (!DFA.ReadHTMLToken(&CurrentToken, true)) { SetInterpreterError(ErrType_SyntaxErr, "No Token can be loaded"); return false; } if (!HTMLPage() && CurrentToken.type != null) { return false; } if (!StackEmpty(*ObjectStack)) { ShowInfor("Object stack not clear(Last object string:%s) when terminal the analyse!\n", GetTop(*ObjectStack).string); if (SetInterpreterError(ErrType_StackErr, "")) { sprintf(globalError->message, "Object stack not clear(Last object string:%s) when terminal the analyse!\n", GetTop(*ObjectStack).string); } return false; } return true;} BOOL HTMLPage(){ if (CurrentToken.type == MarkStart || CurrentToken.type == Summary || CurrentToken.type == String || CurrentToken.type == Space || CurrentToken.type == Atom) { if (!HTMLStatement(CurrentToken, NodeType_Sub)) { return false; } if ((CurrentToken.type == MarkStart || CurrentToken.type == Summary || CurrentToken.type == String || CurrentToken.type == Space || CurrentToken.type == Atom) && !HTMLPage()) { return false; } } return true;} BOOL HTMLStatement(Token argTK, NodeType type){ BOOL SummaryStatement = false; switch (argTK.type) { case Summary: case String: case Space: case Atom: if (!HTMLSummary(argTK))// statement => SummaryArr { return false; } else { SummaryStatement = true; ShowInfor("\n"); } break; case MarkStart: //< if (!DFA.ReadHTMLToken(&CurrentToken, true)) //filter the < bettween the next token { ShowInfor("Repected: HTML Mark, No token can load."); SetInterpreterError(ErrType_SyntaxErr, "Repected: HTML Mark, No token can load"); return false; } if (CurrentToken.type == Summary) //<summary { if (!HTMLObjectFront(CurrentToken, type)) // <object | <object/> | <object> { return false; } switch (CurrentToken.type) // > | /> | > { case Atom: if (IsHTMLMarkEnd(CurrentToken.string[0])) //<object> { if (!StackEmpty(*ObjectStack) && !strcmp(GetTop(*ObjectStack).string, "SCRIPT")) { if (JavaScript(CurrentToken)) // javascript { DFA.ReadHTMLToken(&CurrentToken, true); // </ } else { return false; } } else { DFA.ReadHTMLToken(&CurrentToken, true); if (!StackEmpty(*ObjectStack) && IsFilter(GetTop(*ObjectStack).string) ) //Filter objects { //DFA.ReadHTMLToken(&CurrentToken, true); while (CurrentToken.type != MarkEndStart && DFA.ReadHTMLToken(&CurrentToken, true)) { //DFA.ReadHTMLToken(&CurrentToken); if (CurrentToken.type == MarkEndStart ) { if (DFA.ReadHTMLToken(&CurrentToken, true)) { if (!strcmp(GetTop(*ObjectStack).string, strupr(CurrentToken.string))) { CodeBuff.RollBackReadPointer(); CodeBuff.RollBackReadPointer(); //CodeBuff.RollBackReadPointer(); //CodeBuff.RollBackReadPointer(); CurrentChar = CodeBuff.ReadChar(); DFA.ReadHTMLToken(&CurrentToken, true); break; } } else { ShowInfor("End scan but not match the object :%s\n", GetTop(*ObjectStack).string); if (SetInterpreterError(ErrType_SyntaxErr, "")) { sprintf(globalError->message, "End scan but not match the object :%s\n", GetTop(*ObjectStack).string); } } } } } } switch (CurrentToken.type) //<object> { case MarkEndStart:// <object></ if (!HTMLObjectRear(CurrentToken, type)) //<object></object> { return false; } StmtEndOfFront = false; break;// statement => <object></object> default: if (!HTMLStatement(CurrentToken, type))//<object><statement> { return false; } switch (CurrentToken.type) //<object><statement></ | summary | { case MarkEndStart: //<object><statement></ if (!HTMLObjectRear(CurrentToken, type)) //<object><statement></object> { return false; } break; }//<object><statement></object> } } break; case MarkEndObject: //<object /> if (!DFA.ReadHTMLToken(&CurrentToken, true)) { return false; } break; //2007-8-25 break object by < case MarkStart: return true; default: return false; } } }// end statement if (CurrentToken.type == Summary || CurrentToken.type == MarkStart || CurrentToken.type == String || CurrentToken.type == Space || CurrentToken.type == Atom) { if (!HTMLStatement(CurrentToken, SummaryStatement? type:NodeType_Next)) //statement | statements { return false; } } return true;} BOOL HTMLObjectFront(Token argTK, NodeType type){ Token stackRet; static int Root = 1; int i = 0; if (argTK.type == Summary) //<name { strupr(argTK.string); if (Push(ObjectStack, CopyToken, argTK)) { strcpy(ObjectArr->data[ObjectArr->CurrentObjPointer].objName, argTK.string); ShowInfor("Start Object: %s\n", GetTop(*ObjectStack).string); } else { ShowInfor("Object Stack overflow.\n"); SetInterpreterError(ErrType_StackErr, "Object Stack overflow"); return false; } //CurrentToken.string; Object Name; } DFA.ReadHTMLToken(&CurrentToken, true); switch (CurrentToken.type) { case Summary: //< name attributes if (!ObjectAttributes(CurrentToken)) { return false; } break; } //<name attributes /> | <name attributes> | <name> | <name/> switch (CurrentToken.type) // > | /> { case Atom: //case MarkEndObject: if (IsHTMLMarkEnd(CurrentToken.string[0])) // <name> | <name attributes > { if (IsAloneObject(GetTop(*ObjectStack).string)) {// ObjectFront => <BR> | <INPUT> if (Pop(ObjectStack, &stackRet)) { ShowInfor("End Object: %s\n", stackRet.string); } else { ShowInfor("Object Stack empty.\n"); SetInterpreterError(ErrType_StackErr, "Object Stack empty"); return false; } }// ObjectFront => <object> } break; case MarkEndObject: if (Pop(ObjectStack, &stackRet)) { ShowInfor("End Object: %s\n", stackRet.string); } else { ShowInfor("Object Stack empty.\n"); SetInterpreterError(ErrType_StackErr, "Object Stack empty"); return false; } //<object /> break; // <name /> case MarkStart: //break the object 2007-8-25 return true; default: return false; } if (ObjectArr->CurrentObjPointer + 1 >= ObjectArr->ObjectArrSize) { //ObjectArr->data = (struct _HTMLObjectNode *)realloc(ObjectArr->data, ObjectArr->ObjectArrSize + DEF_OBJLIST_INC); if (ObjectArr->data == NULL) { SetInterpreterError(ErrType_CreateErr, "Create Object array data error"); return false; } //Init every element of object data array for ( i = 0; i < DEF_OBJLIST_INC; i++ ) { if (!InitHTMLObjectTreeNode(ObjectArr->data + ObjectArr->ObjectArrSize + i)) { SetInterpreterError(ErrType_CreateErr, "Create Object node error"); return false; } } ObjectArr->ObjectArrSize += DEF_OBJLIST_INC; } ObjectArr->CurrentObjPointer++; return true;} BOOL ObjectAttributes(Token argTK){ if (!AttributeNode(argTK)) //<name attrib { return false; } FilterSpace(); switch (CurrentToken.type) { case Summary: if (!ObjectAttributes(CurrentToken)) // <name attribs... { return false; } break; case Atom: while (!IsHTMLMarkEnd(CurrentToken.string[0])) // <name attribs...> { ShowInfor("Warning: Repected \">\" Current: %s\n", CurrentToken.string); //sprintf(globalError->message, // "Repected: \">\" Current: %s\n", CurrentToken.string); //SetInterpreterError(ErrType_SyntaxErr, globalError->message); DFA.ReadHTMLToken(&CurrentToken, true); //return false; } break; case MarkEndObject: //<name attribs... /> break; default: if (CurrentToken.type != MarkStart) { ShowInfor("Warning: Repected \">\" \nCurrent:%s\n", CurrentToken.string); if (SetInterpreterError(ErrType_SyntaxErr, "")) { sprintf(globalError->message, "Repected: \">\" \nCurrent:%s\n", CurrentToken.string); } //return false; } } return true;} BOOL AttributeNode(Token argTK){ // attributename char name[ATTRIBNAMELEN] = ""; char value[ATTRIBVALUELEN] = ""; int iObjCurPnt = ObjectArr->CurrentObjPointer; int iAttribCurPnt = ObjectArr->data[iObjCurPnt].AttribArrCurrentPointer; //assign to object tree node ShowInfor("AttriName: %s, ", CurrentToken.string); strcpy(ObjectArr->data[iObjCurPnt].AttribArr[iAttribCurPnt].name, strupr(CurrentToken.string)); //strcpy(name, CurrentToken.string); if (!DFA.ReadHTMLToken(&CurrentToken, true)) { return false; } switch (CurrentToken.type) { case Atom: if (!IsEqual(CurrentToken.string[0])) // attributename = { if (IsHTMLMarkEnd(CurrentToken.string[0])) // attributename { ShowInfor("Value None.\n"); break; } return false; } DFA.ReadHTMLToken(&CurrentToken, true); do { if (CurrentToken.type == Space || // ' ' CurrentToken.string[0] == '>' || // > CurrentToken.type == MarkEndObject || // /> CurrentToken.type == MarkStart || // < CurrentToken.type == MarkEndStart) // </ { //printf("debug-type"); break; } else { strcat(value, CurrentToken.string); } } while (DFA.ReadHTMLToken(&CurrentToken, false) ); if (strlen(value)) { ShowInfor( "value: %s\n", value); strcpy(ObjectArr->data[iObjCurPnt].AttribArr[iAttribCurPnt].value, value); ObjectArr->data[iObjCurPnt].AttribArrCurrentPointer++; if (ObjectArr->data[iObjCurPnt].AttribArrCurrentPointer > ATTRIBARRSIZE) { ShowInfor( "Overflow: attributes array too long!\n"); // =? SetInterpreterError(ErrType_CreateErr, "Overflow: attributes array too long!"); return false; } } else { ShowInfor( "Repected: attribute value!\n"); // =? SetInterpreterError(ErrType_StackErr, "Repected: attribute value"); return false; } break; // No Value attribute node case Summary: case MarkEndObject: ShowInfor("Value None.\n"); break; default: return false; } return true;} BOOL HTMLObjectRear(Token argTK, NodeType type){ int index = 0; Token stackRet; if (argTK.type != MarkEndStart) { return false; } if (DFA.ReadHTMLToken(&CurrentToken, true) && CurrentToken.type == Summary) { //object end if (!IsAloneObject(CurrentToken.string)) { if (!StackEmpty(*ObjectStack)) { if (!strcmp(strupr(CurrentToken.string), GetTop(*ObjectStack).string)) { Pop(ObjectStack, &stackRet); ShowInfor("End Object: %s\n", stackRet.string); } else { ShowInfor( "[Warning] Current Object :%s Incompatiable Stack Object: %s, Warning has be passed.\n", CurrentToken.string, GetTop(*ObjectStack).string); //2007-06-17 Add index = IndexOfStack(*ObjectStack, CurrentToken, TokenEqual); if (index == -1) // have not the object front, throw the object rear { // } else // front of the object have not starting, pop them. { while (Pop(ObjectStack, &stackRet)) { if (!strcmp(strupr(CurrentToken.string), stackRet.string)) { ShowInfor( "End Object: %s\n", stackRet.string); break; } } } } } else { ShowInfor( "Object Stack empty.\n"); SetInterpreterError(ErrType_StackErr, "Object Stack empty"); return false; } } //</object while (DFA.ReadHTMLToken(&CurrentToken, true) && !IsHTMLMarkEnd(CurrentToken.string[0])); { DFA.ReadHTMLToken(&CurrentToken, true); // ObjectRear => </Object> return true; } } ShowInfor("Repected: HTML object end mark. Current: %s\n",CurrentToken.string); if (SetInterpreterError(ErrType_StackErr, "")) { sprintf(globalError->message, "Repected: HTML object end mark. Current: %s\n",CurrentToken.string); } return false;} BOOL HTMLSummary(Token argTK){ //HTMLObjectTreeNode *p = AbstStackGetPop(ObjectTreeStack); HTMLObjectTreeNode *p; int sumlen = 0; int arglen = strlen(argTK.string); p = ObjectArr->data + ObjectArr->CurrentObjPointer - 1; if (argTK.type != MarkStart && argTK.type != MarkEndStart && argTK.type != null) //SummaryArr => summary* { if (!strcmp(argTK.string, " ")) { strcpy(argTK.string ," "); } sumlen = strlen(p->summary); if(sumlen == 0) { p->summary = (char*)malloc(arglen*sizeof(char)); memset(p->summary, '\0', arglen); } else { p->summary = (char*)realloc(p->summary, (sumlen + arglen) * sizeof(char)); memset(p->summary + sumlen, '\0', arglen); } strcat(p->summary, argTK.string); ShowInfor( "%s", argTK.string); if (!DFA.ReadHTMLToken(&CurrentToken, true)) { return true; } HTMLSummary(CurrentToken); } else { return false; } return true;} void FilterSpace(){ if (CurrentToken.type == Space) { DFA.ReadHTMLToken(&CurrentToken, true); }} BOOL JavaScript(Token argTK){ BOOL jsarr = true; BOOL ret = false; do { while (jsarr) { jsarr = jsarr && DFA.ReadJavaScriptToken(&CurrentToken, true); } DFA.ReadHTMLToken(&CurrentToken, true); if (CurrentToken.type == MarkEndStart) { if (DFA.ReadHTMLToken(&CurrentToken, true)) { if (!strcmp(strupr(CurrentToken.string), "SCRIPT")) { CodeBuff.RollBackReadPointer(); CodeBuff.RollBackReadPointer(); CurrentChar = CodeBuff.ReadChar(); ret = true; break; } } } jsarr = true; } while (ret == false); return ret;} 下面是以html解释程序为内核的 WIN32应用程序 GUI, 左边是html源代码,右边是解释器输出的解释日志,追踪了对象数组的生成过程。

评论