/* l2xirexp.c builds recognizers for two kinds of regular expressions */ /* Based on rexpr.c, part of the PCCTS compiler/generator distribution */ /* * This file contains code for * * int rexpr(char *expr, char *s); * * which answers * * 1 if 's' is in the language described by the regular expression 'expr' * 0 if it is not * -1 if the regular expression is invalid * * Language membership is determined by constructing a non-deterministic * finite automata (NFA) from the regular expression. A depth- * first-search is performed on the NFA (graph) to check for a match of 's'. * Each non-epsilon arc consumes one character from 's'. Backtracking is * performed to check all possible paths through the NFA. * * Regular expressions follow the meta-language: * * ::= ( '|' )* * * ::= ( )* * * ::= {'~'} '[' ']' * | '(' ')' * | '{' '}' * | * * ::= { '*' | '+' } * * ::= ( )* * | { } '-' { } * * ::= Token[Atom] * * Notes: * ~ means complement the set in [..]. i.e. all characters * not listed * * means match 0 or more times (can be on expression * or atom) * + means match 1 or more times (can be on expression * or atom) * {} optional * () grouping * [] set of atoms * x-y all characters from x to y (found only in [..]) * \xx the character with value xx * * Examples: * [a-z]+ * match 1 or more lower-case letters (e.g. variable) * * 0x[0-9A-Fa-f]+ * match a hex number with 0x on front (e.g. 0xA1FF) * * [0-9]+.[0-9]+{e[0-9]+} * match a floating point number (e.g. 3.14e21) * * Code example: * if ( rexpr("[a-zA-Z][a-zA-Z0-9]+", str) ) then str is keyword * * Terence Parr * Purdue University * April 1991 */ /**********************************************************************/ /* some modifications by Peter Wilson, Catholic University of America */ /* * Complement changed from ~[...] to [^...] to match AWK/lex * Optional changed from {...} to (r)? to match AWK/lex * Swapped parameters in rexpr() * * Major additions/changes to support the EXPRESS LIKE operator: * string LIKE pattern -> TRUE or FALSE * * @ matches any letter * ^ matches any uppercase letter * ? matches any character * & matches remainder of string * # matches any digit * $ matches any substring terminated by space or end of string * * matches any number of characters * \ escape character * */ /**********************************************************************/ #include #include #include #include "l2xirexp.h" #include "l2xiidbg.h" static int regExpr(); static int andExpr(); static int expr(); static int repeatSymbol(); static int atomList(); static int next(); static ArcPtr newGraphArc(); static NodePtr newNode(); static int ArcBetweenGraphNode(); static Graph BuildNFA_atom(); static Graph BuildNFA_AB(); static Graph BuildNFA_AorB(); static Graph BuildNFA_set(); static Graph BuildNFA_Astar(); static Graph BuildNFA_Aplus(); static Graph BuildNFA_Aoptional(); static char *_c; static int rx_token, tokchar; static NodePtr accept; static NodePtr freelist = NULL; int like_op = 0; /* PRW = 1 iff processing EXPRESS LIKE pattern */ /* * return 1 if s in language described by expr * 0 if s is not * -1 if expr is an invalid regular expression */ rexpr(s, expr) char *expr, *s; { NodePtr p,q; Graph nfa; int result; entry_debug("rexpr (l2xirexp.c)"); sprintf(dbuffer, "rexpr(%s, %s);\n", s, expr); debug_print(dbuffer); like_op = 0; freelist = NULL; _c = expr; next(); if ( regExpr(&nfa) == -1 ) return -1; accept = nfa.right; result = match(nfa.left, s); /* free all your memory */ p = q = freelist; while ( p!=NULL ) { q = p->track; free(p); p = q; } exit_debug("rexpr"); return result; } /* * do a depth-first-search on the NFA looking for a path from start to * accept state labelled with the characters of 's'. */ match(automaton, s) NodePtr automaton; char *s; { ArcPtr p; if ( automaton == accept && *s == '\0' ) return 1; /* match */ for (p=automaton->arcs; p!=NULL; p=p->next) /* try all arcs */ { if ( p->label == Epsilon ) { if ( match(p->target, s) ) return 1; } else if ( p->label == *s ) if ( match(p->target, s+1) ) return 1; } return 0; } /* * ::= ( '|' {} )* * * Return -1 if syntax error * Return 0 if none found * Return 1 if a regExrp was found */ static regExpr(g) GraphPtr g; { Graph g1, g2; if ( andExpr(&g1) == -1 ) { return -1; } while ( rx_token == '|' ) { int a; next(); a = andExpr(&g2); if ( a == -1 ) return -1; /* syntax error below */ else if ( !a ) return 1; /* empty alternative */ g1 = BuildNFA_AorB(g1, g2); } if ( rx_token!='\0' ) return -1; *g = g1; return 1; } /* * ::= ( )* */ static andExpr(g) GraphPtr g; { Graph g1, g2; if ( expr(&g1) == -1 ) { return -1; } /* PRW while ( rx_token==Atom || rx_token=='{' || rx_token=='(' || rx_token=='~' || rx_token=='[' ) */ while ( rx_token==Atom || rx_token=='(' || rx_token=='[' ) { if (expr(&g2) == -1) return -1; g1 = BuildNFA_AB(g1, g2); } *g = g1; return 1; } /* * ::= {'~'} '[' ']' * | '(' ')' * | '{' '}' * | * PRW changed above to: * ::= '[' {'^'} ']' * | '(' ')' * | */ static expr(g) GraphPtr g; { int complement = 0; char s[257]; /* alloc space for string of char in [] */ /* PRW if ( rx_token == '~' || rx_token == '[' ) */ if ( rx_token == '[' ) { next(); if (rx_token == '^') { complement = 1; next();} /* PRW if ( rx_token == '~' ) {complement = 1; next();} * if ( rx_token != '[' ) return -1; * next(); */ if ( atomList( s, complement ) == -1 ) return -1; *g = BuildNFA_set( s ); if ( rx_token != ']' ) return -1; next(); repeatSymbol( g ); return 1; } if ( rx_token == '(' ) { next(); if ( regExpr( g ) == -1 ) return -1; if ( rx_token != ')' ) return -1; next(); repeatSymbol( g ); return 1; } /* PRW if ( rx_token == '{' ) * { * next(); * if ( regExpr( g ) == -1 ) return -1; * if ( rx_token != '}' ) return -1; * next(); * -- S p e c i a l C a s e O p t i o n a l { } -- * if ( rx_token != '*' && rx_token != '+' ) * { * *g = BuildNFA_Aoptional( *g ); * } * repeatSymbol( g ); * return 1; * } */ if ( rx_token == Atom ) { *g = BuildNFA_atom( tokchar ); next(); repeatSymbol( g ); return 1; } return -1; } /* * ::= { '*' | '+' } * PRW changed to ::= { '*' | '+' | '?' } */ static repeatSymbol(g) GraphPtr g; { switch ( rx_token ) { case '*' : *g = BuildNFA_Astar( *g ); next(); break; case '+' : *g = BuildNFA_Aplus( *g ); next(); break; case '?' : *g = BuildNFA_Aoptional( *g ); next(); break; } return 1; } /* * ::= { }* * { } '-' { } * * a-b is same as ab * q-a is same as q */ static atomList(p, complement) char *p; int complement; { static unsigned char set[256]; /* no duplicates */ int first, last, i; char *s = p; if ( rx_token != Atom ) return -1; for (i=0; i<256; i++) set[i] = 0; while ( rx_token == Atom ) { if ( !set[tokchar] ) *s++ = tokchar; set[tokchar] = 1; /* Add atom to set */ next(); if ( rx_token == '-' ) /* have we found '-' */ { first = *(s-1); /* Get last char */ next(); if ( rx_token != Atom ) return -1; else { last = tokchar; } for (i = first+1; i <= last; i++) { if ( !set[tokchar] ) *s++ = i; set[i] = 1; /* Add atom to set */ } next(); } } *s = '\0'; if ( complement ) { for (i=0; i<256; i++) set[i] = !set[i]; for (i=1,s=p; i<256; i++) if ( set[i] ) *s++ = i; *s = '\0'; } return 1; } /* a somewhat stupid lexical analyzer */ static next() { /* PRW seperate scanners for regular expression and LIKE operator */ if (like_op == 0) { /* regex scanning */ while ( *_c==' ' || *_c=='\t' || *_c=='\n' ) _c++; if ( *_c=='\\' ) { _c++; if ( isdigit(*_c) ) { int n=0; while ( isdigit(*_c) ) { n = n*10 + (*_c++ - '0'); } if ( n>255 ) n=255; tokchar = n; } else { switch (*_c) { case 'n' : tokchar = '\n'; break; case 't' : tokchar = '\t'; break; case 'r' : tokchar = '\r'; break; default : tokchar = *_c; } _c++; } rx_token = Atom; } else if ( isgraph(*_c) && *_c!='[' && *_c!='(' && *_c!='-' && *_c!=')' && *_c!=']' && *_c!='?' && *_c!='+' && *_c!='*' && *_c!='^' && *_c!='|' ) { /* PRW deleted {, } and ~ while adding ^ and ? */ rx_token = Atom; tokchar = *_c++; } else { rx_token = tokchar = *_c++; } return; } /* end of regex scanner */ else if (like_op == 1) { /* LIKE scanning */ while ( *_c==' ' || *_c=='\t' || *_c=='\n' ) _c++; if ( *_c=='\\' ) { _c++; switch (*_c) { case 'n' : tokchar = '\n'; break; case 't' : tokchar = '\t'; break; case 'r' : tokchar = '\r'; break; default : tokchar = *_c; } _c++; rx_token = Atom; } else if ( isgraph(*_c) && *_c!='@' && *_c!='^' && *_c!='?' && *_c!='&' && *_c!='#' && *_c!='$' && *_c!='*' && *_c!='!' ) { rx_token = Atom; tokchar = *_c++; } else { rx_token = tokchar = *_c++; } return; } /* end of LIKE scanner */ } /* N F A B u i l d i n g R o u t i n e s */ static ArcPtr newGraphArc() { ArcPtr p; p = (ArcPtr) calloc(1, sizeof(Arc)); if ( p==NULL ) {fprintf(stderr,"rexpr: out of memory\n"); exit(-1);} if ( freelist != NULL ) p->track = (ArcPtr) freelist; freelist = (NodePtr) p; return p; } static NodePtr newNode() { NodePtr p; p = (NodePtr) calloc(1, sizeof(Node)); if ( p==NULL ) {fprintf(stderr,"rexpr: out of memory\n"); exit(-1);} if ( freelist != NULL ) p->track = freelist; freelist = p; return p; } static ArcBetweenGraphNodes(i, j, label) NodePtr i, j; int label; { ArcPtr a; a = newGraphArc(); if ( i->arcs == NULL ) i->arctail = i->arcs = a; else {(i->arctail)->next = a; i->arctail = a;} a->label = label; a->target = j; } static Graph BuildNFA_atom(label) int label; { Graph g; g.left = newNode(); g.right = newNode(); ArcBetweenGraphNodes(g.left, g.right, label); return( g ); } static Graph BuildNFA_AB(A, B) Graph A, B; { Graph g; ArcBetweenGraphNodes(A.right, B.left, Epsilon); g.left = A.left; g.right = B.right; return( g ); } static Graph BuildNFA_AorB(A, B) Graph A, B; { Graph g; g.left = newNode(); ArcBetweenGraphNodes(g.left, A.left, Epsilon); ArcBetweenGraphNodes(g.left, B.left, Epsilon); g.right = newNode(); ArcBetweenGraphNodes(A.right, g.right, Epsilon); ArcBetweenGraphNodes(B.right, g.right, Epsilon); return( g ); } static Graph BuildNFA_set( s ) char *s; { Graph g; if ( s == NULL ) return g; g.left = newNode(); g.right = newNode(); while ( *s != '\0' ) { ArcBetweenGraphNodes(g.left, g.right, *s++); } return g; } static Graph BuildNFA_Astar( A ) Graph A; { Graph g; g.left = newNode(); g.right = newNode(); ArcBetweenGraphNodes(g.left, A.left, Epsilon); ArcBetweenGraphNodes(g.left, g.right, Epsilon); ArcBetweenGraphNodes(A.right, g.right, Epsilon); ArcBetweenGraphNodes(A.right, A.left, Epsilon); return( g ); } static Graph BuildNFA_Aplus( A ) Graph A; { ArcBetweenGraphNodes(A.right, A.left, Epsilon); return( A ); } static Graph BuildNFA_Aoptional( A ) Graph A; { Graph g; g.left = newNode(); g.right = newNode(); ArcBetweenGraphNodes(g.left, A.left, Epsilon); ArcBetweenGraphNodes(g.left, g.right, Epsilon); ArcBetweenGraphNodes(A.right, g.right, Epsilon); return( g ); } /* LIKE operator code */ /************************************************************************/ /* likeExpr() Process likeExpr */ static likeExpr(g) GraphPtr g; { int complement = 0; char s[257]; int i; Graph g2; if (rx_token == '\\') { /* escape char */ next(); } if (rx_token == '!') { /* NOT char */ complement = 1; next(); } if (rx_token == Atom) { if (complement == 1) { do_not_atom(s, tokchar); *g = BuildNFA_set(s); } else { *g = BuildNFA_atom(tokchar); } next(); return(1); } else if ( rx_token == '@' || rx_token == '^' || rx_token == '?' || rx_token == '&' || rx_token == '#' || rx_token == '$' || rx_token == '*' ) { do_like_meta(rx_token, s, complement); *g = BuildNFA_set(s); if ( rx_token == '&' || rx_token == '*' || rx_token == '$' ) { /* multiple chars */ *g = BuildNFA_Astar(*g); if (rx_token == '$') { /* add 0 or more following space chars */ g2 = BuildNFA_atom(' '); g2 = BuildNFA_Astar(g2); *g = BuildNFA_AB(*g, g2); } } next(); return(1); } return(-1); /* should not be here */ } /* end LIKEEXPR */ /************************************************************************/ /************************************************************************/ /* likePat() Process likePat ::= ( )* */ static likePat(g) GraphPtr g; { Graph g1, g2; if (likeExpr(&g1) == -1) { return(-1); } while (rx_token != '\0') { if (likeExpr(&g2) == -1) { return(-1); } g1 = BuildNFA_AB(g1, g2); } *g = g1; return(1); } /* end LIKEPAT */ /************************************************************************/ /************************************************************************/ /* like_expr(expr,s) */ /* returns -1 if expr not an EXPRESS LIKE pattern */ /* 0 if s does not match pattern expr */ /* 1 if s matches pattern expr */ int like_expr(s, expr) char *s; /* string to be matched */ char *expr; /* the pattern */ { NodePtr p, q; Graph nfa; int result; entry_debug("like_expr (l2xirexp.c)"); sprintf(dbuffer, "like_expr(%s, %s);\n", s, expr); debug_print(dbuffer); like_op = 1; /* set correct scanner */ freelist = NULL; _c = expr; next(); if (likePat(&nfa) == -1) { return(-1); } accept = nfa.right; result = match(nfa.left, s); /* free memory */ p = q = freelist; while (p != NULL) { q = p->track; free(p); p = q; } exit_debug("like_expr"); return(result); } /* end LIKE_EXPR */ /************************************************************************/ /************************************************************************/ /* do_not_atom() Process !a */ int do_not_atom(p, c) char *p; int c; /* the negated atom char */ { static unsigned char set [256]; int i; char *s = p; for (i = 0; i < 256; i++) { /* all characters */ set[i] = 1; } set[c] = !set[c]; /* except for this one */ for (i = 1, s = p; i < 256; i++) { if (set[i]) *s++ = i; } *s = '\0'; return(1); } /* end DO_NOT_ATOM */ /************************************************************************/ /************************************************************************/ /* do_like_meta() Process EXPRESS LIKE meta character */ int do_like_meta(meta, p, complement) int meta; /* the meta character */ char *p; int complement; /* = 1 if complement the set */ { static unsigned char set[256]; int i; char *s = p; /* ensure all character are noted in the set */ for (i = 0; i < 256; i++) set[i] = 1; switch (meta) { case '@' : { /* a single alphabetic character */ for (i = 0; i < 256; i++) set[i] = 0; for (i = 'a'; i <= 'z'; i++) set[i] = 1; for (i = 'A'; i <= 'Z'; i++) set[i] = 1; break; } case '^' : { /* a single upper case alphabetic character */ for (i = 0; i < 256; i++) set[i] = 0; for (i = 'A'; i <= 'Z'; i++) set[i] = 1; break; } case '?' : { /* a single character */ break; } case '&' : { /* remainder of string */ break; } case '#' : { /* a single digit */ for (i = 0; i < 256; i++) set[i] = 0; for (i = '0'; i <= '9'; i++) set[i] = 1; break; } case '$' : { /* substring ended by space or EOS */ i = ' '; set[i] = !set[i]; break; } case '*' : { /* any character */ break; } default : { /* an ERROR */ return(-1); } } /* end switch */ if (complement == 1) { /* invert the set */ for (i = 0; i < 256; i++) set[i] = !set[i]; } /* add all the search characters to the string */ for (i = 1, s = p; i < 256; i++) { if (set[i]) *s++ = i; } *s = '\0'; return(1); } /* end DO_LIKE_META */ /************************************************************************/