/* ed style regular expressions */ /* Copyright (c) 1992, 1999, 2001 John E. Davis * * You may distribute under the terms of either the GNU General Public * License or the Perl Artistic License. */ #include "slinclud.h" #include "slang.h" #include "_slang.h" #define SET_BIT(b, n) b[(unsigned int) (n) >> 3] |= 1 << ((unsigned int) (n) % 8) #define TEST_BIT(b, n) (b[(unsigned int)(n) >> 3] & (1 << ((unsigned int) (n) % 8))) #define LITERAL 1 #define RANGE 2 /* [...] */ #define ANY 3 /* . */ #define BOL 4 /* ^ */ #define EOL 5 /* $ */ #define NTH_MATCH 6 /* \1 \2 ... \9 */ #define OPAREN 7 /* \( */ #define CPAREN 0x8 /* \) */ #define ANY_DIGIT 0x9 /* \d */ #define BOW 0xA /* \< */ #define EOW 0xB /* \> */ #if 0 #define NOT_LITERAL 0xC /* \~ */ #endif #define STAR 0x80 /* * */ #define LEAST_ONCE 0x40 /* + */ #define MAYBE_ONCE 0x20 /* ? */ #define MANY 0x10 /* {n,m} */ /* The rest are additions */ #define YES_CASE (STAR | BOL) #define NO_CASE (STAR | EOL) #define UPPERCASE(x) (cs ? (x) : UPPER_CASE(x)) #define LOWERCASE(x) (cs ? (x) : LOWER_CASE(x)) static unsigned char Word_Chars[256]; #define IS_WORD_CHAR(x) Word_Chars[(unsigned int) (x)] #if 0 static int ctx->open_paren_number; static char Closed_Paren_Matches[10]; static SLRegexp_Type *This_Reg; static unsigned char *This_Str; #endif typedef struct { SLRegexp_Type *reg; unsigned char *str; unsigned int len; char closed_paren_matches[10]; int open_paren_number; } Re_Context_Type; static unsigned char *do_nth_match (Re_Context_Type *ctx, int n, unsigned char *str, unsigned char *estr) { unsigned char *bpos; if (ctx->closed_paren_matches[n] == 0) return NULL; bpos = ctx->reg->beg_matches[n] + ctx->str; n = ctx->reg->end_matches[n]; if (n == 0) return(str); if (n > (int) (estr - str)) return (NULL); /* This needs fixed for case sensitive match */ if (0 != strncmp((char *) str, (char *) bpos, (unsigned int) n)) return (NULL); str += n; return (str); } /* returns pointer to the end of regexp or NULL */ static unsigned char *regexp_looking_at (Re_Context_Type *ctx, register unsigned char *str, unsigned char *estr, unsigned char *buf, register int cs) { register unsigned char p, p1; unsigned char *save_str, *tmpstr; int n, n0, n1; int save_num_open; char save_closed_matches[10]; p = *buf++; while (p != 0) { /* p1 = UPPERCASE(*buf); */ /* if (str < estr) c = UPPERCASE(*str); */ switch((unsigned char) p) { case BOW: if ((str != ctx->str) && ((str >= estr) || IS_WORD_CHAR(*(str - 1)) || (0 == IS_WORD_CHAR(*str)))) return NULL; break; case EOW: if ((str < estr) && IS_WORD_CHAR (*str)) return NULL; break; case YES_CASE: cs = 1; break; case NO_CASE: cs = 0; break; case OPAREN: ctx->open_paren_number++; ctx->reg->beg_matches[ctx->open_paren_number] = (int) (str - ctx->str); break; case CPAREN: n = ctx->open_paren_number; while (n > 0) { if (ctx->closed_paren_matches[n] != 0) { n--; continue; } ctx->closed_paren_matches[n] = 1; ctx->reg->end_matches[n] = (unsigned int) (str - (ctx->str + ctx->reg->beg_matches[n])); break; } break; #ifdef NOT_LITERAL case NOT_LITERAL: if ((str >= estr) || (*buf == UPPERCASE(*str))) return (NULL); str++; buf++; break; case MAYBE_ONCE | NOT_LITERAL: save_str = str; if ((str < estr) && (*buf != UPPERCASE(*str))) str++; buf++; goto match_rest; case NOT_LITERAL | LEAST_ONCE: /* match at least once */ if ((str >= estr) || (UPPERCASE(*str) == UPPERCASE(*buf))) return (NULL); str++; /* drop */ case STAR | NOT_LITERAL: save_str = str; p1 = *buf; while ((str < estr) && (UPPERCASE(*str) != p1)) str++; buf++; goto match_rest; /* this type consists of the expression + two bytes that determine number of matches to perform */ case MANY | NOT_LITERAL: p1 = *buf; buf++; n = n0 = (int) (unsigned char) *buf++; /* minimum number to match--- could be 0 */ n1 = (int) (unsigned char) *buf++; /* maximum number to match */ while (n && (str < estr) && (p1 != *str)) { n--; str++; } if (n) return (NULL); save_str = str; n = n1 - n0; while (n && (str < estr) && (p1 != *str)) { n--; str++; } goto match_rest; #endif /* NOT_LITERAL */ case LITERAL: if ((str >= estr) || (*buf != UPPERCASE(*str))) return (NULL); str++; buf++; break; case MAYBE_ONCE | LITERAL: save_str = str; if ((str < estr) && (*buf == UPPERCASE(*str))) str++; buf++; goto match_rest; case LITERAL | LEAST_ONCE: /* match at least once */ if ((str >= estr) || (UPPERCASE(*str) != UPPERCASE(*buf))) return (NULL); str++; /* drop */ case STAR | LITERAL: save_str = str; p1 = *buf; while ((str < estr) && (UPPERCASE(*str) == p1)) str++; buf++; goto match_rest; /* this type consists of the expression + two bytes that determine number of matches to perform */ case MANY | LITERAL: p1 = *buf; buf++; n = n0 = (int) (unsigned char) *buf++; /* minimum number to match--- could be 0 */ n1 = (int) (unsigned char) *buf++; /* maximum number to match */ while (n && (str < estr) && (p1 == *str)) { n--; str++; } if (n) return (NULL); save_str = str; n = n1 - n0; while (n && (str < estr) && (p1 == *str)) { n--; str++; } goto match_rest; case NTH_MATCH: if ((str = do_nth_match(ctx, (int) (unsigned char) *buf, str, estr)) == NULL) return(NULL); buf++; break; case MAYBE_ONCE | NTH_MATCH: save_str = str; tmpstr = do_nth_match (ctx, (int) (unsigned char) *buf, str, estr); buf++; if (tmpstr != NULL) { str = tmpstr; goto match_rest; } continue; case LEAST_ONCE | NTH_MATCH: if ((str = do_nth_match(ctx, (int) (unsigned char) *buf, str, estr)) == NULL) return(NULL); /* drop */ case STAR | NTH_MATCH: save_str = str; while (NULL != (tmpstr = do_nth_match(ctx, (int) (unsigned char) *buf, str, estr))) { str = tmpstr; } buf++; goto match_rest; case MANY | NTH_MATCH: return(NULL); /* needs done */ case RANGE: if (str >= estr) return (NULL); if (TEST_BIT(buf, UPPERCASE(*str)) == 0) return (NULL); buf += 32; str++; break; case MAYBE_ONCE | RANGE: save_str = str; if ((str < estr) && TEST_BIT(buf, UPPERCASE(*str))) str++; buf += 32; goto match_rest; case LEAST_ONCE | RANGE: if ((str >= estr) || (0 == TEST_BIT(buf, UPPERCASE(*str)))) return NULL; str++; /* drop */ case STAR | RANGE: save_str = str; while ((str < estr) && TEST_BIT(buf, UPPERCASE(*str))) str++; buf += 32; goto match_rest; /* The first 32 bytes correspond to the range and the two * following bytes indicate the min and max number of matches. */ case MANY | RANGE: /* minimum number to match--- could be 0 */ n = n0 = (int) (unsigned char) *(buf + 32); /* maximum number to match */ n1 = (int) (unsigned char) *(buf + 33); while (n && (str < estr) && (TEST_BIT(buf, UPPERCASE(*str)))) { n--; str++; } if (n) return (NULL); save_str = str; n = n1 - n0; while (n && (str < estr) && (TEST_BIT(buf, UPPERCASE(*str)))) { n--; str++; } buf += 34; /* 32 + 2 */ goto match_rest; case ANY_DIGIT: if ((str >= estr) || (*str > '9') || (*str < '0')) return (NULL); str++; break; case MAYBE_ONCE | ANY_DIGIT: save_str = str; if ((str < estr) && ((*str > '9') || (*str < '0'))) str++; goto match_rest; case LEAST_ONCE | ANY_DIGIT: if ((str >= estr) || ((*str > '9') || (*str < '0'))) return NULL; str++; /* drop */ case STAR | ANY_DIGIT: save_str = str; while ((str < estr) && ((*str <= '9') && (*str >= '0'))) str++; goto match_rest; case MANY | ANY_DIGIT: /* needs finished */ return (NULL); case ANY: if ((str >= estr) || (*str == '\n')) return (NULL); str++; break; case MAYBE_ONCE | ANY: save_str = str; if ((str < estr) && (*str != '\n')) str++; goto match_rest; case LEAST_ONCE | ANY: if ((str >= estr) || (*str == '\n')) return (NULL); str++; /* drop */ case STAR | ANY: save_str = str; while ((str < estr) && (*str != '\n')) str++; goto match_rest; case MANY | ANY: return (NULL); /* needs finished */ case EOL: if ((str >= estr) || (*str == '\n')) return (str); return(NULL); default: return (NULL); } p = *buf++; continue; match_rest: if (save_str == str) { p = *buf++; continue; } /* if (p == EOL) * { * if (str < estr) return (NULL); else return (str); * } */ SLMEMCPY(save_closed_matches, ctx->closed_paren_matches, sizeof(save_closed_matches)); save_num_open = ctx->open_paren_number; while (str >= save_str) { tmpstr = regexp_looking_at (ctx, str, estr, buf, cs); if (tmpstr != NULL) return(tmpstr); SLMEMCPY(ctx->closed_paren_matches, save_closed_matches, sizeof(ctx->closed_paren_matches)); ctx->open_paren_number = save_num_open; str--; } return NULL; } if ((p != 0) && (p != EOL)) return (NULL); else return (str); } static void fixup_beg_end_matches (Re_Context_Type *ctx, SLRegexp_Type *r, unsigned char *str, unsigned char *epos) { int i; if (str == NULL) { r->beg_matches[0] = -1; r->end_matches[0] = 0; SLMEMSET(ctx->closed_paren_matches, 0, sizeof(ctx->closed_paren_matches)); } else { r->beg_matches[0] = (int) (str - ctx->str); r->end_matches[0] = (unsigned int) (epos - str); } for (i = 1; i < 10; i++) { if (ctx->closed_paren_matches [i] == 0) { r->beg_matches[i] = -1; r->end_matches[i] = 0; } } } static void init_re_context (Re_Context_Type *ctx, SLRegexp_Type *reg, unsigned char *str, unsigned int len) { memset ((char *) ctx, 0, sizeof (Re_Context_Type)); ctx->reg = reg; ctx->str = str; ctx->len = len; } unsigned char *SLang_regexp_match(unsigned char *str, unsigned int len, SLRegexp_Type *reg) { register unsigned char c = 0, *estr = str + len; int cs = reg->case_sensitive, lit = 0; unsigned char *buf = reg->buf, *epos = NULL; Re_Context_Type ctx_buf; if (reg->min_length > len) return NULL; init_re_context (&ctx_buf, reg, str, len); if (*buf == BOL) { if (NULL == (epos = regexp_looking_at (&ctx_buf, str, estr, buf + 1, cs))) str = NULL; fixup_beg_end_matches (&ctx_buf, reg, str, epos); return str; } if (*buf == NO_CASE) { buf++; cs = 0; } if (*buf == YES_CASE) { buf++; cs = 1; } if (*buf == LITERAL) { lit = 1; c = *(buf + 1); } else if ((*buf == OPAREN) && (*(buf + 1) == LITERAL)) { lit = 1; c = *(buf + 2); } while (str < estr) { ctx_buf.open_paren_number = 0; memset (ctx_buf.closed_paren_matches, 0, sizeof(ctx_buf.closed_paren_matches)); /* take care of leading chars */ if (lit) { while ((str < estr) && (c != UPPERCASE(*str))) str++; if (str >= estr) break; /* failed */ } if (NULL != (epos = regexp_looking_at(&ctx_buf, str, estr, buf, cs))) { fixup_beg_end_matches (&ctx_buf, reg, str, epos); return str; } str++; } fixup_beg_end_matches (&ctx_buf, reg, NULL, epos); return NULL; } static unsigned char *convert_digit(unsigned char *pat, int *nn) { int n = 0, m = 0; unsigned char c; while (c = (unsigned char) *pat, (c <= '9') && (c >= '0')) { pat++; n = 10 * n + (c - '0'); m++; } if (m == 0) { return (NULL); } *nn = n; return pat; } #define ERROR return (int) (pat - reg->pat) /* Returns 0 if successful or offset in pattern of error */ int SLang_regexp_compile (SLRegexp_Type *reg) { register unsigned char *buf, *ebuf, *pat; unsigned char *last = NULL, *tmppat; register unsigned char c; int i, reverse = 0, n, cs; int oparen = 0, nparen = 0; /* substring stuff */ int count, last_count, this_max_mm = 0, max_mm = 0, ordinary_search, no_osearch = 0, min_length = 0; unsigned char *mm_p = NULL, *this_mm_p = NULL; static int already_initialized; reg->beg_matches[0] = reg->end_matches[0] = 0; buf = reg->buf; ebuf = (reg->buf + reg->buf_len) - 2; /* make some room */ pat = reg->pat; cs = reg->case_sensitive; if (already_initialized == 0) { SLang_init_case_tables (); #ifdef IBMPC_SYSTEM SLmake_lut (Word_Chars, (unsigned char *) "_0-9a-zA-Z\200-\232\240-\245\341-\353", 0); #else SLmake_lut (Word_Chars, (unsigned char *) "_0-9a-zA-Z\277-\326\330-\336\340-\366\370-\376", 0); #endif already_initialized = 1; } i = 1; while (i < 10) { reg->beg_matches[i] = -1; reg->end_matches[i] = 0; i++; } if (*pat == '\\') { if (pat[1] == 'c') { cs = 1; pat += 2; no_osearch = 1; } else if (pat[1] == 'C') { cs = 0; pat += 2; no_osearch = 1; } } if (*pat == '^') { pat++; *buf++ = BOL; reg->must_match_bol = 1; } else reg->must_match_bol = 0; if (cs != reg->case_sensitive) { if (cs) *buf++ = YES_CASE; else *buf++ = NO_CASE; } *buf = 0; last_count = count = 0; while ((c = *pat++) != 0) { if (buf >= ebuf - 3) { SLang_doerror ("Pattern too large to be compiled."); ERROR; } count++; switch (c) { case '$': if (*pat != 0) goto literal_char; *buf++ = EOL; break; case '\\': c = *pat++; no_osearch = 1; switch(c) { case 'e': c = 033; goto literal_char; case 'n': c = '\n'; goto literal_char; case 't': c = '\t'; goto literal_char; case 'C': cs = 0; *buf++ = NO_CASE; break; case 'c': cs = 1; *buf++ = YES_CASE; break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': c = c - '0'; if ((int) c > nparen) ERROR; last = buf; *buf++ = NTH_MATCH; *buf++ = c; break; #ifdef NOT_LITERAL case '~': /* slang extension */ if ((c = *pat) == 0) ERROR; pat++; last = buf; *buf++ = NOT_LITERAL; *buf++ = c; min_length++; break; #endif case 'd': /* slang extension */ last = buf; *buf++ = ANY_DIGIT; min_length++; break; case '<': last = NULL; *buf++ = BOW; break; case '>': last = NULL; *buf++ = EOW; break; case '{': if (last == NULL) goto literal_char; *last |= MANY; tmppat = convert_digit(pat, &n); if (tmppat == NULL) ERROR; pat = tmppat; *buf++ = n; min_length += (n - 1); if (*pat == '\\') { *buf++ = n; } else if (*pat == ',') { pat++; if (*pat == '\\') { n = 255; } else { tmppat = convert_digit(pat, &n); if (tmppat == NULL) ERROR; pat = tmppat; if (*pat != '\\') ERROR; } *buf++ = n; } else ERROR; last = NULL; pat++; if (*pat != '}') ERROR; pat++; break; /* case '{' */ case '(': oparen++; if (oparen > 9) ERROR; *buf++ = OPAREN; break; case ')': if (oparen == 0) ERROR; oparen--; nparen++; *buf++ = CPAREN; break; case 0: ERROR; default: goto literal_char; } break; case '[': *buf = RANGE; last = buf++; if (buf + 32 >= ebuf) ERROR; for (i = 0; i < 32; i++) buf[i] = 0; c = *pat++; if (c == '^') { reverse = 1; SET_BIT(buf, '\n'); c = *pat++; } if (c == ']') { SET_BIT(buf, c); c = *pat++; } while (c && (c != ']')) { if (c == '\\') { c = *pat++; switch(c) { case 'n': c = '\n'; break; case 't': c = '\t'; break; case 0: ERROR; } } if (*pat == '-') { pat++; while (c < *pat) { if (cs == 0) { SET_BIT(buf, UPPERCASE(c)); SET_BIT(buf, LOWERCASE(c)); } else SET_BIT(buf, c); c++; } } if (cs == 0) { SET_BIT(buf, UPPERCASE(c)); SET_BIT(buf, LOWERCASE(c)); } else SET_BIT(buf, c); c = *pat++; } if (c != ']') ERROR; if (reverse) for (i = 0; i < 32; i++) buf[i] = buf[i] ^ 0xFF; reverse = 0; buf += 32; min_length++; break; case '.': last = buf; *buf++ = ANY; min_length++; break; case '*': if (last == NULL) goto literal_char; *last |= STAR; min_length--; last = NULL; break; case '+': if (last == NULL) goto literal_char; *last |= LEAST_ONCE; last = NULL; break; case '?': if (last == NULL) goto literal_char; *last |= MAYBE_ONCE; last = NULL; min_length--; break; literal_char: default: /* This is to keep track of longest substring */ min_length++; this_max_mm++; if (last_count + 1 == count) { if (this_max_mm == 1) { this_mm_p = buf; } else if (max_mm < this_max_mm) { mm_p = this_mm_p; max_mm = this_max_mm; } } else { this_mm_p = buf; this_max_mm = 1; } last_count = count; last = buf; *buf++ = LITERAL; *buf++ = UPPERCASE(c); } } *buf = 0; /* Check for ordinary search */ ebuf = buf; buf = reg->buf; if (no_osearch) ordinary_search = 0; else { ordinary_search = 1; while (buf < ebuf) { if (*buf != LITERAL) { ordinary_search = 0; break; } buf += 2; } } reg->osearch = ordinary_search; reg->must_match_str[15] = 0; reg->min_length = (min_length > 0) ? (unsigned int) min_length : 0; if (ordinary_search) { strncpy((char *) reg->must_match_str, (char *) reg->pat, 15); reg->must_match = 1; return(0); } /* check for longest substring of pattern */ reg->must_match = 0; if ((mm_p == NULL) && (this_mm_p != NULL)) mm_p = this_mm_p; if (mm_p == NULL) { return (0); } n = 15; pat = reg->must_match_str; buf = mm_p; while (n--) { if (*buf++ != LITERAL) break; *pat++ = *buf++; } *pat = 0; if (pat != reg->must_match_str) reg->must_match = 1; return(0); } char *SLregexp_quote_string (char *re, char *buf, unsigned int buflen) { char ch; char *b, *bmax; if (re == NULL) return NULL; b = buf; bmax = buf + buflen; while (b < bmax) { switch (ch = *re++) { case 0: *b = 0; return buf; case '$': case '\\': case '[': case ']': case '.': case '^': case '*': case '+': case '?': *b++ = '\\'; if (b == bmax) break; /* drop */ default: *b++ = ch; } } return NULL; } #if 0 #define MAX_EXP 4096 int main(int argc, char **argv) { FILE *fp; char *regexp, *file; char expbuf[MAX_EXP], buf[512]; SLRegexp_Type reg; file = argv[2]; regexp = argv[1]; if (NULL == (fp = fopen(file, "r"))) { fprintf(stderr, "File not open\n"); return(1); } reg.buf = expbuf; reg.buf_len = MAX_EXP; reg.pat = regexp; reg.case_sensitive = 1; if (!regexp_compile(®)) while (NULL != fgets(buf, 511, fp)) { if (reg.osearch) { if (NULL == strstr(buf, reg.pat)) continue; } else { if (reg.must_match && (NULL == strstr(buf, reg.must_match_str))) continue; if (0 == regexp_match(buf, buf + strlen(buf), ®)) continue; } fputs(buf, stdout); } return (0); } #endif