From f15aa3a552022743398a652165d76bf912c715e5 Mon Sep 17 00:00:00 2001 From: Guillaume Cottenceau Date: Mon, 14 May 2001 13:47:49 +0000 Subject: Initial revision --- mdk-stage1/dietlibc/libregex/rx.c | 434 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 434 insertions(+) create mode 100644 mdk-stage1/dietlibc/libregex/rx.c (limited to 'mdk-stage1/dietlibc/libregex/rx.c') diff --git a/mdk-stage1/dietlibc/libregex/rx.c b/mdk-stage1/dietlibc/libregex/rx.c new file mode 100644 index 000000000..b28358c2e --- /dev/null +++ b/mdk-stage1/dietlibc/libregex/rx.c @@ -0,0 +1,434 @@ +#include +#include +#include +#include +#include + +/* this is ugly. + * the idea is to build a parse tree, then do some poor man's OOP with a + * generic matcher function call that is always that the start of each + * record, and a next pointer. When the parse tree is done, we need to + * recursively set the next pointers to point to the part of the parse + * tree that needs to match next. + * This is the prototype of the generic match function call pointer. + * The first argument is the "this" pointer, the second is the text to + * be matched against, ofs is the offset from the start of the matched + * text (so we can match "^") and matches is an array where match + * positions are stored. */ +/* now declared in regex.h: */ +/* typedef int (*matcher)(void*,const char*,int ofs,regmatch_t* matches,int plus,int eflags); */ + +/* one would think that this is approach is an order of magnitude slower + * than the standard NFA approach, but it isn't. The busybox grep took + * 0.26 seconds for a fixed string compared to 0.19 seconds for the + * glibc regex. */ + +/* first part: parse a regex into a parse tree */ +struct bracketed { + unsigned int cc[32]; +}; + +/* now declared in regex.h: +struct regex { + matcher m; + void* next; + int pieces; + int num; + struct branch *b; +}; */ + +struct atom { + matcher m; + void* next; + enum { EMPTY, REGEX, BRACKET, ANY, LINESTART, LINEEND, WORDSTART, WORDEND, CHAR, } type; + int bnum; + union { + struct regex r; + struct bracketed b; + char c; + } u; +}; + +struct piece { + matcher m; + void* next; + struct atom a; + unsigned char min,max; +}; + +struct branch { + matcher m; + void* next; + int num; + struct piece *p; +}; + +static void clearcc(unsigned int* x) { + memset(x,0,sizeof(*x)); +} + +static void setcc(unsigned int* x,unsigned int bit) { + x[bit/32]|=(1<<((bit%32)-1)); +} + +static int issetcc(unsigned int* x,unsigned int bit) { + return x[bit/32] & (1<<((bit%32)-1)); +} + +static const char* parsebracketed(struct bracketed*__restrict__ b,const char*__restrict__ s,regex_t*__restrict__ rx) { + const char* t; + int i,negflag=0; + if (*s!='[') return s; + t=s+1; + clearcc(b->cc); + if (*t=='^') { negflag=1; ++t; } + do { + if (*t==0) return s; + setcc(b->cc,rx->cflags®_ICASE?*t:tolower(*t)); + if (t[1]=='-' && t[2]!=']') { + for (i=*t+1; i<=t[2]; ++i) setcc(b->cc,rx->cflags®_ICASE?i:tolower(i)); + t+=2; + } + ++t; + } while (*t!=']'); + if (negflag) for (i=0; i<32; ++i) b->cc[i]=~b->cc[i]; + return t+1; +} + +static const char* parseregex(struct regex* r,const char* s,regex_t* rx); + +static int matchatom(void*__restrict__ x,const char*__restrict__ s,int ofs,struct __regex_t*__restrict__ preg,int plus,int eflags) { + register struct atom* a=(struct atom*)x; + int matchlen=0; + switch (a->type) { + case EMPTY: +// printf("matching EMPTY against \"%s\"\n",s); + preg->l[a->bnum].rm_so=preg->l[a->bnum].rm_eo=ofs; + goto match; + case REGEX: +// printf("matching REGEX against \"%s\"\n",s); + if ((matchlen=a->u.r.m(a,s,ofs,preg,0,eflags))>=0) { + preg->l[a->bnum].rm_so=ofs; + preg->l[a->bnum].rm_eo=ofs+matchlen; + goto match; + } + break; + case BRACKET: +// printf("matching BRACKET against \"%s\"\n",s); + matchlen=1; + if (*s=='\n' && (preg->cflags®_NEWLINE)) break; + if (*s && issetcc(a->u.b.cc,(preg->cflags®_ICASE?tolower(*s):*s))) + goto match; + break; + case ANY: +// printf("matching ANY against \"%s\"\n",s); + if (*s=='\n' && (preg->cflags®_NEWLINE)) break; + matchlen=1; + if (*s) goto match; + break; + case LINESTART: +// printf("matching LINESTART against \"%s\"\n",s); + if (ofs==0 && (eflags®_NOTBOL)==0) { + goto match; + } + break; + case LINEEND: +// printf("matching LINEEND against \"%s\"\n",s); + if ((*s && *s!='\n') || (eflags®_NOTEOL)==0) break; + goto match; + case WORDSTART: + if ((ofs==0 || isspace(s[-1])) && !isspace(*s)) + goto match; + break; + case WORDEND: + if (ofs>0 && !isspace(s[-1]) && isspace(*s)) + goto match; + break; + case CHAR: +// printf("matching CHAR %c against \"%s\"\n",a->u.c,s); + matchlen=1; + if (((preg->cflags®_ICASE)?tolower(*s):*s)==a->u.c) goto match; + break; + } + return -1; +match: + if (a->next) + return ((struct atom*)(a->next))->m(a->next,s+matchlen,ofs+matchlen,preg,plus+matchlen,eflags); + else + return plus+matchlen; +} + +static const char* parseatom(struct atom*__restrict__ a,const char*__restrict__ s,regex_t*__restrict__ rx) { + const char *tmp; + a->m=matchatom; + a->bnum=-1; + switch (*s) { + case '(': + a->bnum=++rx->brackets; + if (s[1]==')') { + a->type=EMPTY; + return s+2; + } + a->type=REGEX; + if ((tmp=parseregex(&a->u.r,s+1,rx))!=s) { + if (*tmp==')') + return tmp+1; + } + case 0: + case '|': + case ')': + return s; + case '[': + a->type=BRACKET; + if ((tmp=parsebracketed(&a->u.b,s,rx))!=s) + return tmp; + return s; + case '.': + a->type=ANY; + break; + case '^': + a->type=LINESTART; + break; + case '$': + a->type=LINEEND; + break; + case '\\': + if (!*++s) return s; + if (*s=='<') { + a->type=WORDSTART; + break; + } else if (*s=='>') { + a->type=WORDEND; + break; + } + default: + a->type=CHAR; + a->u.c=rx->cflags®_ICASE?*s:tolower(*s); + break; + } + return s+1; +} + +/* needs to do "greedy" matching, i.e. match as often as possible */ +static int matchpiece(void*__restrict__ x,const char*__restrict__ s,int ofs,struct __regex_t*__restrict__ preg,int plus,int eflags) { + register struct piece* a=(struct piece*)x; + int matchlen=0; + int tmp,num=0; + unsigned int *offsets=alloca(sizeof(int)*a->max); + offsets[0]=0; +// printf("matchpiece \"%s\"...\n",s); + /* first, try to match the atom as often as possible, up to a->max times */ + if (a->max == 1 && a->min == 1) + return a->a.m(&a->a,s+matchlen,ofs+matchlen,preg,0,eflags); + while (nummax) { + void* save=a->a.next; + a->a.next=0; + if ((tmp=a->a.m(&a->a,s+matchlen,ofs+matchlen,preg,0,eflags))>=0) { + a->a.next=save; + ++num; + matchlen+=tmp; + offsets[num]=tmp; + } else { + a->a.next=save; + break; + } + } + if (nummin) return -1; /* already at minimum matches; signal mismatch */ + /* then, while the rest does not match, back off */ + for (;;) { + if (a->next) + tmp=((struct atom*)(a->next))->m(a->next,s+matchlen,ofs+matchlen,preg,plus+matchlen,eflags); + else + tmp=plus+matchlen; + if (tmp>=0) break; /* it did match; don't back off any further */ + matchlen-=offsets[num]; + --num; + } + return tmp; +} + +static const char* parsepiece(struct piece*__restrict__ p,const char*__restrict__ s,regex_t*__restrict__ rx) { + const char* tmp=parseatom(&p->a,s,rx); + if (tmp==s) return s; + p->m=matchpiece; + p->min=p->max=1; + switch (*tmp) { + case '*': p->min=0; p->max=RE_DUP_MAX; break; + case '+': p->min=1; p->max=RE_DUP_MAX; break; + case '?': p->min=0; p->max=1; break; + case '{': + if (isdigit(*++tmp)) { + p->min=*tmp-'0'; p->max=RE_DUP_MAX; + while (isdigit(*++tmp)) p->min=p->min*10+*tmp-'0'; + if (*tmp==',') { + if (isdigit(*++tmp)) { + p->max=*tmp-'0'; + while (isdigit(*++tmp)) p->max=p->max*10+*tmp-'0'; + } + } + if (*tmp!='}') return s; + ++tmp; + } + default: + return tmp; + } + return tmp+1; +} + +/* trivial, just pass through */ +static int matchbranch(void*__restrict__ x,const char*__restrict__ s,int ofs,struct __regex_t*__restrict__ preg,int plus,int eflags) { + register struct branch* a=(struct branch*)x; + int tmp; + tmp=a->p->m(a->p,s,ofs,preg,plus,eflags); + if (tmp>=0) { + if (a->next) + return ((struct atom*)(a->next))->m(a->next,s+tmp,ofs+tmp,preg,plus+tmp,eflags); + else + return plus+tmp; + } + return -1; +} + +static const char* parsebranch(struct branch*__restrict__ b,const char*__restrict__ s,regex_t*__restrict__ rx,int*__restrict__ pieces) { + struct piece p; + const char *tmp; + b->m=matchbranch; + b->num=0; b->p=0; + for (;;) { + if (*s=='|') { + if (b->num==0) { + tmp=s+1; + p.a.type=EMPTY; + p.min=p.max=1; + } + } else { + tmp=parsepiece(&p,s,rx); + if (tmp==s) return s; + } + if (!(b->p=realloc(b->p,++b->num*sizeof(p)))) return s; + b->p[b->num-1]=p; + if (*s=='|') { ++tmp; break; } + s=tmp; + } + *pieces+=b->num; + return tmp; +} + +/* try the branches one by one */ +static int matchregex(void*__restrict__ x,const char*__restrict__ s,int ofs,struct __regex_t*__restrict__ preg,int plus,int eflags) { + register struct regex* a=(struct regex*)x; + int i,tmp; + for (i=0; inum; ++i) { + tmp=a->b[i].m(&a->b[i],s,ofs,preg,plus,eflags); + if (tmp>=0) { + if (a->next) + return ((struct atom*)(a->next))->m(a->next,s+tmp,ofs+tmp,preg,plus+tmp,eflags); + else + return plus+tmp; + } + } + return -1; +} + +static const char* parseregex(struct regex*__restrict__ r,const char*__restrict__ s,regex_t*__restrict__ p) { + struct branch b; + const char *tmp; + r->m=matchregex; + r->num=0; r->b=0; r->pieces=0; + for (;;) { + tmp=parsebranch(&b,s,p,&r->pieces); + if (tmp==s) return s; + if (!(r->b=realloc(r->b,++r->num*sizeof(b)))) return s; + r->b[r->num-1]=b; + s=tmp; + } + return tmp; +} + + +/* The matcher relies on the presence of next pointers, of which the + * parser does not know the correct destination. So we need an + * additional pass through the data structure that sets the next + * pointers correctly. */ +static void regex_putnext(struct regex* r,void* next); + +static void atom_putnext(struct atom*__restrict__ a,void*__restrict__ next) { + a->next=next; + if (a->type==REGEX) + regex_putnext(&a->u.r,next); +} + +static void piece_putnext(struct piece*__restrict__ p,void*__restrict__ next) { + p->next=next; + atom_putnext(&p->a,next); +} + +static void branch_putnext(struct branch*__restrict__ b,void*__restrict__ next) { + int i; + for (i=0; inum-1; ++i) + piece_putnext(&b->p[i],&b->p[i+1]); + piece_putnext(&b->p[i],0); + b->next=next; +} + +static void regex_putnext(struct regex*__restrict__ r,void*__restrict__ next) { + int i; + for (i=0; inum; ++i) + branch_putnext(&r->b[i],next); + r->next=next; +} + + + +int regcomp(regex_t*__restrict__ preg, const char*__restrict__ regex, int cflags) { + const char* t=parseregex(&preg->r,regex,preg); + if (t==regex) return -1; + regex_putnext(&preg->r,0); + preg->cflags=cflags; + return 0; +} + +int regexec(const regex_t*__restrict__ preg, const char*__restrict__ string, size_t nmatch, regmatch_t pmatch[], int eflags) { + int matched; + const char *orig=string; + ((regex_t*)preg)->l=alloca(sizeof(regmatch_t)*(preg->brackets+1)); + while (*string) { + matched=preg->r.m((void*)&preg->r,string,string-orig,(regex_t*)preg,0,eflags); + if (matched>=0) { + if ((preg->cflags®_NOSUB)==0) memmove(pmatch,preg->l,nmatch*sizeof(regmatch_t)); + return 0; + } + ++string; eflags|=REG_NOTBOL; + } + return REG_NOMATCH; +} + + + +void regfree(regex_t* preg) { + int i; + for (i=0; ir.num; ++i) { + free(preg->r.b[i].p); + free(preg->r.b); + } +} + +size_t regerror(int errcode, const regex_t*__restrict__ preg, char*__restrict__ errbuf, size_t errbuf_size) { + strncpy(errbuf,"invalid regular expression (sorry)",errbuf_size); + return strlen(errbuf); +} + + + + +#if 0 +int main() { + struct regex r; + int bnum=-1; + const char* t=parseregex(&r,"^a*ab$",&bnum); + regex_putnext(&r,0); + printf("%d pieces, %s\n",r.pieces,t); + printf("%d\n",r.m(&r,"aaab",0,0,0)); + return 0; +} +#endif -- cgit v1.2.1