#include "headers.h" /* really hate using the preprocessor, but it makes sense in this context */ /* SUBSTR_COUNT is number of parenthetical substrings in REGEX_URI plus one */ /* this regex is from the RFC describing URI syntax -- can't recall the */ /* exact one right now. anyway, it's a little too general for my tastes, */ /* but the one I came up with was trash (unsurprisingly) so here we are. */ /* need to modify this in the future to be less liberal... */ #define REGEX_URI_OLD "^([^:/?#]+)://(([^/?#]+)+([^?#]*))(\\?([^#]*))?(#(.*))?" #define REGEX_URI_RFC "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?" #define REGEX_URI_NEW "^(([^:/?#]+):)?(//([^/?#]+))?([^?#]*)(\\?([^#]*))?(#(.*))?" #define REGEX_URI "^(([^:/?#]+):)?(//([^/?#]*))?(([^?#]*)(\\?([^#]*))?(#(.*))?)" #define SUBSTR_COUNT 9 #define PROTO 2 #define FQDN 4 #define PATH 5 int uri_parse(const char *uristr, uri *res) { int i; int regerrcode; char validp; char *pathp; regex_t regexp; regmatch_t match[SUBSTR_COUNT+5]; char errbuf[BUFSIZ] = {0}; validp = 0; if( (regerrcode = regcomp(®exp, REGEX_URI, REG_EXTENDED))) { regerror(regerrcode, ®exp, errbuf, BUFSIZ); fprintf(stderr, "regular expression error: %s\n", errbuf); return(1); } if( REG_NOMATCH == (validp = regexec(®exp, uristr, SUBSTR_COUNT, match, 0))) { return(1); } /* not very elegant but it does the job. i shouldn't be thinking about */ /* elegance at this stage in my programming life, anyways... comes */ /* with experience. */ res->proto = substr_extract(uristr, match[PROTO].rm_so, match[PROTO].rm_eo); res->fqdn = substr_extract(uristr, match[FQDN].rm_so, match[FQDN].rm_eo); /* if the difference below is less than 1, our path doesn't exist. */ /* Compensate by setting it to '/' which will always return a root */ /* document from an HTTP server -- and, presumably, others. We'll */ /* see, I suppose. */ if((match[PATH].rm_eo - match[PATH].rm_so) < 1) { res->path = "/"; } else { /* we only have a simple path */ res->path = substr_extract(uristr, match[PATH].rm_so, match[PATH].rm_eo); /* /\* we have a more complex path *\/ */ /* if(0 != match[PATH+1].rm_so) */ /* { */ /* for(i = PATH; 0 != match[i].rm_so && i <= SUBSTR_COUNT; i++) */ /* { */ /* /\* memory leak here that needs to be addressed *\/ */ /* res->path = mastrcat(res->path, substr_extract(uristr, match[i].rm_so, match[i].rm_eo)); */ /* } */ /* } */ } return(0); }