diff --git a/CHANGES b/CHANGES index 2128d6b..2568d6a 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,9 @@ +2023/02/19 - v0.3.2 + -Improved URI parsing again + -Improved HTTP redirect handling capability + -Moved TLS out of dial and into main + -Improved debugging experience by shuffling around if(param[V])s + 2022/12/13 - v0.3.1b -Fixed Gopher support -Improved URI parsing diff --git a/README b/README index ae3feca..4e9ff0d 100644 --- a/README +++ b/README @@ -20,7 +20,7 @@ Right now, it retrieves data over HTTP or Gopher, and can support TLS, at least when it comes to HTTPS. I'd like a more general approach to TLS support in the future. -Its source code is small (< 400 lines of C total according to cloc right +Its source code is small (< 500 lines of C total according to cloc right now) and uses only POSIX routines -- there's no temptation to use GNU or BSD features because I develop against musl and tcc, and I don't even have the Linux manpages on any of my systems. @@ -36,7 +36,7 @@ Apportate also aims to have actually useful diagnostics; that is, compared to other tools, apportate aims to only provide useful error output. In the case of success, it follows the Rule of Silence; on unrecoverable errors, it aborts immediately. It supports multiple levels -of verbosity, and exposes almost all of its internals. +of verbosity, and exposes almost all of its internal operations for debugging.. Its simple design and use should also make it relatively convenient for inclusion in shell scripts. diff --git a/src/connect.c b/src/connect.c index ae88410..7d3b949 100644 --- a/src/connect.c +++ b/src/connect.c @@ -1,19 +1,18 @@ #include "headers.h" -int dial(const char *fqdn, const char *proto, struct tls **tlsres) +int dial(const char *fqdn, const char *proto) { int sd; struct addrinfo *ainfo; - struct tls_config *tlshints; - if(getaddrinfo(fqdn, proto, 0, &ainfo)) + if( !(sd = socket(AF_INET, SOCK_STREAM, 0))) { return(0); } - if( !(sd = socket(ainfo->ai_family, SOCK_STREAM, 0))) + if(getaddrinfo(fqdn, proto, 0, &ainfo)) { return(0); } @@ -23,34 +22,5 @@ int dial(const char *fqdn, const char *proto, struct tls **tlsres) return(0); } - if(tlsres != 0) - { - close(sd); - - if( 0 == (*tlsres = tls_client())) - { - goto err_ssl; - } - - if( 0 == (tlshints = tls_config_new())) - { - goto err_ssl; - } - - if(tls_configure(*tlsres, tlshints)) - { - goto err_ssl; - } - - - if( (tls_connect(*tlsres, fqdn, proto))) - { - goto err_ssl; - } - } - return(sd); - - err_ssl: - return(0); } diff --git a/src/connect.h b/src/connect.h index 55f8b55..a053d69 100644 --- a/src/connect.h +++ b/src/connect.h @@ -8,4 +8,4 @@ /* sd -- successful connection returns a file descriptor connected to the fqdn */ /* ERRCONN -- couldn't connect */ /* ERRADDR -- couldn't get addrinfo */ -int dial(const char *fqdn, const char *proto, struct tls **tls_res); +int dial(const char *fqdn, const char *proto); diff --git a/src/http.c b/src/http.c index 7d81fed..38abb9d 100644 --- a/src/http.c +++ b/src/http.c @@ -1,27 +1,27 @@ #include "headers.h" const char REQ_HTTP[] = - { - "GET %s HTTP/1.0\r\nHost: %s\r\n\r\n" - }; + { + "GET %s HTTP/1.0\r\nHost: %s\r\n\r\n" + }; int reqgen_http(const char *path, const char *fqdn, char **nbuf) - { - int buflen; + { + int buflen; - buflen = (strlen(REQ_HTTP) + strlen(path) + strlen(fqdn) + 1); + buflen = (strlen(REQ_HTTP) + strlen(path) + strlen(fqdn) + 1); - if( !(*nbuf = calloc(buflen, sizeof(char)))) - { - return(ERRMEM); - } + if( !(*nbuf = calloc(buflen, sizeof(char)))) + { + return(ERRMEM); + } - sprintf(*nbuf, REQ_HTTP, path, fqdn); + sprintf(*nbuf, REQ_HTTP, path, fqdn); - return(0); - } + return(0); + } int resp_parse_http(char *data) { @@ -41,32 +41,51 @@ int resp_parse_http(char *data) return(0); } -char *http_header_extract(char *key, char *data) +/* http_header_extract - return the value corresponding to a key if it exists in an http response, otherwise null */ +/* key - key to extract corresponding value */ +/* data - response header from which to extract key's value */ +char *http_get_keyval(char *key, char *data) { - char *keyp, *keyp_end; - char *returnp; + char *buf; + /* data indices */ + char *d_ind, *d_ind2; - if( NULL == (keyp = strstr(data, key))) - { - goto err; - } - else - { - for(; 0 != *keyp && *keyp != ':'; keyp++); - for(; 0 != *keyp && !isalnum(*keyp); keyp++); - for(keyp_end = keyp; 0 != *keyp_end && '\r' != *keyp_end; keyp_end++); + buf = NULL; + d_ind = d_ind2 = data; - if( NULL == (returnp = calloc((int) (keyp_end - keyp), sizeof(char)))) + + /* we ensure that our key and each key we compare to are lower-case because some */ + /* servers will return mixed-case keys and others single-case. by doing this we */ + /* can use full key specifiers, which avoids false matches that would occur as a */ + /* result of using partial key specifiers e.g "location" instead of "ocation". */ + key = buftolower(key); + + + for(;*data != '\0'; data++) + { + if(NULL != buf) { - goto err; + free(buf); + buf = NULL; + + for(; '\n' != *(data - 1); data++); + d_ind = d_ind2 = data; } - memcpy(returnp, keyp, (keyp_end - keyp)); + for(; *d_ind != ':'; d_ind++); + buf = substr_extract(data, 0, (d_ind - data)); + + buf = buftolower(buf); + + if(!strcmp(key, buf)) + { + free(buf); + for(data = d_ind+2; *(1+d_ind) != '\n'; d_ind++); + + buf = substr_extract(data, 0, (d_ind - data)); + return(buf); + } } - - return(returnp); - - err: return(NULL); } diff --git a/src/http.h b/src/http.h index 055b474..a2593b8 100644 --- a/src/http.h +++ b/src/http.h @@ -16,4 +16,4 @@ int resp_parse_http(char *data); exit(); } */ -char *http_header_extract(char *key, char *data); +char *http_get_keyval(char *key, char *data); diff --git a/src/main.c b/src/main.c index 1fa02df..b032512 100644 --- a/src/main.c +++ b/src/main.c @@ -36,7 +36,7 @@ char *outpath; int main(int argc, char **argv) { - int i, translen, redirnum; + int i, translen, redirnum, gotheader; int sockd; char *recvbufp; char *sendbufp, *offsetp, *errstr; @@ -60,6 +60,7 @@ int main(int argc, char **argv) { goto usage; } + for(i = 0; (i = getopt(argc, argv, "bo:qv")) != -1; i = 0) { switch(i) @@ -80,7 +81,7 @@ int main(int argc, char **argv) break; case 'v': - /* we handle v differently because we want to support different levels of verbosity */ + /* we handle v differently because we want to support different levels of verbosity */ if(!param[Q]) { param[V]++; @@ -88,7 +89,7 @@ int main(int argc, char **argv) break; default: - goto usage; + goto usage; } } urip = argv[optind]; @@ -125,26 +126,37 @@ int main(int argc, char **argv) } /* todo: init uristruct as well */ - - + + uristruct.proto = NULL; + uristruct.fqdn = NULL; + uristruct.path = NULL; + if( uri_parse(urip, &uristruct)) - { - errstr = "couldn't parse URI."; - goto err; - } + { + errstr = "couldn't parse URI."; + goto err; + } - if(param[V]) + if(param[V] >= 2) { fprintf(stderr, "URI parsed, results follow...\nProtocol: %s\nFQDN: %s\nPath: %s\n", uristruct.proto, uristruct.fqdn, uristruct.path); + + if(param[V] >= 3) + { + fprintf(stderr, "length of proto: %lu\nlength of fqdn: %lu\nlength of path: %lu\n", strlen(uristruct.proto), strlen(uristruct.fqdn), strlen(uristruct.path)); + } } + sendbufp = reqgen(&uristruct); + + + /* we should probably modify uri_parse to return a zero value on failure... */ - /* make errstr display the URI -- need mastrcat? */ if( NULL == (sendbufp = reqgen(&uristruct))) { - errstr = "couldn't generate request. Unknown protocol?"; - goto err; + fprintf(stderr, "couldn't generate request. Unknown protocol: %s?\n", uristruct.proto); + exit(-1); } /* if outpath isn't set because we haven't received a -o param, */ @@ -158,57 +170,73 @@ int main(int argc, char **argv) outpath = "default"; } - /* todo: handle TLS a little better. use a global variable, move TLS handling out of dial? */ if(param[V]) { fprintf(stderr, "connecting to %s using protocol %s...\n", uristruct.fqdn, uristruct.proto); } - /* definitely going to break tls out of dial, this is really bad */ - if(!strcmp("https", uristruct.proto)) + /* once again, I'll repeat myself -- while a system *COULD* return 0, 1, or 2, it *SHOULD NEVER DO SO* in a sane environment */ + if( 2 >= (sockd = dial(uristruct.fqdn, uristruct.proto))) { - if(!dial(uristruct.fqdn, uristruct.proto, &tlsc)) + errstr = "failed to connect"; + goto err; + } + + /* todo: upgrade this to a more general mechanism */ + if(!strncmp("https", uristruct.proto, 5)) + { + if(NULL != tlsc) { - errstr = "failed to connect"; - goto err; + tls_reset(tlsc); } - if(param[V]) + struct tls_config *config = tls_config_new(); + tlsc = tls_client(); + tls_configure(tlsc, config); + + if(-1 == tls_connect_socket(tlsc, sockd, uristruct.fqdn)) { - fprintf(stderr, "setting up TLS...\n"); - } - } - else - { - /* once again, I'll repeat myself -- while a system *COULD* return 0, 1, or 2, it *SHOULD NEVER DO SO* in a sane environment */ - if( 2 >= (sockd = dial(uristruct.fqdn, uristruct.proto, NULL))) - { - errstr = "failed to connect"; + errstr = "failed to upgrade connection to use TLS, aborting\n"; goto err; } } - if(param[V]) + if(param[V] >= 2) { - fprintf(stderr, "Sending request...\n %s\n", sendbufp); + fprintf(stderr, "Sending request...\n-----REQUEST START-----\n%s\n-----REQUEST END-----\n", sendbufp); } if(NULL != tlsc) { - i = tls_write(tlsc, sendbufp, strlen(sendbufp)); + if(param[V]) + { + fprintf(stderr, "writing over tls...\n"); + } + + if( -1 == (i = tls_write(tlsc, sendbufp, strlen(sendbufp)))) + { + fprintf(stderr, "libtls internal error: "); + errstr = (char *) tls_error(tlsc); + goto err; + } } else { + if(param[V]) + { + fprintf(stderr, "writing over socket...\n"); + } + i = send(sockd, sendbufp, strlen(sendbufp), 0); } - if(param[V]) + if(param[V] >= 3) { fprintf(stderr, "sent: %d bytes\n", i); } /* actual read loop */ - int gotheader; + for(gotheader = 0, translen = 1; translen; memset(recvbufp, 0, BUFSIZ+1)) { if(NULL == tlsc) @@ -220,7 +248,7 @@ int main(int argc, char **argv) translen = tls_read(tlsc, recvbufp, BUFSIZ); } - if(param[V]) + if(param[V] >= 3) { fprintf(stderr, "recv: %d bytes\n", translen); } @@ -241,7 +269,7 @@ int main(int argc, char **argv) case 200: /* by now we have the first transmission from the server that we actually care about */ /* we just need to get the the end of the headres, now that we're done with 'em */ - if(param[V]) + if(param[V] >= 3) { fprintf(stderr, "200 OKAY, moving to end of header...\n"); } @@ -249,7 +277,11 @@ int main(int argc, char **argv) for(; NULL == (offsetp = strstr(recvbufp, "\r\n\r\n"));) { - fprintf(stderr, "Searching to end of header...\n"); + if(param[V] >= 2) + { + fprintf(stderr, "Searching to end of header...\n"); + } + if(NULL != tlsc) { tls_read(tlsc, recvbufp, BUFSIZ); @@ -270,12 +302,14 @@ int main(int argc, char **argv) /* intentional drop through from 301 to 302 */ case 301: case 302: - urip = http_header_extract("ocation", recvbufp); + urip = http_get_keyval("Location", recvbufp); if(param[V]) { - fprintf(stderr, "Redirecting to %s%s%s...", uristruct.proto, uristruct.fqdn, uristruct.path); + fprintf(stderr, "Redirecting to %s...\n", urip); } + redirnum++; + /* could use continue, but structured programming makes it easier to use goto in this circumstance... */ goto start; case 400: @@ -297,10 +331,8 @@ int main(int argc, char **argv) } i = fwrite(offsetp, sizeof(char), translen - (offsetp - recvbufp), filed); - if(param[V] >= 2) - fprintf(stdout, "%s", recvbufp); - if(param[V]) + if(param[V] >= 3) { fprintf(stderr, "fwrite: %d bytes\n", i); } @@ -308,6 +340,7 @@ int main(int argc, char **argv) offsetp = recvbufp; } + tls_free(tlsc); close(sockd); fclose(filed); free(sendbufp); @@ -327,3 +360,4 @@ int main(int argc, char **argv) fprintf(stderr, "%s: %s\n", argv[0], errstr); exit(EXIT_FAILURE); } + diff --git a/src/support.c b/src/support.c index 73f4f5a..ce66886 100644 --- a/src/support.c +++ b/src/support.c @@ -2,35 +2,34 @@ /* return a properly formatted request for any implemented protocol */ char *reqgen(uri *urip) - { - char *req; - int is_tls = 0; - - if(!strcmp("http", urip->proto) || !strcmp("https", urip->proto)) { - reqgen_http(urip->path, urip->fqdn, &req); + char *req; - if(!req) - { - return(NULL); - } + if(!strcmp("http", urip->proto) || !strcmp("https", urip->proto)) + { + reqgen_http(urip->path, urip->fqdn, &req); - return(req); + if(!req) + { + return(NULL); + } + + return(req); + } + else if(!strcmp(urip->proto, "gopher")) + { + reqgen_gopher(urip->path, &req); + + if(!req) + { + return(NULL); + } + + return(req); + } + + return(NULL); } - else if(!strcmp(urip->proto, "gopher")) - { - reqgen_gopher(urip->path, &req); - - if(!req) - { - return(NULL); - } - - return(req); - } - - return(NULL); - } /* takes a data buffer and returns an integer corresponding to the server's response value */ /* if not applicable, return -1 */ @@ -46,7 +45,7 @@ int resp_parse(char *data, uri *uristruct) return(-1); } - } + } /* return a pointer to a character array on the heap consisting of all bytes */ /* between start and end in str. */ @@ -59,35 +58,26 @@ char *substr_extract(const char *str, int start, int end) substr = NULL; /* account for zero index plus the nullterm */ - if( !(substr = malloc((substr_len + 1)))) + if( !(substr = calloc((substr_len + 1), sizeof(char)))) { return(NULL); } - memcpy(substr, str+start, substr_len); + memcpy(substr, str+start, substr_len); - return(substr); - } - -/* mastrcat -- improved string concat function. returns a pointer to the first element in a buffer containing the strings str1 */ -/* and str2 joined end-to-end. */ -char *mastrcat(char *str1, char *str2) - { - unsigned long int nbi, stri, nbsize; - char *nbuf; - nbi = stri = 0; - nbsize = (strlen(str1) + strlen(str2)); - nbuf = malloc(nbsize); - - for(stri = 0; str1[stri] != '\0'; nbi++, stri++) - { - nbuf[nbi] = str1[stri]; - } - - for(stri = 0; str2[stri] != '\0'; nbi++, stri++) - { - nbuf[nbi] = str2[stri]; - } - - return nbuf; + return(substr); + } + +char *buftolower(char *bufp) + { + int i; + char *nbufp; + nbufp = calloc(strlen(bufp), sizeof(char)); + + for(i = 0; '\0' != bufp[i]; i++) + { + nbufp[i] = tolower(bufp[i]); + } + + return(nbufp); } diff --git a/src/support.h b/src/support.h index ebd8e8f..b4628e6 100644 --- a/src/support.h +++ b/src/support.h @@ -37,4 +37,4 @@ typedef struct char *reqgen(uri *urip); int resp_parse(char *data, uri *uristruct); char *substr_extract(const char *str, int start, int end); -char *mastrcat(char *str1, char *str2); +char *buftolower(char *bufp); diff --git a/src/uri.c b/src/uri.c index 01bf903..ce65eef 100644 --- a/src/uri.c +++ b/src/uri.c @@ -2,14 +2,6 @@ /* really hate using the preprocessor, but it makes sense in this context */ -/* SUBSTR_COUNT is number of parenthetical substrings in REGEX_URI plus one */ -/* this regex is from the RFC describing URI syntax -- can't recall the */ -/* exact one right now. anyway, it's a little too general for my tastes, */ -/* but the one I came up with was trash (unsurprisingly) so here we are. */ -/* need to modify this in the future to be less liberal... */ -#define REGEX_URI_OLD "^([^:/?#]+)://(([^/?#]+)+([^?#]*))(\\?([^#]*))?(#(.*))?" -#define REGEX_URI_RFC "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?" -#define REGEX_URI_NEW "^(([^:/?#]+):)?(//([^/?#]+))?([^?#]*)(\\?([^#]*))?(#(.*))?" #define REGEX_URI "^(([^:/?#]+):)?(//([^/?#]*))?(([^?#]*)(\\?([^#]*))?(#(.*))?)" @@ -21,10 +13,8 @@ int uri_parse(const char *uristr, uri *res) { - int i; int regerrcode; char validp; - char *pathp; regex_t regexp; regmatch_t match[SUBSTR_COUNT+5]; char errbuf[BUFSIZ] = {0}; @@ -49,6 +39,7 @@ int uri_parse(const char *uristr, uri *res) /* with experience. */ res->proto = substr_extract(uristr, match[PROTO].rm_so, match[PROTO].rm_eo); res->fqdn = substr_extract(uristr, match[FQDN].rm_so, match[FQDN].rm_eo); + /* if the difference below is less than 1, our path doesn't exist. */ /* Compensate by setting it to '/' which will always return a root */ /* document from an HTTP server -- and, presumably, others. We'll */ @@ -61,16 +52,6 @@ int uri_parse(const char *uristr, uri *res) { /* we only have a simple path */ res->path = substr_extract(uristr, match[PATH].rm_so, match[PATH].rm_eo); - /* /\* we have a more complex path *\/ */ - /* if(0 != match[PATH+1].rm_so) */ - /* { */ - /* for(i = PATH; 0 != match[i].rm_so && i <= SUBSTR_COUNT; i++) */ - /* { */ - /* /\* memory leak here that needs to be addressed *\/ */ - /* res->path = mastrcat(res->path, substr_extract(uristr, match[i].rm_so, match[i].rm_eo)); */ - /* } */ - /* } */ - }