/* URL parser and translator; implementation of RFC 2396. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #ifdef HAVE_ICONV #include #endif #ifdef HAVE_IDN2_H #include #endif #include #include #include #include #ifdef HAVE_NETDB_H #include /* OS/2 needs this after sys/types.h */ #endif #ifdef HAVE_SYS_SOCKET_H #include #endif #ifdef HAVE_NETINET_IN_H #include #endif #ifdef HAVE_ARPA_INET_H #include #endif #include "elinks.h" #include "intl/libintl.h" #include "main/object.h" #include "protocol/protocol.h" #include "protocol/uri.h" #include "util/conv.h" #include "util/error.h" #include "util/file.h" #include "util/hash.h" #include "util/memory.h" #include "util/string.h" static inline int end_of_dir(unsigned char c) { /* This used to check for c == ';' as well. But section 3.3 * of RFC 2396 explicitly says that parameters in a path * segment "are not significant to the parsing of relative * references." */ return c == POST_CHAR || c == '#' || c == '?'; } static inline int is_uri_dir_sep(const struct uri *uri, unsigned char pos) { return (uri->protocol == PROTOCOL_FILE ? dir_sep(pos) : pos == '/'); } int is_in_domain(char *domain, char *server, int server_len) { int domain_len = strlen(domain); int len; if (domain_len > server_len) return 0; if (domain_len == server_len) return !c_strncasecmp(domain, server, server_len); len = server_len - domain_len; if (server[len - 1] != '.') return 0; return !c_strncasecmp(domain, server + len, domain_len); } int is_ip_address(const char *address, int addresslen) { /* The @address has well defined limits so it would be a shame to * allocate it. */ char buffer[IP_ADDRESS_BUFFER_SIZE]; if (addresslen >= sizeof(buffer)) return 0; safe_strncpy(buffer, address, addresslen + 1); #ifdef HAVE_INET_PTON #ifdef CONFIG_IPV6 { struct sockaddr_in6 addr6; if (inet_pton(AF_INET6, buffer, &addr6.sin6_addr) > 0) return 1; } #endif /* CONFIG_IPV6 */ { struct in_addr addr4; if (inet_pton(AF_INET, buffer, &addr4) > 0) return 1; } return 0; #else /* FIXME: Is this ever the case? */ return 0; #endif /* HAVE_INET_PTON */ } int end_with_known_tld(const char *s, int slen) { int i; static const char *const tld[] = { "com", "edu", "net", "org", "gov", "mil", "int", "biz", "arpa", "aero", "coop", "club", "info", "museum", "expert", "name", "pro", NULL }; if (!slen) return -1; if (slen < 0) slen = strlen(s); for (i = 0; tld[i]; i++) { int tldlen = strlen(tld[i]); int pos = slen - tldlen; if (pos >= 0 && !c_strncasecmp(&s[pos], tld[i], tldlen)) return pos; } return -1; } /* XXX: this function writes to @name. */ static int check_whether_file_exists(char *name) { /* Check POST_CHAR etc ... */ static const char chars[] = POST_CHAR_S "#?"; int i; int namelen = strlen(name); if (file_exists(name)) return namelen; for (i = 0; i < sizeof(chars) - 1; i++) { char *pos = (char *)memchr(name, chars[i], namelen); int exists; if (!pos) continue; *pos = 0; exists = file_exists(name); *pos = chars[i]; if (exists) { return pos - name; } } return -1; } /* Encodes URIs without encoding stuff like fragments and query separators. */ static void encode_file_uri_string(struct string *string, char *uristring) { int filenamelen = check_whether_file_exists(uristring); encode_uri_string(string, uristring, filenamelen, 0); if (filenamelen > 0) add_to_string(string, uristring + filenamelen); } static inline int get_protocol_length(const char *url) { char *end = (char *) url; /* Seek the end of the protocol name if any. */ /* RFC1738: * scheme = 1*[ lowalpha | digit | "+" | "-" | "." ] * (but per its recommendations we accept "upalpha" too) */ while (isalnum(*end) || *end == '+' || *end == '-' || *end == '.') end++; /* Now we make something to support our "IP version in protocol scheme * name" hack and silently chop off the last digit if it's there. The * IETF's not gonna notice I hope or it'd be going after us hard. */ if (end != url && isdigit(end[-1])) end--; /* Also return 0 if there's no protocol name (@end == @url). */ return (*end == ':' || isdigit(*end)) ? end - url : 0; } uri_errno_T parse_uri(struct uri *uri, char *uristring) { char *prefix_end, *host_end; #ifdef CONFIG_IPV6 char *lbracket, *rbracket; #endif assertm(uristring != NULL, "No uri to parse."); memset(uri, 0, sizeof(*uri)); /* Nothing to do for an empty url. */ if_assert_failed return 0; if (!*uristring) return URI_ERRNO_EMPTY; uri->string = uristring; uri->protocollen = get_protocol_length(uristring); /* Invalid */ if (!uri->protocollen) return URI_ERRNO_INVALID_PROTOCOL; /* Figure out whether the protocol is known */ uri->protocol = get_protocol(struri(uri), uri->protocollen); prefix_end = uristring + uri->protocollen; /* ':' */ /* Check if there's a digit after the protocol name. */ if (isdigit(*prefix_end)) { uri->ip_family = uristring[uri->protocollen] - '0'; prefix_end++; } if (*prefix_end != ':') return URI_ERRNO_INVALID_PROTOCOL; prefix_end++; /* Skip slashes */ if (prefix_end[0] == '/' && prefix_end[1] == '/') { if (prefix_end[2] == '/' && get_protocol_need_slash_after_host(uri->protocol)) return URI_ERRNO_TOO_MANY_SLASHES; prefix_end += 2; } else if (get_protocol_need_slashes(uri->protocol)) { return URI_ERRNO_NO_SLASHES; } if (get_protocol_free_syntax(uri->protocol)) { uri->data = prefix_end; uri->datalen = strlen(prefix_end); return URI_ERRNO_OK; } else if (uri->protocol == PROTOCOL_FILE) { int datalen = strcspn(prefix_end, "#" POST_CHAR_S); char *frag_or_post = prefix_end + datalen; /* Extract the fragment part. */ if (datalen >= 0) { if (*frag_or_post == '#') { uri->fragment = frag_or_post + 1; uri->fragmentlen = strcspn(uri->fragment, POST_CHAR_S); frag_or_post = uri->fragment + uri->fragmentlen; } if (*frag_or_post == POST_CHAR) { uri->post = frag_or_post + 1; } } else { datalen = strlen(prefix_end); } /* A bit of a special case, but using the "normal" host * parsing seems a bit scary at this point. (see bug 107). */ if (datalen > 9 && !c_strncasecmp(prefix_end, "localhost/", 10)) { prefix_end += 9; datalen -= 9; } uri->data = prefix_end; uri->datalen = datalen; return URI_ERRNO_OK; } /* Isolate host */ #ifdef CONFIG_IPV6 /* Get brackets enclosing IPv6 address */ lbracket = strchr(prefix_end, '['); if (lbracket) { rbracket = strchr(lbracket, ']'); /* [address] is handled only inside of hostname part (surprisingly). */ if (rbracket && rbracket < prefix_end + strcspn(prefix_end, "/")) uri->ipv6 = 1; else lbracket = rbracket = NULL; } else { rbracket = NULL; } #endif /* Possibly skip auth part */ host_end = prefix_end + strcspn(prefix_end, "@"); if (prefix_end + strcspn(prefix_end, "/") > host_end && *host_end) { /* we have auth info here */ char *user_end; /* Allow '@' in the password component */ while (strcspn(host_end + 1, "@") < strcspn(host_end + 1, "/?")) host_end = host_end + 1 + strcspn(host_end + 1, "@"); user_end = strchr(prefix_end, ':'); if (!user_end || user_end > host_end) { uri->user = prefix_end; uri->userlen = host_end - prefix_end; } else { uri->user = prefix_end; uri->userlen = user_end - prefix_end; uri->password = user_end + 1; uri->passwordlen = host_end - user_end - 1; } prefix_end = host_end + 1; } #ifdef CONFIG_IPV6 if (uri->ipv6) host_end = rbracket + strcspn(rbracket, ":/?"); else #endif host_end = prefix_end + strcspn(prefix_end, ":/?"); #ifdef CONFIG_IPV6 if (uri->ipv6) { int addrlen = rbracket - lbracket - 1; /* Check for valid length. * addrlen >= sizeof(hostbuf) is theorically impossible * but i keep the test in case of... Safer, imho --Zas */ assertm(addrlen >= 0 && addrlen < NI_MAXHOST, "parse_uri(): addrlen value is bad (%d) for URL '%s'. " "Problems are likely to be encountered. Please report " "this, it is a security bug!", addrlen, uristring); if_assert_failed return URI_ERRNO_IPV6_SECURITY; uri->host = lbracket + 1; uri->hostlen = addrlen; } else #endif { uri->host = prefix_end; uri->hostlen = host_end - prefix_end; /* Trim trailing '.'s */ if (uri->hostlen && uri->host[uri->hostlen - 1] == '.') return URI_ERRNO_TRAILING_DOTS; } if (*host_end == ':') { /* we have port here */ char *port_end = host_end + 1 + strcspn(host_end + 1, "/"); host_end++; uri->port = host_end; uri->portlen = port_end - host_end; if (uri->portlen == 0) return URI_ERRNO_NO_PORT_COLON; /* We only use 8 bits for portlen so better check */ if (uri->portlen != port_end - host_end) return URI_ERRNO_INVALID_PORT; /* test if port is number */ /* TODO: possibly lookup for the service otherwise? --pasky */ for (; host_end < port_end; host_end++) if (!isdigit(*host_end)) return URI_ERRNO_INVALID_PORT; /* Check valid port value, and let show an error message * about invalid url syntax. */ if (uri->port && uri->portlen) { int n; errno = 0; n = strtol(uri->port, NULL, 10); if (errno || !uri_port_is_valid(n)) return URI_ERRNO_INVALID_PORT; } } if (*host_end == '/') { host_end++; } else if (get_protocol_need_slash_after_host(uri->protocol)) { /* The need for slash after the host component depends on the * need for a host component. -- The dangerous mind of Jonah */ if (!uri->hostlen) return URI_ERRNO_NO_HOST; return URI_ERRNO_NO_HOST_SLASH; } /* Look for #fragment or POST_CHAR */ prefix_end = host_end + strcspn(host_end, "#" POST_CHAR_S); uri->data = host_end; uri->datalen = prefix_end - host_end; if (*prefix_end == '#') { uri->fragment = prefix_end + 1; uri->fragmentlen = strcspn(uri->fragment, POST_CHAR_S); prefix_end = uri->fragment + uri->fragmentlen; } if (*prefix_end == POST_CHAR) { uri->post = prefix_end + 1; } return URI_ERRNO_OK; } int get_uri_port(const struct uri *uri) { if (uri->port && uri->portlen) { const char *end = uri->port; int port = strtol(uri->port, (char **) &end, 10); if (end != uri->port) { assert(uri_port_is_valid(port)); return port; } } return get_protocol_port(uri->protocol); } #define can_compare_uri_components(comp) !(((comp) & (URI_SPECIAL | URI_IDN))) static inline int compare_component(const char *a, int alen, const char *b, int blen) { /* Check that the length and the strings are both set or unset */ if (alen != blen || !!a != !!b) return 0; /* Both are unset so that will make a perfect match */ if (!a || !alen) return 1; /* Let the higher forces decide */ return !memcmp(a, b, blen); } #define wants(x) (components & (x)) int compare_uri(const struct uri *a, const struct uri *b, uri_component_T components) { if (a == b) return 1; if (!components) return 0; assertm(can_compare_uri_components(components), "compare_uri() is a work in progress. Component unsupported"); return (!wants(URI_PROTOCOL) || a->protocol == b->protocol) && (!wants(URI_IP_FAMILY) || a->ip_family == b->ip_family) && (!wants(URI_USER) || compare_component(a->user, a->userlen, b->user, b->userlen)) && (!wants(URI_PASSWORD) || compare_component(a->password, a->passwordlen, b->password, b->passwordlen)) && (!wants(URI_HOST) || compare_component(a->host, a->hostlen, b->host, b->hostlen)) && (!wants(URI_PORT) || compare_component(a->port, a->portlen, b->port, b->portlen)) && (!wants(URI_DATA) || compare_component(a->data, a->datalen, b->data, b->datalen)) && (!wants(URI_FRAGMENT) || compare_component(a->fragment, a->fragmentlen, b->fragment, b->fragmentlen)) && (!wants(URI_POST) || compare_component(a->post, a->post ? strlen(a->post) : 0, b->post, b->post ? strlen(b->post) : 0)); } /* We might need something more intelligent than this Swiss army knife. */ struct string * add_uri_to_string(struct string *string, const struct uri *uri, uri_component_T components) { /* Custom or unknown keep the URI untouched. */ if (uri->protocol == PROTOCOL_UNKNOWN) return add_to_string(string, struri(uri)); if (wants(URI_PROTOCOL)) { add_bytes_to_string(string, uri->string, uri->protocollen); if (wants(URI_IP_FAMILY) && uri->ip_family) add_long_to_string(string, uri->ip_family); add_char_to_string(string, ':'); if (get_protocol_need_slashes(uri->protocol)) add_to_string(string, "//"); } if (wants(URI_USER) && uri->userlen) { add_bytes_to_string(string, uri->user, uri->userlen); if (wants(URI_PASSWORD) && uri->passwordlen) { add_char_to_string(string, ':'); add_bytes_to_string(string, uri->password, uri->passwordlen); } add_char_to_string(string, '@'); } else if (wants(URI_PASSWORD) && uri->passwordlen) { add_bytes_to_string(string, uri->password, uri->passwordlen); } if (wants(URI_HOST) && uri->hostlen) { int add_host = 1; #ifdef CONFIG_IPV6 /* Rationale for wants(URI_PORT): The [notation] was invented * so that you can have an IPv6 addy and a port together. So * we want to use it when that happens, otherwise we need not * bother (that happens only when we want it for DNS anyway). * I insist on an implied elegancy of this way, but YMMV. ;-) * --pasky */ if (uri->ipv6 && wants(URI_PORT)) add_char_to_string(string, '['); #endif #ifdef CONFIG_IDN2 /* Support for the GNU International Domain Name library. * * http://www.gnu.org/software/libidn/libidn2/manual/libidn2.html */ if (wants(URI_IDN)) { char *host = NULL; #if defined(CONFIG_NLS) || defined(CONFIG_GETTEXT) if (current_charset != -1 && !is_cp_utf8(current_charset)) { int utf8_cp = get_cp_index("utf-8"); struct conv_table *ctable = get_translation_table(current_charset, utf8_cp); host = convert_string(ctable, uri->host, uri->hostlen, utf8_cp, CSM_NONE, NULL, NULL, NULL); } #endif if (!host) { host = memacpy(uri->host, uri->hostlen); } if (host) { char *idname; int code = idn2_to_ascii_8z(host, &idname, 0); /* FIXME: Return NULL if it coughed? --jonas */ if (code == IDN2_OK) { add_to_string(string, idname); free(idname); add_host = 0; } mem_free(host); } } #endif if (add_host) add_bytes_to_string(string, uri->host, uri->hostlen); #ifdef CONFIG_IPV6 if (uri->ipv6 && wants(URI_PORT)) add_char_to_string(string, ']'); #endif } if (wants(URI_PORT) || wants(URI_DEFAULT_PORT)) { if (uri->portlen) { add_char_to_string(string, ':'); add_bytes_to_string(string, uri->port, uri->portlen); } else if (wants(URI_DEFAULT_PORT) && uri->protocol != PROTOCOL_USER) { /* For user protocols we don't know a default port. * Should user protocols ports be configurable? */ int port = get_protocol_port(uri->protocol); add_char_to_string(string, ':'); add_long_to_string(string, port); } } /* Only add slash if we need to separate */ if ((wants(URI_DATA) || wants(URI_POST) || components == URI_HTTP_REFERRER_HOST) && wants(~(URI_DATA | URI_PORT)) && get_protocol_need_slash_after_host(uri->protocol)) add_char_to_string(string, '/'); if (wants(URI_DATA) && uri->datalen) add_bytes_to_string(string, uri->data, uri->datalen); /* We can not test uri->datalen here since we need to always * add '/'. */ if (wants(URI_PATH) || wants(URI_FILENAME)) { const char *filename = uri->data; const char *pos; assertm(!wants(URI_FILENAME) || components == URI_FILENAME, "URI_FILENAME should be used alone %d", components); if (wants(URI_PATH) && !is_uri_dir_sep(uri, *filename)) { #if defined(CONFIG_OS_WIN32) || defined(CONFIG_OS_DOS) if (uri->protocol != PROTOCOL_FILE) #endif /* FIXME: Add correct separator */ add_char_to_string(string, '/'); } if (uri->datalen) { if (uri->protocol == PROTOCOL_DATA) { char *e; add_to_string(string, "data"); e = get_extension_from_uri((struct uri *) uri); if (e) { add_to_string(string, e); mem_free(e); } return string; } for (pos = filename; *pos && !end_of_dir(*pos); pos++) if (wants(URI_FILENAME) && is_uri_dir_sep(uri, *pos)) filename = pos + 1; add_bytes_to_string(string, filename, pos - filename); } } if (wants(URI_QUERY) && uri->datalen) { const char *query = (const char *)memchr(uri->data, '?', uri->datalen); assertm(URI_QUERY == components, "URI_QUERY should be used alone %d", components); if (!query) return string; query++; /* Check fragment and POST_CHAR */ return add_bytes_to_string(string, query, strcspn(query, "#" POST_CHAR_S)); } if (wants(URI_FRAGMENT) && uri->fragmentlen) { add_char_to_string(string, '#'); add_bytes_to_string(string, uri->fragment, uri->fragmentlen); } if (wants(URI_POST) && uri->post) { add_char_to_string(string, POST_CHAR); add_to_string(string, uri->post); } else if (wants(URI_POST_INFO) && uri->post) { if (!strncmp(uri->post, "text/plain", 10)) { add_to_string(string, " (PLAIN TEXT DATA)"); } else if (!strncmp(uri->post, "multipart/form-data;", 20)) { add_to_string(string, " (MULTIPART FORM DATA)"); } else { add_to_string(string, " (POST DATA)"); } } return string; } #undef wants char * get_uri_string(const struct uri *uri, uri_component_T components) { struct string string; if (init_string(&string) && add_uri_to_string(&string, uri, components)) return string.source; done_string(&string); return NULL; } struct string * add_string_uri_to_string(struct string *string, char *uristring, uri_component_T components) { struct uri uri; if (parse_uri(&uri, uristring) != URI_ERRNO_OK) return NULL; return add_uri_to_string(string, &uri, components); } #define normalize_uri_reparse(str) normalize_uri(NULL, str) #define normalize_uri_noparse(uri) normalize_uri(uri, struri(uri)) char * normalize_uri(struct uri *uri, char *uristring) { char *parse_string = uristring; char *src, *dest, *path; int need_slash = 0, keep_dslash = 1; int parse = (uri == NULL); struct uri uri_struct; if (!uri) uri = &uri_struct; /* We need to get the real (proxied) URI but lowercase relevant URI * parts along the way. */ do { if (parse && parse_uri(uri, parse_string) != URI_ERRNO_OK) return uristring; assert(uri->data); /* This is a maybe not the right place but both join_urls() and * get_translated_uri() through translate_url() calls this * function and then it already works on and modifies an * allocated copy. */ convert_to_lowercase_locale_indep(uri->string, uri->protocollen); if (uri->hostlen) convert_to_lowercase_locale_indep(uri->host, uri->hostlen); parse = 1; parse_string = uri->data; } while (uri->protocol == PROTOCOL_PROXY); if (get_protocol_free_syntax(uri->protocol)) return uristring; if (uri->protocol != PROTOCOL_UNKNOWN) { need_slash = get_protocol_need_slash_after_host(uri->protocol); keep_dslash = get_protocol_keep_double_slashes(uri->protocol); } path = uri->data - need_slash; dest = src = path; /* This loop mangles the URI string by removing ".." and "." segments. * However it must not alter "//" without reason; see bug 744. */ while (*dest) { /* If the following pieces are the LAST parts of URL, we remove * them as well. See RFC 2396 section 5.2 for details. */ if (end_of_dir(src[0])) { /* URL data contains no more path. */ memmove(dest, src, strlen(src) + 1); break; } if (!is_uri_dir_sep(uri, src[0])) { /* This is to reduce indentation */ } else if (src[1] == '.') { if (!src[2]) { /* /. - skip the dot */ *dest++ = *src; *dest = 0; break; } else if (is_uri_dir_sep(uri, src[2])) { /* /./ - strip that.. */ src += 2; continue; } else if (src[2] == '.' && (is_uri_dir_sep(uri, src[3]) || !src[3])) { /* /../ or /.. - skip it and preceding element. * * "/foo/bar" ... * ("/../" or "/..\0") ... * * Remove "bar" and the directory * separator that precedes it. The * separator will be added back in the * next iteration unless another ".." * follows, in which case it will be * added later. "bar" may be empty. */ while (dest > path) { dest--; if (is_uri_dir_sep(uri, *dest)) break; } /* "/foo" "/bar" ... * ("/../" or "/..\0") ... */ if (!src[3]) { /* /.. - add ending slash and stop */ *dest++ = *src; *dest = 0; break; } src += 3; continue; } } else if (is_uri_dir_sep(uri, src[1]) && !keep_dslash) { /* // - ignore first '/'. */ src += 1; continue; } /* We don't want to access memory past the NUL char. */ *dest = *src++; if (*dest) dest++; } return uristring; } /* The 'file' scheme URI comes in and bastardized URI comes out which consists * of just the complete path to file/directory, which the dumb 'file' protocol * backend can understand. No host parts etc, that is what this function is * supposed to chew. */ static struct uri * transform_file_url(struct uri *uri, const char *cwd) { char *path = uri->data; assert(uri->protocol == PROTOCOL_FILE && uri->data); /* Sort out the host part. We currently support only host "localhost" * (plus empty host part will be assumed to be "localhost" as well). * As our extensions, '.' will reference to the cwd on localhost * (originally, when the first thing after file:// wasn't "localhost/", * we assumed the cwd as well, and pretended that there's no host part * at all) and '..' to the directory parent to cwd. Another extension * is that if this is a DOS-like system, the first char in two-char * host part is uppercase letter and the second char is a colon, it is * assumed to be a local disk specification. */ /* TODO: Use FTP for non-localhost hosts. --pasky */ /* For URL "file://", we open the current directory. Some other * browsers instead open root directory, but AFAIK the standard does * not specify that and this was the original behaviour and it is more * consistent with our file://./ notation. */ /* Who would name their file/dir '...' ? */ if (*path == '.' || !*path) { struct string dir; if (!init_string(&dir)) return NULL; encode_uri_string(&dir, cwd, -1, 0); /* Either we will end up with '//' and translate_directories() * will shorten it or the '/' will mark the inserted cwd as a * directory. */ if (*path == '.') *path = '/'; /* Insert the current working directory. */ /* The offset is 7 == sizeof("file://") - 1. */ insert_in_string(&struri(uri), 7, dir.source, dir.length); done_string(&dir); return uri; } #ifdef DOS_FS if (isasciialpha(path[0]) && path[1] == ':' && dir_sep(path[2])) return NULL; #endif for (; *path && !dir_sep(*path); path++); /* FIXME: We will in fact assume localhost even for non-local hosts, * until we will support the FTP transformation. --pasky */ memmove(uri->data, path, strlen(path) + 1); return uri; } static char *translate_url(const char *url, char *cwd); char * join_urls(struct uri *base, const char *rel) { char *uristring, *path; int add_slash = 0; int translate = 0; int length = 0; /* See RFC 1808 */ /* TODO: Support for ';' ? (see the RFC) --pasky */ /* For '#', '?' and '//' we could use get_uri_string() but it might be * too expensive since it uses granular allocation scheme. I wouldn't * personally mind tho' because it would be cleaner. --jonas */ if (rel[0] == '#') { /* Strip fragment and post part from the base URI and append * the fragment string in @rel. */ length = base->fragment ? base->fragment - struri(base) - 1 : get_real_uri_length(base); } else if (rel[0] == '?') { /* Strip query, fragment and post part from the base URI and * append the query string in @rel. */ length = base->fragment ? base->fragment - struri(base) - 1 : get_real_uri_length(base); uristring = (char *)memchr(base->data, '?', base->datalen); if (uristring) length = uristring - struri(base); } else if (rel[0] == '/' && rel[1] == '/') { if (!get_protocol_need_slashes(base->protocol)) return NULL; /* Get `:' from the base URI and append the `//' part * from @rel. */ length = base->protocollen + 1; /* We need to sanitize the relative part and add stuff like * host slash. */ translate = 1; } /* If one of the tests above set @length to something useful */ if (length) { uristring = memacpy(struri(base), length); if (!uristring) return NULL; add_to_strn(&uristring, rel); if (translate) { char *translated; translated = translate_url(uristring, NULL); mem_free(uristring); return translated; } return normalize_uri_reparse(uristring); } /* Check if there is some protocol name to go for */ length = get_protocol_length(rel); if (length) { switch (get_protocol(rel, length)) { case PROTOCOL_UNKNOWN: case PROTOCOL_PROXY: /* Mysteriously proxy URIs are breaking here ... */ break; case PROTOCOL_FILE: /* FIXME: Use get_uri_string(base, URI_PATH) as cwd arg * to translate_url(). */ default: uristring = translate_url(rel, NULL); if (uristring) return uristring; } } assertm(base->data != NULL, "bad base url"); if_assert_failed return NULL; path = base->data; /* Either is path blank, but we've slash char before, or path is not * blank, but doesn't start by a slash (if we'd just stay along with * is_uri_dir_sep(&uri, path[-1]) w/o all the surrounding crap, it * should be enough, but I'm not sure and I don't want to break * anything --pasky). */ /* We skip first char of URL ('/') in parse_url() (ARGH). This * is reason of all this bug-bearing magic.. */ if (*path) { if (!is_uri_dir_sep(base, *path)) path--; } else { if (is_uri_dir_sep(base, path[-1])) path--; } if (!is_uri_dir_sep(base, rel[0])) { char *path_end; /* The URL is relative. */ if (!*path) { /* There's no path in the URL, but we're going to add * something there, and the something doesn't start by * a slash. So we need to insert a slash after the base * URL. Clever, eh? ;) */ add_slash = 1; } for (path_end = path; *path_end; path_end++) { if (end_of_dir(*path_end)) break; /* Modify the path pointer, so that it'll always point * above the last '/' in the URL; later, we'll copy the * URL only _TO_ this point, and anything after last * slash will be substituted by 'rel'. */ if (is_uri_dir_sep(base, *path_end)) path = path_end + 1; } } length = path - struri(base); uristring = (char *)mem_alloc(length + strlen(rel) + add_slash + 1); if (!uristring) return NULL; memcpy(uristring, struri(base), length); if (add_slash) uristring[length] = '/'; strcpy(uristring + length + add_slash, rel); return normalize_uri_reparse(uristring); } /* Tries to figure out what protocol @newurl might be specifying by checking if * it exists as a file locally or by checking parts of the host name. */ static protocol_T find_uri_protocol(char *newurl) { char *ch; /* First see if it is a file so filenames that look like hostnames * won't confuse us below. */ if (check_whether_file_exists(newurl) >= 0) return PROTOCOL_FILE; /* Yes, it would be simpler to make test for IPv6 address first, * but it would result in confusing mix of ifdefs ;-). */ /* FIXME: Ideas for improve protocol detection * * - Handle common hostnames. It could be part of the protocol backend * structure. [ www -> http, irc -> irc, news -> nntp, ... ] * * - Resolve using port number. [ 119 -> nntp, 443 -> https, ... ] */ ch = newurl + strcspn(newurl, ".:/@"); if (*ch == '@' || (*ch == ':' && *newurl != '[' && strchr(newurl, '@')) || !c_strncasecmp(newurl, "ftp.", 4)) { /* Contains user/password/ftp-hostname */ return PROTOCOL_FTP; #ifdef CONFIG_IPV6 } else if (*newurl == '[' && *ch == ':') { /* Candidate for IPv6 address */ char *bracket2, *colon2; ch++; bracket2 = strchr(ch, ']'); colon2 = strchr(ch, ':'); if (bracket2 && colon2 && bracket2 > colon2) return PROTOCOL_HTTP; #endif } else if (*newurl != '.' && *ch == '.') { /* Contains domain name? */ char *host_end, *domain; char *ipscan; /* Process the hostname */ for (domain = ch + 1; *(host_end = domain + strcspn(domain, ".:/?")) == '.'; domain = host_end + 1); /* It's IP? */ for (ipscan = ch; isdigit(*ipscan) || *ipscan == '.'; ipscan++); if (!*ipscan || *ipscan == ':' || *ipscan == '/') return PROTOCOL_HTTP; /* It's two-letter or known TLD? */ if (host_end - domain == 2 || end_with_known_tld(domain, host_end - domain) >= 0) return PROTOCOL_HTTP; } return PROTOCOL_UNKNOWN; } #define MAX_TRANSLATION_ATTEMPTS 32 /* Returns an URI string that can be used internally. Adding protocol prefix, * missing slashes etc. */ static char * translate_url(const char *url, char *cwd) { char *newurl; struct uri uri; uri_errno_T uri_errno, prev_errno = URI_ERRNO_EMPTY; int retries = 0; /* Strip starting spaces */ while (*url == ' ') url++; if (!*url) return NULL; newurl = expand_tilde(url); /* XXX: Post data copy. */ if (!newurl) return NULL; parse_uri: /* Yay a goto loop. If we get some URI parse error and try to * fix it we go back to here and try again. */ /* Ordinary parse */ uri_errno = parse_uri(&uri, newurl); /* Bail out if the same error occurs twice */ if (uri_errno == prev_errno || retries++ > MAX_TRANSLATION_ATTEMPTS) { if (retries > MAX_TRANSLATION_ATTEMPTS) { ERROR("Maximum number of parsing attempts exceeded " "for %s.", url); } mem_free(newurl); return NULL; } prev_errno = uri_errno; switch (uri_errno) { case URI_ERRNO_OK: /* Fix translation of 1.2.3.4:5 so IP address part won't be * interpreted as the protocol name. */ if (uri.protocol == PROTOCOL_UNKNOWN) { protocol_T protocol = find_uri_protocol(newurl); /* Code duplication with the URI_ERRNO_INVALID_PROTOCOL * case. */ if (protocol != PROTOCOL_UNKNOWN) { struct string str; if (!init_string(&str)) return NULL; switch (protocol) { case PROTOCOL_FTP: add_to_string(&str, "ftp://"); encode_uri_string(&str, newurl, -1, 0); break; case PROTOCOL_HTTP: add_to_string(&str, "http://"); add_to_string(&str, newurl); break; case PROTOCOL_UNKNOWN: break; case PROTOCOL_FILE: default: add_to_string(&str, "file://"); if (!dir_sep(*newurl)) { #ifndef DOS_FS add_to_string(&str, "./"); #endif } add_to_string(&str, newurl); } mem_free(newurl); newurl = str.source; /* Work around the infinite loop prevention */ prev_errno = URI_ERRNO_EMPTY; goto parse_uri; } } /* If file:// URI is transformed we need to reparse. */ if (uri.protocol == PROTOCOL_FILE && cwd && *cwd && transform_file_url(&uri, cwd)) return normalize_uri_reparse(struri(&uri)); /* Translate the proxied URI too if proxy:// */ if (uri.protocol == PROTOCOL_PROXY) { char *data = translate_url(uri.data, cwd); int pos = uri.data - struri(&uri); if (!data) break; struri(&uri)[pos] = 0; insert_in_string(&struri(&uri), pos, data, strlen(data)); mem_free(data); return normalize_uri_reparse(struri(&uri)); } return normalize_uri_noparse(&uri); case URI_ERRNO_TOO_MANY_SLASHES: { char *from, *to; assert(uri.string[uri.protocollen] == ':' && uri.string[uri.protocollen + 1] == '/' && uri.string[uri.protocollen + 2] == '/'); from = to = uri.string + uri.protocollen + 3; while (*from == '/') from++; assert(to < from); memmove(to, from, strlen(from) + 1); goto parse_uri; } case URI_ERRNO_NO_SLASHES: { /* Try prefix:some.url -> prefix://some.url.. */ int slashes = 2; /* Check if only one '/' is needed. */ if (uri.string[uri.protocollen + 1] == '/') slashes--; insert_in_string(&newurl, uri.protocollen + 1, "//", slashes); goto parse_uri; } case URI_ERRNO_TRAILING_DOTS: { /* Trim trailing '.'s */ char *from = uri.host + uri.hostlen; char *to = from; assert(uri.host < to && to[-1] == '.' && *from != '.'); while (uri.host < to && to[-1] == '.') to--; assert(to < from); memmove(to, from, strlen(from) + 1); goto parse_uri; } case URI_ERRNO_NO_PORT_COLON: assert(uri.portlen == 0 && uri.string < uri.port && uri.port[-1] == ':'); memmove(uri.port - 1, uri.port, strlen(uri.port) + 1); goto parse_uri; case URI_ERRNO_NO_HOST_SLASH: { int offset = uri.port ? uri.port + uri.portlen - struri(&uri) : uri.host + uri.hostlen - struri(&uri) + uri.ipv6 /* ']' */; assertm(uri.host != NULL, "uri.host not set after no host slash error"); insert_in_string(&newurl, offset, "/", 1); goto parse_uri; } case URI_ERRNO_INVALID_PROTOCOL: { const char *default_protocol; /* No protocol name */ protocol_T protocol = find_uri_protocol(newurl); struct string str; if (!init_string(&str)) return NULL; switch (protocol) { case PROTOCOL_FTP: add_to_string(&str, "ftp://"); encode_uri_string(&str, newurl, -1, 0); break; case PROTOCOL_HTTP: #ifdef CONFIG_SSL if (get_https_by_default()) add_to_string(&str, "https://"); else #endif add_to_string(&str, "http://"); add_to_string(&str, newurl); break; case PROTOCOL_UNKNOWN: default_protocol = get_default_protocol(); if (strcmp("file://", default_protocol)) { add_to_string(&str, default_protocol); add_to_string(&str, newurl); break; } case PROTOCOL_FILE: default: add_to_string(&str, "file://"); if (!dir_sep(*newurl)) add_to_string(&str, "./"); encode_file_uri_string(&str, newurl); } mem_free(newurl); newurl = str.source; goto parse_uri; } case URI_ERRNO_EMPTY: case URI_ERRNO_IPV6_SECURITY: case URI_ERRNO_NO_HOST: case URI_ERRNO_INVALID_PORT: case URI_ERRNO_INVALID_PORT_RANGE: /* None of these can be handled properly. */ break; } mem_free(newurl); return NULL; } struct uri * get_composed_uri(struct uri *uri, uri_component_T components) { char *string; assert(uri); if_assert_failed return NULL; string = get_uri_string(uri, components); if (!string) return NULL; uri = get_uri(string, URI_NONE); mem_free(string); return uri; } struct uri * get_translated_uri(char *uristring, char *cwd) { struct uri *uri; uristring = translate_url(uristring, cwd); if (!uristring) return NULL; uri = get_uri(uristring, URI_NONE); mem_free(uristring); return uri; } #define ADD_EXTENSION_FROM_TYPE(string, type, ext) \ if (!memcmp(string, type ";", sizeof(type ";") - 1) || \ !memcmp(string, type ",", sizeof(type ",") - 1)) \ return stracpy("." ext); char * get_extension_from_uri(struct uri *uri) { char *extension = NULL; int afterslash = 1; char *pos = uri->data; assert(pos); if (uri->protocol == PROTOCOL_DATA) { ADD_EXTENSION_FROM_TYPE(uri->data, "image/gif", "gif") ADD_EXTENSION_FROM_TYPE(uri->data, "image/jpeg", "jpg") ADD_EXTENSION_FROM_TYPE(uri->data, "image/png", "png") ADD_EXTENSION_FROM_TYPE(uri->data, "image/webp", "webp") ADD_EXTENSION_FROM_TYPE(uri->data, "image/avif", "avif") ADD_EXTENSION_FROM_TYPE(uri->data, "text/plain", "txt") ADD_EXTENSION_FROM_TYPE(uri->data, "text/html", "html") ADD_EXTENSION_FROM_TYPE(uri->data, "image/svg+xml", "svg") ADD_EXTENSION_FROM_TYPE(uri->data, "image/jxl", "jxl") return stracpy(""); } for (; *pos && !end_of_dir(*pos); pos++) { if (!afterslash && !extension && *pos == '.') { extension = pos; } else if (is_uri_dir_sep(uri, *pos)) { extension = NULL; afterslash = 1; } else { afterslash = 0; } } if (extension && extension < pos) return memacpy(extension, pos - extension); return NULL; } /* URI encoding, escaping unallowed characters. */ static inline int safe_char(unsigned char c) { /* RFC 2396, Page 8, Section 2.3 ;-) */ return isident(c) || c == '.' || c == '!' || c == '~' || c == '*' || c == '\''|| c == '(' || c == ')'; } void encode_uri_string(struct string *string, const char *name, int namelen, int convert_slashes) { char n[4]; const char *end; n[0] = '%'; n[3] = '\0'; if (namelen < 0) namelen = strlen(name); for (end = name + namelen; name < end; name++) { #if 0 /* This is probably correct only for query part of URI..? */ if (*name == ' ') add_char_to_string(data, len, '+'); else #endif if (safe_char(*name) || (!convert_slashes && *name == '/')) { add_char_to_string(string, *name); } else { /* Hex it. */ n[1] = hx((((int) *name) & 0xF0) >> 4); n[2] = hx(((int) *name) & 0xF); add_bytes_to_string(string, n, sizeof(n) - 1); } } } void encode_uri_string_percent(struct string *string, const char *name, int namelen) { char n[4]; const char *end; n[0] = '%'; n[3] = '\0'; if (namelen < 0) namelen = strlen(name); for (end = name + namelen; name < end; name++) { if ((unsigned char)(*name) < 128) { add_char_to_string(string, *name); } else { /* Hex it. */ n[1] = Hx((((int) *name) & 0xF0) >> 4); n[2] = Hx(((int) *name) & 0xF); add_bytes_to_string(string, n, sizeof(n) - 1); } } } void encode_win32_uri_string(struct string *string, char *name, int namelen) { char n[4]; char *end; n[0] = '%'; n[3] = '\0'; if (namelen < 0) namelen = strlen(name); for (end = name + namelen; name < end; name++) { if (safe_char(*name) || *name == ':' || *name == '\\') { add_char_to_string(string, *name); } else { /* Hex it. */ n[1] = hx((((int) *name) & 0xF0) >> 4); n[2] = hx(((int) *name) & 0xF); add_bytes_to_string(string, n, sizeof(n) - 1); } } } /* This function is evil, it modifies its parameter. */ /* XXX: but decoded string is _never_ longer than encoded string so it's an * efficient way to do that, imho. --Zas */ void decode_uri(char *src) { char *dst = src; unsigned char c; do { c = *src++; if (c == '%') { int x1 = unhx(*src); if (x1 >= 0) { int x2 = unhx(*(src + 1)); if (x2 >= 0) { x1 = (x1 << 4) + x2; if (x1 != 0) { /* don't allow %00 */ c = (unsigned char) x1; src += 2; } } } #if 0 } else if (c == '+') { /* As the comment in encode_uri_string suggests, '+' * should only be decoded in the query part of a URI * (should that be 'URL'?). I'm not bold enough to * disable this code, tho. -- Miciah */ c = ' '; #endif } *dst++ = c; } while (c != '\0'); } void decode_uri_string(struct string *string) { decode_uri(string->source); string->length = strlen(string->source); } void decode_uri_for_display(char *src) { decode_uri(src); for (; *src; src++) if (!isprint(*src) || iscntrl(*src)) *src = '*'; } void decode_uri_string_for_display(struct string *string) { decode_uri_for_display(string->source); string->length = strlen(string->source); } /* URI list */ #define URI_LIST_GRANULARITY 0x3 #define realloc_uri_list(list) \ mem_align_alloc(&(list)->uris, (list)->size, (list)->size + 1, \ URI_LIST_GRANULARITY) struct uri * add_to_uri_list(struct uri_list *list, struct uri *uri) { if (!realloc_uri_list(list)) return NULL; list->uris[list->size++] = get_uri_reference(uri); return uri; }; void free_uri_list(struct uri_list *list) { struct uri *uri; int index; if (!list->uris) return; foreach_uri (uri, index, list) { done_uri(uri); } mem_free_set(&list->uris, NULL); list->size = 0; } /* URI cache */ struct uri_cache_entry { struct uri uri; char string[1]; }; struct uri_cache { struct hash *map; struct elinks_object object; }; static struct uri_cache uri_cache; #ifdef CONFIG_DEBUG static inline void check_uri_sanity(struct uri *uri) { int pos; for (pos = 0; pos < uri->protocollen; pos++) if (c_isupper(uri->string[pos])) goto error; if (uri->hostlen) for (pos = 0; pos < uri->hostlen; pos++) if (c_isupper(uri->host[pos])) goto error; return; error: INTERNAL("Uppercase letters detected in protocol or host part (%s).", struri(uri)); } #else #define check_uri_sanity(uri) #endif static inline struct uri_cache_entry * get_uri_cache_entry(char *string, int length) { struct uri_cache_entry *entry; struct hash_item *item; assert(string && length > 0); if_assert_failed return NULL; item = get_hash_item(uri_cache.map, string, length); if (item) return (struct uri_cache_entry *)item->value; /* Setup a new entry */ entry = (struct uri_cache_entry *)mem_calloc(1, sizeof(*entry) + length); if (!entry) return NULL; object_nolock(&entry->uri, "uri"); memcpy(&entry->string, string, length); string = entry->string; if (parse_uri(&entry->uri, string) != URI_ERRNO_OK || !add_hash_item(uri_cache.map, string, length, entry)) { mem_free(entry); return NULL; } object_lock(&uri_cache); return entry; } struct uri * get_uri(char *string, uri_component_T components) { struct uri_cache_entry *entry; assert(string); if (components) { struct uri uri; if (parse_uri(&uri, string) != URI_ERRNO_OK) return NULL; return get_composed_uri(&uri, components); } if (!is_object_used(&uri_cache)) { uri_cache.map = init_hash8(); if (!uri_cache.map) return NULL; object_nolock(&uri_cache, "uri_cache"); } entry = get_uri_cache_entry(string, strlen(string)); if (!entry) { if (!is_object_used(&uri_cache)) free_hash(&uri_cache.map); return NULL; } check_uri_sanity(&entry->uri); object_nolock(&entry->uri, "uri"); object_lock(&entry->uri); return &entry->uri; } void done_uri(struct uri *uri) { char *string = struri(uri); int length = strlen(string); struct hash_item *item; struct uri_cache_entry *entry; assert(is_object_used(&uri_cache)); object_unlock(uri); if (is_object_used(uri)) return; item = get_hash_item(uri_cache.map, string, length); entry = (struct uri_cache_entry *)(item ? item->value : NULL); assertm(entry != NULL, "Releasing unknown URI [%s]", string); del_hash_item(uri_cache.map, item); mem_free(entry); /* Last URI frees the cache */ object_unlock(&uri_cache); if (!is_object_used(&uri_cache)) free_hash(&uri_cache.map); }