elinks/src/protocol/uri.c

/* URL parser and translator; implementation of RFC 2396. */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <ctype.h>
#include <errno.h>
#ifdef HAVE_ICONV
#include <iconv.h>
#endif
#ifdef HAVE_IDN2_H
#include <idn2.h>
#endif
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#ifdef HAVE_NETDB_H
#include <netdb.h> /* OS/2 needs this after sys/types.h */
#endif

#ifdef HAVE_SYS_SOCKET_H
#include <sys/socket.h>
#endif
#ifdef HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif
#ifdef HAVE_ARPA_INET_H
#include <arpa/inet.h>
#endif

#include "elinks.h"

#include "intl/libintl.h"
#include "main/object.h"
#include "protocol/protocol.h"
#include "protocol/uri.h"
#include "util/conv.h"
#include "util/error.h"
#include "util/file.h"
#include "util/hash.h"
#include "util/memory.h"
#include "util/string.h"


static inline int
end_of_dir(unsigned char c)
{
	/* This used to check for c == ';' as well.  But section 3.3
	 * of RFC 2396 explicitly says that parameters in a path
	 * segment "are not significant to the parsing of relative
	 * references."  */
	return c == POST_CHAR || c == '#' || c == '?';
}

static inline int
is_uri_dir_sep(const struct uri *uri, unsigned char pos)
{
	return (uri->protocol == PROTOCOL_FILE ? dir_sep(pos) : pos == '/');
}


int
is_in_domain(char *domain, char *server, int server_len)
{
	int domain_len = strlen(domain);
	int len;

	if (domain_len > server_len)
		return 0;

	if (domain_len == server_len)
		return !c_strncasecmp(domain, server, server_len);

	len = server_len - domain_len;
	if (server[len - 1] != '.')
		return 0;

	return !c_strncasecmp(domain, server + len, domain_len);
}

int
is_ip_address(const char *address, int addresslen)
{
	/* The @address has well defined limits so it would be a shame to
	 * allocate it. */
	char buffer[IP_ADDRESS_BUFFER_SIZE];

	if (addresslen >= sizeof(buffer))
		return 0;

	safe_strncpy(buffer, address, addresslen + 1);

#ifdef HAVE_INET_PTON
#ifdef CONFIG_IPV6
	{
		struct sockaddr_in6 addr6;

		if (inet_pton(AF_INET6, buffer, &addr6.sin6_addr) > 0)
			return 1;
	}
#endif /* CONFIG_IPV6 */
	{
		struct in_addr addr4;

		if (inet_pton(AF_INET, buffer, &addr4) > 0)
			return 1;
	}

	return 0;
#else
	/* FIXME: Is this ever the case? */
	return 0;
#endif /* HAVE_INET_PTON */
}


int
end_with_known_tld(const char *s, int slen)
{
	int i;
	static const char *const tld[] =
	{ "com", "edu", "net",
	  "org", "gov", "mil",
	  "int", "biz", "arpa",
	  "aero", "coop", "club",
	  "info", "museum", "expert",
	  "name", "pro", NULL };

	if (!slen) return -1;
	if (slen < 0) slen = strlen(s);

	for (i = 0; tld[i]; i++) {
		int tldlen = strlen(tld[i]);
		int pos = slen - tldlen;

		if (pos >= 0 && !c_strncasecmp(&s[pos], tld[i], tldlen))
			return pos;
	}

	return -1;
}

/* XXX: this function writes to @name. */
static int
check_whether_file_exists(char *name)
{
	/* Check POST_CHAR etc ... */
	static const char chars[] = POST_CHAR_S "#?";
	int i;
	int namelen = strlen(name);

	if (file_exists(name))
		return namelen;

	for (i = 0; i < sizeof(chars) - 1; i++) {
		char *pos = (char *)memchr(name, chars[i], namelen);
		int exists;

		if (!pos) continue;

		*pos = 0;
		exists = file_exists(name);
		*pos = chars[i];

		if (exists) {
			return pos - name;
		}
	}

	return -1;
}

/* Encodes URIs without encoding stuff like fragments and query separators. */
static void
encode_file_uri_string(struct string *string, char *uristring)
{
	int filenamelen = check_whether_file_exists(uristring);

	encode_uri_string(string, uristring, filenamelen, 0);
	if (filenamelen > 0) add_to_string(string, uristring + filenamelen);
}


static inline int
get_protocol_length(const char *url)
{
	char *end = (char *) url;

	/* Seek the end of the protocol name if any. */
	/* RFC1738:
	 * scheme  = 1*[ lowalpha | digit | "+" | "-" | "." ]
	 * (but per its recommendations we accept "upalpha" too) */
	while (isalnum(*end) || *end == '+' || *end == '-' || *end == '.')
		end++;

	/* Now we make something to support our "IP version in protocol scheme
	 * name" hack and silently chop off the last digit if it's there. The
	 * IETF's not gonna notice I hope or it'd be going after us hard. */
	if (end != url && isdigit(end[-1]))
		end--;

	/* Also return 0 if there's no protocol name (@end == @url). */
	return (*end == ':' || isdigit(*end)) ? end - url : 0;
}

uri_errno_T
parse_uri(struct uri *uri, char *uristring)
{
	char *prefix_end, *host_end;
#ifdef CONFIG_IPV6
	char *lbracket, *rbracket;
#endif

	assertm(uristring != NULL, "No uri to parse.");
	memset(uri, 0, sizeof(*uri));

	/* Nothing to do for an empty url. */
	if_assert_failed return 0;
	if (!*uristring) return URI_ERRNO_EMPTY;

	uri->string = uristring;
	uri->protocollen = get_protocol_length(uristring);

	/* Invalid */
	if (!uri->protocollen) return URI_ERRNO_INVALID_PROTOCOL;

	/* Figure out whether the protocol is known */
	uri->protocol = get_protocol(struri(uri), uri->protocollen);

	prefix_end = uristring + uri->protocollen; /* ':' */

	/* Check if there's a digit after the protocol name. */
	if (isdigit(*prefix_end)) {
		uri->ip_family = uristring[uri->protocollen] - '0';
		prefix_end++;
	}
	if (*prefix_end != ':')
		return URI_ERRNO_INVALID_PROTOCOL;
	prefix_end++;

	/* Skip slashes */

	if (prefix_end[0] == '/' && prefix_end[1] == '/') {
		if (prefix_end[2] == '/'
		    && get_protocol_need_slash_after_host(uri->protocol))
			return URI_ERRNO_TOO_MANY_SLASHES;

		prefix_end += 2;

	} else if (get_protocol_need_slashes(uri->protocol)) {
		return URI_ERRNO_NO_SLASHES;
	}

	if (get_protocol_free_syntax(uri->protocol)) {
		uri->data = prefix_end;
		uri->datalen = strlen(prefix_end);
		return URI_ERRNO_OK;

	} else if (uri->protocol == PROTOCOL_FILE) {
		int datalen = strcspn(prefix_end, "#" POST_CHAR_S);
		char *frag_or_post = prefix_end + datalen;

		/* Extract the fragment part. */
		if (datalen >= 0) {
			if (*frag_or_post == '#') {
				uri->fragment = frag_or_post + 1;
				uri->fragmentlen = strcspn(uri->fragment, POST_CHAR_S);
				frag_or_post = uri->fragment + uri->fragmentlen;
			}
			if (*frag_or_post == POST_CHAR) {
				uri->post = frag_or_post + 1;
			}
		} else {
			datalen = strlen(prefix_end);
		}

		/* A bit of a special case, but using the "normal" host
		 * parsing seems a bit scary at this point. (see bug 107). */
		if (datalen > 9 && !c_strncasecmp(prefix_end, "localhost/", 10)) {
			prefix_end += 9;
			datalen -= 9;
		}

		uri->data = prefix_end;
		uri->datalen = datalen;

		return URI_ERRNO_OK;
	}

	/* Isolate host */

#ifdef CONFIG_IPV6
	/* Get brackets enclosing IPv6 address */
	lbracket = strchr(prefix_end, '[');
	if (lbracket) {
		rbracket = strchr(lbracket, ']');
		/* [address] is handled only inside of hostname part (surprisingly). */
		if (rbracket && rbracket < prefix_end + strcspn(prefix_end, "/"))
			uri->ipv6 = 1;
		else
			lbracket = rbracket = NULL;
	} else {
		rbracket = NULL;
	}
#endif

	/* Possibly skip auth part */
	host_end = prefix_end + strcspn(prefix_end, "@");

	if (prefix_end + strcspn(prefix_end, "/") > host_end
	    && *host_end) { /* we have auth info here */
		char *user_end;

		/* Allow '@' in the password component */
		while (strcspn(host_end + 1, "@") < strcspn(host_end + 1, "/?"))
			host_end = host_end + 1 + strcspn(host_end + 1, "@");

		user_end = strchr(prefix_end, ':');

		if (!user_end || user_end > host_end) {
			uri->user = prefix_end;
			uri->userlen = host_end - prefix_end;
		} else {
			uri->user = prefix_end;
			uri->userlen = user_end - prefix_end;
			uri->password = user_end + 1;
			uri->passwordlen = host_end - user_end - 1;
		}
		prefix_end = host_end + 1;
	}

#ifdef CONFIG_IPV6
	if (uri->ipv6)
		host_end = rbracket + strcspn(rbracket, ":/?");
	else
#endif
		host_end = prefix_end + strcspn(prefix_end, ":/?");

#ifdef CONFIG_IPV6
	if (uri->ipv6) {
		int addrlen = rbracket - lbracket - 1;

		/* Check for valid length.
		 * addrlen >= sizeof(hostbuf) is theorically impossible
		 * but i keep the test in case of... Safer, imho --Zas */
		assertm(addrlen >= 0 && addrlen < NI_MAXHOST,
			"parse_uri(): addrlen value is bad (%d) for URL '%s'. "
			"Problems are likely to be encountered. Please report "
			"this, it is a security bug!", addrlen, uristring);
		if_assert_failed return URI_ERRNO_IPV6_SECURITY;

		uri->host = lbracket + 1;
		uri->hostlen = addrlen;
	} else
#endif
	{
		uri->host = prefix_end;
		uri->hostlen = host_end - prefix_end;

		/* Trim trailing '.'s */
		if (uri->hostlen && uri->host[uri->hostlen - 1] == '.')
			return URI_ERRNO_TRAILING_DOTS;
	}

	if (*host_end == ':') { /* we have port here */
		char *port_end = host_end + 1 + strcspn(host_end + 1, "/");

		host_end++;

		uri->port = host_end;
		uri->portlen = port_end - host_end;

		if (uri->portlen == 0)
			return URI_ERRNO_NO_PORT_COLON;

		/* We only use 8 bits for portlen so better check */
		if (uri->portlen != port_end - host_end)
			return URI_ERRNO_INVALID_PORT;

		/* test if port is number */
		/* TODO: possibly lookup for the service otherwise? --pasky */
		for (; host_end < port_end; host_end++)
			if (!isdigit(*host_end))
				return URI_ERRNO_INVALID_PORT;

		/* Check valid port value, and let show an error message
		 * about invalid url syntax. */
		if (uri->port && uri->portlen) {
			int n;

			errno = 0;
			n = strtol(uri->port, NULL, 10);
			if (errno || !uri_port_is_valid(n))
				return URI_ERRNO_INVALID_PORT;
		}
	}

	if (*host_end == '/') {
		host_end++;

	} else if (get_protocol_need_slash_after_host(uri->protocol)) {
		/* The need for slash after the host component depends on the
		 * need for a host component. -- The dangerous mind of Jonah */
		if (!uri->hostlen)
			return URI_ERRNO_NO_HOST;

		return URI_ERRNO_NO_HOST_SLASH;
	}

	/* Look for #fragment or POST_CHAR */
	prefix_end = host_end + strcspn(host_end, "#" POST_CHAR_S);
	uri->data = host_end;
	uri->datalen = prefix_end - host_end;

	if (*prefix_end == '#') {
		uri->fragment = prefix_end + 1;
		uri->fragmentlen = strcspn(uri->fragment, POST_CHAR_S);
		prefix_end = uri->fragment + uri->fragmentlen;
	}

	if (*prefix_end == POST_CHAR) {
		uri->post = prefix_end + 1;
	}

	return URI_ERRNO_OK;
}

int
get_uri_port(const struct uri *uri)
{
	if (uri->port && uri->portlen) {
		const char *end = uri->port;
		int port = strtol(uri->port, (char **) &end, 10);

		if (end != uri->port) {
			assert(uri_port_is_valid(port));
			return port;
		}
	}

	return get_protocol_port(uri->protocol);
}

#define can_compare_uri_components(comp) !(((comp) & (URI_SPECIAL | URI_IDN)))

static inline int
compare_component(const char *a, int alen,
		  const char *b, int blen)
{
	/* Check that the length and the strings are both set or unset */
	if (alen != blen || !!a != !!b) return 0;

	/* Both are unset so that will make a perfect match */
	if (!a || !alen) return 1;

	/* Let the higher forces decide */
	return !memcmp(a, b, blen);
}

#define wants(x) (components & (x))

int
compare_uri(const struct uri *a, const struct uri *b,
	    uri_component_T components)
{
	if (a == b) return 1;
	if (!components) return 0;

	assertm(can_compare_uri_components(components),
		"compare_uri() is a work in progress. Component unsupported");

	return (!wants(URI_PROTOCOL) || a->protocol == b->protocol)
		&& (!wants(URI_IP_FAMILY) || a->ip_family == b->ip_family)
		&& (!wants(URI_USER)
		    || compare_component(a->user, a->userlen, b->user, b->userlen))
		&& (!wants(URI_PASSWORD)
		    || compare_component(a->password, a->passwordlen, b->password, b->passwordlen))
		&& (!wants(URI_HOST)
		    || compare_component(a->host, a->hostlen, b->host, b->hostlen))
		&& (!wants(URI_PORT)
		    || compare_component(a->port, a->portlen, b->port, b->portlen))
		&& (!wants(URI_DATA)
		    || compare_component(a->data, a->datalen, b->data, b->datalen))
		&& (!wants(URI_FRAGMENT)
		    || compare_component(a->fragment, a->fragmentlen, b->fragment, b->fragmentlen))
		&& (!wants(URI_POST)
		    || compare_component(a->post, a->post ? strlen(a->post) : 0, b->post, b->post ? strlen(b->post) : 0));
}


/* We might need something more intelligent than this Swiss army knife. */
struct string *
add_uri_to_string(struct string *string, const struct uri *uri,
		  uri_component_T components)
{
	/* Custom or unknown keep the URI untouched. */
	if (uri->protocol == PROTOCOL_UNKNOWN)
		return add_to_string(string, struri(uri));

 	if (wants(URI_PROTOCOL)) {
		add_bytes_to_string(string, uri->string, uri->protocollen);
		if (wants(URI_IP_FAMILY) && uri->ip_family)
			add_long_to_string(string, uri->ip_family);
		add_char_to_string(string, ':');
 		if (get_protocol_need_slashes(uri->protocol))
			add_to_string(string, "//");
 	}

 	if (wants(URI_USER) && uri->userlen) {
		add_bytes_to_string(string, uri->user, uri->userlen);

 		if (wants(URI_PASSWORD) && uri->passwordlen) {
			add_char_to_string(string, ':');
			add_bytes_to_string(string, uri->password,
						    uri->passwordlen);
 		}

		add_char_to_string(string, '@');

	} else if (wants(URI_PASSWORD) && uri->passwordlen) {
		add_bytes_to_string(string, uri->password, uri->passwordlen);
	}

 	if (wants(URI_HOST) && uri->hostlen) {
		int add_host = 1;

#ifdef CONFIG_IPV6
		/* Rationale for wants(URI_PORT): The [notation] was invented
		 * so that you can have an IPv6 addy and a port together. So
		 * we want to use it when that happens, otherwise we need not
		 * bother (that happens only when we want it for DNS anyway).
		 * I insist on an implied elegancy of this way, but YMMV. ;-)
		 * --pasky */
		if (uri->ipv6 && wants(URI_PORT)) add_char_to_string(string, '[');
#endif
#ifdef CONFIG_IDN2
		/* Support for the GNU International Domain Name library.
		 *
		 * http://www.gnu.org/software/libidn/libidn2/manual/libidn2.html
		 */
		if (wants(URI_IDN)) {
			char *host = NULL;
#if defined(CONFIG_NLS) || defined(CONFIG_GETTEXT)
			if (current_charset != -1 && !is_cp_utf8(current_charset)) {
				int utf8_cp = get_cp_index("utf-8");
				struct conv_table *ctable = get_translation_table(current_charset, utf8_cp);
				host = convert_string(ctable, uri->host, uri->hostlen, utf8_cp, CSM_NONE,
					NULL, NULL, NULL);
			}
#endif
			if (!host) {
				host = memacpy(uri->host, uri->hostlen);
			}

			if (host) {
				char *idname;
				int code = idn2_to_ascii_8z(host, &idname, 0);

				/* FIXME: Return NULL if it coughed? --jonas */
				if (code == IDN2_OK) {
					add_to_string(string, idname);
					free(idname);
					add_host = 0;
				}
				mem_free(host);
			}
		}
#endif
		if (add_host)
			add_bytes_to_string(string, uri->host, uri->hostlen);

#ifdef CONFIG_IPV6
		if (uri->ipv6 && wants(URI_PORT)) add_char_to_string(string, ']');
#endif
 	}

 	if (wants(URI_PORT) || wants(URI_DEFAULT_PORT)) {
 		if (uri->portlen) {
			add_char_to_string(string, ':');
			add_bytes_to_string(string, uri->port, uri->portlen);

		} else if (wants(URI_DEFAULT_PORT)
			   && uri->protocol != PROTOCOL_USER) {
			/* For user protocols we don't know a default port.
			 * Should user protocols ports be configurable? */
			int port = get_protocol_port(uri->protocol);

			add_char_to_string(string, ':');
			add_long_to_string(string, port);
		}
	}

	/* Only add slash if we need to separate */
	if ((wants(URI_DATA) || wants(URI_POST) || components == URI_HTTP_REFERRER_HOST)
	    && wants(~(URI_DATA | URI_PORT))
	    && get_protocol_need_slash_after_host(uri->protocol))
		add_char_to_string(string, '/');

	if (wants(URI_DATA) && uri->datalen)
		add_bytes_to_string(string, uri->data, uri->datalen);

	/* We can not test uri->datalen here since we need to always
	 * add '/'. */
	if (wants(URI_PATH) || wants(URI_FILENAME)) {
		const char *filename = uri->data;
		const char *pos;

		assertm(!wants(URI_FILENAME) || components == URI_FILENAME,
			"URI_FILENAME should be used alone %d", components);

		if (wants(URI_PATH) && !is_uri_dir_sep(uri, *filename)) {
#if defined(CONFIG_OS_WIN32) || defined(CONFIG_OS_DOS)
			if (uri->protocol != PROTOCOL_FILE)
#endif
			/* FIXME: Add correct separator */
			add_char_to_string(string, '/');
		}

		if (uri->datalen) {

			if (uri->protocol == PROTOCOL_DATA) {
				char *e;
				add_to_string(string, "data");
				e = get_extension_from_uri((struct uri *) uri);

				if (e) {
					add_to_string(string, e);
					mem_free(e);
				}
				return string;
			}

			for (pos = filename; *pos && !end_of_dir(*pos); pos++)
				if (wants(URI_FILENAME) && is_uri_dir_sep(uri, *pos))
					filename = pos + 1;

			add_bytes_to_string(string, filename, pos - filename);
		}
	}

	if (wants(URI_QUERY) && uri->datalen) {
		const char *query = (const char *)memchr(uri->data, '?', uri->datalen);

		assertm(URI_QUERY == components,
			"URI_QUERY should be used alone %d", components);

		if (!query) return string;

		query++;
		/* Check fragment and POST_CHAR */
		return add_bytes_to_string(string, query, strcspn(query, "#" POST_CHAR_S));
	}

	if (wants(URI_FRAGMENT) && uri->fragmentlen) {
		add_char_to_string(string, '#');
		add_bytes_to_string(string, uri->fragment, uri->fragmentlen);
	}

	if (wants(URI_POST) && uri->post) {
		add_char_to_string(string, POST_CHAR);
		add_to_string(string, uri->post);

	} else if (wants(URI_POST_INFO) && uri->post) {
		if (!strncmp(uri->post, "text/plain", 10)) {
			add_to_string(string, " (PLAIN TEXT DATA)");

		} else if (!strncmp(uri->post, "multipart/form-data;", 20)) {
			add_to_string(string, " (MULTIPART FORM DATA)");

		} else {
			add_to_string(string, " (POST DATA)");
		}

	}

	return string;
}

#undef wants

char *
get_uri_string(const struct uri *uri, uri_component_T components)
{
	struct string string;

	if (init_string(&string)
	    && add_uri_to_string(&string, uri, components))
		return string.source;

	done_string(&string);
	return NULL;
}


struct string *
add_string_uri_to_string(struct string *string, char *uristring,
			 uri_component_T components)
{
	struct uri uri;

	if (parse_uri(&uri, uristring) != URI_ERRNO_OK)
		return NULL;

	return add_uri_to_string(string, &uri, components);
}


#define normalize_uri_reparse(str)	normalize_uri(NULL, str)
#define normalize_uri_noparse(uri)	normalize_uri(uri, struri(uri))

char *
normalize_uri(struct uri *uri, char *uristring)
{
	char *parse_string = uristring;
	char *src, *dest, *path;
	int need_slash = 0, keep_dslash = 1;
	int parse = (uri == NULL);
	struct uri uri_struct;

	if (!uri) uri = &uri_struct;

	/* We need to get the real (proxied) URI but lowercase relevant URI
	 * parts along the way. */
	do {
		if (parse && parse_uri(uri, parse_string) != URI_ERRNO_OK)
			return uristring;

		assert(uri->data);

		/* This is a maybe not the right place but both join_urls() and
		 * get_translated_uri() through translate_url() calls this
		 * function and then it already works on and modifies an
		 * allocated copy. */
		convert_to_lowercase_locale_indep(uri->string, uri->protocollen);
		if (uri->hostlen) convert_to_lowercase_locale_indep(uri->host, uri->hostlen);

		parse = 1;
		parse_string = uri->data;
	} while (uri->protocol == PROTOCOL_PROXY);

	if (get_protocol_free_syntax(uri->protocol))
		return uristring;

	if (uri->protocol != PROTOCOL_UNKNOWN) {
		need_slash = get_protocol_need_slash_after_host(uri->protocol);
		keep_dslash = get_protocol_keep_double_slashes(uri->protocol);
	}

	path = uri->data - need_slash;
	dest = src = path;

	/* This loop mangles the URI string by removing ".." and "." segments.
	 * However it must not alter "//" without reason; see bug 744.  */
	while (*dest) {
		/* If the following pieces are the LAST parts of URL, we remove
		 * them as well. See RFC 2396 section 5.2 for details. */

		if (end_of_dir(src[0])) {
			/* URL data contains no more path. */
			memmove(dest, src, strlen(src) + 1);
			break;
		}

		if (!is_uri_dir_sep(uri, src[0])) {
			/* This is to reduce indentation */

		} else if (src[1] == '.') {
			if (!src[2]) {
				/* /. - skip the dot */
				*dest++ = *src;
				*dest = 0;
				break;

			} else if (is_uri_dir_sep(uri, src[2])) {
				/* /./ - strip that.. */
				src += 2;
				continue;

			} else if (src[2] == '.'
				   && (is_uri_dir_sep(uri, src[3]) || !src[3])) {
				/* /../ or /.. - skip it and preceding element.
				 *
				 * <path> "/foo/bar" <dest> ...
				 * <src> ("/../" or "/..\0") ...
				 *
				 * Remove "bar" and the directory
				 * separator that precedes it.  The
				 * separator will be added back in the
				 * next iteration unless another ".."
				 * follows, in which case it will be
				 * added later.  "bar" may be empty.  */

				while (dest > path) {
					dest--;
					if (is_uri_dir_sep(uri, *dest)) break;
				}

				/* <path> "/foo" <dest> "/bar" ...
				 * <src> ("/../" or "/..\0") ... */
				if (!src[3]) {
					/* /.. - add ending slash and stop */
					*dest++ = *src;
					*dest = 0;
					break;
				}

				src += 3;
				continue;
			}

		} else if (is_uri_dir_sep(uri, src[1]) && !keep_dslash) {
			/* // - ignore first '/'. */
			src += 1;
			continue;
		}

		/* We don't want to access memory past the NUL char. */
		*dest = *src++;
		if (*dest) dest++;
	}

	return uristring;
}

/* The 'file' scheme URI comes in and bastardized URI comes out which consists
 * of just the complete path to file/directory, which the dumb 'file' protocol
 * backend can understand. No host parts etc, that is what this function is
 * supposed to chew. */
static struct uri *
transform_file_url(struct uri *uri, const char *cwd)
{
	char *path = uri->data;

	assert(uri->protocol == PROTOCOL_FILE && uri->data);

	/* Sort out the host part. We currently support only host "localhost"
	 * (plus empty host part will be assumed to be "localhost" as well).
	 * As our extensions, '.' will reference to the cwd on localhost
	 * (originally, when the first thing after file:// wasn't "localhost/",
	 * we assumed the cwd as well, and pretended that there's no host part
	 * at all) and '..' to the directory parent to cwd. Another extension
	 * is that if this is a DOS-like system, the first char in two-char
	 * host part is uppercase letter and the second char is a colon, it is
	 * assumed to be a local disk specification. */
	/* TODO: Use FTP for non-localhost hosts. --pasky */

	/* For URL "file://", we open the current directory. Some other
	 * browsers instead open root directory, but AFAIK the standard does
	 * not specify that and this was the original behaviour and it is more
	 * consistent with our file://./ notation. */

	/* Who would name their file/dir '...' ? */
	if (*path == '.' || !*path) {
		struct string dir;

		if (!init_string(&dir))
			return NULL;

		encode_uri_string(&dir, cwd, -1, 0);

		/* Either we will end up with '//' and translate_directories()
		 * will shorten it or the '/' will mark the inserted cwd as a
		 * directory. */
		if (*path == '.') *path = '/';

		/* Insert the current working directory. */
		/* The offset is 7 == sizeof("file://") - 1. */
		insert_in_string(&struri(uri), 7, dir.source, dir.length);

		done_string(&dir);
		return uri;
	}

#ifdef DOS_FS
	if (isasciialpha(path[0]) && path[1] == ':' && dir_sep(path[2]))
		return NULL;
#endif

	for (; *path && !dir_sep(*path); path++);

	/* FIXME: We will in fact assume localhost even for non-local hosts,
	 * until we will support the FTP transformation. --pasky */

	memmove(uri->data, path, strlen(path) + 1);
	return uri;
}

static char *translate_url(const char *url, char *cwd);

char *
join_urls(struct uri *base, const char *rel)
{
	char *uristring, *path;
	int add_slash = 0;
	int translate = 0;
	int length = 0;

	/* See RFC 1808 */
	/* TODO: Support for ';' ? (see the RFC) --pasky */

	/* For '#', '?' and '//' we could use get_uri_string() but it might be
	 * too expensive since it uses granular allocation scheme. I wouldn't
	 * personally mind tho' because it would be cleaner. --jonas */
	if (rel[0] == '#') {
		/* Strip fragment and post part from the base URI and append
		 * the fragment string in @rel. */
		length  = base->fragment
			? base->fragment - struri(base) - 1
			: get_real_uri_length(base);

	} else if (rel[0] == '?') {
		/* Strip query, fragment and post part from the base URI and
		 * append the query string in @rel. */
		length  = base->fragment ? base->fragment - struri(base) - 1
					 : get_real_uri_length(base);

		uristring = (char *)memchr(base->data, '?', base->datalen);
		if (uristring) length = uristring - struri(base);

	} else if (rel[0] == '/' && rel[1] == '/') {
		if (!get_protocol_need_slashes(base->protocol))
			return NULL;

		/* Get `<protocol>:' from the base URI and append the `//' part
		 * from @rel. */
		length = base->protocollen + 1;

		/* We need to sanitize the relative part and add stuff like
		 * host slash. */
		translate = 1;
	}

	/* If one of the tests above set @length to something useful */
	if (length) {
		uristring = memacpy(struri(base), length);
		if (!uristring) return NULL;

		add_to_strn(&uristring, rel);

		if (translate) {
			char *translated;

			translated = translate_url(uristring, NULL);
			mem_free(uristring);
			return translated;
		}
		return normalize_uri_reparse(uristring);
	}

	/* Check if there is some protocol name to go for */
	length = get_protocol_length(rel);
	if (length) {
		switch (get_protocol(rel, length)) {
		case PROTOCOL_UNKNOWN:
		case PROTOCOL_PROXY:
			/* Mysteriously proxy URIs are breaking here ... */
			break;

		case PROTOCOL_FILE:
			/* FIXME: Use get_uri_string(base, URI_PATH) as cwd arg
			 * to translate_url(). */
		default:
			uristring = translate_url(rel, NULL);
			if (uristring) return uristring;
		}
	}

	assertm(base->data != NULL, "bad base url");
	if_assert_failed return NULL;

	path = base->data;

	/* Either is path blank, but we've slash char before, or path is not
	 * blank, but doesn't start by a slash (if we'd just stay along with
	 * is_uri_dir_sep(&uri, path[-1]) w/o all the surrounding crap, it
	 * should be enough, but I'm not sure and I don't want to break
	 * anything --pasky). */
	/* We skip first char of URL ('/') in parse_url() (ARGH). This
	 * is reason of all this bug-bearing magic.. */
	if (*path) {
		if (!is_uri_dir_sep(base, *path)) path--;
	} else {
		if (is_uri_dir_sep(base, path[-1])) path--;
	}

	if (!is_uri_dir_sep(base, rel[0])) {
		char *path_end;

		/* The URL is relative. */

		if (!*path) {
			/* There's no path in the URL, but we're going to add
			 * something there, and the something doesn't start by
			 * a slash. So we need to insert a slash after the base
			 * URL. Clever, eh? ;) */
			add_slash = 1;
		}

		for (path_end = path; *path_end; path_end++) {
			if (end_of_dir(*path_end)) break;
			/* Modify the path pointer, so that it'll always point
			 * above the last '/' in the URL; later, we'll copy the
			 * URL only _TO_ this point, and anything after last
			 * slash will be substituted by 'rel'. */
			if (is_uri_dir_sep(base, *path_end))
				path = path_end + 1;
		}
	}

	length = path - struri(base);
	uristring = (char *)mem_alloc(length + strlen(rel) + add_slash + 1);
	if (!uristring) return NULL;

	memcpy(uristring, struri(base), length);
	if (add_slash) uristring[length] = '/';
	strcpy(uristring + length + add_slash, rel);

	return normalize_uri_reparse(uristring);
}


/* Tries to figure out what protocol @newurl might be specifying by checking if
 * it exists as a file locally or by checking parts of the host name. */
static protocol_T
find_uri_protocol(char *newurl)
{
	char *ch;

	/* First see if it is a file so filenames that look like hostnames
	 * won't confuse us below. */
	if (check_whether_file_exists(newurl) >= 0) return PROTOCOL_FILE;

	/* Yes, it would be simpler to make test for IPv6 address first,
	 * but it would result in confusing mix of ifdefs ;-). */
	/* FIXME: Ideas for improve protocol detection
	 *
	 * - Handle common hostnames. It could be part of the protocol backend
	 *   structure. [ www -> http, irc -> irc, news -> nntp, ... ]
	 *
	 * - Resolve using port number. [ 119 -> nntp, 443 -> https, ... ]
	 */

	ch = newurl + strcspn(newurl, ".:/@");
	if (*ch == '@'
	    || (*ch == ':' && *newurl != '[' && strchr(newurl, '@'))
	    || !c_strncasecmp(newurl, "ftp.", 4)) {
		/* Contains user/password/ftp-hostname */
		return PROTOCOL_FTP;

#ifdef CONFIG_IPV6
	} else if (*newurl == '[' && *ch == ':') {
		/* Candidate for IPv6 address */
		char *bracket2, *colon2;

		ch++;
		bracket2 = strchr(ch, ']');
		colon2 = strchr(ch, ':');
		if (bracket2 && colon2 && bracket2 > colon2)
			return PROTOCOL_HTTP;
#endif

	} else if (*newurl != '.' && *ch == '.') {
		/* Contains domain name? */
		char *host_end, *domain;
		char *ipscan;

		/* Process the hostname */
		for (domain = ch + 1;
			*(host_end = domain + strcspn(domain, ".:/?")) == '.';
			domain = host_end + 1);

		/* It's IP? */
		for (ipscan = ch; isdigit(*ipscan) || *ipscan == '.';
			ipscan++);

		if (!*ipscan || *ipscan == ':' || *ipscan == '/')
			return PROTOCOL_HTTP;

		/* It's two-letter or known TLD? */
		if (host_end - domain == 2
		    || end_with_known_tld(domain, host_end - domain) >= 0)
			return PROTOCOL_HTTP;
	}

	return PROTOCOL_UNKNOWN;
}


#define MAX_TRANSLATION_ATTEMPTS	32

/* Returns an URI string that can be used internally. Adding protocol prefix,
 * missing slashes etc. */
static char *
translate_url(const char *url, char *cwd)
{
	char *newurl;
	struct uri uri;
	uri_errno_T uri_errno, prev_errno = URI_ERRNO_EMPTY;
	int retries = 0;

	/* Strip starting spaces */
	while (*url == ' ') url++;
	if (!*url) return NULL;

	newurl = expand_tilde(url); /* XXX: Post data copy. */
	if (!newurl) return NULL;

parse_uri:
	/* Yay a goto loop. If we get some URI parse error and try to
	 * fix it we go back to here and try again. */
	/* Ordinary parse */
	uri_errno = parse_uri(&uri, newurl);

	/* Bail out if the same error occurs twice */
	if (uri_errno == prev_errno || retries++ > MAX_TRANSLATION_ATTEMPTS) {
		if (retries > MAX_TRANSLATION_ATTEMPTS) {
			ERROR("Maximum number of parsing attempts exceeded "
			      "for %s.", url);
		}
		mem_free(newurl);
		return NULL;
	}

	prev_errno = uri_errno;

	switch (uri_errno) {
	case URI_ERRNO_OK:
		/* Fix translation of 1.2.3.4:5 so IP address part won't be
		 * interpreted as the protocol name. */
		if (uri.protocol == PROTOCOL_UNKNOWN) {
			protocol_T protocol = find_uri_protocol(newurl);

			/* Code duplication with the URI_ERRNO_INVALID_PROTOCOL
			 * case. */
			if (protocol != PROTOCOL_UNKNOWN) {
				struct string str;

				if (!init_string(&str)) return NULL;

				switch (protocol) {
				case PROTOCOL_FTP:
					add_to_string(&str, "ftp://");
					encode_uri_string(&str, newurl, -1, 0);
					break;

				case PROTOCOL_HTTP:
					add_to_string(&str, "http://");
					add_to_string(&str, newurl);
					break;

				case PROTOCOL_UNKNOWN:
					break;

				case PROTOCOL_FILE:
				default:
					add_to_string(&str, "file://");
					if (!dir_sep(*newurl)) {
#ifndef DOS_FS
						add_to_string(&str, "./");
#endif
					}

					add_to_string(&str, newurl);
				}

				mem_free(newurl);
				newurl = str.source;

				/* Work around the infinite loop prevention */
				prev_errno = URI_ERRNO_EMPTY;
				goto parse_uri;
			}
		}

		/* If file:// URI is transformed we need to reparse. */
		if (uri.protocol == PROTOCOL_FILE && cwd && *cwd
		    && transform_file_url(&uri, cwd))
			return normalize_uri_reparse(struri(&uri));

		/* Translate the proxied URI too if proxy:// */
		if (uri.protocol == PROTOCOL_PROXY) {
			char *data = translate_url(uri.data, cwd);
			int pos = uri.data - struri(&uri);

			if (!data) break;
			struri(&uri)[pos] = 0;
			insert_in_string(&struri(&uri), pos, data, strlen(data));
			mem_free(data);
			return normalize_uri_reparse(struri(&uri));
		}

		return normalize_uri_noparse(&uri);

	case URI_ERRNO_TOO_MANY_SLASHES:
	{
		char *from, *to;

		assert(uri.string[uri.protocollen] == ':'
		       && uri.string[uri.protocollen + 1] == '/'
		       && uri.string[uri.protocollen + 2] == '/');

		from = to = uri.string + uri.protocollen + 3;
		while (*from == '/') from++;

		assert(to < from);
		memmove(to, from, strlen(from) + 1);
		goto parse_uri;
	}
	case URI_ERRNO_NO_SLASHES:
	{
		/* Try prefix:some.url -> prefix://some.url.. */
		int slashes = 2;

		/* Check if only one '/' is needed. */
		if (uri.string[uri.protocollen + 1] == '/')
			slashes--;

		insert_in_string(&newurl, uri.protocollen + 1, "//", slashes);
		goto parse_uri;
	}
	case URI_ERRNO_TRAILING_DOTS:
	{
		/* Trim trailing '.'s */
		char *from = uri.host + uri.hostlen;
		char *to = from;

		assert(uri.host < to && to[-1] == '.' && *from != '.');

		while (uri.host < to && to[-1] == '.') to--;

		assert(to < from);
		memmove(to, from, strlen(from) + 1);
		goto parse_uri;
	}
	case URI_ERRNO_NO_PORT_COLON:
		assert(uri.portlen == 0
		       && uri.string < uri.port
		       && uri.port[-1] == ':');

		memmove(uri.port - 1, uri.port, strlen(uri.port) + 1);
		goto parse_uri;

	case URI_ERRNO_NO_HOST_SLASH:
	{
		int offset = uri.port
			   ? uri.port + uri.portlen - struri(&uri)
			   : uri.host + uri.hostlen - struri(&uri) + uri.ipv6 /* ']' */;

		assertm(uri.host != NULL, "uri.host not set after no host slash error");
		insert_in_string(&newurl, offset, "/", 1);
		goto parse_uri;
	}
	case URI_ERRNO_INVALID_PROTOCOL:
	{
		const char *default_protocol;
		/* No protocol name */
		protocol_T protocol = find_uri_protocol(newurl);
		struct string str;

		if (!init_string(&str)) return NULL;

		switch (protocol) {
			case PROTOCOL_FTP:
				add_to_string(&str, "ftp://");
				encode_uri_string(&str, newurl, -1, 0);
				break;

			case PROTOCOL_HTTP:
#ifdef CONFIG_SSL
				if (get_https_by_default())
					add_to_string(&str, "https://");
				else
#endif
				add_to_string(&str, "http://");
				add_to_string(&str, newurl);
				break;

			case PROTOCOL_UNKNOWN:
				default_protocol = get_default_protocol();

				if (strcmp("file://", default_protocol)) {
					add_to_string(&str, default_protocol);
					add_to_string(&str, newurl);
					break;
				}
			case PROTOCOL_FILE:
			default:
				add_to_string(&str, "file://");
				if (!dir_sep(*newurl))
					add_to_string(&str, "./");

				encode_file_uri_string(&str, newurl);
		}

		mem_free(newurl);
		newurl = str.source;

		goto parse_uri;
	}
	case URI_ERRNO_EMPTY:
	case URI_ERRNO_IPV6_SECURITY:
	case URI_ERRNO_NO_HOST:
	case URI_ERRNO_INVALID_PORT:
	case URI_ERRNO_INVALID_PORT_RANGE:
		/* None of these can be handled properly. */
		break;
	}

	mem_free(newurl);
	return NULL;
}


struct uri *
get_composed_uri(struct uri *uri, uri_component_T components)
{
	char *string;

	assert(uri);
	if_assert_failed return NULL;

	string = get_uri_string(uri, components);
	if (!string) return NULL;

	uri = get_uri(string, URI_NONE);
	mem_free(string);

	return uri;
}

struct uri *
get_translated_uri(char *uristring, char *cwd)
{
	struct uri *uri;

	uristring = translate_url(uristring, cwd);
	if (!uristring) return NULL;

	uri = get_uri(uristring, URI_NONE);
	mem_free(uristring);

	return uri;
}

#define ADD_EXTENSION_FROM_TYPE(string, type, ext)			\
	if (!memcmp(string, type ";", sizeof(type ";") - 1)	||	\
	    !memcmp(string, type ",", sizeof(type ",") - 1))		\
		return stracpy("." ext);

char *
get_extension_from_uri(struct uri *uri)
{
	char *extension = NULL;
	int afterslash = 1;
	char *pos = uri->data;

	assert(pos);

	if (uri->protocol == PROTOCOL_DATA) {
		ADD_EXTENSION_FROM_TYPE(uri->data, "image/gif",  "gif")
		ADD_EXTENSION_FROM_TYPE(uri->data, "image/jpeg", "jpg")
		ADD_EXTENSION_FROM_TYPE(uri->data, "image/png",  "png")
		ADD_EXTENSION_FROM_TYPE(uri->data, "image/webp",  "webp")
		ADD_EXTENSION_FROM_TYPE(uri->data, "image/avif",  "avif")
		ADD_EXTENSION_FROM_TYPE(uri->data, "text/plain", "txt")
		ADD_EXTENSION_FROM_TYPE(uri->data, "text/html",  "html")
		ADD_EXTENSION_FROM_TYPE(uri->data, "image/svg+xml", "svg")
		ADD_EXTENSION_FROM_TYPE(uri->data, "image/jxl", "jxl")
		return stracpy("");
	}

	for (; *pos && !end_of_dir(*pos); pos++) {
		if (!afterslash && !extension && *pos == '.') {
			extension = pos;
		} else if (is_uri_dir_sep(uri, *pos)) {
			extension = NULL;
			afterslash = 1;
		} else {
			afterslash = 0;
		}
	}

	if (extension && extension < pos)
		return memacpy(extension, pos - extension);

	return NULL;
}

/* URI encoding, escaping unallowed characters. */
static inline int
safe_char(unsigned char c)
{
	/* RFC 2396, Page 8, Section 2.3 ;-) */
	return isident(c) || c == '.' || c == '!' || c == '~'
	       || c == '*' || c == '\''|| c == '(' || c == ')';
}

void
encode_uri_string(struct string *string, const char *name, int namelen,
		  int convert_slashes)
{
	char n[4];
	const char *end;

	n[0] = '%';
	n[3] = '\0';

	if (namelen < 0) namelen = strlen(name);

	for (end = name + namelen; name < end; name++) {
#if 0
		/* This is probably correct only for query part of URI..? */
		if (*name == ' ') add_char_to_string(data, len, '+');
		else
#endif
		if (safe_char(*name) || (!convert_slashes && *name == '/')) {
			add_char_to_string(string, *name);
		} else {
			/* Hex it. */
			n[1] = hx((((int) *name) & 0xF0) >> 4);
			n[2] = hx(((int) *name) & 0xF);
			add_bytes_to_string(string, n, sizeof(n) - 1);
		}
	}
}

void
encode_uri_string_percent(struct string *string, const char *name, int namelen)
{
	char n[4];
	const char *end;

	n[0] = '%';
	n[3] = '\0';

	if (namelen < 0) namelen = strlen(name);

	for (end = name + namelen; name < end; name++) {
		if ((unsigned char)(*name) < 128) {
			add_char_to_string(string, *name);
		} else {
			/* Hex it. */
			n[1] = Hx((((int) *name) & 0xF0) >> 4);
			n[2] = Hx(((int) *name) & 0xF);
			add_bytes_to_string(string, n, sizeof(n) - 1);
		}
	}
}

void
encode_win32_uri_string(struct string *string, char *name, int namelen)
{
	char n[4];
	char *end;

	n[0] = '%';
	n[3] = '\0';

	if (namelen < 0) namelen = strlen(name);

	for (end = name + namelen; name < end; name++) {
		if (safe_char(*name) || *name == ':' || *name == '\\') {
			add_char_to_string(string, *name);
		} else {
			/* Hex it. */
			n[1] = hx((((int) *name) & 0xF0) >> 4);
			n[2] = hx(((int) *name) & 0xF);
			add_bytes_to_string(string, n, sizeof(n) - 1);
		}
	}
}

/* This function is evil, it modifies its parameter. */
/* XXX: but decoded string is _never_ longer than encoded string so it's an
 * efficient way to do that, imho. --Zas */
void
decode_uri(char *src)
{
	char *dst = src;
	unsigned char c;

	do {
		c = *src++;

		if (c == '%') {
			int x1 = unhx(*src);

			if (x1 >= 0) {
				int x2 = unhx(*(src + 1));

				if (x2 >= 0) {
					x1 = (x1 << 4) + x2;
					if (x1 != 0) { /* don't allow %00 */
						c = (unsigned char) x1;
						src += 2;
					}
				}
			}

#if 0
		} else if (c == '+') {
			/* As the comment in encode_uri_string suggests, '+'
			 * should only be decoded in the query part of a URI
			 * (should that be 'URL'?). I'm not bold enough to
			 * disable this code, tho. -- Miciah */
			c = ' ';
#endif
		}

		*dst++ = c;
	} while (c != '\0');
}

void
decode_uri_string(struct string *string)
{
	decode_uri(string->source);
	string->length = strlen(string->source);
}

void
decode_uri_for_display(char *src)
{
	decode_uri(src);

	for (; *src; src++)
		if (!isprint(*src) || iscntrl(*src))
			*src = '*';
}

void
decode_uri_string_for_display(struct string *string)
{
	decode_uri_for_display(string->source);
	string->length = strlen(string->source);
}


/* URI list */

#define URI_LIST_GRANULARITY 0x3

#define realloc_uri_list(list) \
	mem_align_alloc(&(list)->uris, (list)->size, (list)->size + 1, \
			URI_LIST_GRANULARITY)

struct uri *
add_to_uri_list(struct uri_list *list, struct uri *uri)
{
	if (!realloc_uri_list(list))
		return NULL;

	list->uris[list->size++] = get_uri_reference(uri);

	return uri;
};

void
free_uri_list(struct uri_list *list)
{
	struct uri *uri;
	int index;

	if (!list->uris) return;

	foreach_uri (uri, index, list) {
		done_uri(uri);
	}

	mem_free_set(&list->uris, NULL);
	list->size = 0;
}

/* URI cache */

struct uri_cache_entry {
	struct uri uri;
	char string[1];
};

struct uri_cache {
	struct hash *map;
	struct elinks_object object;
};

static struct uri_cache uri_cache;

#ifdef CONFIG_DEBUG
static inline void
check_uri_sanity(struct uri *uri)
{
	int pos;

	for (pos = 0; pos < uri->protocollen; pos++)
		if (c_isupper(uri->string[pos])) goto error;

	if (uri->hostlen)
		for (pos = 0; pos < uri->hostlen; pos++)
			if (c_isupper(uri->host[pos])) goto error;
	return;
error:
	INTERNAL("Uppercase letters detected in protocol or host part (%s).", struri(uri));
}
#else
#define check_uri_sanity(uri)
#endif

static inline struct uri_cache_entry *
get_uri_cache_entry(char *string, int length)
{
	struct uri_cache_entry *entry;
	struct hash_item *item;

	assert(string && length > 0);
	if_assert_failed return NULL;

	item = get_hash_item(uri_cache.map, string, length);
	if (item) return (struct uri_cache_entry *)item->value;

	/* Setup a new entry */

	entry = (struct uri_cache_entry *)mem_calloc(1, sizeof(*entry) + length);
	if (!entry) return NULL;

	object_nolock(&entry->uri, "uri");
	memcpy(&entry->string, string, length);
	string = entry->string;

	if (parse_uri(&entry->uri, string) != URI_ERRNO_OK
	    || !add_hash_item(uri_cache.map, string, length, entry)) {
		mem_free(entry);
		return NULL;
	}

	object_lock(&uri_cache);

	return entry;
}

struct uri *
get_uri(char *string, uri_component_T components)
{
	struct uri_cache_entry *entry;

	assert(string);

	if (components) {
		struct uri uri;

		if (parse_uri(&uri, string) != URI_ERRNO_OK)
			return NULL;

		return get_composed_uri(&uri, components);
	}

	if (!is_object_used(&uri_cache)) {
		uri_cache.map = init_hash8();
		if (!uri_cache.map) return NULL;
		object_nolock(&uri_cache, "uri_cache");
	}

	entry = get_uri_cache_entry(string, strlen(string));
	if (!entry) {
		if (!is_object_used(&uri_cache))
			free_hash(&uri_cache.map);
		return NULL;
	}

	check_uri_sanity(&entry->uri);
	object_nolock(&entry->uri, "uri");
	object_lock(&entry->uri);

	return &entry->uri;
}

void
done_uri(struct uri *uri)
{
	char *string = struri(uri);
	int length = strlen(string);
	struct hash_item *item;
	struct uri_cache_entry *entry;

	assert(is_object_used(&uri_cache));

	object_unlock(uri);
	if (is_object_used(uri)) return;

	item = get_hash_item(uri_cache.map, string, length);
	entry = (struct uri_cache_entry *)(item ? item->value : NULL);

	assertm(entry != NULL, "Releasing unknown URI [%s]", string);
	del_hash_item(uri_cache.map, item);
	mem_free(entry);

	/* Last URI frees the cache */
	object_unlock(&uri_cache);
	if (!is_object_used(&uri_cache))
		free_hash(&uri_cache.map);
}