1
0
mirror of https://github.com/rkd77/elinks.git synced 2024-06-20 00:15:31 +00:00
elinks/src/protocol/uri.c
2023-08-22 08:25:07 +02:00

1703 lines
42 KiB
C

/* URL parser and translator; implementation of RFC 2396. */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <ctype.h>
#include <errno.h>
#ifdef HAVE_ICONV
#include <iconv.h>
#endif
#ifdef HAVE_IDN2_H
#include <idn2.h>
#endif
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#ifdef HAVE_NETDB_H
#include <netdb.h> /* OS/2 needs this after sys/types.h */
#endif
#ifdef HAVE_SYS_SOCKET_H
#include <sys/socket.h>
#endif
#ifdef HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif
#ifdef HAVE_ARPA_INET_H
#include <arpa/inet.h>
#endif
#include "elinks.h"
#include "intl/libintl.h"
#include "main/object.h"
#include "protocol/protocol.h"
#include "protocol/uri.h"
#include "util/conv.h"
#include "util/error.h"
#include "util/file.h"
#include "util/hash.h"
#include "util/memory.h"
#include "util/string.h"
static inline int
end_of_dir(unsigned char c)
{
/* This used to check for c == ';' as well. But section 3.3
* of RFC 2396 explicitly says that parameters in a path
* segment "are not significant to the parsing of relative
* references." */
return c == POST_CHAR || c == '#' || c == '?';
}
static inline int
is_uri_dir_sep(const struct uri *uri, unsigned char pos)
{
return (uri->protocol == PROTOCOL_FILE ? dir_sep(pos) : pos == '/');
}
int
is_in_domain(char *domain, char *server, int server_len)
{
int domain_len = strlen(domain);
int len;
if (domain_len > server_len)
return 0;
if (domain_len == server_len)
return !c_strncasecmp(domain, server, server_len);
len = server_len - domain_len;
if (server[len - 1] != '.')
return 0;
return !c_strncasecmp(domain, server + len, domain_len);
}
int
is_ip_address(const char *address, int addresslen)
{
/* The @address has well defined limits so it would be a shame to
* allocate it. */
char buffer[IP_ADDRESS_BUFFER_SIZE];
if (addresslen >= sizeof(buffer))
return 0;
safe_strncpy(buffer, address, addresslen + 1);
#ifdef HAVE_INET_PTON
#ifdef CONFIG_IPV6
{
struct sockaddr_in6 addr6;
if (inet_pton(AF_INET6, buffer, &addr6.sin6_addr) > 0)
return 1;
}
#endif /* CONFIG_IPV6 */
{
struct in_addr addr4;
if (inet_pton(AF_INET, buffer, &addr4) > 0)
return 1;
}
return 0;
#else
/* FIXME: Is this ever the case? */
return 0;
#endif /* HAVE_INET_PTON */
}
int
end_with_known_tld(const char *s, int slen)
{
int i;
static const char *const tld[] =
{ "com", "edu", "net",
"org", "gov", "mil",
"int", "biz", "arpa",
"aero", "coop", "club",
"info", "museum", "expert",
"name", "pro", NULL };
if (!slen) return -1;
if (slen < 0) slen = strlen(s);
for (i = 0; tld[i]; i++) {
int tldlen = strlen(tld[i]);
int pos = slen - tldlen;
if (pos >= 0 && !c_strncasecmp(&s[pos], tld[i], tldlen))
return pos;
}
return -1;
}
/* XXX: this function writes to @name. */
static int
check_whether_file_exists(char *name)
{
/* Check POST_CHAR etc ... */
static const char chars[] = POST_CHAR_S "#?";
int i;
int namelen = strlen(name);
if (file_exists(name))
return namelen;
for (i = 0; i < sizeof(chars) - 1; i++) {
char *pos = (char *)memchr(name, chars[i], namelen);
int exists;
if (!pos) continue;
*pos = 0;
exists = file_exists(name);
*pos = chars[i];
if (exists) {
return pos - name;
}
}
return -1;
}
/* Encodes URIs without encoding stuff like fragments and query separators. */
static void
encode_file_uri_string(struct string *string, char *uristring)
{
int filenamelen = check_whether_file_exists(uristring);
encode_uri_string(string, uristring, filenamelen, 0);
if (filenamelen > 0) add_to_string(string, uristring + filenamelen);
}
static inline int
get_protocol_length(const char *url)
{
char *end = (char *) url;
/* Seek the end of the protocol name if any. */
/* RFC1738:
* scheme = 1*[ lowalpha | digit | "+" | "-" | "." ]
* (but per its recommendations we accept "upalpha" too) */
while (isalnum(*end) || *end == '+' || *end == '-' || *end == '.')
end++;
/* Now we make something to support our "IP version in protocol scheme
* name" hack and silently chop off the last digit if it's there. The
* IETF's not gonna notice I hope or it'd be going after us hard. */
if (end != url && isdigit(end[-1]))
end--;
/* Also return 0 if there's no protocol name (@end == @url). */
return (*end == ':' || isdigit(*end)) ? end - url : 0;
}
uri_errno_T
parse_uri(struct uri *uri, char *uristring)
{
char *prefix_end, *host_end;
#ifdef CONFIG_IPV6
char *lbracket, *rbracket;
#endif
assertm(uristring != NULL, "No uri to parse.");
memset(uri, 0, sizeof(*uri));
/* Nothing to do for an empty url. */
if_assert_failed return 0;
if (!*uristring) return URI_ERRNO_EMPTY;
uri->string = uristring;
uri->protocollen = get_protocol_length(uristring);
/* Invalid */
if (!uri->protocollen) return URI_ERRNO_INVALID_PROTOCOL;
/* Figure out whether the protocol is known */
uri->protocol = get_protocol(struri(uri), uri->protocollen);
prefix_end = uristring + uri->protocollen; /* ':' */
/* Check if there's a digit after the protocol name. */
if (isdigit(*prefix_end)) {
uri->ip_family = uristring[uri->protocollen] - '0';
prefix_end++;
}
if (*prefix_end != ':')
return URI_ERRNO_INVALID_PROTOCOL;
prefix_end++;
/* Skip slashes */
if (prefix_end[0] == '/' && prefix_end[1] == '/') {
if (prefix_end[2] == '/'
&& get_protocol_need_slash_after_host(uri->protocol))
return URI_ERRNO_TOO_MANY_SLASHES;
prefix_end += 2;
} else if (get_protocol_need_slashes(uri->protocol)) {
return URI_ERRNO_NO_SLASHES;
}
if (get_protocol_free_syntax(uri->protocol)) {
uri->data = prefix_end;
uri->datalen = strlen(prefix_end);
return URI_ERRNO_OK;
} else if (uri->protocol == PROTOCOL_FILE) {
int datalen = strcspn(prefix_end, "#" POST_CHAR_S);
char *frag_or_post = prefix_end + datalen;
/* Extract the fragment part. */
if (datalen >= 0) {
if (*frag_or_post == '#') {
uri->fragment = frag_or_post + 1;
uri->fragmentlen = strcspn(uri->fragment, POST_CHAR_S);
frag_or_post = uri->fragment + uri->fragmentlen;
}
if (*frag_or_post == POST_CHAR) {
uri->post = frag_or_post + 1;
}
} else {
datalen = strlen(prefix_end);
}
/* A bit of a special case, but using the "normal" host
* parsing seems a bit scary at this point. (see bug 107). */
if (datalen > 9 && !c_strncasecmp(prefix_end, "localhost/", 10)) {
prefix_end += 9;
datalen -= 9;
}
uri->data = prefix_end;
uri->datalen = datalen;
return URI_ERRNO_OK;
}
/* Isolate host */
#ifdef CONFIG_IPV6
/* Get brackets enclosing IPv6 address */
lbracket = strchr(prefix_end, '[');
if (lbracket) {
rbracket = strchr(lbracket, ']');
/* [address] is handled only inside of hostname part (surprisingly). */
if (rbracket && rbracket < prefix_end + strcspn(prefix_end, "/"))
uri->ipv6 = 1;
else
lbracket = rbracket = NULL;
} else {
rbracket = NULL;
}
#endif
/* Possibly skip auth part */
host_end = prefix_end + strcspn(prefix_end, "@");
if (prefix_end + strcspn(prefix_end, "/") > host_end
&& *host_end) { /* we have auth info here */
char *user_end;
/* Allow '@' in the password component */
while (strcspn(host_end + 1, "@") < strcspn(host_end + 1, "/?"))
host_end = host_end + 1 + strcspn(host_end + 1, "@");
user_end = strchr(prefix_end, ':');
if (!user_end || user_end > host_end) {
uri->user = prefix_end;
uri->userlen = host_end - prefix_end;
} else {
uri->user = prefix_end;
uri->userlen = user_end - prefix_end;
uri->password = user_end + 1;
uri->passwordlen = host_end - user_end - 1;
}
prefix_end = host_end + 1;
}
#ifdef CONFIG_IPV6
if (uri->ipv6)
host_end = rbracket + strcspn(rbracket, ":/?");
else
#endif
host_end = prefix_end + strcspn(prefix_end, ":/?");
#ifdef CONFIG_IPV6
if (uri->ipv6) {
int addrlen = rbracket - lbracket - 1;
/* Check for valid length.
* addrlen >= sizeof(hostbuf) is theorically impossible
* but i keep the test in case of... Safer, imho --Zas */
assertm(addrlen >= 0 && addrlen < NI_MAXHOST,
"parse_uri(): addrlen value is bad (%d) for URL '%s'. "
"Problems are likely to be encountered. Please report "
"this, it is a security bug!", addrlen, uristring);
if_assert_failed return URI_ERRNO_IPV6_SECURITY;
uri->host = lbracket + 1;
uri->hostlen = addrlen;
} else
#endif
{
uri->host = prefix_end;
uri->hostlen = host_end - prefix_end;
/* Trim trailing '.'s */
if (uri->hostlen && uri->host[uri->hostlen - 1] == '.')
return URI_ERRNO_TRAILING_DOTS;
}
if (*host_end == ':') { /* we have port here */
char *port_end = host_end + 1 + strcspn(host_end + 1, "/");
host_end++;
uri->port = host_end;
uri->portlen = port_end - host_end;
if (uri->portlen == 0)
return URI_ERRNO_NO_PORT_COLON;
/* We only use 8 bits for portlen so better check */
if (uri->portlen != port_end - host_end)
return URI_ERRNO_INVALID_PORT;
/* test if port is number */
/* TODO: possibly lookup for the service otherwise? --pasky */
for (; host_end < port_end; host_end++)
if (!isdigit(*host_end))
return URI_ERRNO_INVALID_PORT;
/* Check valid port value, and let show an error message
* about invalid url syntax. */
if (uri->port && uri->portlen) {
int n;
errno = 0;
n = strtol(uri->port, NULL, 10);
if (errno || !uri_port_is_valid(n))
return URI_ERRNO_INVALID_PORT;
}
}
if (*host_end == '/') {
host_end++;
} else if (get_protocol_need_slash_after_host(uri->protocol)) {
/* The need for slash after the host component depends on the
* need for a host component. -- The dangerous mind of Jonah */
if (!uri->hostlen)
return URI_ERRNO_NO_HOST;
return URI_ERRNO_NO_HOST_SLASH;
}
/* Look for #fragment or POST_CHAR */
prefix_end = host_end + strcspn(host_end, "#" POST_CHAR_S);
uri->data = host_end;
uri->datalen = prefix_end - host_end;
if (*prefix_end == '#') {
uri->fragment = prefix_end + 1;
uri->fragmentlen = strcspn(uri->fragment, POST_CHAR_S);
prefix_end = uri->fragment + uri->fragmentlen;
}
if (*prefix_end == POST_CHAR) {
uri->post = prefix_end + 1;
}
return URI_ERRNO_OK;
}
int
get_uri_port(const struct uri *uri)
{
if (uri->port && uri->portlen) {
const char *end = uri->port;
int port = strtol(uri->port, (char **) &end, 10);
if (end != uri->port) {
assert(uri_port_is_valid(port));
return port;
}
}
return get_protocol_port(uri->protocol);
}
#define can_compare_uri_components(comp) !(((comp) & (URI_SPECIAL | URI_IDN)))
static inline int
compare_component(const char *a, int alen,
const char *b, int blen)
{
/* Check that the length and the strings are both set or unset */
if (alen != blen || !!a != !!b) return 0;
/* Both are unset so that will make a perfect match */
if (!a || !alen) return 1;
/* Let the higher forces decide */
return !memcmp(a, b, blen);
}
#define wants(x) (components & (x))
int
compare_uri(const struct uri *a, const struct uri *b,
uri_component_T components)
{
if (a == b) return 1;
if (!components) return 0;
assertm(can_compare_uri_components(components),
"compare_uri() is a work in progress. Component unsupported");
return (!wants(URI_PROTOCOL) || a->protocol == b->protocol)
&& (!wants(URI_IP_FAMILY) || a->ip_family == b->ip_family)
&& (!wants(URI_USER)
|| compare_component(a->user, a->userlen, b->user, b->userlen))
&& (!wants(URI_PASSWORD)
|| compare_component(a->password, a->passwordlen, b->password, b->passwordlen))
&& (!wants(URI_HOST)
|| compare_component(a->host, a->hostlen, b->host, b->hostlen))
&& (!wants(URI_PORT)
|| compare_component(a->port, a->portlen, b->port, b->portlen))
&& (!wants(URI_DATA)
|| compare_component(a->data, a->datalen, b->data, b->datalen))
&& (!wants(URI_FRAGMENT)
|| compare_component(a->fragment, a->fragmentlen, b->fragment, b->fragmentlen))
&& (!wants(URI_POST)
|| compare_component(a->post, a->post ? strlen(a->post) : 0, b->post, b->post ? strlen(b->post) : 0));
}
/* We might need something more intelligent than this Swiss army knife. */
struct string *
add_uri_to_string(struct string *string, const struct uri *uri,
uri_component_T components)
{
/* Custom or unknown keep the URI untouched. */
if (uri->protocol == PROTOCOL_UNKNOWN)
return add_to_string(string, struri(uri));
if (wants(URI_PROTOCOL)) {
add_bytes_to_string(string, uri->string, uri->protocollen);
if (wants(URI_IP_FAMILY) && uri->ip_family)
add_long_to_string(string, uri->ip_family);
add_char_to_string(string, ':');
if (get_protocol_need_slashes(uri->protocol))
add_to_string(string, "//");
}
if (wants(URI_USER) && uri->userlen) {
add_bytes_to_string(string, uri->user, uri->userlen);
if (wants(URI_PASSWORD) && uri->passwordlen) {
add_char_to_string(string, ':');
add_bytes_to_string(string, uri->password,
uri->passwordlen);
}
add_char_to_string(string, '@');
} else if (wants(URI_PASSWORD) && uri->passwordlen) {
add_bytes_to_string(string, uri->password, uri->passwordlen);
}
if (wants(URI_HOST) && uri->hostlen) {
int add_host = 1;
#ifdef CONFIG_IPV6
/* Rationale for wants(URI_PORT): The [notation] was invented
* so that you can have an IPv6 addy and a port together. So
* we want to use it when that happens, otherwise we need not
* bother (that happens only when we want it for DNS anyway).
* I insist on an implied elegancy of this way, but YMMV. ;-)
* --pasky */
if (uri->ipv6 && wants(URI_PORT)) add_char_to_string(string, '[');
#endif
#ifdef CONFIG_IDN2
/* Support for the GNU International Domain Name library.
*
* http://www.gnu.org/software/libidn/libidn2/manual/libidn2.html
*/
if (wants(URI_IDN)) {
char *host = NULL;
#if defined(CONFIG_NLS) || defined(CONFIG_GETTEXT)
if (current_charset != -1 && !is_cp_utf8(current_charset)) {
int utf8_cp = get_cp_index("utf-8");
struct conv_table *ctable = get_translation_table(current_charset, utf8_cp);
host = convert_string(ctable, uri->host, uri->hostlen, utf8_cp, CSM_NONE,
NULL, NULL, NULL);
}
#endif
if (!host) {
host = memacpy(uri->host, uri->hostlen);
}
if (host) {
char *idname;
int code = idn2_to_ascii_8z(host, &idname, 0);
/* FIXME: Return NULL if it coughed? --jonas */
if (code == IDN2_OK) {
add_to_string(string, idname);
free(idname);
add_host = 0;
}
mem_free(host);
}
}
#endif
if (add_host)
add_bytes_to_string(string, uri->host, uri->hostlen);
#ifdef CONFIG_IPV6
if (uri->ipv6 && wants(URI_PORT)) add_char_to_string(string, ']');
#endif
}
if (wants(URI_PORT) || wants(URI_DEFAULT_PORT)) {
if (uri->portlen) {
add_char_to_string(string, ':');
add_bytes_to_string(string, uri->port, uri->portlen);
} else if (wants(URI_DEFAULT_PORT)
&& uri->protocol != PROTOCOL_USER) {
/* For user protocols we don't know a default port.
* Should user protocols ports be configurable? */
int port = get_protocol_port(uri->protocol);
add_char_to_string(string, ':');
add_long_to_string(string, port);
}
}
/* Only add slash if we need to separate */
if ((wants(URI_DATA) || wants(URI_POST) || components == URI_HTTP_REFERRER_HOST)
&& wants(~(URI_DATA | URI_PORT))
&& get_protocol_need_slash_after_host(uri->protocol))
add_char_to_string(string, '/');
if (wants(URI_DATA) && uri->datalen)
add_bytes_to_string(string, uri->data, uri->datalen);
/* We can not test uri->datalen here since we need to always
* add '/'. */
if (wants(URI_PATH) || wants(URI_FILENAME)) {
const char *filename = uri->data;
const char *pos;
assertm(!wants(URI_FILENAME) || components == URI_FILENAME,
"URI_FILENAME should be used alone %d", components);
if (wants(URI_PATH) && !is_uri_dir_sep(uri, *filename)) {
#if defined(CONFIG_OS_WIN32) || defined(CONFIG_OS_DOS)
if (uri->protocol != PROTOCOL_FILE)
#endif
/* FIXME: Add correct separator */
add_char_to_string(string, '/');
}
if (uri->datalen) {
if (uri->protocol == PROTOCOL_DATA) {
char *e;
add_to_string(string, "data");
e = get_extension_from_uri((struct uri *) uri);
if (e) {
add_to_string(string, e);
mem_free(e);
}
return string;
}
for (pos = filename; *pos && !end_of_dir(*pos); pos++)
if (wants(URI_FILENAME) && is_uri_dir_sep(uri, *pos))
filename = pos + 1;
add_bytes_to_string(string, filename, pos - filename);
}
}
if (wants(URI_QUERY) && uri->datalen) {
const char *query = (const char *)memchr(uri->data, '?', uri->datalen);
assertm(URI_QUERY == components,
"URI_QUERY should be used alone %d", components);
if (!query) return string;
query++;
/* Check fragment and POST_CHAR */
return add_bytes_to_string(string, query, strcspn(query, "#" POST_CHAR_S));
}
if (wants(URI_FRAGMENT) && uri->fragmentlen) {
add_char_to_string(string, '#');
add_bytes_to_string(string, uri->fragment, uri->fragmentlen);
}
if (wants(URI_POST) && uri->post) {
add_char_to_string(string, POST_CHAR);
add_to_string(string, uri->post);
} else if (wants(URI_POST_INFO) && uri->post) {
if (!strncmp(uri->post, "text/plain", 10)) {
add_to_string(string, " (PLAIN TEXT DATA)");
} else if (!strncmp(uri->post, "multipart/form-data;", 20)) {
add_to_string(string, " (MULTIPART FORM DATA)");
} else {
add_to_string(string, " (POST DATA)");
}
}
return string;
}
#undef wants
char *
get_uri_string(const struct uri *uri, uri_component_T components)
{
struct string string;
if (init_string(&string)
&& add_uri_to_string(&string, uri, components))
return string.source;
done_string(&string);
return NULL;
}
struct string *
add_string_uri_to_string(struct string *string, char *uristring,
uri_component_T components)
{
struct uri uri;
if (parse_uri(&uri, uristring) != URI_ERRNO_OK)
return NULL;
return add_uri_to_string(string, &uri, components);
}
#define normalize_uri_reparse(str) normalize_uri(NULL, str)
#define normalize_uri_noparse(uri) normalize_uri(uri, struri(uri))
char *
normalize_uri(struct uri *uri, char *uristring)
{
char *parse_string = uristring;
char *src, *dest, *path;
int need_slash = 0, keep_dslash = 1;
int parse = (uri == NULL);
struct uri uri_struct;
if (!uri) uri = &uri_struct;
/* We need to get the real (proxied) URI but lowercase relevant URI
* parts along the way. */
do {
if (parse && parse_uri(uri, parse_string) != URI_ERRNO_OK)
return uristring;
assert(uri->data);
/* This is a maybe not the right place but both join_urls() and
* get_translated_uri() through translate_url() calls this
* function and then it already works on and modifies an
* allocated copy. */
convert_to_lowercase_locale_indep(uri->string, uri->protocollen);
if (uri->hostlen) convert_to_lowercase_locale_indep(uri->host, uri->hostlen);
parse = 1;
parse_string = uri->data;
} while (uri->protocol == PROTOCOL_PROXY);
if (get_protocol_free_syntax(uri->protocol))
return uristring;
if (uri->protocol != PROTOCOL_UNKNOWN) {
need_slash = get_protocol_need_slash_after_host(uri->protocol);
keep_dslash = get_protocol_keep_double_slashes(uri->protocol);
}
path = uri->data - need_slash;
dest = src = path;
/* This loop mangles the URI string by removing ".." and "." segments.
* However it must not alter "//" without reason; see bug 744. */
while (*dest) {
/* If the following pieces are the LAST parts of URL, we remove
* them as well. See RFC 2396 section 5.2 for details. */
if (end_of_dir(src[0])) {
/* URL data contains no more path. */
memmove(dest, src, strlen(src) + 1);
break;
}
if (!is_uri_dir_sep(uri, src[0])) {
/* This is to reduce indentation */
} else if (src[1] == '.') {
if (!src[2]) {
/* /. - skip the dot */
*dest++ = *src;
*dest = 0;
break;
} else if (is_uri_dir_sep(uri, src[2])) {
/* /./ - strip that.. */
src += 2;
continue;
} else if (src[2] == '.'
&& (is_uri_dir_sep(uri, src[3]) || !src[3])) {
/* /../ or /.. - skip it and preceding element.
*
* <path> "/foo/bar" <dest> ...
* <src> ("/../" or "/..\0") ...
*
* Remove "bar" and the directory
* separator that precedes it. The
* separator will be added back in the
* next iteration unless another ".."
* follows, in which case it will be
* added later. "bar" may be empty. */
while (dest > path) {
dest--;
if (is_uri_dir_sep(uri, *dest)) break;
}
/* <path> "/foo" <dest> "/bar" ...
* <src> ("/../" or "/..\0") ... */
if (!src[3]) {
/* /.. - add ending slash and stop */
*dest++ = *src;
*dest = 0;
break;
}
src += 3;
continue;
}
} else if (is_uri_dir_sep(uri, src[1]) && !keep_dslash) {
/* // - ignore first '/'. */
src += 1;
continue;
}
/* We don't want to access memory past the NUL char. */
*dest = *src++;
if (*dest) dest++;
}
return uristring;
}
/* The 'file' scheme URI comes in and bastardized URI comes out which consists
* of just the complete path to file/directory, which the dumb 'file' protocol
* backend can understand. No host parts etc, that is what this function is
* supposed to chew. */
static struct uri *
transform_file_url(struct uri *uri, const char *cwd)
{
char *path = uri->data;
assert(uri->protocol == PROTOCOL_FILE && uri->data);
/* Sort out the host part. We currently support only host "localhost"
* (plus empty host part will be assumed to be "localhost" as well).
* As our extensions, '.' will reference to the cwd on localhost
* (originally, when the first thing after file:// wasn't "localhost/",
* we assumed the cwd as well, and pretended that there's no host part
* at all) and '..' to the directory parent to cwd. Another extension
* is that if this is a DOS-like system, the first char in two-char
* host part is uppercase letter and the second char is a colon, it is
* assumed to be a local disk specification. */
/* TODO: Use FTP for non-localhost hosts. --pasky */
/* For URL "file://", we open the current directory. Some other
* browsers instead open root directory, but AFAIK the standard does
* not specify that and this was the original behaviour and it is more
* consistent with our file://./ notation. */
/* Who would name their file/dir '...' ? */
if (*path == '.' || !*path) {
struct string dir;
if (!init_string(&dir))
return NULL;
encode_uri_string(&dir, cwd, -1, 0);
/* Either we will end up with '//' and translate_directories()
* will shorten it or the '/' will mark the inserted cwd as a
* directory. */
if (*path == '.') *path = '/';
/* Insert the current working directory. */
/* The offset is 7 == sizeof("file://") - 1. */
insert_in_string(&struri(uri), 7, dir.source, dir.length);
done_string(&dir);
return uri;
}
#ifdef DOS_FS
if (isasciialpha(path[0]) && path[1] == ':' && dir_sep(path[2]))
return NULL;
#endif
for (; *path && !dir_sep(*path); path++);
/* FIXME: We will in fact assume localhost even for non-local hosts,
* until we will support the FTP transformation. --pasky */
memmove(uri->data, path, strlen(path) + 1);
return uri;
}
static char *translate_url(const char *url, char *cwd);
char *
join_urls(struct uri *base, const char *rel)
{
char *uristring, *path;
int add_slash = 0;
int translate = 0;
int length = 0;
/* See RFC 1808 */
/* TODO: Support for ';' ? (see the RFC) --pasky */
/* For '#', '?' and '//' we could use get_uri_string() but it might be
* too expensive since it uses granular allocation scheme. I wouldn't
* personally mind tho' because it would be cleaner. --jonas */
if (rel[0] == '#') {
/* Strip fragment and post part from the base URI and append
* the fragment string in @rel. */
length = base->fragment
? base->fragment - struri(base) - 1
: get_real_uri_length(base);
} else if (rel[0] == '?') {
/* Strip query, fragment and post part from the base URI and
* append the query string in @rel. */
length = base->fragment ? base->fragment - struri(base) - 1
: get_real_uri_length(base);
uristring = (char *)memchr(base->data, '?', base->datalen);
if (uristring) length = uristring - struri(base);
} else if (rel[0] == '/' && rel[1] == '/') {
if (!get_protocol_need_slashes(base->protocol))
return NULL;
/* Get `<protocol>:' from the base URI and append the `//' part
* from @rel. */
length = base->protocollen + 1;
/* We need to sanitize the relative part and add stuff like
* host slash. */
translate = 1;
}
/* If one of the tests above set @length to something useful */
if (length) {
uristring = memacpy(struri(base), length);
if (!uristring) return NULL;
add_to_strn(&uristring, rel);
if (translate) {
char *translated;
translated = translate_url(uristring, NULL);
mem_free(uristring);
return translated;
}
return normalize_uri_reparse(uristring);
}
/* Check if there is some protocol name to go for */
length = get_protocol_length(rel);
if (length) {
switch (get_protocol(rel, length)) {
case PROTOCOL_UNKNOWN:
case PROTOCOL_PROXY:
/* Mysteriously proxy URIs are breaking here ... */
break;
case PROTOCOL_FILE:
/* FIXME: Use get_uri_string(base, URI_PATH) as cwd arg
* to translate_url(). */
default:
uristring = translate_url(rel, NULL);
if (uristring) return uristring;
}
}
assertm(base->data != NULL, "bad base url");
if_assert_failed return NULL;
path = base->data;
/* Either is path blank, but we've slash char before, or path is not
* blank, but doesn't start by a slash (if we'd just stay along with
* is_uri_dir_sep(&uri, path[-1]) w/o all the surrounding crap, it
* should be enough, but I'm not sure and I don't want to break
* anything --pasky). */
/* We skip first char of URL ('/') in parse_url() (ARGH). This
* is reason of all this bug-bearing magic.. */
if (*path) {
if (!is_uri_dir_sep(base, *path)) path--;
} else {
if (is_uri_dir_sep(base, path[-1])) path--;
}
if (!is_uri_dir_sep(base, rel[0])) {
char *path_end;
/* The URL is relative. */
if (!*path) {
/* There's no path in the URL, but we're going to add
* something there, and the something doesn't start by
* a slash. So we need to insert a slash after the base
* URL. Clever, eh? ;) */
add_slash = 1;
}
for (path_end = path; *path_end; path_end++) {
if (end_of_dir(*path_end)) break;
/* Modify the path pointer, so that it'll always point
* above the last '/' in the URL; later, we'll copy the
* URL only _TO_ this point, and anything after last
* slash will be substituted by 'rel'. */
if (is_uri_dir_sep(base, *path_end))
path = path_end + 1;
}
}
length = path - struri(base);
uristring = (char *)mem_alloc(length + strlen(rel) + add_slash + 1);
if (!uristring) return NULL;
memcpy(uristring, struri(base), length);
if (add_slash) uristring[length] = '/';
strcpy(uristring + length + add_slash, rel);
return normalize_uri_reparse(uristring);
}
/* Tries to figure out what protocol @newurl might be specifying by checking if
* it exists as a file locally or by checking parts of the host name. */
static protocol_T
find_uri_protocol(char *newurl)
{
char *ch;
/* First see if it is a file so filenames that look like hostnames
* won't confuse us below. */
if (check_whether_file_exists(newurl) >= 0) return PROTOCOL_FILE;
/* Yes, it would be simpler to make test for IPv6 address first,
* but it would result in confusing mix of ifdefs ;-). */
/* FIXME: Ideas for improve protocol detection
*
* - Handle common hostnames. It could be part of the protocol backend
* structure. [ www -> http, irc -> irc, news -> nntp, ... ]
*
* - Resolve using port number. [ 119 -> nntp, 443 -> https, ... ]
*/
ch = newurl + strcspn(newurl, ".:/@");
if (*ch == '@'
|| (*ch == ':' && *newurl != '[' && strchr(newurl, '@'))
|| !c_strncasecmp(newurl, "ftp.", 4)) {
/* Contains user/password/ftp-hostname */
return PROTOCOL_FTP;
#ifdef CONFIG_IPV6
} else if (*newurl == '[' && *ch == ':') {
/* Candidate for IPv6 address */
char *bracket2, *colon2;
ch++;
bracket2 = strchr(ch, ']');
colon2 = strchr(ch, ':');
if (bracket2 && colon2 && bracket2 > colon2)
return PROTOCOL_HTTP;
#endif
} else if (*newurl != '.' && *ch == '.') {
/* Contains domain name? */
char *host_end, *domain;
char *ipscan;
/* Process the hostname */
for (domain = ch + 1;
*(host_end = domain + strcspn(domain, ".:/?")) == '.';
domain = host_end + 1);
/* It's IP? */
for (ipscan = ch; isdigit(*ipscan) || *ipscan == '.';
ipscan++);
if (!*ipscan || *ipscan == ':' || *ipscan == '/')
return PROTOCOL_HTTP;
/* It's two-letter or known TLD? */
if (host_end - domain == 2
|| end_with_known_tld(domain, host_end - domain) >= 0)
return PROTOCOL_HTTP;
}
return PROTOCOL_UNKNOWN;
}
#define MAX_TRANSLATION_ATTEMPTS 32
/* Returns an URI string that can be used internally. Adding protocol prefix,
* missing slashes etc. */
static char *
translate_url(const char *url, char *cwd)
{
char *newurl;
struct uri uri;
uri_errno_T uri_errno, prev_errno = URI_ERRNO_EMPTY;
int retries = 0;
/* Strip starting spaces */
while (*url == ' ') url++;
if (!*url) return NULL;
newurl = expand_tilde(url); /* XXX: Post data copy. */
if (!newurl) return NULL;
parse_uri:
/* Yay a goto loop. If we get some URI parse error and try to
* fix it we go back to here and try again. */
/* Ordinary parse */
uri_errno = parse_uri(&uri, newurl);
/* Bail out if the same error occurs twice */
if (uri_errno == prev_errno || retries++ > MAX_TRANSLATION_ATTEMPTS) {
if (retries > MAX_TRANSLATION_ATTEMPTS) {
ERROR("Maximum number of parsing attempts exceeded "
"for %s.", url);
}
mem_free(newurl);
return NULL;
}
prev_errno = uri_errno;
switch (uri_errno) {
case URI_ERRNO_OK:
/* Fix translation of 1.2.3.4:5 so IP address part won't be
* interpreted as the protocol name. */
if (uri.protocol == PROTOCOL_UNKNOWN) {
protocol_T protocol = find_uri_protocol(newurl);
/* Code duplication with the URI_ERRNO_INVALID_PROTOCOL
* case. */
if (protocol != PROTOCOL_UNKNOWN) {
struct string str;
if (!init_string(&str)) return NULL;
switch (protocol) {
case PROTOCOL_FTP:
add_to_string(&str, "ftp://");
encode_uri_string(&str, newurl, -1, 0);
break;
case PROTOCOL_HTTP:
add_to_string(&str, "http://");
add_to_string(&str, newurl);
break;
case PROTOCOL_UNKNOWN:
break;
case PROTOCOL_FILE:
default:
add_to_string(&str, "file://");
if (!dir_sep(*newurl)) {
#ifndef DOS_FS
add_to_string(&str, "./");
#endif
}
add_to_string(&str, newurl);
}
mem_free(newurl);
newurl = str.source;
/* Work around the infinite loop prevention */
prev_errno = URI_ERRNO_EMPTY;
goto parse_uri;
}
}
/* If file:// URI is transformed we need to reparse. */
if (uri.protocol == PROTOCOL_FILE && cwd && *cwd
&& transform_file_url(&uri, cwd))
return normalize_uri_reparse(struri(&uri));
/* Translate the proxied URI too if proxy:// */
if (uri.protocol == PROTOCOL_PROXY) {
char *data = translate_url(uri.data, cwd);
int pos = uri.data - struri(&uri);
if (!data) break;
struri(&uri)[pos] = 0;
insert_in_string(&struri(&uri), pos, data, strlen(data));
mem_free(data);
return normalize_uri_reparse(struri(&uri));
}
return normalize_uri_noparse(&uri);
case URI_ERRNO_TOO_MANY_SLASHES:
{
char *from, *to;
assert(uri.string[uri.protocollen] == ':'
&& uri.string[uri.protocollen + 1] == '/'
&& uri.string[uri.protocollen + 2] == '/');
from = to = uri.string + uri.protocollen + 3;
while (*from == '/') from++;
assert(to < from);
memmove(to, from, strlen(from) + 1);
goto parse_uri;
}
case URI_ERRNO_NO_SLASHES:
{
/* Try prefix:some.url -> prefix://some.url.. */
int slashes = 2;
/* Check if only one '/' is needed. */
if (uri.string[uri.protocollen + 1] == '/')
slashes--;
insert_in_string(&newurl, uri.protocollen + 1, "//", slashes);
goto parse_uri;
}
case URI_ERRNO_TRAILING_DOTS:
{
/* Trim trailing '.'s */
char *from = uri.host + uri.hostlen;
char *to = from;
assert(uri.host < to && to[-1] == '.' && *from != '.');
while (uri.host < to && to[-1] == '.') to--;
assert(to < from);
memmove(to, from, strlen(from) + 1);
goto parse_uri;
}
case URI_ERRNO_NO_PORT_COLON:
assert(uri.portlen == 0
&& uri.string < uri.port
&& uri.port[-1] == ':');
memmove(uri.port - 1, uri.port, strlen(uri.port) + 1);
goto parse_uri;
case URI_ERRNO_NO_HOST_SLASH:
{
int offset = uri.port
? uri.port + uri.portlen - struri(&uri)
: uri.host + uri.hostlen - struri(&uri) + uri.ipv6 /* ']' */;
assertm(uri.host != NULL, "uri.host not set after no host slash error");
insert_in_string(&newurl, offset, "/", 1);
goto parse_uri;
}
case URI_ERRNO_INVALID_PROTOCOL:
{
const char *default_protocol;
/* No protocol name */
protocol_T protocol = find_uri_protocol(newurl);
struct string str;
if (!init_string(&str)) return NULL;
switch (protocol) {
case PROTOCOL_FTP:
add_to_string(&str, "ftp://");
encode_uri_string(&str, newurl, -1, 0);
break;
case PROTOCOL_HTTP:
#ifdef CONFIG_SSL
if (get_https_by_default())
add_to_string(&str, "https://");
else
#endif
add_to_string(&str, "http://");
add_to_string(&str, newurl);
break;
case PROTOCOL_UNKNOWN:
default_protocol = get_default_protocol();
if (strcmp("file://", default_protocol)) {
add_to_string(&str, default_protocol);
add_to_string(&str, newurl);
break;
}
case PROTOCOL_FILE:
default:
add_to_string(&str, "file://");
if (!dir_sep(*newurl))
add_to_string(&str, "./");
encode_file_uri_string(&str, newurl);
}
mem_free(newurl);
newurl = str.source;
goto parse_uri;
}
case URI_ERRNO_EMPTY:
case URI_ERRNO_IPV6_SECURITY:
case URI_ERRNO_NO_HOST:
case URI_ERRNO_INVALID_PORT:
case URI_ERRNO_INVALID_PORT_RANGE:
/* None of these can be handled properly. */
break;
}
mem_free(newurl);
return NULL;
}
struct uri *
get_composed_uri(struct uri *uri, uri_component_T components)
{
char *string;
assert(uri);
if_assert_failed return NULL;
string = get_uri_string(uri, components);
if (!string) return NULL;
uri = get_uri(string, URI_NONE);
mem_free(string);
return uri;
}
struct uri *
get_translated_uri(char *uristring, char *cwd)
{
struct uri *uri;
uristring = translate_url(uristring, cwd);
if (!uristring) return NULL;
uri = get_uri(uristring, URI_NONE);
mem_free(uristring);
return uri;
}
#define ADD_EXTENSION_FROM_TYPE(string, type, ext) \
if (!memcmp(string, type ";", sizeof(type ";") - 1) || \
!memcmp(string, type ",", sizeof(type ",") - 1)) \
return stracpy("." ext);
char *
get_extension_from_uri(struct uri *uri)
{
char *extension = NULL;
int afterslash = 1;
char *pos = uri->data;
assert(pos);
if (uri->protocol == PROTOCOL_DATA) {
ADD_EXTENSION_FROM_TYPE(uri->data, "image/gif", "gif")
ADD_EXTENSION_FROM_TYPE(uri->data, "image/jpeg", "jpg")
ADD_EXTENSION_FROM_TYPE(uri->data, "image/png", "png")
ADD_EXTENSION_FROM_TYPE(uri->data, "image/webp", "webp")
ADD_EXTENSION_FROM_TYPE(uri->data, "image/avif", "avif")
ADD_EXTENSION_FROM_TYPE(uri->data, "text/plain", "txt")
ADD_EXTENSION_FROM_TYPE(uri->data, "text/html", "html")
ADD_EXTENSION_FROM_TYPE(uri->data, "image/svg+xml", "svg")
ADD_EXTENSION_FROM_TYPE(uri->data, "image/jxl", "jxl")
return stracpy("");
}
for (; *pos && !end_of_dir(*pos); pos++) {
if (!afterslash && !extension && *pos == '.') {
extension = pos;
} else if (is_uri_dir_sep(uri, *pos)) {
extension = NULL;
afterslash = 1;
} else {
afterslash = 0;
}
}
if (extension && extension < pos)
return memacpy(extension, pos - extension);
return NULL;
}
/* URI encoding, escaping unallowed characters. */
static inline int
safe_char(unsigned char c)
{
/* RFC 2396, Page 8, Section 2.3 ;-) */
return isident(c) || c == '.' || c == '!' || c == '~'
|| c == '*' || c == '\''|| c == '(' || c == ')';
}
void
encode_uri_string(struct string *string, const char *name, int namelen,
int convert_slashes)
{
char n[4];
const char *end;
n[0] = '%';
n[3] = '\0';
if (namelen < 0) namelen = strlen(name);
for (end = name + namelen; name < end; name++) {
#if 0
/* This is probably correct only for query part of URI..? */
if (*name == ' ') add_char_to_string(data, len, '+');
else
#endif
if (safe_char(*name) || (!convert_slashes && *name == '/')) {
add_char_to_string(string, *name);
} else {
/* Hex it. */
n[1] = hx((((int) *name) & 0xF0) >> 4);
n[2] = hx(((int) *name) & 0xF);
add_bytes_to_string(string, n, sizeof(n) - 1);
}
}
}
void
encode_uri_string_percent(struct string *string, const char *name, int namelen)
{
char n[4];
const char *end;
n[0] = '%';
n[3] = '\0';
if (namelen < 0) namelen = strlen(name);
for (end = name + namelen; name < end; name++) {
if ((unsigned char)(*name) < 128) {
add_char_to_string(string, *name);
} else {
/* Hex it. */
n[1] = Hx((((int) *name) & 0xF0) >> 4);
n[2] = Hx(((int) *name) & 0xF);
add_bytes_to_string(string, n, sizeof(n) - 1);
}
}
}
void
encode_win32_uri_string(struct string *string, char *name, int namelen)
{
char n[4];
char *end;
n[0] = '%';
n[3] = '\0';
if (namelen < 0) namelen = strlen(name);
for (end = name + namelen; name < end; name++) {
if (safe_char(*name) || *name == ':' || *name == '\\') {
add_char_to_string(string, *name);
} else {
/* Hex it. */
n[1] = hx((((int) *name) & 0xF0) >> 4);
n[2] = hx(((int) *name) & 0xF);
add_bytes_to_string(string, n, sizeof(n) - 1);
}
}
}
/* This function is evil, it modifies its parameter. */
/* XXX: but decoded string is _never_ longer than encoded string so it's an
* efficient way to do that, imho. --Zas */
void
decode_uri(char *src)
{
char *dst = src;
unsigned char c;
do {
c = *src++;
if (c == '%') {
int x1 = unhx(*src);
if (x1 >= 0) {
int x2 = unhx(*(src + 1));
if (x2 >= 0) {
x1 = (x1 << 4) + x2;
if (x1 != 0) { /* don't allow %00 */
c = (unsigned char) x1;
src += 2;
}
}
}
#if 0
} else if (c == '+') {
/* As the comment in encode_uri_string suggests, '+'
* should only be decoded in the query part of a URI
* (should that be 'URL'?). I'm not bold enough to
* disable this code, tho. -- Miciah */
c = ' ';
#endif
}
*dst++ = c;
} while (c != '\0');
}
void
decode_uri_string(struct string *string)
{
decode_uri(string->source);
string->length = strlen(string->source);
}
void
decode_uri_for_display(char *src)
{
decode_uri(src);
for (; *src; src++)
if (!isprint(*src) || iscntrl(*src))
*src = '*';
}
void
decode_uri_string_for_display(struct string *string)
{
decode_uri_for_display(string->source);
string->length = strlen(string->source);
}
/* URI list */
#define URI_LIST_GRANULARITY 0x3
#define realloc_uri_list(list) \
mem_align_alloc(&(list)->uris, (list)->size, (list)->size + 1, \
URI_LIST_GRANULARITY)
struct uri *
add_to_uri_list(struct uri_list *list, struct uri *uri)
{
if (!realloc_uri_list(list))
return NULL;
list->uris[list->size++] = get_uri_reference(uri);
return uri;
};
void
free_uri_list(struct uri_list *list)
{
struct uri *uri;
int index;
if (!list->uris) return;
foreach_uri (uri, index, list) {
done_uri(uri);
}
mem_free_set(&list->uris, NULL);
list->size = 0;
}
/* URI cache */
struct uri_cache_entry {
struct uri uri;
char string[1];
};
struct uri_cache {
struct hash *map;
struct elinks_object object;
};
static struct uri_cache uri_cache;
#ifdef CONFIG_DEBUG
static inline void
check_uri_sanity(struct uri *uri)
{
int pos;
for (pos = 0; pos < uri->protocollen; pos++)
if (c_isupper(uri->string[pos])) goto error;
if (uri->hostlen)
for (pos = 0; pos < uri->hostlen; pos++)
if (c_isupper(uri->host[pos])) goto error;
return;
error:
INTERNAL("Uppercase letters detected in protocol or host part (%s).", struri(uri));
}
#else
#define check_uri_sanity(uri)
#endif
static inline struct uri_cache_entry *
get_uri_cache_entry(char *string, int length)
{
struct uri_cache_entry *entry;
struct hash_item *item;
assert(string && length > 0);
if_assert_failed return NULL;
item = get_hash_item(uri_cache.map, string, length);
if (item) return (struct uri_cache_entry *)item->value;
/* Setup a new entry */
entry = (struct uri_cache_entry *)mem_calloc(1, sizeof(*entry) + length);
if (!entry) return NULL;
object_nolock(&entry->uri, "uri");
memcpy(&entry->string, string, length);
string = entry->string;
if (parse_uri(&entry->uri, string) != URI_ERRNO_OK
|| !add_hash_item(uri_cache.map, string, length, entry)) {
mem_free(entry);
return NULL;
}
object_lock(&uri_cache);
return entry;
}
struct uri *
get_uri(char *string, uri_component_T components)
{
struct uri_cache_entry *entry;
assert(string);
if (components) {
struct uri uri;
if (parse_uri(&uri, string) != URI_ERRNO_OK)
return NULL;
return get_composed_uri(&uri, components);
}
if (!is_object_used(&uri_cache)) {
uri_cache.map = init_hash8();
if (!uri_cache.map) return NULL;
object_nolock(&uri_cache, "uri_cache");
}
entry = get_uri_cache_entry(string, strlen(string));
if (!entry) {
if (!is_object_used(&uri_cache))
free_hash(&uri_cache.map);
return NULL;
}
check_uri_sanity(&entry->uri);
object_nolock(&entry->uri, "uri");
object_lock(&entry->uri);
return &entry->uri;
}
void
done_uri(struct uri *uri)
{
char *string = struri(uri);
int length = strlen(string);
struct hash_item *item;
struct uri_cache_entry *entry;
assert(is_object_used(&uri_cache));
object_unlock(uri);
if (is_object_used(uri)) return;
item = get_hash_item(uri_cache.map, string, length);
entry = (struct uri_cache_entry *)(item ? item->value : NULL);
assertm(entry != NULL, "Releasing unknown URI [%s]", string);
del_hash_item(uri_cache.map, item);
mem_free(entry);
/* Last URI frees the cache */
object_unlock(&uri_cache);
if (!is_object_used(&uri_cache))
free_hash(&uri_cache.map);
}