2005-09-15 09:58:31 -04:00
|
|
|
#ifndef EL__PROTOCOL_URI_H
|
|
|
|
#define EL__PROTOCOL_URI_H
|
|
|
|
|
|
|
|
#include "main/object.h"
|
2022-01-15 13:26:04 -05:00
|
|
|
#include "protocol/protocol.h"
|
2005-09-15 09:58:31 -04:00
|
|
|
|
2020-10-05 14:14:55 -04:00
|
|
|
#ifdef __cplusplus
|
|
|
|
extern "C" {
|
|
|
|
#endif
|
|
|
|
|
2019-04-21 06:27:40 -04:00
|
|
|
struct string;
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
#define POST_CHAR 1
|
|
|
|
#define POST_CHAR_S "\001"
|
2008-05-12 06:51:53 -04:00
|
|
|
#define FILE_CHAR '\002'
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
/* The uri structure is used to store the start position and length of commonly
|
|
|
|
* used uri fields. It is initialized by parse_uri(). It is possible that the
|
|
|
|
* start of a field is set but that the length is zero so instead of testing
|
|
|
|
* *uri-><fieldname> always use uri-><fieldname>len. */
|
|
|
|
/* XXX: Lots of places in the code assume that the string members point into
|
|
|
|
* the same string. That means if you need to use a NUL terminated uri field
|
|
|
|
* either temporary modify the string, allocated a copy or change the function
|
|
|
|
* used to take a length parameter (in the reverse precedence - modifying the
|
|
|
|
* string should not be done since you never know what kind of memory actually
|
|
|
|
* contains the string --pasky). */
|
|
|
|
/* TODO: We should probably add path+query members instead of data. */
|
|
|
|
|
|
|
|
struct uri {
|
2008-07-11 04:27:46 -04:00
|
|
|
/** The start of the URI (and thus start of the protocol %string).
|
|
|
|
* The format of the whole %string is like:
|
|
|
|
* "http6://elinks.cz/dir/file?query#frag" ::POST_CHAR post_data "\0"
|
|
|
|
*
|
|
|
|
* The post_data is not really %part of the URI but ELinks keeps it
|
|
|
|
* in the same %string and can then distinguish between cache entries
|
|
|
|
* for different POST requests. See uri.post for its syntax. */
|
2021-01-02 10:20:27 -05:00
|
|
|
char *string;
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
/* The internal type of protocol. Can _never_ be PROTOCOL_UNKNOWN. */
|
2022-01-28 11:19:11 -05:00
|
|
|
protocol_T protocol; /* protocol_T */
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
/* A special ELinks extension allows i.e. 'http4' or 'ftp6' protocols,
|
|
|
|
* forcing the given IP family. 0 means the IP family is not forced. */
|
|
|
|
int ip_family;
|
|
|
|
|
2021-01-02 10:20:27 -05:00
|
|
|
char *user;
|
|
|
|
char *password;
|
|
|
|
char *host;
|
|
|
|
char *port;
|
2005-09-15 09:58:31 -04:00
|
|
|
/* @data can contain both the path and query uri fields.
|
|
|
|
* It can never be NULL but can have zero length. */
|
2021-01-02 10:20:27 -05:00
|
|
|
char *data;
|
|
|
|
char *fragment;
|
2008-07-11 04:27:46 -04:00
|
|
|
|
|
|
|
/** POST data attached to the URI. If uri.string contains a
|
|
|
|
* ::POST_CHAR, then @c post points to the following
|
|
|
|
* character. Otherwise NULL. The syntax of the POST data
|
|
|
|
* is:
|
|
|
|
*
|
|
|
|
* [content-type '\\n']
|
|
|
|
* (hexadecimal-byte | ::FILE_CHAR file-name ::FILE_CHAR)*
|
|
|
|
*
|
|
|
|
* - If content-type is present, ELinks sends "Content-Type: ",
|
|
|
|
* content-type, and CRLF in the head of the POST request.
|
|
|
|
*
|
|
|
|
* - Each hexadecimal-byte is a byte for the body of the POST
|
|
|
|
* request. It is encoded as two lower-case hexadecimal
|
|
|
|
* digits, most significant first. For example, "0a" for
|
|
|
|
* ::ASCII_LF.
|
|
|
|
*
|
|
|
|
* - file-name is the name of a file that ELinks should send
|
|
|
|
* to the server. It is in the charset accepted by open(),
|
2008-07-11 07:15:32 -04:00
|
|
|
* and some characters (especially ::FILE_CHAR) are
|
|
|
|
* percent-encoded. */
|
2021-01-02 10:20:27 -05:00
|
|
|
char *post;
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
/* @protocollen should only be usable if @protocol is either
|
|
|
|
* PROTOCOL_USER or an uri string should be composed. */
|
|
|
|
unsigned int protocollen:16;
|
|
|
|
unsigned int userlen:16;
|
|
|
|
unsigned int passwordlen:16;
|
|
|
|
unsigned int hostlen:16;
|
|
|
|
unsigned int portlen:8;
|
2020-09-18 15:58:28 -04:00
|
|
|
unsigned int datalen;
|
2005-09-15 09:58:31 -04:00
|
|
|
unsigned int fragmentlen:16;
|
|
|
|
|
|
|
|
/* Flags */
|
|
|
|
unsigned int ipv6:1; /* URI contains IPv6 host */
|
|
|
|
unsigned int form:1; /* URI originated from form */
|
|
|
|
|
|
|
|
/* Usage count object. */
|
|
|
|
struct object object;
|
|
|
|
};
|
|
|
|
|
|
|
|
enum uri_errno {
|
|
|
|
URI_ERRNO_OK, /* Parsing went well */
|
|
|
|
URI_ERRNO_EMPTY, /* The URI string was empty */
|
|
|
|
URI_ERRNO_INVALID_PROTOCOL, /* No protocol was found */
|
|
|
|
URI_ERRNO_NO_SLASHES, /* Slashes after protocol missing */
|
|
|
|
URI_ERRNO_TOO_MANY_SLASHES, /* Too many slashes after protocol */
|
|
|
|
URI_ERRNO_TRAILING_DOTS, /* '.' after host */
|
|
|
|
URI_ERRNO_NO_HOST, /* Host part is missing */
|
|
|
|
URI_ERRNO_NO_PORT_COLON, /* ':' after host without port */
|
|
|
|
URI_ERRNO_NO_HOST_SLASH, /* Slash after host missing */
|
|
|
|
URI_ERRNO_IPV6_SECURITY, /* IPv6 security bug detected */
|
|
|
|
URI_ERRNO_INVALID_PORT, /* Port number is bad */
|
|
|
|
URI_ERRNO_INVALID_PORT_RANGE, /* Port number is not within 0-65535 */
|
|
|
|
};
|
|
|
|
|
2022-01-28 11:22:11 -05:00
|
|
|
typedef int uri_errno_T;
|
|
|
|
|
2005-09-15 09:58:31 -04:00
|
|
|
/* Initializes the members of the uri struct, as they are encountered.
|
|
|
|
* If an uri component is recognized both it's length and starting point is
|
|
|
|
* set. */
|
|
|
|
/* Returns what error was encountered or URI_ERRNO_OK if parsing went well. */
|
2022-01-28 11:22:11 -05:00
|
|
|
uri_errno_T parse_uri(struct uri *uri, char *uristring);
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
|
|
|
|
/* Returns the raw zero-terminated URI string the (struct uri) is associated
|
|
|
|
* with. Thus, chances are high that it is the original URI received, not any
|
|
|
|
* cheap reconstruction. */
|
|
|
|
#define struri(uri) ((uri)->string)
|
|
|
|
|
|
|
|
|
|
|
|
enum uri_component {
|
|
|
|
/**** The "raw" URI components */
|
2022-01-14 14:52:17 -05:00
|
|
|
URI_NONE = 0,
|
2005-09-15 09:58:31 -04:00
|
|
|
URI_PROTOCOL = (1 << 0),
|
|
|
|
URI_IP_FAMILY = (1 << 1),
|
|
|
|
URI_USER = (1 << 2),
|
|
|
|
URI_PASSWORD = (1 << 3),
|
|
|
|
URI_HOST = (1 << 4),
|
|
|
|
URI_PORT = (1 << 5),
|
|
|
|
URI_DEFAULT_PORT = (1 << 6),
|
|
|
|
URI_DATA = (1 << 7),
|
|
|
|
URI_FRAGMENT = (1 << 8),
|
|
|
|
URI_POST = (1 << 9),
|
|
|
|
URI_POST_INFO = (1 << 10),
|
|
|
|
|
|
|
|
|
|
|
|
/**** Flags affecting appearance of the components above, or special
|
|
|
|
* mutations of mixups of some of the raw components. */
|
|
|
|
|
|
|
|
/* Control for ``encoding'' URIs into Internationalized Domain Names.
|
|
|
|
* Hopefully only a few lowlevel places should have to use it and it
|
|
|
|
* should never be exposed to the user. */
|
|
|
|
URI_IDN = (1 << 11),
|
|
|
|
|
|
|
|
/* Add stuff from uri->data and up and prefixes a '/' */
|
|
|
|
URI_PATH = (1 << 12),
|
|
|
|
|
|
|
|
/* Add filename from last direcory separator in uri->data to end of
|
|
|
|
* path. */
|
|
|
|
URI_FILENAME = (1 << 13),
|
|
|
|
|
|
|
|
/* Add query part from uri->data not including the '?' */
|
|
|
|
URI_QUERY = (1 << 14),
|
|
|
|
|
|
|
|
|
|
|
|
/**** Some predefined classes for formatting of URIs */
|
|
|
|
|
|
|
|
/* Special flags */
|
|
|
|
URI_SPECIAL = URI_DEFAULT_PORT | URI_PATH | URI_FILENAME | URI_QUERY,
|
|
|
|
|
|
|
|
/* The usual suspects */
|
|
|
|
URI_RARE = URI_SPECIAL | URI_POST | URI_POST_INFO | URI_IDN,
|
|
|
|
|
|
|
|
/* Used _only_ for displaying URIs in dialogs or document titles. */
|
|
|
|
URI_PUBLIC = ~(URI_PASSWORD | URI_RARE) | URI_POST_INFO,
|
|
|
|
|
|
|
|
/* Used for getting the original URI with no internal post encoding */
|
|
|
|
URI_ORIGINAL = ~URI_RARE,
|
|
|
|
|
|
|
|
/* Used for getting the URI with no #fragment */
|
|
|
|
URI_BASE = ~(URI_RARE | URI_FRAGMENT) | URI_POST,
|
|
|
|
|
2022-01-15 12:28:46 -05:00
|
|
|
URI_BASE_FRAGMENT = URI_BASE | URI_FRAGMENT,
|
|
|
|
|
2005-09-15 09:58:31 -04:00
|
|
|
/* Used for getting data-less URI (stuff only up to the slash). */
|
|
|
|
URI_SERVER = ~(URI_RARE | URI_DATA | URI_FRAGMENT),
|
|
|
|
|
|
|
|
/* Used in the HTTP Auth code */
|
|
|
|
URI_HTTP_AUTH = ~(URI_RARE | URI_USER | URI_PASSWORD | URI_DATA | URI_FRAGMENT),
|
|
|
|
|
|
|
|
/* Used for the value of HTTP "Host" header info */
|
|
|
|
URI_HTTP_HOST = URI_HOST | URI_PORT | URI_IDN,
|
|
|
|
|
|
|
|
/* Used for the host part of HTTP referrer. Stripped from user info. */
|
|
|
|
URI_HTTP_REFERRER_HOST = URI_PROTOCOL | URI_HOST | URI_PORT,
|
|
|
|
|
|
|
|
/* Used for the whole HTTP referrer. Contains no user/passwd info. */
|
|
|
|
URI_HTTP_REFERRER = URI_HTTP_REFERRER_HOST | URI_DATA,
|
|
|
|
|
|
|
|
/* Used for HTTP CONNECT method info */
|
|
|
|
URI_HTTP_CONNECT = URI_HOST | URI_PORT | URI_DEFAULT_PORT,
|
|
|
|
|
2006-01-29 19:22:31 -05:00
|
|
|
/* Used for adding directory listing HTML header, */
|
|
|
|
URI_DIR_LOCATION = URI_PROTOCOL | URI_HOST | URI_PORT | URI_IDN,
|
|
|
|
|
2005-09-15 09:58:31 -04:00
|
|
|
/* Used for getting the host of a DNS query. As a hidden bonus we get
|
|
|
|
* IPv6 hostnames without the brackets because we don't ask for
|
|
|
|
* URI_PORT. */
|
|
|
|
URI_DNS_HOST = URI_HOST | URI_IDN,
|
|
|
|
|
2022-01-15 12:28:46 -05:00
|
|
|
URI_HOST_PORT = URI_HOST | URI_PORT,
|
|
|
|
|
2005-09-15 09:58:31 -04:00
|
|
|
/* Used for adding the unproxied URI and encode it using IDN to string */
|
|
|
|
URI_PROXY = ~(URI_RARE | URI_FRAGMENT) | URI_IDN,
|
|
|
|
|
|
|
|
/* Used for comparing keepalive connection URIs */
|
|
|
|
/* (We don't need to bother by explicit IP family, we don't care
|
|
|
|
* whether the actual query goes over IPv4 or IPv6 but only about
|
|
|
|
* new connections. Of course another thing is what the user expects
|
|
|
|
* us to care about... ;-) --pasky */
|
|
|
|
URI_KEEPALIVE = URI_PROTOCOL | URI_USER | URI_PASSWORD | URI_HOST | URI_PORT,
|
|
|
|
|
|
|
|
/* Used for the form action URI using the GET method */
|
|
|
|
URI_FORM_GET = URI_SERVER | URI_PATH,
|
|
|
|
};
|
|
|
|
|
2022-01-28 09:56:59 -05:00
|
|
|
typedef unsigned int uri_component_T;
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
/* List for maintaining multiple URIs. Free it with mem_free() */
|
|
|
|
struct uri_list {
|
|
|
|
int size;
|
|
|
|
struct uri **uris;
|
|
|
|
};
|
|
|
|
|
|
|
|
#define foreach_uri(uri, index, list) \
|
|
|
|
for (index = 0; index < (list)->size; index++) \
|
|
|
|
if ((uri = (list)->uris[index]))
|
|
|
|
|
|
|
|
/* Adds @uri to the URI list */
|
|
|
|
struct uri *add_to_uri_list(struct uri_list *list, struct uri *uri);
|
|
|
|
|
|
|
|
/* Free all entries in the URI list */
|
|
|
|
void free_uri_list(struct uri_list *list);
|
|
|
|
|
|
|
|
|
|
|
|
/* A small URI struct cache to increase reusability. */
|
|
|
|
/* XXX: Now there are a few rules to abide.
|
|
|
|
*
|
|
|
|
* Any URI string that should be registered in the cache has to have lowercased
|
|
|
|
* both the protocol and hostname parts. This is strictly checked and will
|
|
|
|
* otherwise cause an assertion failure.
|
|
|
|
*
|
|
|
|
* However this will not be a problem if you either first call join_urls()
|
|
|
|
* which you want to do anyway to resolve relative references or use the
|
|
|
|
* get_translated_uri() interface.
|
|
|
|
*
|
|
|
|
* The remaining support for RFC 2396 section 3.1 is done through get_protocol()
|
|
|
|
* and get_user_program() which will treat upper case letters
|
|
|
|
* as equivalent to lower case in protocol names. */
|
|
|
|
|
|
|
|
/* Register a new URI in the cache where @components controls which parts are
|
|
|
|
* added to the returned URI. */
|
2022-01-28 09:56:59 -05:00
|
|
|
struct uri *get_uri(char *string, uri_component_T components);
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
/* Dereference an URI from the cache */
|
|
|
|
void done_uri(struct uri *uri);
|
|
|
|
|
|
|
|
/* Take a reference of an URI already registered in the cache. */
|
|
|
|
static inline struct uri *
|
|
|
|
get_uri_reference(struct uri *uri)
|
|
|
|
{
|
|
|
|
object_lock(uri);
|
|
|
|
return uri;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Get URI using the string returned by get_uri_string(@uri, @components) */
|
2022-01-28 09:56:59 -05:00
|
|
|
struct uri *get_composed_uri(struct uri *uri, uri_component_T components);
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
/* Resolves an URI relative to a current working directory (CWD) and possibly
|
|
|
|
* extracts the fragment. It is possible to just use it to extract fragment
|
|
|
|
* and get the resulting URI from the cache.
|
|
|
|
* @uristring is the URI to resolve or translate.
|
|
|
|
* @cwd if non NULL @uristring will be translated using this CWD. */
|
2021-01-02 10:20:27 -05:00
|
|
|
struct uri *get_translated_uri(char *uristring, char *cwd);
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
/* Normalizes the directory structure given in uristring. XXX: The function
|
|
|
|
* modifies the uristring and returns it. The uri argument should be NULL
|
|
|
|
* if the uri is not the parsed uristring. */
|
2021-01-02 10:20:27 -05:00
|
|
|
char *normalize_uri(struct uri *uri, char *uristring);
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
/* Check if two URIs are equal. If @components are 0 simply compare the whole
|
|
|
|
* URI else only compare the specific parts. */
|
2007-03-18 03:56:43 -04:00
|
|
|
int compare_uri(const struct uri *uri1, const struct uri *uri2,
|
2022-01-28 09:56:59 -05:00
|
|
|
uri_component_T components);
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
/* These functions recreate the URI string part by part. */
|
|
|
|
/* The @components bitmask describes the set of URI components used for
|
|
|
|
* construction of the URI string. */
|
|
|
|
|
|
|
|
/* Adds the components to an already initialized string. */
|
2019-04-21 06:27:40 -04:00
|
|
|
struct string *add_uri_to_string(struct string *string, const struct uri *uri,
|
2022-01-28 09:56:59 -05:00
|
|
|
uri_component_T components);
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
/* Takes an uri string, parses it and adds the desired components. Useful if
|
|
|
|
* there is no struct uri around. */
|
2022-01-28 09:56:59 -05:00
|
|
|
struct string *add_string_uri_to_string(struct string *string, char *uristring, uri_component_T components);
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
/* Returns the new URI string or NULL upon an error. */
|
2021-01-02 10:20:27 -05:00
|
|
|
char *get_uri_string(const struct uri *uri,
|
2022-01-28 09:56:59 -05:00
|
|
|
uri_component_T components);
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
/* Returns either the uri's port number if available or the protocol's
|
|
|
|
* default port. It is zarro for user protocols. */
|
2007-03-18 03:56:43 -04:00
|
|
|
int get_uri_port(const struct uri *uri);
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
/* Tcp port range */
|
|
|
|
#define LOWEST_PORT 0
|
|
|
|
#define HIGHEST_PORT 65535
|
|
|
|
|
|
|
|
#define uri_port_is_valid(port) \
|
|
|
|
(LOWEST_PORT <= (port) && (port) <= HIGHEST_PORT)
|
|
|
|
|
|
|
|
|
|
|
|
/* Encode and add @namelen bytes from @name to @string. If @namelen is -1 it is
|
|
|
|
* set to strlen(@name). If the boolean convert_slashes is zero '/'-chars will
|
|
|
|
* not be encoded. */
|
2021-01-02 10:20:27 -05:00
|
|
|
void encode_uri_string(struct string *string, const char *name, int namelen,
|
2005-09-15 09:58:31 -04:00
|
|
|
int convert_slashes);
|
|
|
|
|
2006-07-02 13:20:27 -04:00
|
|
|
/* special version for Windows directory listing */
|
2021-01-02 10:20:27 -05:00
|
|
|
void encode_win32_uri_string(struct string *string, char *name, int namelen);
|
2006-07-02 13:20:27 -04:00
|
|
|
|
2019-04-21 06:27:40 -04:00
|
|
|
void decode_uri_string(struct string *string);
|
2021-01-02 10:20:27 -05:00
|
|
|
void decode_uri(char *uristring);
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
/* Decodes and replaces illicit screen chars with '*'. */
|
2019-04-21 06:27:40 -04:00
|
|
|
void decode_uri_string_for_display(struct string *string);
|
2021-01-02 10:20:27 -05:00
|
|
|
void decode_uri_for_display(char *uristring);
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
/* Returns allocated string containing the biggest possible extension.
|
|
|
|
* If url is 'jabadaba.1.foo.gz' the returned extension is '1.foo.gz' */
|
2021-01-02 10:20:27 -05:00
|
|
|
char *get_extension_from_uri(struct uri *uri);
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
|
|
|
|
/* Resolves a @relative URI to absolute form using @base URI.
|
2005-12-28 23:35:02 -05:00
|
|
|
* Example: if @base is http://elinks.cz/ and @relative is #news
|
|
|
|
* the outcome would be http://elinks.cz/#news */
|
2022-02-21 11:13:14 -05:00
|
|
|
char *join_urls(struct uri *base, const char *relative);
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
/* Return position if end of string @s matches a known tld or -1 if not.
|
|
|
|
* If @slen < 0, then string length will be obtained by a strlen() call,
|
|
|
|
* else @slen is used as @s length. */
|
2021-01-02 10:20:27 -05:00
|
|
|
int end_with_known_tld(const char *s, int slen);
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
|
|
|
|
static inline int
|
|
|
|
get_real_uri_length(struct uri *uri)
|
|
|
|
{
|
|
|
|
return uri->post ? uri->post - struri(uri) - 1 : strlen(struri(uri));
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Checks if @address contains a valid IP address. */
|
2021-01-02 10:20:27 -05:00
|
|
|
int is_ip_address(const char *address, int addresslen);
|
2005-09-15 09:58:31 -04:00
|
|
|
|
2007-08-29 09:33:19 -04:00
|
|
|
/* Check whether domain is matching server
|
|
|
|
* Ie.
|
|
|
|
* example.com matches www.example.com/
|
|
|
|
* example.com doesn't match www.example.com.org/
|
|
|
|
* example.com doesn't match www.example.comm/
|
|
|
|
* example.com doesn't match example.co
|
|
|
|
*/
|
2021-01-02 10:20:27 -05:00
|
|
|
int is_in_domain(char *domain, char *server, int server_len);
|
2007-08-29 09:33:19 -04:00
|
|
|
|
2020-10-05 14:14:55 -04:00
|
|
|
#ifdef __cplusplus
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2005-09-15 09:58:31 -04:00
|
|
|
#endif
|