elinks/src/encoding/encoding.c

/* Stream reading and decoding (mostly decompression) */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <errno.h>
#include <stdio.h>
#include <string.h>
#include <sys/stat.h> /* OS/2 needs this after sys/types.h */
#include <sys/types.h>
#ifdef HAVE_FCNTL_H
#include <fcntl.h> /* OS/2 needs this after sys/types.h */
#endif
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif

#include "elinks.h"

#include "config/options.h"
#include "encoding/encoding.h"
#include "network/state.h"
#include "osdep/osdep.h"
#include "util/memory.h"
#include "util/string.h"


/*************************************************************************
  Dummy encoding (ENCODING_NONE)
*************************************************************************/

struct dummy_enc_data {
	int fd;
};

static int
dummy_open(struct stream_encoded *stream, int fd)
{
	stream->data = mem_alloc(sizeof(struct dummy_enc_data));
	if (!stream->data) return -1;

	((struct dummy_enc_data *) stream->data)->fd = fd;

	return 0;
}

static int
dummy_read(struct stream_encoded *stream, char *data, int len)
{
	return safe_read(((struct dummy_enc_data *) stream->data)->fd, data, len);
}

static char *
dummy_decode_buffer(struct stream_encoded *stream, char *data, int len, int *new_len)
{
	char *buffer = memacpy(data, len);

	if (!buffer) return NULL;

	*new_len = len;
	return buffer;
}

static void
dummy_close(struct stream_encoded *stream)
{
	close(((struct dummy_enc_data *) stream->data)->fd);
	mem_free(stream->data);
}

static const char *const dummy_extensions[] = { NULL };

static const struct decoding_backend dummy_decoding_backend = {
	"none",
	dummy_extensions,
	dummy_open,
	dummy_read,
	dummy_decode_buffer,
	dummy_close,
};


/* Dynamic backend area */

#include "encoding/brotli.h"
#include "encoding/bzip2.h"
#include "encoding/gzip.h"
#include "encoding/lzma.h"
#include "encoding/zstd.h"

static const struct decoding_backend *const decoding_backends[] = {
	&dummy_decoding_backend,
	&gzip_decoding_backend,
	&bzip2_decoding_backend,
	&lzma_decoding_backend,
	&brotli_decoding_backend,
	&zstd_decoding_backend
};


/*************************************************************************
  Public functions
*************************************************************************/


/* Associates encoded stream with a fd. */
struct stream_encoded *
open_encoded(int fd, stream_encoding_T encoding)
{
	struct stream_encoded *stream;

	stream = (struct stream_encoded *)mem_alloc(sizeof(*stream));
	if (!stream) return NULL;

	stream->encoding = encoding;
	if (decoding_backends[stream->encoding]->eopen(stream, fd) >= 0)
		return stream;

	mem_free(stream);
	return NULL;
}

/* Read available data from stream and decode them. Note that when data change
 * their size during decoding, 'len' indicates desired size of _returned_ data,
 * not desired size of data read from stream. */
int
read_encoded(struct stream_encoded *stream, char *data, int len)
{
	return decoding_backends[stream->encoding]->eread(stream, data, len);
}

/* Decode an entire file from a buffer. This function is not suitable
 * for parts of files. @data contains the original data, @len bytes
 * long. The resulting decoded data chunk is *@new_len bytes long. */
char *
decode_encoded_buffer(struct stream_encoded *stream, stream_encoding_T encoding, char *data, int len,
		      int *new_len)
{
	return decoding_backends[encoding]->decode_buffer(stream, data, len, new_len);
}

/* Closes encoded stream. Note that fd associated with the stream will be
 * closed here. */
void
close_encoded(struct stream_encoded *stream)
{
	decoding_backends[stream->encoding]->eclose(stream);
	mem_free(stream);
}


/* Return a list of extensions associated with that encoding. */
const char *const *listext_encoded(stream_encoding_T encoding)
{
	return decoding_backends[encoding]->extensions;
}

stream_encoding_T
guess_encoding(char *filename)
{
	int fname_len = strlen(filename);
	char *fname_end = filename + fname_len;
	int enc;

	for (enc = 1; enc < ENCODINGS_KNOWN; enc++) {
		const char *const *ext = decoding_backends[enc]->extensions;

		while (ext && *ext) {
			int len = strlen(*ext);

			if (fname_len >= len && !strcmp(fname_end - len, *ext))
				return enc;

			ext++;
		}
	}

	return ENCODING_NONE;
}

const char *
get_encoding_name(stream_encoding_T encoding)
{
	return decoding_backends[encoding]->name;
}


/* File reading */

/* Tries to open @prefixname with each of the supported encoding extensions
 * appended. */
static inline stream_encoding_T
try_encoding_extensions(struct string *filename, int *fd)
{
	int length = filename->length;
	int encoding;

	/* No file of that name was found, try some others names. */
	for (encoding = 1; encoding < ENCODINGS_KNOWN; encoding++) {
		const char *const *ext = listext_encoded(encoding);

		for (; ext && *ext; ext++) {
			add_to_string(filename, *ext);

			/* We try with some extensions. */
			*fd = open(filename->source, O_RDONLY | O_NOCTTY);

			if (*fd >= 0)
				/* Ok, found one, use it. */
				return encoding;

			filename->source[length] = 0;
			filename->length = length;
		}
	}

	return ENCODING_NONE;
}

/** Reads the file from @a stream in chunks of size @a readsize.
 *
 * @a stream should be in blocking mode.  If it is in non-blocking
 * mode, this function can return an empty string in @a page just
 * because no more data is available yet, and the caller cannot know
 * whether the true end of the stream has been reached.
 *
 * @return a connection state. S_OK if all is well. */
struct connection_state
read_file(struct stream_encoded *stream, int readsize, struct string *page)
{
	if (!init_string(page)) return connection_state(S_OUT_OF_MEM);

	/* We read with granularity of stt.st_size (given as @readsize) - this
	 * does best job for uncompressed files, and doesn't hurt for
	 * compressed ones anyway - very large files usually tend to inflate
	 * fast anyway. At least I hope ;).  --pasky */
	/* Also there because of bug in Linux. Read returns -EACCES when
	 * reading 0 bytes to invalid address so ensure never to try and
	 * allocate zero number of bytes. */
	if (!readsize) readsize = 4096;

	while (realloc_string(page, page->length + readsize)) {
		char *string_pos = page->source + page->length;
		int readlen = read_encoded(stream, string_pos, readsize);

		if (readlen < 0) {
			done_string(page);

			/* If it is some I/O error (and errno is set) that will
			 * do. Since errno == 0 == S_WAIT and we cannot have
			 * that. */
			if (errno)
				return connection_state_for_errno(errno);

			/* FIXME: This is indeed an internal error. If readed from a
			 * corrupted encoded file nothing or only some of the
			 * data will be read. */
			return connection_state(S_ENCODE_ERROR);

		} else if (readlen == 0) {
			/* NUL-terminate just in case */
			page->source[page->length] = '\0';
			return connection_state(S_OK);
		}

		page->length += readlen;
#if 0
		/* This didn't work so well as it should (I had to implement
		 * end of stream handling to bzip2 anyway), so I rather
		 * disabled this. */
		if (readlen < readsize) {
			/* This is much safer. It should always mean that we
			 * already read everything possible, and it permits us
			 * more elegant of handling end of file with bzip2. */
			break;
		}
#endif
	}

	done_string(page);
	return connection_state(S_OUT_OF_MEM);
}

static inline int
is_stdin_pipe(struct stat *stt, struct string *filename)
{
	/* On Mac OS X, /dev/stdin has type S_IFSOCK. (bug 616) */
	return !strlcmp(filename->source, filename->length, "/dev/stdin", 10)
		&& (
#ifdef S_ISSOCK
			S_ISSOCK(stt->st_mode) ||
#endif
			S_ISFIFO(stt->st_mode));
}

struct connection_state
read_encoded_file(struct string *filename, struct string *page)
{
	struct stream_encoded *stream;
	struct stat stt;
	stream_encoding_T encoding = ENCODING_NONE;
	int fd = open(filename->source, O_RDONLY | O_NOCTTY);
	struct connection_state state = connection_state_for_errno(errno);

	if (fd == -1 && get_opt_bool("protocol.file.try_encoding_extensions", NULL)) {
		encoding = try_encoding_extensions(filename, &fd);

	} else if (fd != -1) {
		encoding = guess_encoding(filename->source);
	}

	if (fd == -1) {
#ifdef HAVE_SYS_CYGWIN_H
		/* There is no /dev/stdin on Cygwin. */
		if (!strlcmp(filename->source, filename->length, "/dev/stdin", 10)) {
			fd = STDIN_FILENO;
		} else
#endif
		return state;
	}

	/* Some file was opened so let's get down to bi'ness */
	set_bin(fd);

	/* Do all the necessary checks before trying to read the file.
	 * @state code is used to block further progress. */
	if (fstat(fd, &stt)) {
		state = connection_state_for_errno(errno);

	} else if (!S_ISREG(stt.st_mode) && encoding != ENCODING_NONE) {
		/* We only want to open regular encoded files. */
		/* Leave @state being the saved errno */

	} else if (!S_ISREG(stt.st_mode) && !is_stdin_pipe(&stt, filename)
	           && !get_opt_bool("protocol.file.allow_special_files", NULL)) {
		state = connection_state(S_FILE_TYPE);

	} else if (!(stream = open_encoded(fd, encoding))) {
		state = connection_state(S_OUT_OF_MEM);

	} else {
		int readsize = (int) stt.st_size;

		/* Check if st_size will cause overflow. */
		/* FIXME: See bug 497 for info about support for big files. */
		if (readsize != stt.st_size || readsize < 0) {
#ifdef EFBIG
			state = connection_state_for_errno(EFBIG);
#else
			state = connection_state(S_FILE_ERROR);
#endif

		} else {
			state = read_file(stream, stt.st_size, page);
		}
		close_encoded(stream);
	}

	close(fd);
	return state;
}