From 9079106894b985ff9052f4f4adb3e0c886602355 Mon Sep 17 00:00:00 2001 From: Kalle Olavi Niemitalo Date: Wed, 19 Aug 2009 00:18:20 +0300 Subject: [PATCH] bug 1083: Distinguish EOF from errors in read_encoded The 2009-08-15 fix for bug 1083 made read_encoded() return -1 of EOF, like decompress_data() expects. Unfortunately, read_file() too calls read_encoded(), and it treated the -1 as an error and reporter whatever error had been left in errno. This made it impossible to display local files, compressed or not. Apparently then, read_encoded() needs to distinguish between decoded bytes, EOF, EAGAIN, and true errors. Make it return an enum. --- src/encoding/bzip2.c | 67 ++++++++++++++++++++------- src/encoding/deflate.c | 77 +++++++++++++++++++++++++------ src/encoding/encoding.c | 98 +++++++++++++++++++++++----------------- src/encoding/encoding.h | 36 +++++++++++++++ src/encoding/lzma.c | 60 ++++++++++++++++++------ src/protocol/http/http.c | 10 ++-- 6 files changed, 255 insertions(+), 93 deletions(-) diff --git a/src/encoding/bzip2.c b/src/encoding/bzip2.c index 22196fb84..5c083780a 100644 --- a/src/encoding/bzip2.c +++ b/src/encoding/bzip2.c @@ -31,12 +31,11 @@ struct bz2_enc_data { /* The file descriptor from which we read. */ int fdread; - /* Initially 0; set to 1 when BZ2_bzDecompress indicates - * BZ_STREAM_END, which means it has found the bzip2-specific - * end-of-stream marker and all data has been decompressed. - * Then we neither read from the file nor call BZ2_bzDecompress - * any more. */ - int last_read; + /** Error code to be returned by all later bzip2_read() calls. + * ::READENC_EAGAIN is used here as a passive value that means + * no such error occurred yet. ::READENC_ERRNO is not allowed + * because there is no @c sticky_errno member here. */ + enum read_encoded_result sticky_result; /* A buffer for data that has been read from the file but not * yet decompressed. fbz_stream.next_in and fbz_stream.avail_in @@ -65,7 +64,7 @@ bzip2_open(struct stream_encoded *stream, int fd) * will be initialized on demand by bzip2_read. */ copy_struct(&data->fbz_stream, &null_bz_stream); data->fdread = fd; - data->last_read = 0; + data->sticky_result = READENC_EAGAIN; err = BZ2_bzDecompressInit(&data->fbz_stream, 0, 0); if (err != BZ_OK) { @@ -78,34 +77,64 @@ bzip2_open(struct stream_encoded *stream, int fd) return 0; } +static enum read_encoded_result +map_bzip2_ret(int ret) +{ + switch (ret) { + case BZ_STREAM_END: + return READENC_STREAM_END; + case BZ_DATA_ERROR: + case BZ_DATA_ERROR_MAGIC: + return READENC_DATA_ERROR; + case BZ_UNEXPECTED_EOF: + return READENC_UNEXPECTED_EOF; + case BZ_IO_ERROR: + return READENC_ERRNO; + case BZ_MEM_ERROR: + return READENC_MEM_ERROR; + case BZ_RUN_OK: /* not possible in decompression */ + case BZ_FLUSH_OK: /* likewise */ + case BZ_FINISH_OK: /* likewise */ + case BZ_OUTBUFF_FULL: /* only for BuffToBuff functions */ + case BZ_CONFIG_ERROR: + case BZ_SEQUENCE_ERROR: + case BZ_PARAM_ERROR: + default: + return READENC_INTERNAL; + } +} + static int bzip2_read(struct stream_encoded *stream, unsigned char *buf, int len) { struct bz2_enc_data *data = (struct bz2_enc_data *) stream->data; int err = 0; + int l = 0; - if (!data) return -1; + if (!data) return READENC_INTERNAL; assert(len > 0); + if_assert_failed return READENC_INTERNAL; - if (data->last_read) return -1; + if (data->sticky_result != READENC_EAGAIN) + return data->sticky_result; data->fbz_stream.avail_out = len; data->fbz_stream.next_out = buf; do { if (data->fbz_stream.avail_in == 0) { - int l = safe_read(data->fdread, data->buf, - ELINKS_BZ_BUFFER_LENGTH); + l = safe_read(data->fdread, data->buf, + ELINKS_BZ_BUFFER_LENGTH); if (l == -1) { if (errno == EAGAIN) break; else - return -1; /* I/O error */ + return READENC_ERRNO; /* I/O error */ } else if (l == 0) { /* EOF. It is error: we wait for more bytes */ - return -1; + return READENC_UNEXPECTED_EOF; } data->fbz_stream.next_in = data->buf; @@ -114,15 +143,19 @@ bzip2_read(struct stream_encoded *stream, unsigned char *buf, int len) err = BZ2_bzDecompress(&data->fbz_stream); if (err == BZ_STREAM_END) { - data->last_read = 1; + data->sticky_result = READENC_STREAM_END; break; } else if (err != BZ_OK) { - return -1; + return map_bzip2_ret(err); } } while (data->fbz_stream.avail_out > 0); - assert(len - data->fbz_stream.avail_out == data->fbz_stream.next_out - (char *) buf); - return len - data->fbz_stream.avail_out; + l = len - data->fbz_stream.avail_out; + assert(l == data->fbz_stream.next_out - (char *) buf); + if (l > 0) /* Positive return values are byte counts */ + return l; + else /* and others are from enum read_encoded_result */ + return data->sticky_result; } #ifdef CONFIG_SMALL diff --git a/src/encoding/deflate.c b/src/encoding/deflate.c index 9031409b5..3ba857fd6 100644 --- a/src/encoding/deflate.c +++ b/src/encoding/deflate.c @@ -29,9 +29,18 @@ struct deflate_enc_data { /* The file descriptor from which we read. */ int fdread; - unsigned int last_read:1; unsigned int after_first_read:1; + /** Error code to be returned by all later deflate_read() + * calls. ::READENC_EAGAIN is used here as a passive value + * that means no such error occurred yet. */ + enum read_encoded_result sticky_result; + + /** Error code to be set to @c errno by all later + * deflate_read() calls. This is interesting only when + * #sticky_result == ::READENC_ERRNO. */ + int sticky_errno; + /* A buffer for data that has been read from the file but not * yet decompressed. z_stream.next_in and z_stream.avail_in * refer to this buffer. */ @@ -59,8 +68,9 @@ deflate_open(int window_size, struct stream_encoded *stream, int fd) * will be initialized on demand by deflate_read. */ copy_struct(&data->deflate_stream, &null_z_stream); data->fdread = fd; - data->last_read = 0; data->after_first_read = 0; + data->sticky_result = READENC_EAGAIN; + data->sticky_errno = 0; err = inflateInit2(&data->deflate_stream, window_size); if (err != Z_OK) { @@ -88,6 +98,36 @@ deflate_gzip_open(struct stream_encoded *stream, int fd) return deflate_open(MAX_WBITS + 32, stream, fd); } +static void +deflate_set_sticky(struct deflate_enc_data *data, int ret, int save_errno) +{ + switch (ret) { + case Z_STREAM_END: + data->sticky_result = READENC_STREAM_END; + break; + case Z_DATA_ERROR: + case Z_NEED_DICT: + data->sticky_result = READENC_DATA_ERROR; + break; + case Z_ERRNO: + data->sticky_result = READENC_ERRNO; + data->sticky_errno = save_errno; + break; + case Z_MEM_ERROR: + data->sticky_result = READENC_MEM_ERROR; + break; + case Z_STREAM_ERROR: + case Z_BUF_ERROR: + case Z_VERSION_ERROR: + default: + data->sticky_result = READENC_INTERNAL; + break; + } +} + +/*! @return A positive number means that many bytes were + * written to the @a buf array. Otherwise, the value is + * enum read_encoded_result. */ static int deflate_read(struct stream_encoded *stream, unsigned char *buf, int len) { @@ -95,11 +135,16 @@ deflate_read(struct stream_encoded *stream, unsigned char *buf, int len) int err = 0; int l = 0; - if (!data) return -1; + if (!data) return READENC_INTERNAL; assert(len > 0); + if_assert_failed return READENC_INTERNAL; - if (data->last_read) return -1; + if (data->sticky_result != READENC_EAGAIN) { + if (data->sticky_result == READENC_ERRNO) + errno = data->sticky_errno; + return data->sticky_result; + } data->deflate_stream.avail_out = len; data->deflate_stream.next_out = buf; @@ -107,16 +152,16 @@ deflate_read(struct stream_encoded *stream, unsigned char *buf, int len) do { if (data->deflate_stream.avail_in == 0) { l = safe_read(data->fdread, data->buf, - ELINKS_DEFLATE_BUFFER_LENGTH); + ELINKS_DEFLATE_BUFFER_LENGTH); if (l == -1) { if (errno == EAGAIN) break; else - return -1; /* I/O error */ + return READENC_ERRNO; /* I/O error */ } else if (l == 0) { /* EOF. It is error: we wait for more bytes */ - return -1; + return READENC_UNEXPECTED_EOF; } data->deflate_stream.next_in = data->buf; @@ -156,17 +201,21 @@ restart: if (err == Z_OK) goto restart; } data->after_first_read = 1; - if (err == Z_STREAM_END) { - data->last_read = 1; - break; - } else if (err != Z_OK) { - data->last_read = 1; + if (err != Z_OK) { + deflate_set_sticky(data, err, errno); break; } } while (data->deflate_stream.avail_out > 0); - assert(len - data->deflate_stream.avail_out == data->deflate_stream.next_out - buf); - return len - data->deflate_stream.avail_out; + l = len - data->deflate_stream.avail_out; + assert(l == data->deflate_stream.next_out - buf); + if (l > 0) /* Positive return values are byte counts */ + return l; + else { /* and others are from enum read_encoded_result */ + if (data->sticky_result == READENC_ERRNO) + errno = data->sticky_errno; + return data->sticky_result; + } } static unsigned char * diff --git a/src/encoding/encoding.c b/src/encoding/encoding.c index 2c6e7da64..4056d9ffc 100644 --- a/src/encoding/encoding.c +++ b/src/encoding/encoding.c @@ -45,6 +45,9 @@ dummy_open(struct stream_encoded *stream, int fd) return 0; } +/*! @return A positive number means that many bytes were + * written to the @a data array. Otherwise, the value is + * enum read_encoded_result. */ static int dummy_read(struct stream_encoded *stream, unsigned char *data, int len) { @@ -53,10 +56,12 @@ dummy_read(struct stream_encoded *stream, unsigned char *data, int len) if (got > 0) return got; - else if (got == -1 && errno == EAGAIN) - return 0; + else if (got == 0) + return READENC_STREAM_END; + else if (errno == EAGAIN) + return READENC_EAGAIN; else - return -1; + return READENC_ERRNO; } static unsigned char * @@ -131,11 +136,8 @@ open_encoded(int fd, enum stream_encoding encoding) * size of _returned_ data, not desired size of data read from * stream. * - * @return the number of bytes written to the @a data array if - * something was decoded; 0 if no data is available yet but some may - * become available later; or -1 if there will be no further data, - * either because an error occurred or because an end-of-stream mark - * was reached. */ + * @return A positive number means that many bytes were written to the + * @a data array. Otherwise, the value is enum read_encoded_result. */ int read_encoded(struct stream_encoded *stream, unsigned char *data, int len) { @@ -241,6 +243,9 @@ try_encoding_extensions(struct string *filename, int *fd) struct connection_state read_file(struct stream_encoded *stream, int readsize, struct string *page) { + int readlen; + int save_errno; + if (!init_string(page)) return connection_state(S_OUT_OF_MEM); /* We read with granularity of stt.st_size (given as @readsize) - this @@ -252,46 +257,55 @@ read_file(struct stream_encoded *stream, int readsize, struct string *page) * allocate zero number of bytes. */ if (!readsize) readsize = 4096; - while (realloc_string(page, page->length + readsize)) { - unsigned char *string_pos = page->source + page->length; - int readlen = read_encoded(stream, string_pos, readsize); - - if (readlen < 0) { + for (;;) { + unsigned char *string_pos; + + if (!realloc_string(page, page->length + readsize)) { done_string(page); - - /* If it is some I/O error (and errno is set) that will - * do. Since errno == 0 == S_WAIT and we cannot have - * that. */ - if (errno) - return connection_state_for_errno(errno); - - /* FIXME: This is indeed an internal error. If readed from a - * corrupted encoded file nothing or only some of the - * data will be read. */ - return connection_state(S_ENCODE_ERROR); - - } else if (readlen == 0) { - /* NUL-terminate just in case */ - page->source[page->length] = '\0'; - return connection_state(S_OK); + return connection_state(S_OUT_OF_MEM); + } + + string_pos = page->source + page->length; + readlen = read_encoded(stream, string_pos, readsize); + if (readlen <= 0) { + save_errno = errno; /* in case of READENC_ERRNO */ + break; } page->length += readlen; -#if 0 - /* This didn't work so well as it should (I had to implement - * end of stream handling to bzip2 anyway), so I rather - * disabled this. */ - if (readlen < readsize) { - /* This is much safer. It should always mean that we - * already read everything possible, and it permits us - * more elegant of handling end of file with bzip2. */ - break; - } -#endif } - done_string(page); - return connection_state(S_OUT_OF_MEM); + switch (readlen) { + case READENC_ERRNO: + done_string(page); + return connection_state_for_errno(save_errno); + + case READENC_STREAM_END: + /* NUL-terminate just in case */ + page->source[page->length] = '\0'; + return connection_state(S_OK); + + case READENC_UNEXPECTED_EOF: + case READENC_DATA_ERROR: + done_string(page); + /* FIXME: This is indeed an internal error. If readed from a + * corrupted encoded file nothing or only some of the + * data will be read. */ + return connection_state(S_ENCODE_ERROR); + + case READENC_MEM_ERROR: + done_string(page); + return connection_state(S_OUT_OF_MEM); + + case READENC_EAGAIN: + case READENC_INTERNAL: + default: + ERROR("unexpected readlen==%d", readlen); + /* If you have a breakpoint in elinks_error(), + * you can examine page before it gets freed. */ + done_string(page); + return connection_state(S_INTERNAL); + } } static inline int diff --git a/src/encoding/encoding.h b/src/encoding/encoding.h index 68b656c85..f4293db2d 100644 --- a/src/encoding/encoding.h +++ b/src/encoding/encoding.h @@ -15,6 +15,35 @@ enum stream_encoding { ENCODINGS_KNOWN, }; +/** Special values returned by read_encoded() and + * decoding_backend.read. Positive numbers cannot be used in + * this enum because they mean byte counts as return values. + * Zero could be used but currently is not used. + * Do not rely on the order of values here. */ +enum read_encoded_result { + /** An error occurred and the code is in @c errno. */ + READENC_ERRNO = -1, + + /** Saw an end-of-file mark in the compressed data. */ + READENC_STREAM_END = -2, + + /** The data ended before the decompressor expected. */ + READENC_UNEXPECTED_EOF = -3, + + /** Cannot decompress anything yet: please provide more data. */ + READENC_EAGAIN = -4, + + /** The input data is malformed: for example, checksums don't + * match, or a header is missing. */ + READENC_DATA_ERROR = -5, + + /** Out of memory */ + READENC_MEM_ERROR = -6, + + /** An internal error occurred. */ + READENC_INTERNAL = -7 +}; + struct stream_encoded { enum stream_encoding encoding; void *data; @@ -23,9 +52,16 @@ struct stream_encoded { struct decoding_backend { const unsigned char *name; const unsigned char *const *extensions; + int (*open)(struct stream_encoded *stream, int fd); + + /*! @return A positive number means that many bytes were + * written to the @a data array. Otherwise, the value is + * enum read_encoded_result. */ int (*read)(struct stream_encoded *stream, unsigned char *data, int len); + unsigned char *(*decode_buffer)(unsigned char *data, int len, int *new_len); + void (*close)(struct stream_encoded *stream); }; diff --git a/src/encoding/lzma.c b/src/encoding/lzma.c index 702596268..b3ac893c8 100644 --- a/src/encoding/lzma.c +++ b/src/encoding/lzma.c @@ -25,7 +25,13 @@ struct lzma_enc_data { lzma_stream flzma_stream; int fdread; - int last_read; + + /** Error code to be returned by all later lzma_read() calls. + * ::READENC_EAGAIN is used here as a passive value that means + * no such error occurred yet. ::READENC_ERRNO is not allowed + * because there is no @c sticky_errno member here. */ + enum read_encoded_result sticky_result; + unsigned char buf[ELINKS_BZ_BUFFER_LENGTH]; }; @@ -42,7 +48,7 @@ lzma_open(struct stream_encoded *stream, int fd) copy_struct(&data->flzma_stream, &LZMA_STREAM_INIT_VAR); data->fdread = fd; - data->last_read = 0; + data->sticky_result = READENC_EAGAIN; err = lzma_auto_decoder(&data->flzma_stream, NULL, NULL); if (err != LZMA_OK) { @@ -55,34 +61,58 @@ lzma_open(struct stream_encoded *stream, int fd) return 0; } +static enum read_encoded_result +map_lzma_ret(lzma_ret ret) +{ + switch (ret) { + case LZMA_STREAM_END: + return READENC_STREAM_END; + case LZMA_DATA_ERROR: + case LZMA_HEADER_ERROR: + return READENC_DATA_ERROR; + case LZMA_MEM_ERROR: + return READENC_MEM_ERROR; + case LZMA_PROG_ERROR: + case LZMA_BUF_ERROR: + default: + return READENC_INTERNAL; + } +} + +/*! @return A positive number means that many bytes were + * written to the @a buf array. Otherwise, the value is + * enum read_encoded_result. */ static int lzma_read(struct stream_encoded *stream, unsigned char *buf, int len) { struct lzma_enc_data *data = (struct lzma_enc_data *) stream->data; int err = 0; + int l = 0; - if (!data) return -1; + if (!data) return READENC_INTERNAL; assert(len > 0); + if_assert_failed return READENC_INTERNAL; - if (data->last_read) return -1; + if (data->sticky_result != READENC_EAGAIN) + return data->sticky_result; data->flzma_stream.avail_out = len; data->flzma_stream.next_out = buf; do { if (data->flzma_stream.avail_in == 0) { - int l = safe_read(data->fdread, data->buf, - ELINKS_BZ_BUFFER_LENGTH); + l = safe_read(data->fdread, data->buf, + ELINKS_BZ_BUFFER_LENGTH); if (l == -1) { if (errno == EAGAIN) break; else - return -1; /* I/O error */ + return READENC_ERRNO; /* I/O error */ } else if (l == 0) { /* EOF. It is error: we wait for more bytes */ - return -1; + return READENC_UNEXPECTED_EOF; } data->flzma_stream.next_in = data->buf; @@ -91,15 +121,19 @@ lzma_read(struct stream_encoded *stream, unsigned char *buf, int len) err = lzma_code(&data->flzma_stream, LZMA_RUN); if (err == LZMA_STREAM_END) { - data->last_read = 1; + data->sticky_result = READENC_STREAM_END; break; - } else if (err != LZMA_OK) { - return -1; + } else if (err != LZMA_OK && err != LZMA_UNSUPPORTED_CHECK) { + return map_lzma_ret(err); } } while (data->flzma_stream.avail_out > 0); - assert(len - data->flzma_stream.avail_out == data->flzma_stream.next_out - buf); - return len - data->flzma_stream.avail_out; + l = len - data->flzma_stream.avail_out; + assert(l == data->flzma_stream.next_out - buf); + if (l > 0) /* Positive return values are byte counts */ + return l; + else /* and others are from enum read_encoded_result */ + return data->sticky_result; } static unsigned char * diff --git a/src/protocol/http/http.c b/src/protocol/http/http.c index 18cc641a3..d99a7f7ed 100644 --- a/src/protocol/http/http.c +++ b/src/protocol/http/http.c @@ -1121,16 +1121,12 @@ decompress_data(struct connection *conn, unsigned char *data, int len, did_read = read_encoded(conn->stream, output + *new_len, BIG_READ); - /* Do not break from the loop if did_read == 0. It - * means no decoded data is available yet, but some may - * become available later. This happens especially with - * the bzip2 decoder, which needs an entire compressed - * block as input before it generates any output. */ - if (did_read < 0) { + if (did_read > 0) + *new_len += did_read; + else if (did_read != READENC_EAGAIN) { state = FINISHING; break; } - *new_len += did_read; } while (len || (did_read == BIG_READ)); if (state == FINISHING) shutdown_connection_stream(conn);