From d26791bfbdcf6269dfcce6903578231bd8f2d34d Mon Sep 17 00:00:00 2001 From: Eremey Valetov Date: Sat, 13 Jun 2026 02:10:56 -0400 Subject: [PATCH] libarchive plugin: directory paths, round-trip test (M5-M6) The read handler now composes full directory paths from the cdir's directory ids rather than emitting bare leaf names: build_dir_path walks the parent chain (root dirid 0, depth-capped against cyclic cdirs), so multi-file archives with subdirectories list correctly. Master-block resolution (M4) and tagged long names (M6) already work through libuc2's extract and tag paths; this adds a libarchive round-trip test that creates archives at Huffman and rANS levels and verifies every byte back through libarchive's public API. Documents the plugin build recipe (libarchive source tree + static lib). Verified against libarchive 3.7.7; round-trip clean under valgrind. --- .../archive_read_support_format_uc2.c | 91 +++++++++++- docs/building.rst | 40 ++++++ tests/CMakeLists.txt | 20 +++ tests/src/test_libarchive_uc2.c | 134 ++++++++++++++++++ tests/test_cli_libarchive.cmake | 39 +++++ 5 files changed, 322 insertions(+), 2 deletions(-) create mode 100644 tests/src/test_libarchive_uc2.c create mode 100644 tests/test_cli_libarchive.cmake diff --git a/contrib/libarchive/archive_read_support_format_uc2.c b/contrib/libarchive/archive_read_support_format_uc2.c index cb37a04..9ff67ab 100644 --- a/contrib/libarchive/archive_read_support_format_uc2.c +++ b/contrib/libarchive/archive_read_support_format_uc2.c @@ -2,12 +2,16 @@ /* libarchive read handler for UC2 v3 archives. * - * Status: milestones 1-3. + * Status: milestones 1-6. * M1 -- bid() with UC2 magic check. * M2 -- read_header iterates uc2_read_cdir, maps each cdir entry to * libarchive's archive_entry shape (name, size, mode, mtime). * M3 -- read_data uses uc2_extract to decompress an entry, buffers * the result, then yields it via libarchive's pull-style API. + * M4 -- master blocks resolve inside libuc2 during uc2_extract. + * M5 -- multi-file archives with full directory paths composed from + * the cdir's directory ids (parent-before-child not assumed). + * M6 -- tagged entries (Win95 long names) resolved via uc2_get_tag. * * Strategy: on the first read_header call we slurp the entire archive * into memory through __archive_read_ahead, then drive libuc2 against @@ -51,6 +55,7 @@ struct uc2_la_state { /* Cached cdir entries. uc2_read_cdir is single-pass; we capture * everything on the first read_header call. */ struct uc2_entry *entries; + char **paths; /* composed full path per entry */ int n_entries; int n_capacity; int next_entry; @@ -306,6 +311,79 @@ collect_entries(struct archive_read *a, struct uc2_la_state *st) return (ARCHIVE_OK); } +/* Append the full path of directory `id` (with a trailing slash) to + * buf. Returns the new offset, or -1 on overflow. UC2 directory ids + * are archive-global; root is 0. The depth cap breaks cycles in + * damaged directories. */ +static int +build_dir_path(struct uc2_la_state *st, unsigned id, + char *buf, size_t cap, int depth) +{ + int i; + + if (id == 0) + return (0); + if (depth > 64) + return (-1); /* cyclic or pathologically deep: corrupt cdir */ + for (i = 0; i < st->n_entries; i++) { + struct uc2_entry *d = &st->entries[i]; + if (d->is_dir && d->id == id) { + int off = build_dir_path(st, d->dirid, buf, cap, + depth + 1); + int n; + if (off < 0) + return (-1); + n = snprintf(buf + off, cap - off, "%s/", d->name); + if (n < 0 || (size_t)n >= cap - off) + return (-1); + return (off + n); + } + } + return (0); /* unknown parent: fall back to root */ +} + +/* Compose a full path for every entry: parent directories joined with + * '/', directories themselves carrying a trailing slash. */ +static int +compose_paths(struct archive_read *a, struct uc2_la_state *st) +{ + int i; + + st->paths = (char **)calloc((size_t)st->n_entries, + sizeof *st->paths); + if (st->paths == NULL && st->n_entries > 0) { + archive_set_error(&a->archive, ENOMEM, + "UC2: out of memory composing paths"); + return (ARCHIVE_FATAL); + } + + for (i = 0; i < st->n_entries; i++) { + struct uc2_entry *e = &st->entries[i]; + char buf[2048]; + int off = build_dir_path(st, e->dirid, buf, sizeof buf, 0); + int n; + if (off < 0) { + archive_set_error(&a->archive, EINVAL, + "UC2: directory path too long"); + return (ARCHIVE_FATAL); + } + n = snprintf(buf + off, sizeof buf - off, "%s%s", + e->name, e->is_dir ? "/" : ""); + if (n < 0 || (size_t)n >= sizeof buf - off) { + archive_set_error(&a->archive, EINVAL, + "UC2: entry path too long"); + return (ARCHIVE_FATAL); + } + st->paths[i] = strdup(buf); + if (st->paths[i] == NULL) { + archive_set_error(&a->archive, ENOMEM, + "UC2: out of memory composing paths"); + return (ARCHIVE_FATAL); + } + } + return (ARCHIVE_OK); +} + static int uc2_la_read_header(struct archive_read *a, struct archive_entry *entry) { @@ -321,6 +399,9 @@ uc2_la_read_header(struct archive_read *a, struct archive_entry *entry) r = collect_entries(a, st); if (r != ARCHIVE_OK) return r; + + r = compose_paths(a, st); + if (r != ARCHIVE_OK) return r; } if (st->next_entry >= st->n_entries) @@ -332,7 +413,7 @@ uc2_la_read_header(struct archive_read *a, struct archive_entry *entry) st->entry_len = 0; st->entry_yielded = 0; - archive_entry_set_pathname(entry, e->name); + archive_entry_set_pathname(entry, st->paths[st->next_entry - 1]); archive_entry_set_size(entry, (la_int64_t)e->size); archive_entry_set_mtime(entry, dos_to_unix_time(e->dos_time), 0); @@ -409,6 +490,12 @@ uc2_la_cleanup(struct archive_read *a) return (ARCHIVE_OK); if (st->handle) uc2_close(st->handle); + if (st->paths) { + int i; + for (i = 0; i < st->n_entries; i++) + free(st->paths[i]); + free(st->paths); + } free(st->data); free(st->entries); free(st->entry_data); diff --git a/docs/building.rst b/docs/building.rst index 350726b..3b74643 100644 --- a/docs/building.rst +++ b/docs/building.rst @@ -41,6 +41,43 @@ Cross-compile from a Linux host using the DJGPP toolchain: This produces a DOS executable suitable for DOSBox or real hardware. +libarchive Read Plugin +---------------------- + +The optional libarchive read handler (``contrib/libarchive/``) lets any +libarchive consumer — ``bsdtar``, file managers, language bindings — +list and extract ``.uc2`` archives. It uses libarchive's internal +read-format API, so it builds against a libarchive **source tree** +rather than an installed ``-devel`` package. + +Unpack a libarchive release and build a static library (a +dependency-free configuration is enough for the plugin and its test): + +.. code-block:: sh + + curl -LO https://github.com/libarchive/libarchive/releases/download/v3.7.7/libarchive-3.7.7.tar.gz + tar xzf libarchive-3.7.7.tar.gz + cmake -S libarchive-3.7.7 -B larch-build -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_SHARED_LIBS=OFF -DENABLE_TEST=OFF + cmake --build larch-build --target archive_static + +Then configure UC2 with the plugin enabled, pointing at the source tree +and the static library: + +.. code-block:: sh + + cmake -B build -DCMAKE_BUILD_TYPE=Release \ + -DUC2_BUILD_LIBARCHIVE_PLUGIN=ON \ + -DLIBARCHIVE_SOURCE_DIR=$PWD/libarchive-3.7.7 \ + -DLIBARCHIVE_LIBRARY=$PWD/larch-build/libarchive/libarchive.a + cmake --build build + +This builds ``libuc2_libarchive.a`` and the ``libarchive_roundtrip`` +test, which creates archives at multiple compression levels and reads +them back through libarchive's public API, verifying every byte. The +plugin handles multi-file archives with directory paths, master-block +deduplication, and Win95 long names. + Build Options ------------- @@ -54,6 +91,9 @@ Build Options * - ``UC2_BUILD_TESTS`` - ``ON`` - Build test programs + * - ``UC2_BUILD_LIBARCHIVE_PLUGIN`` + - ``OFF`` + - Build the libarchive read handler (needs ``LIBARCHIVE_SOURCE_DIR``) * - ``CMAKE_BUILD_TYPE`` - (none) - ``Release``, ``Debug``, ``RelWithDebInfo`` diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 66c124a..d3ef8ff 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -155,6 +155,26 @@ if(Python3_Interpreter_FOUND) ) endif() +# libarchive plugin round-trip. Needs -DUC2_BUILD_LIBARCHIVE_PLUGIN=ON, +# -DLIBARCHIVE_SOURCE_DIR=, and -DLIBARCHIVE_LIBRARY= (a deps-disabled static build is enough; see docs). +if(TARGET uc2_libarchive AND DEFINED LIBARCHIVE_LIBRARY + AND DEFINED LIBARCHIVE_SOURCE_DIR) + add_executable(test_libarchive_uc2 src/test_libarchive_uc2.c) + target_include_directories(test_libarchive_uc2 PRIVATE + "${LIBARCHIVE_SOURCE_DIR}/libarchive") + target_link_libraries(test_libarchive_uc2 PRIVATE + uc2_libarchive "${LIBARCHIVE_LIBRARY}" uc2) + target_compile_features(test_libarchive_uc2 PRIVATE c_std_99) + add_test(NAME libarchive_roundtrip + COMMAND ${CMAKE_COMMAND} + -DUC2_CLI=$ + -DLA_TEST=$ + -DTEST_DIR=${CMAKE_CURRENT_BINARY_DIR}/libarchive_test + -P ${CMAKE_CURRENT_SOURCE_DIR}/test_cli_libarchive.cmake + ) +endif() + # Cross-tool round-trip: UC2 v3 <-> original uc2pro.exe via DOSBox-X add_test(NAME roundtrip_dosbox COMMAND bash ${CMAKE_CURRENT_SOURCE_DIR}/scripts/roundtrip_dosbox.sh diff --git a/tests/src/test_libarchive_uc2.c b/tests/src/test_libarchive_uc2.c new file mode 100644 index 0000000..0b01fb5 --- /dev/null +++ b/tests/src/test_libarchive_uc2.c @@ -0,0 +1,134 @@ +/* Round-trip verification of the libarchive UC2 read plugin. + * + * Usage: test_libarchive_uc2 + * + * Opens the archive through libarchive's public API with the UC2 + * format registered, walks every entry, extracts the data, and + * compares it byte-for-byte against /. + * Exit 0 only if every file entry matches. + */ + +#include +#include +#include + +#include +#include + +extern int archive_read_support_format_uc2(struct archive *); + +static unsigned char *slurp(const char *path, size_t *out_len) +{ + FILE *f = fopen(path, "rb"); + if (!f) { + fprintf(stderr, "FAIL: cannot open original %s\n", path); + exit(1); + } + fseek(f, 0, SEEK_END); + long n = ftell(f); + fseek(f, 0, SEEK_SET); + if (n < 0) { + fprintf(stderr, "FAIL: ftell %s\n", path); + exit(1); + } + unsigned char *buf = malloc(n > 0 ? (size_t)n : 1); + if (!buf) { + fprintf(stderr, "FAIL: malloc\n"); + exit(1); + } + *out_len = fread(buf, 1, (size_t)n, f); + fclose(f); + return buf; +} + +int main(int argc, char **argv) +{ + if (argc != 3) { + fprintf(stderr, "usage: %s \n", + argv[0]); + return 2; + } + + struct archive *a = archive_read_new(); + if (!a) return 2; + if (archive_read_support_format_uc2(a) != ARCHIVE_OK) { + fprintf(stderr, "FAIL: cannot register UC2 format: %s\n", + archive_error_string(a)); + return 1; + } + if (archive_read_open_filename(a, argv[1], 65536) != ARCHIVE_OK) { + fprintf(stderr, "FAIL: open %s: %s\n", argv[1], + archive_error_string(a)); + return 1; + } + + int nfiles = 0, ndirs = 0, bad = 0; + struct archive_entry *e; + int r; + while ((r = archive_read_next_header(a, &e)) == ARCHIVE_OK) { + const char *name = archive_entry_pathname(e); + if (archive_entry_filetype(e) == AE_IFDIR) { + ndirs++; + continue; + } + la_int64_t want = archive_entry_size(e); + + size_t cap = want > 0 ? (size_t)want : 1; + unsigned char *got = malloc(cap); + if (!got) { + fprintf(stderr, "FAIL: malloc\n"); + return 1; + } + size_t got_len = 0; + for (;;) { + la_ssize_t n = archive_read_data(a, got + got_len, + cap - got_len); + if (n < 0) { + fprintf(stderr, "FAIL: read_data %s: %s\n", + name, archive_error_string(a)); + return 1; + } + if (n == 0) + break; + got_len += (size_t)n; + if (got_len == cap) + break; + } + + if ((la_int64_t)got_len != want) { + fprintf(stderr, "BAD: %s: size %zu, header said %lld\n", + name, got_len, (long long)want); + bad++; + free(got); + nfiles++; + continue; + } + + char opath[4096]; + snprintf(opath, sizeof opath, "%s/%s", argv[2], name); + size_t ref_len; + unsigned char *ref = slurp(opath, &ref_len); + if (ref_len != got_len || memcmp(ref, got, got_len) != 0) { + fprintf(stderr, "BAD: %s: content mismatch " + "(%zu vs %zu bytes)\n", name, got_len, ref_len); + bad++; + } + free(ref); + free(got); + nfiles++; + } + if (r != ARCHIVE_EOF) { + fprintf(stderr, "FAIL: next_header: %s\n", + archive_error_string(a)); + return 1; + } + archive_read_free(a); + + printf("libarchive round-trip: %d files (%d dirs), %d bad\n", + nfiles, ndirs, bad); + if (nfiles == 0) { + fprintf(stderr, "FAIL: no file entries found\n"); + return 1; + } + return bad ? 1 : 0; +} diff --git a/tests/test_cli_libarchive.cmake b/tests/test_cli_libarchive.cmake new file mode 100644 index 0000000..7858c20 --- /dev/null +++ b/tests/test_cli_libarchive.cmake @@ -0,0 +1,39 @@ +# Round-trip test for the libarchive UC2 read plugin: the uc2 CLI +# creates archives (Huffman and rANS), then test_libarchive_uc2 reads +# them back through libarchive's public API and verifies every byte. + +file(REMOVE_RECURSE "${TEST_DIR}") +file(MAKE_DIRECTORY "${TEST_DIR}/input/subdir") + +file(WRITE "${TEST_DIR}/input/hello.txt" "Hello from libarchive!\n") +string(REPEAT "The quick brown fox jumps over the lazy dog.\n" 200 REPEATED) +file(WRITE "${TEST_DIR}/input/repeated.txt" "${REPEATED}") +string(RANDOM LENGTH 8192 RANDOM_SEED 99 BLOB) +file(WRITE "${TEST_DIR}/input/blob.dat" "${BLOB}") +file(WRITE "${TEST_DIR}/input/subdir/nested_long_file_name.txt" + "nested content with a long name\n") +file(WRITE "${TEST_DIR}/input/empty.dat" "") + +foreach(LEVEL 4 6) + set(ARCHIVE "${TEST_DIR}/la${LEVEL}.uc2") + execute_process( + COMMAND "${UC2_CLI}" -q -w -L ${LEVEL} "${ARCHIVE}" + hello.txt repeated.txt blob.dat empty.dat subdir + WORKING_DIRECTORY "${TEST_DIR}/input" + RESULT_VARIABLE RC + ) + if(NOT RC EQUAL 0) + message(FATAL_ERROR "uc2 -w -L ${LEVEL} failed: ${RC}") + endif() + + execute_process( + COMMAND "${LA_TEST}" "${ARCHIVE}" "${TEST_DIR}/input" + RESULT_VARIABLE RC + OUTPUT_VARIABLE OUT + ERROR_VARIABLE OUT + ) + message(STATUS "L${LEVEL}: ${OUT}") + if(NOT RC EQUAL 0) + message(FATAL_ERROR "libarchive round-trip failed at -L ${LEVEL}") + endif() +endforeach()