libarchive plugin: directory paths, round-trip test (M5-M6)

The read handler now composes full directory paths from the cdir's
directory ids rather than emitting bare leaf names: build_dir_path
walks the parent chain (root dirid 0, depth-capped against cyclic
cdirs), so multi-file archives with subdirectories list correctly.
Master-block resolution (M4) and tagged long names (M6) already work
through libuc2's extract and tag paths; this adds a libarchive
round-trip test that creates archives at Huffman and rANS levels and
verifies every byte back through libarchive's public API. Documents
the plugin build recipe (libarchive source tree + static lib).

Verified against libarchive 3.7.7; round-trip clean under valgrind.
This commit is contained in:
Eremey Valetov
2026-06-13 02:10:56 -04:00
parent b86309542d
commit d26791bfbd
5 changed files with 322 additions and 2 deletions

View File

@@ -2,12 +2,16 @@
/* libarchive read handler for UC2 v3 archives.
*
* Status: milestones 1-3.
* Status: milestones 1-6.
* M1 -- bid() with UC2 magic check.
* M2 -- read_header iterates uc2_read_cdir, maps each cdir entry to
* libarchive's archive_entry shape (name, size, mode, mtime).
* M3 -- read_data uses uc2_extract to decompress an entry, buffers
* the result, then yields it via libarchive's pull-style API.
* M4 -- master blocks resolve inside libuc2 during uc2_extract.
* M5 -- multi-file archives with full directory paths composed from
* the cdir's directory ids (parent-before-child not assumed).
* M6 -- tagged entries (Win95 long names) resolved via uc2_get_tag.
*
* Strategy: on the first read_header call we slurp the entire archive
* into memory through __archive_read_ahead, then drive libuc2 against
@@ -51,6 +55,7 @@ struct uc2_la_state {
/* Cached cdir entries. uc2_read_cdir is single-pass; we capture
* everything on the first read_header call. */
struct uc2_entry *entries;
char **paths; /* composed full path per entry */
int n_entries;
int n_capacity;
int next_entry;
@@ -306,6 +311,79 @@ collect_entries(struct archive_read *a, struct uc2_la_state *st)
return (ARCHIVE_OK);
}
/* Append the full path of directory `id` (with a trailing slash) to
* buf. Returns the new offset, or -1 on overflow. UC2 directory ids
* are archive-global; root is 0. The depth cap breaks cycles in
* damaged directories. */
static int
build_dir_path(struct uc2_la_state *st, unsigned id,
char *buf, size_t cap, int depth)
{
int i;
if (id == 0)
return (0);
if (depth > 64)
return (-1); /* cyclic or pathologically deep: corrupt cdir */
for (i = 0; i < st->n_entries; i++) {
struct uc2_entry *d = &st->entries[i];
if (d->is_dir && d->id == id) {
int off = build_dir_path(st, d->dirid, buf, cap,
depth + 1);
int n;
if (off < 0)
return (-1);
n = snprintf(buf + off, cap - off, "%s/", d->name);
if (n < 0 || (size_t)n >= cap - off)
return (-1);
return (off + n);
}
}
return (0); /* unknown parent: fall back to root */
}
/* Compose a full path for every entry: parent directories joined with
* '/', directories themselves carrying a trailing slash. */
static int
compose_paths(struct archive_read *a, struct uc2_la_state *st)
{
int i;
st->paths = (char **)calloc((size_t)st->n_entries,
sizeof *st->paths);
if (st->paths == NULL && st->n_entries > 0) {
archive_set_error(&a->archive, ENOMEM,
"UC2: out of memory composing paths");
return (ARCHIVE_FATAL);
}
for (i = 0; i < st->n_entries; i++) {
struct uc2_entry *e = &st->entries[i];
char buf[2048];
int off = build_dir_path(st, e->dirid, buf, sizeof buf, 0);
int n;
if (off < 0) {
archive_set_error(&a->archive, EINVAL,
"UC2: directory path too long");
return (ARCHIVE_FATAL);
}
n = snprintf(buf + off, sizeof buf - off, "%s%s",
e->name, e->is_dir ? "/" : "");
if (n < 0 || (size_t)n >= sizeof buf - off) {
archive_set_error(&a->archive, EINVAL,
"UC2: entry path too long");
return (ARCHIVE_FATAL);
}
st->paths[i] = strdup(buf);
if (st->paths[i] == NULL) {
archive_set_error(&a->archive, ENOMEM,
"UC2: out of memory composing paths");
return (ARCHIVE_FATAL);
}
}
return (ARCHIVE_OK);
}
static int
uc2_la_read_header(struct archive_read *a, struct archive_entry *entry)
{
@@ -321,6 +399,9 @@ uc2_la_read_header(struct archive_read *a, struct archive_entry *entry)
r = collect_entries(a, st);
if (r != ARCHIVE_OK) return r;
r = compose_paths(a, st);
if (r != ARCHIVE_OK) return r;
}
if (st->next_entry >= st->n_entries)
@@ -332,7 +413,7 @@ uc2_la_read_header(struct archive_read *a, struct archive_entry *entry)
st->entry_len = 0;
st->entry_yielded = 0;
archive_entry_set_pathname(entry, e->name);
archive_entry_set_pathname(entry, st->paths[st->next_entry - 1]);
archive_entry_set_size(entry, (la_int64_t)e->size);
archive_entry_set_mtime(entry, dos_to_unix_time(e->dos_time), 0);
@@ -409,6 +490,12 @@ uc2_la_cleanup(struct archive_read *a)
return (ARCHIVE_OK);
if (st->handle)
uc2_close(st->handle);
if (st->paths) {
int i;
for (i = 0; i < st->n_entries; i++)
free(st->paths[i]);
free(st->paths);
}
free(st->data);
free(st->entries);
free(st->entry_data);

View File

@@ -41,6 +41,43 @@ Cross-compile from a Linux host using the DJGPP toolchain:
This produces a DOS executable suitable for DOSBox or real hardware.
libarchive Read Plugin
----------------------
The optional libarchive read handler (``contrib/libarchive/``) lets any
libarchive consumer — ``bsdtar``, file managers, language bindings —
list and extract ``.uc2`` archives. It uses libarchive's internal
read-format API, so it builds against a libarchive **source tree**
rather than an installed ``-devel`` package.
Unpack a libarchive release and build a static library (a
dependency-free configuration is enough for the plugin and its test):
.. code-block:: sh
curl -LO https://github.com/libarchive/libarchive/releases/download/v3.7.7/libarchive-3.7.7.tar.gz
tar xzf libarchive-3.7.7.tar.gz
cmake -S libarchive-3.7.7 -B larch-build -DCMAKE_BUILD_TYPE=Release \
-DBUILD_SHARED_LIBS=OFF -DENABLE_TEST=OFF
cmake --build larch-build --target archive_static
Then configure UC2 with the plugin enabled, pointing at the source tree
and the static library:
.. code-block:: sh
cmake -B build -DCMAKE_BUILD_TYPE=Release \
-DUC2_BUILD_LIBARCHIVE_PLUGIN=ON \
-DLIBARCHIVE_SOURCE_DIR=$PWD/libarchive-3.7.7 \
-DLIBARCHIVE_LIBRARY=$PWD/larch-build/libarchive/libarchive.a
cmake --build build
This builds ``libuc2_libarchive.a`` and the ``libarchive_roundtrip``
test, which creates archives at multiple compression levels and reads
them back through libarchive's public API, verifying every byte. The
plugin handles multi-file archives with directory paths, master-block
deduplication, and Win95 long names.
Build Options
-------------
@@ -54,6 +91,9 @@ Build Options
* - ``UC2_BUILD_TESTS``
- ``ON``
- Build test programs
* - ``UC2_BUILD_LIBARCHIVE_PLUGIN``
- ``OFF``
- Build the libarchive read handler (needs ``LIBARCHIVE_SOURCE_DIR``)
* - ``CMAKE_BUILD_TYPE``
- (none)
- ``Release``, ``Debug``, ``RelWithDebInfo``

View File

@@ -155,6 +155,26 @@ if(Python3_Interpreter_FOUND)
)
endif()
# libarchive plugin round-trip. Needs -DUC2_BUILD_LIBARCHIVE_PLUGIN=ON,
# -DLIBARCHIVE_SOURCE_DIR=<source tree>, and -DLIBARCHIVE_LIBRARY=<built
# libarchive.a> (a deps-disabled static build is enough; see docs).
if(TARGET uc2_libarchive AND DEFINED LIBARCHIVE_LIBRARY
AND DEFINED LIBARCHIVE_SOURCE_DIR)
add_executable(test_libarchive_uc2 src/test_libarchive_uc2.c)
target_include_directories(test_libarchive_uc2 PRIVATE
"${LIBARCHIVE_SOURCE_DIR}/libarchive")
target_link_libraries(test_libarchive_uc2 PRIVATE
uc2_libarchive "${LIBARCHIVE_LIBRARY}" uc2)
target_compile_features(test_libarchive_uc2 PRIVATE c_std_99)
add_test(NAME libarchive_roundtrip
COMMAND ${CMAKE_COMMAND}
-DUC2_CLI=$<TARGET_FILE:uc2-cli>
-DLA_TEST=$<TARGET_FILE:test_libarchive_uc2>
-DTEST_DIR=${CMAKE_CURRENT_BINARY_DIR}/libarchive_test
-P ${CMAKE_CURRENT_SOURCE_DIR}/test_cli_libarchive.cmake
)
endif()
# Cross-tool round-trip: UC2 v3 <-> original uc2pro.exe via DOSBox-X
add_test(NAME roundtrip_dosbox
COMMAND bash ${CMAKE_CURRENT_SOURCE_DIR}/scripts/roundtrip_dosbox.sh

View File

@@ -0,0 +1,134 @@
/* Round-trip verification of the libarchive UC2 read plugin.
*
* Usage: test_libarchive_uc2 <archive.uc2> <originals-dir>
*
* Opens the archive through libarchive's public API with the UC2
* format registered, walks every entry, extracts the data, and
* compares it byte-for-byte against <originals-dir>/<entry-name>.
* Exit 0 only if every file entry matches.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <archive.h>
#include <archive_entry.h>
extern int archive_read_support_format_uc2(struct archive *);
static unsigned char *slurp(const char *path, size_t *out_len)
{
FILE *f = fopen(path, "rb");
if (!f) {
fprintf(stderr, "FAIL: cannot open original %s\n", path);
exit(1);
}
fseek(f, 0, SEEK_END);
long n = ftell(f);
fseek(f, 0, SEEK_SET);
if (n < 0) {
fprintf(stderr, "FAIL: ftell %s\n", path);
exit(1);
}
unsigned char *buf = malloc(n > 0 ? (size_t)n : 1);
if (!buf) {
fprintf(stderr, "FAIL: malloc\n");
exit(1);
}
*out_len = fread(buf, 1, (size_t)n, f);
fclose(f);
return buf;
}
int main(int argc, char **argv)
{
if (argc != 3) {
fprintf(stderr, "usage: %s <archive.uc2> <originals-dir>\n",
argv[0]);
return 2;
}
struct archive *a = archive_read_new();
if (!a) return 2;
if (archive_read_support_format_uc2(a) != ARCHIVE_OK) {
fprintf(stderr, "FAIL: cannot register UC2 format: %s\n",
archive_error_string(a));
return 1;
}
if (archive_read_open_filename(a, argv[1], 65536) != ARCHIVE_OK) {
fprintf(stderr, "FAIL: open %s: %s\n", argv[1],
archive_error_string(a));
return 1;
}
int nfiles = 0, ndirs = 0, bad = 0;
struct archive_entry *e;
int r;
while ((r = archive_read_next_header(a, &e)) == ARCHIVE_OK) {
const char *name = archive_entry_pathname(e);
if (archive_entry_filetype(e) == AE_IFDIR) {
ndirs++;
continue;
}
la_int64_t want = archive_entry_size(e);
size_t cap = want > 0 ? (size_t)want : 1;
unsigned char *got = malloc(cap);
if (!got) {
fprintf(stderr, "FAIL: malloc\n");
return 1;
}
size_t got_len = 0;
for (;;) {
la_ssize_t n = archive_read_data(a, got + got_len,
cap - got_len);
if (n < 0) {
fprintf(stderr, "FAIL: read_data %s: %s\n",
name, archive_error_string(a));
return 1;
}
if (n == 0)
break;
got_len += (size_t)n;
if (got_len == cap)
break;
}
if ((la_int64_t)got_len != want) {
fprintf(stderr, "BAD: %s: size %zu, header said %lld\n",
name, got_len, (long long)want);
bad++;
free(got);
nfiles++;
continue;
}
char opath[4096];
snprintf(opath, sizeof opath, "%s/%s", argv[2], name);
size_t ref_len;
unsigned char *ref = slurp(opath, &ref_len);
if (ref_len != got_len || memcmp(ref, got, got_len) != 0) {
fprintf(stderr, "BAD: %s: content mismatch "
"(%zu vs %zu bytes)\n", name, got_len, ref_len);
bad++;
}
free(ref);
free(got);
nfiles++;
}
if (r != ARCHIVE_EOF) {
fprintf(stderr, "FAIL: next_header: %s\n",
archive_error_string(a));
return 1;
}
archive_read_free(a);
printf("libarchive round-trip: %d files (%d dirs), %d bad\n",
nfiles, ndirs, bad);
if (nfiles == 0) {
fprintf(stderr, "FAIL: no file entries found\n");
return 1;
}
return bad ? 1 : 0;
}

View File

@@ -0,0 +1,39 @@
# Round-trip test for the libarchive UC2 read plugin: the uc2 CLI
# creates archives (Huffman and rANS), then test_libarchive_uc2 reads
# them back through libarchive's public API and verifies every byte.
file(REMOVE_RECURSE "${TEST_DIR}")
file(MAKE_DIRECTORY "${TEST_DIR}/input/subdir")
file(WRITE "${TEST_DIR}/input/hello.txt" "Hello from libarchive!\n")
string(REPEAT "The quick brown fox jumps over the lazy dog.\n" 200 REPEATED)
file(WRITE "${TEST_DIR}/input/repeated.txt" "${REPEATED}")
string(RANDOM LENGTH 8192 RANDOM_SEED 99 BLOB)
file(WRITE "${TEST_DIR}/input/blob.dat" "${BLOB}")
file(WRITE "${TEST_DIR}/input/subdir/nested_long_file_name.txt"
"nested content with a long name\n")
file(WRITE "${TEST_DIR}/input/empty.dat" "")
foreach(LEVEL 4 6)
set(ARCHIVE "${TEST_DIR}/la${LEVEL}.uc2")
execute_process(
COMMAND "${UC2_CLI}" -q -w -L ${LEVEL} "${ARCHIVE}"
hello.txt repeated.txt blob.dat empty.dat subdir
WORKING_DIRECTORY "${TEST_DIR}/input"
RESULT_VARIABLE RC
)
if(NOT RC EQUAL 0)
message(FATAL_ERROR "uc2 -w -L ${LEVEL} failed: ${RC}")
endif()
execute_process(
COMMAND "${LA_TEST}" "${ARCHIVE}" "${TEST_DIR}/input"
RESULT_VARIABLE RC
OUTPUT_VARIABLE OUT
ERROR_VARIABLE OUT
)
message(STATUS "L${LEVEL}: ${OUT}")
if(NOT RC EQUAL 0)
message(FATAL_ERROR "libarchive round-trip failed at -L ${LEVEL}")
endif()
endforeach()