Compare commits
74 Commits
v3.0.0-alp
...
v3.0.0-alp
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fc767a1739 | ||
|
|
ad923d7ea0 | ||
|
|
62a90af101 | ||
|
|
43cf875dfe | ||
|
|
5e0f3852c6 | ||
|
|
13e29ee211 | ||
|
|
247de54352 | ||
|
|
09cdc80986 | ||
|
|
c394106c56 | ||
|
|
d26791bfbd | ||
|
|
b86309542d | ||
|
|
217bf9e53f | ||
|
|
ac01b32273 | ||
|
|
efd41dceb1 | ||
|
|
84672c00b6 | ||
|
|
7825eb47b2 | ||
|
|
bd0d1911b1 | ||
|
|
1a7b760848 | ||
|
|
c4db7cc58f | ||
|
|
779c8b1a28 | ||
|
|
446158e855 | ||
|
|
4a51918b83 | ||
|
|
6d8087fd6f | ||
|
|
79e0505fc3 | ||
|
|
87c5cf3b48 | ||
|
|
345aabd423 | ||
|
|
994c584918 | ||
|
|
b697baef43 | ||
|
|
844c1ab092 | ||
|
|
3dcfb3c4c4 | ||
|
|
5c01fec996 | ||
|
|
dae8a503e4 | ||
|
|
97e05ad81a | ||
|
|
157a517006 | ||
|
|
162cf462b6 | ||
|
|
d65c9ba9e2 | ||
|
|
d121c2083f | ||
|
|
b93f1b2a8f | ||
|
|
33773e6220 | ||
|
|
38c0898bc2 | ||
|
|
6d59bc27db | ||
|
|
e8f0ba5628 | ||
|
|
db94be6043 | ||
|
|
6a71c8ec95 | ||
|
|
7b1833a94c | ||
|
|
5107b659bc | ||
|
|
72669a01bb | ||
|
|
4c5661eb33 | ||
|
|
92e1b85cea | ||
|
|
b042b4b48b | ||
|
|
75a5ea541e | ||
|
|
6e62a7aa28 | ||
|
|
7c832ac7dd | ||
|
|
eddecfcfc2 | ||
|
|
f1eb4b6452 | ||
|
|
09ba31da1c | ||
|
|
c731bd75c2 | ||
|
|
8a7326d668 | ||
|
|
382f4ae6ce | ||
|
|
c736b19bae | ||
|
|
be7085c4d3 | ||
|
|
ab2d37286c | ||
|
|
de51cfea7c | ||
|
|
8e70d4cab9 | ||
|
|
a30c8cf694 | ||
|
|
7691dcc4fa | ||
|
|
9525a81e11 | ||
|
|
ff06506bbc | ||
|
|
40af7e877e | ||
|
|
145c948804 | ||
|
|
91c63a7207 | ||
|
|
e837d98c81 | ||
|
|
59231fc8f9 | ||
|
|
42aabbfb46 |
4
.gitattributes
vendored
4
.gitattributes
vendored
@@ -4,3 +4,7 @@ original/UC2_source/**/*.exe binary
|
||||
original/UC2_source/**/*.DAT binary
|
||||
original/UC2_source/**/*.LIB binary
|
||||
original/UC2_source/**/*.PRJ binary
|
||||
|
||||
# Test corpus and archives must be byte-exact
|
||||
tests/corpus/** binary
|
||||
tests/archives/** binary
|
||||
|
||||
108
.github/workflows/build.yml
vendored
Normal file
108
.github/workflows/build.yml
vendored
Normal file
@@ -0,0 +1,108 @@
|
||||
name: Build
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
pull_request:
|
||||
branches: [main]
|
||||
|
||||
jobs:
|
||||
build:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- { os: ubuntu-latest, name: Linux }
|
||||
- { os: macos-latest, name: macOS }
|
||||
- { os: windows-latest, name: Windows (MSVC) }
|
||||
runs-on: ${{ matrix.os }}
|
||||
name: ${{ matrix.name }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Lint -- forbid assert(side-effect)
|
||||
if: runner.os == 'Linux'
|
||||
run: python3 tests/scripts/check_assert_side_effects.py
|
||||
- name: Configure
|
||||
run: cmake -B build -DCMAKE_BUILD_TYPE=Release
|
||||
- name: Build
|
||||
run: cmake --build build --config Release
|
||||
- name: Smoke test (Unix)
|
||||
if: runner.os != 'Windows'
|
||||
run: ./build/cli/uc2 -h
|
||||
- name: Smoke test (Windows)
|
||||
if: runner.os == 'Windows'
|
||||
run: .\build\cli\Release\uc2.exe -h
|
||||
- name: Test
|
||||
run: ctest --test-dir build --output-on-failure -C Release
|
||||
|
||||
libarchive:
|
||||
runs-on: ubuntu-latest
|
||||
name: libarchive plugin
|
||||
env:
|
||||
LIBARCHIVE_VERSION: 3.7.7
|
||||
LIBARCHIVE_SHA256: 4cc540a3e9a1eebdefa1045d2e4184831100667e6d7d5b315bb1cbc951f8ddff
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Fetch libarchive source
|
||||
run: |
|
||||
curl -fsSLO "https://github.com/libarchive/libarchive/releases/download/v${LIBARCHIVE_VERSION}/libarchive-${LIBARCHIVE_VERSION}.tar.gz"
|
||||
echo "${LIBARCHIVE_SHA256} libarchive-${LIBARCHIVE_VERSION}.tar.gz" | sha256sum -c -
|
||||
tar xzf "libarchive-${LIBARCHIVE_VERSION}.tar.gz"
|
||||
- name: Build libarchive static (dependency-free)
|
||||
run: |
|
||||
cmake -S "libarchive-${LIBARCHIVE_VERSION}" -B larch-build \
|
||||
-DCMAKE_BUILD_TYPE=Release -DCMAKE_POLICY_VERSION_MINIMUM=3.5 \
|
||||
-DBUILD_SHARED_LIBS=OFF -DENABLE_TEST=OFF -DENABLE_TAR=OFF \
|
||||
-DENABLE_CPIO=OFF -DENABLE_CAT=OFF -DENABLE_UNZIP=OFF \
|
||||
-DENABLE_WERROR=OFF -DENABLE_ZLIB=OFF -DENABLE_BZip2=OFF \
|
||||
-DENABLE_LZMA=OFF -DENABLE_ZSTD=OFF -DENABLE_LZ4=OFF \
|
||||
-DENABLE_LIBXML2=OFF -DENABLE_EXPAT=OFF -DENABLE_OPENSSL=OFF \
|
||||
-DENABLE_LIBB2=OFF -DENABLE_ICONV=OFF -DENABLE_ACL=OFF \
|
||||
-DENABLE_XATTR=OFF -DENABLE_CNG=OFF -DENABLE_MBEDTLS=OFF \
|
||||
-DENABLE_NETTLE=OFF -DENABLE_PCREPOSIX=OFF -DENABLE_PCRE2POSIX=OFF
|
||||
cmake --build larch-build --target archive_static -j
|
||||
- name: Configure UC2 with libarchive plugin
|
||||
run: |
|
||||
cmake -B build -DCMAKE_BUILD_TYPE=Release \
|
||||
-DUC2_BUILD_LIBARCHIVE_PLUGIN=ON \
|
||||
-DLIBARCHIVE_SOURCE_DIR="$PWD/libarchive-${LIBARCHIVE_VERSION}" \
|
||||
-DLIBARCHIVE_LIBRARY="$PWD/larch-build/libarchive/libarchive.a"
|
||||
- name: Build
|
||||
run: cmake --build build -j
|
||||
- name: Round-trip test
|
||||
run: ctest --test-dir build --output-on-failure -R libarchive_roundtrip
|
||||
|
||||
djgpp:
|
||||
runs-on: ubuntu-latest
|
||||
name: DOS (DJGPP)
|
||||
env:
|
||||
DJGPP_URL: https://github.com/andrewwutw/build-djgpp/releases/download/v3.4/djgpp-linux64-gcc1220.tar.bz2
|
||||
DJGPP_SHA256: 8464f17017d6ab1b2bb2df4ed82357b5bf692e6e2b7fee37e315638f3d505f00
|
||||
# Keep host include dirs out of the cross-compiler's search path in
|
||||
# every step (the toolchain file also forces -nostdinc, but a stray
|
||||
# CPATH on the runner would otherwise leak glibc headers).
|
||||
CPATH: ''
|
||||
CPLUS_INCLUDE_PATH: ''
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Install DJGPP cross-toolchain
|
||||
run: |
|
||||
# The prebuilt DJGPP binutils (ar, ld) are linked against the
|
||||
# flex runtime; install it so they load on a clean runner.
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y libfl2
|
||||
curl -fsSL -o djgpp.tar.bz2 "$DJGPP_URL"
|
||||
echo "${DJGPP_SHA256} djgpp.tar.bz2" | sha256sum -c -
|
||||
sudo tar xjf djgpp.tar.bz2 -C /opt # -> /opt/djgpp
|
||||
- name: Configure (DJGPP toolchain)
|
||||
run: |
|
||||
cmake -B build-dos \
|
||||
-DCMAKE_TOOLCHAIN_FILE=cmake/djgpp.cmake \
|
||||
-DDJGPP_ROOT=/opt/djgpp -DCMAKE_BUILD_TYPE=Release
|
||||
- name: Build
|
||||
run: cmake --build build-dos -j
|
||||
- name: Verify DOS executable
|
||||
run: |
|
||||
file build-dos/cli/uc2.exe
|
||||
file build-dos/cli/uc2.exe | grep -q "DJGPP go32 DOS extender" \
|
||||
|| { echo "uc2.exe is not a DJGPP DOS executable"; exit 1; }
|
||||
45
.github/workflows/docs.yml
vendored
Normal file
45
.github/workflows/docs.yml
vendored
Normal file
@@ -0,0 +1,45 @@
|
||||
name: Docs
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pages: write
|
||||
id-token: write
|
||||
|
||||
concurrency:
|
||||
group: pages
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.12"
|
||||
|
||||
- name: Install Sphinx
|
||||
run: pip install -r docs/requirements.txt
|
||||
|
||||
- name: Build docs
|
||||
run: sphinx-build -b html docs docs/_build/html
|
||||
|
||||
- uses: actions/upload-pages-artifact@v3
|
||||
with:
|
||||
path: docs/_build/html
|
||||
|
||||
deploy:
|
||||
needs: build
|
||||
runs-on: ubuntu-latest
|
||||
environment:
|
||||
name: github-pages
|
||||
url: ${{ steps.deployment.outputs.page_url }}
|
||||
steps:
|
||||
- id: deployment
|
||||
uses: actions/deploy-pages@v4
|
||||
11
.gitignore
vendored
11
.gitignore
vendored
@@ -1,7 +1,18 @@
|
||||
# Build
|
||||
build/
|
||||
build-*/
|
||||
cmake-build-*/
|
||||
|
||||
# CTest run outputs (when ctest is invoked outside of build/)
|
||||
Testing/
|
||||
|
||||
# Python bytecode
|
||||
__pycache__/
|
||||
*.pyc
|
||||
|
||||
# Docs build
|
||||
docs/_build/
|
||||
|
||||
# IDE
|
||||
.idea/
|
||||
.vscode/
|
||||
|
||||
@@ -9,3 +9,10 @@ include(cmake/UC2Platform.cmake)
|
||||
|
||||
add_subdirectory(lib)
|
||||
add_subdirectory(cli)
|
||||
add_subdirectory(contrib/libarchive)
|
||||
|
||||
option(UC2_BUILD_TESTS "Build tests" ON)
|
||||
if(UC2_BUILD_TESTS AND NOT DJGPP)
|
||||
enable_testing()
|
||||
add_subdirectory(tests)
|
||||
endif()
|
||||
|
||||
57
CREDITS.md
57
CREDITS.md
@@ -8,30 +8,71 @@ versioning. The original source code is preserved in `original/UC2_source/`.
|
||||
|
||||
- Website: <https://nicodevries.com/professional/>
|
||||
|
||||
## Source Code Release
|
||||
## 2015 LGPL Source Release
|
||||
|
||||
**Danny Bezemer** facilitated the public release of the UC2 source code in
|
||||
2015 under the LGPL-3.0 license.
|
||||
In December 2015, **Vladislav Sagunov** asked Nico de Vries whether
|
||||
the UC2 source could be re-released under a free licence. De Vries
|
||||
agreed and personally re-published the full UC2 source under the GNU
|
||||
LGPL-3.0 (with a small Borland C/C++ runtime carve-out for DOS-specific
|
||||
code). The release notes are preserved verbatim in
|
||||
`original/UC2_source/Read Me First.txt`.
|
||||
|
||||
## Portable Decompressor (unuc2 / libunuc2)
|
||||
|
||||
**Jan Bobrowski** wrote a clean-room portable decompressor (2020--2021) that
|
||||
forms the foundation of this project's decompression engine. The library
|
||||
(`libunuc2`) is licensed under LGPL-3.0; the CLI tool (`unuc2`) is licensed
|
||||
under GPL-3.0.
|
||||
under GPL-3.0-or-later.
|
||||
|
||||
The following files in this repository derive directly from Bobrowski's
|
||||
work and retain his licence (see `docs/license-audit.md` for the full
|
||||
provenance table):
|
||||
|
||||
- `lib/src/decompress.c` (LGPL-3.0-only) -- derived from `libunuc2.c`
|
||||
- `lib/src/list.h` (LGPL-3.0-only) -- byte-identical to upstream
|
||||
- `lib/include/uc2/libuc2.h` (LGPL-3.0-only) -- derived from `libunuc2.h`
|
||||
- `cli/src/main.c` (GPL-3.0-or-later) -- derived from `unuc2.c`,
|
||||
with substantial additions for archive creation, OTS, and benchmarking
|
||||
- `cli/src/compat/compat_win32.c` (LGPL-3.0-only)
|
||||
- `cli/src/compat/compat_dos.c` (LGPL-3.0-only, DOS adaptation by Valetov)
|
||||
|
||||
The SuperMaster dictionary (`lib/src/super.bin`) is bit-identical to the
|
||||
copy shipped in `original/unuc2-0.6/` and to the data extracted from
|
||||
de Vries's 1992 binaries.
|
||||
|
||||
- Website: <http://torinak.com/~jb/unuc2/>
|
||||
- Original source preserved in `original/unuc2-0.6/`
|
||||
|
||||
## Additional Contributors
|
||||
|
||||
- **Jan-Pieter Cornet** -- early testing, archive samples, and format
|
||||
- **Danny Bezemer** -- co-authored UC2 with de Vries during the
|
||||
original 1992-1996 development.
|
||||
- **Jan-Pieter Cornet** -- early testing, archive samples, and format
|
||||
documentation contributions to the unuc2 project.
|
||||
- **Vladislav Sagunov** -- maintained UC2 resources and documentation.
|
||||
|
||||
## UC2 v3.0.0 Revival
|
||||
|
||||
**Eremey Valetov** -- project revival, CMake build system, cross-platform
|
||||
porting, and ongoing development.
|
||||
**Eremey Valetov** -- project revival, including:
|
||||
|
||||
- CMake build system and cross-platform porting (Linux, macOS, MSVC, DJGPP)
|
||||
- LZ77+Huffman compression engine (compatible with original UC2 Pro)
|
||||
- rANS entropy coder (method 10, levels 6--9)
|
||||
- Content-defined chunking (CDC) with Gear rolling hash
|
||||
- Merkle DAG content addressing
|
||||
- Cross-archive block store for shared deduplication
|
||||
- SimHash near-duplicate detection
|
||||
- Delta compression for binary patching
|
||||
- Content-aware preprocessing (BCJ, BWT, delta filter)
|
||||
- LZ4 ultra-fast compression
|
||||
- BLAKE3 cryptographic hashing
|
||||
- SHA-256 (FIPS 180-4) implementation
|
||||
- OpenTimestamps integration (proof parser, walker, archive trailer)
|
||||
- Dictionary metadata for cross-archive sharing
|
||||
- Backward compatibility with original UC2 Pro (verified via DOSBox-X)
|
||||
- Automated test infrastructure (19 unit tests, DOSBox-X cross-tool testing)
|
||||
|
||||
All files under "UC2 v3.0.0 Revival" are licensed GPL-3.0-or-later by
|
||||
Eremey Valetov (2026). See `docs/license-audit.md` for the per-file
|
||||
provenance table and the LGPL-3.0 / GPL-3.0 chain rationale.
|
||||
|
||||
- GitHub: <https://github.com/evvaletov/uc2>
|
||||
|
||||
98
README.md
98
README.md
@@ -1,15 +1,32 @@
|
||||
# UC2 v3.0.0 — UltraCompressor II
|
||||
# UC2 v3.0.0 -- UltraCompressor II
|
||||
|
||||
A cross-platform revival of UltraCompressor II, the DOS-era archiver by
|
||||
Nico de Vries (1992--1996). UC2 was notable for its advanced deduplication
|
||||
("master blocks"), file versioning, and competitive compression ratios on
|
||||
the hardware of its day.
|
||||
|
||||
This project brings UC2 back as a modern, portable C99 tool. Phase 1
|
||||
(current) provides decompression and archive listing, built on Jan
|
||||
Bobrowski's clean-room portable decompressor
|
||||
([unuc2/libunuc2](http://torinak.com/~jb/unuc2/)). Phase 2 will add
|
||||
compression using the original algorithms.
|
||||
UC2 v3 brings it back as a modern, portable C99 tool with full
|
||||
backward compatibility -- archives created by UC2 v3 can be extracted
|
||||
by the original 1992 UC2 Pro, and vice versa.
|
||||
|
||||
## Features
|
||||
|
||||
- **Full compression and decompression** -- LZ77+Huffman (levels 2--5),
|
||||
rANS entropy coding (levels 6--9), LZ4 ultra-fast mode
|
||||
- **Backward compatible** with the original UC2 Pro (verified via
|
||||
automated DOSBox-X cross-tool round-trip testing)
|
||||
- **Content-defined chunking** (CDC) with Gear rolling hash for
|
||||
position-independent deduplication
|
||||
- **Merkle DAG** content addressing with 64-bit hashes
|
||||
- **Cross-archive dedup** via shared block store
|
||||
- **Near-duplicate detection** via SimHash
|
||||
- **Delta compression** for binary patching
|
||||
- **Content-aware preprocessing** -- BCJ (x86), BWT (text), delta filter
|
||||
- **BLAKE3** cryptographic hashing for archive integrity
|
||||
- **Benchmark mode** -- test all methods on input data
|
||||
- **Personality** -- warm, confident status messages (`-q` for scripting)
|
||||
- Directory archival with nested hierarchies
|
||||
- Cross-platform: Linux, macOS, Windows (MSVC), DOS (DJGPP)
|
||||
|
||||
## Building
|
||||
|
||||
@@ -18,53 +35,84 @@ Requires CMake >= 3.16 and a C99 compiler (GCC, Clang, or MSVC).
|
||||
```sh
|
||||
cmake -B build
|
||||
cmake --build build
|
||||
ctest --test-dir build # 16 unit tests
|
||||
```
|
||||
|
||||
The binary is at `build/cli/uc2`.
|
||||
|
||||
## Usage
|
||||
|
||||
```sh
|
||||
uc2 -w archive.uc2 files... # Create archive
|
||||
uc2 archive.uc2 # Extract all files
|
||||
uc2 -l archive.uc2 # List contents
|
||||
uc2 -t archive.uc2 # Test integrity
|
||||
uc2 -d /tmp/out archive.uc2 # Extract to directory
|
||||
uc2 -w -L 8 archive.uc2 files... # Create with rANS Tight
|
||||
uc2 -B files... # Benchmark all methods
|
||||
```
|
||||
uc2 archive.uc2 # Extract all files
|
||||
uc2 -l archive.uc2 # List contents
|
||||
uc2 -t archive.uc2 # Test archive integrity
|
||||
uc2 -d /tmp/out archive.uc2 # Extract to directory
|
||||
uc2 -l archive.uc2 '*.txt' # List matching files
|
||||
uc2 -p archive.uc2 readme.txt # Extract to stdout
|
||||
```
|
||||
|
||||
### Compression Levels
|
||||
|
||||
| Level | Method | Description |
|
||||
|-------|--------|-------------|
|
||||
| 2 | Huffman | Fast |
|
||||
| 3 | Huffman | Normal |
|
||||
| 4 | Huffman | Tight (default) |
|
||||
| 5 | Huffman | Ultra |
|
||||
| 6 | rANS | Fast |
|
||||
| 7 | rANS | Normal |
|
||||
| 8 | rANS | Tight |
|
||||
| 9 | rANS | Ultra |
|
||||
|
||||
Levels 2--5 produce archives readable by the original 1992 UC2 Pro.
|
||||
Levels 6--9 use rANS entropy coding (UC2 v3 only, better compression).
|
||||
|
||||
### Options
|
||||
|
||||
| Flag | Description |
|
||||
|------|-------------|
|
||||
| `-w` | Create archive |
|
||||
| `-l` | List archive contents |
|
||||
| `-t` | Test archive integrity |
|
||||
| `-a` | Include all file versions (not just latest) |
|
||||
| `-L n` | Compression level (2--9) |
|
||||
| `-B` | Benchmark all methods on input files |
|
||||
| `-d path` | Extract to specified directory |
|
||||
| `-f` | Overwrite existing files |
|
||||
| `-p` | Extract to stdout |
|
||||
| `-q` | Quiet (suppress status messages) |
|
||||
| `-a` | Include all file versions |
|
||||
| `-D` | Skip directory metadata; `-DD` also skips file metadata |
|
||||
| `-T` | Tab-separated output (for scripting) |
|
||||
| `-T` | Tab-separated output |
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
UC2/
|
||||
lib/ libuc2 decompression library
|
||||
cli/ uc2 command-line tool
|
||||
original/ preserved original sources (reference only)
|
||||
cmake/ build system modules
|
||||
tests/ test archives and test programs
|
||||
lib/ libuc2 compression/decompression library
|
||||
include/uc2/ public headers (libuc2, uc2_cdc, uc2_merkle, uc2_rans, ...)
|
||||
src/ library implementation
|
||||
cli/ uc2 command-line tool
|
||||
tests/ unit tests and test corpus
|
||||
original/ preserved original UC2 Pro sources (reference only)
|
||||
docs/ Sphinx documentation
|
||||
```
|
||||
|
||||
## Credits
|
||||
|
||||
- **Nico de Vries** -- Original UltraCompressor II (1992--1996)
|
||||
- **Danny Bezemer** -- Facilitated source code release (2015)
|
||||
- **Jan Bobrowski** -- Clean-room portable decompressor (unuc2/libunuc2, 2020--2021)
|
||||
- **Eremey Valetov** -- UC2 v3 revival, compression engine, deduplication, and ongoing development
|
||||
|
||||
See [CREDITS.md](CREDITS.md) for full attribution.
|
||||
|
||||
## History
|
||||
|
||||
- **v1.0--v2.3** (1992--1996) Original DOS releases by Nico de Vries
|
||||
- **2015** Source code released under LGPL-3.0 by Danny Bezemer
|
||||
- **2020--2021** Jan Bobrowski writes unuc2/libunuc2 (portable decompressor)
|
||||
- **2026** UC2 v3.0.0: cross-platform revival
|
||||
- **2026** UC2 v3.0.0: cross-platform revival with full compression engine,
|
||||
backward compatibility, and modern deduplication
|
||||
|
||||
## License
|
||||
|
||||
GPL-3.0. See [LICENSE](LICENSE) and [CREDITS.md](CREDITS.md) for full
|
||||
attribution.
|
||||
GPL-3.0. See [LICENSE](LICENSE) and [CREDITS.md](CREDITS.md).
|
||||
|
||||
308
ROADMAP.md
Normal file
308
ROADMAP.md
Normal file
@@ -0,0 +1,308 @@
|
||||
# UC2 Roadmap
|
||||
|
||||
## Phase 1: Decompression MVP (DONE)
|
||||
|
||||
- [x] Port Bobrowski's libunuc2 decompression engine
|
||||
- [x] CLI tool with list/extract/test/pipe modes
|
||||
- [x] CMake build system (Linux, MSVC fallback for super.bin)
|
||||
- [x] Win32 compat layer carried over
|
||||
- [x] Tagged v3.0.0-alpha.1
|
||||
|
||||
## Phase 2: Original Compression Engine (DONE)
|
||||
|
||||
- [x] Port LZ77+Huffman compressor from `ULTRACMP.CPP`, `TREEGEN.CPP`, `TREEENC.CPP`
|
||||
- [x] Write as the inverse of the decompressor (Bobrowski's code is the spec)
|
||||
- [x] Compression levels 2=Fast, 3=Normal, 4=Tight, 5=Ultra
|
||||
- [x] CLI create mode (`uc2 -w`), compression level flag (`-L`)
|
||||
- [x] SuperMaster dictionary support (built-in 49 KB dictionary)
|
||||
- [x] Round-trip testing: 37 unit tests + CLI integration tests
|
||||
- [x] Round-trip testing vs original `uc2pro.exe` in DOSBox
|
||||
(Direction: original creates -> UC2 v3 extracts -- verified.
|
||||
Reverse direction is a known limitation: the original UC2 Pro
|
||||
cannot read UC2 v3 archives due to compression bitstream
|
||||
differences.)
|
||||
- [x] Backward compatibility with original UC2 Pro (listing +
|
||||
extraction verified for multi-file archives in both directions
|
||||
in automated DOSBox-X test).
|
||||
- [x] Custom Huffman tree optimization: use default tree for first
|
||||
small block (< 256 ibuf entries), custom trees for larger
|
||||
blocks. Matches the original's bFlag logic. 37% compression
|
||||
improvement on text data while maintaining backward compat.
|
||||
- [x] UC2 personality: status messages continuing the original's
|
||||
tradition ("Everything went OK", compression level names,
|
||||
"Fast, reliable and superior compression"). Suppressed by -q.
|
||||
|
||||
## Phase 3: Modernized Master-Block Deduplication
|
||||
|
||||
UC2's signature feature from 1992, ahead of its time. Modernize into
|
||||
something no mainstream archiver offers.
|
||||
|
||||
- [x] Content-fingerprint file grouping (FNV-1a hash of first 4096 bytes)
|
||||
- [x] Custom master-block generation from largest file in each group
|
||||
- [x] MASMETA central directory records with full metadata
|
||||
- [x] Masters compressed with SuperMaster, files compressed with custom master
|
||||
- [x] CLI integration test validating master deduplication round-trip
|
||||
- [x] Content-defined chunking (CDC) with Gear rolling hash
|
||||
(`uc2_cdc.h`): chunker library + integration into archive
|
||||
creation. Files sharing content at ANY position (not just
|
||||
identical prefixes) are now grouped for master-block dedup.
|
||||
- [x] Merkle DAG of deduplicated blocks (`uc2_merkle.h`):
|
||||
content-addressable chunk trees with 64-bit FNV-1a hashes,
|
||||
structural similarity comparison, single-byte-change resilience.
|
||||
8 unit tests including partial overlap detection.
|
||||
- [x] Cross-archive dedup via shared block store (`uc2_blockstore.h`):
|
||||
content-addressable chunk storage with two-level directory
|
||||
layout, dedup statistics, read-back verification.
|
||||
6 unit tests including cross-archive dedup scenario.
|
||||
- [x] Near-duplicate detection via SimHash (`uc2_simhash.h`):
|
||||
64-bit locality-sensitive fingerprint with Hamming distance,
|
||||
detects patched executables (16 changed bytes in 8KB: dist <= 8).
|
||||
6 unit tests.
|
||||
- [x] Delta compression (`uc2_delta.h`): binary diff with COPY/INSERT
|
||||
instructions, hash-based source matching. 96-byte patch in 16KB
|
||||
file -> >50% size savings. 6 unit tests including round-trip.
|
||||
|
||||
## Phase 4: Modern Compression Backends
|
||||
|
||||
Pluggable algorithms behind new method IDs; original Method 4 kept for
|
||||
backward compatibility.
|
||||
|
||||
- [x] rANS entropy coder (`uc2_rans.h`) integrated into archive format
|
||||
as method 10. Levels 6-9 use rANS (vs 2-5 Huffman). 32-bit
|
||||
table-based rANS, <5% overhead vs Shannon entropy. End-to-end
|
||||
round-trip verified (create/list/extract/verify).
|
||||
- [x] zstd-inspired dictionary compression (`uc2_dict.h`): formal
|
||||
dictionary metadata with content-hash IDs, integrity checksums,
|
||||
serialization format, and cross-archive sharing via block store.
|
||||
6 unit tests including round-trip and corruption detection.
|
||||
- [x] LZ4 ultra-fast mode (`uc2_lz4.h`): single-probe hash table,
|
||||
O(1) match finding, 4-byte minimum match, variable-length
|
||||
literal/match token encoding. 6 unit tests including
|
||||
text, binary, all-same, incompressible, and small inputs.
|
||||
- [x] Content-aware preprocessing (`uc2_preprocess.h`):
|
||||
BCJ (E8/E9 x86 address normalization), BWT (Burrows-Wheeler
|
||||
for text), delta filter (byte-wise with configurable stride),
|
||||
automatic content detection (text/x86/structured/binary).
|
||||
11 unit tests.
|
||||
- [x] Built-in benchmark mode (`uc2 -B files...`): tests all 8 Huffman/rANS
|
||||
levels plus LZ4, reports compressed size, ratio, and timing.
|
||||
|
||||
## Phase 5: Quantum-Resistant Encryption
|
||||
|
||||
No mainstream archiver offers post-quantum encryption.
|
||||
|
||||
- [ ] CRYSTALS-Kyber (NIST PQC standard) for key encapsulation, pure C
|
||||
(PQClean project, public domain)
|
||||
- [ ] AES-256-GCM for authenticated payload encryption
|
||||
- [ ] Hybrid mode: classical ECDH + Kyber for transition period
|
||||
- [ ] Passphrase-based key derivation via Argon2
|
||||
- [ ] Per-file selective encryption within archives
|
||||
- [ ] Plausible deniability: multi-archive-in-one with separate passwords.
|
||||
Each password decrypts a different archive layer. Under hostile
|
||||
pressure, revealing one password gives access to a decoy layer
|
||||
while the real archive remains hidden and indistinguishable from
|
||||
random padding. (Inspired by VeraCrypt hidden volumes.)
|
||||
|
||||
## Phase 6: DOS / FreeDOS / Retro-Computing
|
||||
|
||||
- [x] DJGPP cross-compilation toolchain: `cmake/djgpp.cmake`
|
||||
builds `uc2.exe` against the prebuilt DJGPP gcc 7.2 / 12.2 from
|
||||
`andrewwutw/build-djgpp`. Output is a 32-bit DPMI DOS executable
|
||||
(MZ + COFF + go32 stub). See `cmake/README-djgpp.md` for the
|
||||
one-time setup (CPATH unset is required on hosts that export it).
|
||||
- [x] DOSBox-X smoke test: `tests/scripts/dos_smoke.sh` runs `uc2 -h`
|
||||
and `uc2 -l <archive>` under DOSBox-X via the flatpak; verifies
|
||||
the cross-compiled binary actually loads under a real DPMI host.
|
||||
Real vintage hardware test still pending.
|
||||
- [ ] Method 80 (Turbo) support
|
||||
- [ ] Multi-volume archive spanning across physical media (floppies)
|
||||
- [ ] Self-extracting archives per platform (DOS COM/EXE, Linux ELF, Windows PE)
|
||||
- [ ] ANSI art progress display, CP850 codepage handling
|
||||
- [ ] Position as the archiver for retrocomputing preservation:
|
||||
disk images, ROM collections, BBS archive redistribution
|
||||
|
||||
## Phase 7: Cryptographic Integrity & Timestamping
|
||||
|
||||
- [x] BLAKE3 content hashing (`uc2_blake3.h`): pure C implementation,
|
||||
256-bit digests, incremental and one-shot API, constant-time
|
||||
comparison, tree hashing structure. 7 unit tests including
|
||||
avalanche, incremental-vs-oneshot, and single-byte updates.
|
||||
- [x] SHA-256 (`uc2_sha256.h`): pure-C FIPS 180-4 implementation,
|
||||
one-shot and incremental API. 6 unit tests against published
|
||||
test vectors (empty, "abc", 56-byte, 1M `'a'`, byte-by-byte
|
||||
incremental, every-split-point boundary).
|
||||
- [x] OpenTimestamps integration (`uc2_ots.h`): pure-C parser,
|
||||
serializer, and walker for the standard `.ots` proof format.
|
||||
Append-only sidecar trailer (magic-bracketed, reverse-scan-safe)
|
||||
stores the proof verbatim and preserves backward compatibility
|
||||
with the original UC2 Pro reader. Walker supports the
|
||||
calendar-path subset (APPEND, PREPEND, SHA256); proofs with other
|
||||
crypto ops are accepted as structurally valid but flagged for
|
||||
`ots verify` follow-up. CLI: `--ots-attach`, `--ots-extract`,
|
||||
`--ots-info`; `uc2 -t` recomputes archive SHA-256 and verifies
|
||||
the leaf and walk. Strict-canonical-varint parser, 64-bit
|
||||
overflow check, depth-bounded recursion, varbytes cap.
|
||||
17 unit tests.
|
||||
- [ ] OTS upgrade: fetch the upgraded proof from the calendar after
|
||||
the Bitcoin attestation has been minted (~1-6h), replace the
|
||||
pending-only trailer with the Bitcoin block-header attestation.
|
||||
- [ ] Useful for legal/forensic archiving, software provenance, digital
|
||||
preservation
|
||||
|
||||
## Phase 8: Decentralized & Cloud Integration
|
||||
|
||||
- [ ] IPFS pinning: `uc2 --ipfs-pin archive.uc2` to publish,
|
||||
`uc2 --ipfs-get <CID>` to retrieve
|
||||
- [ ] Content-addressable dedup maps directly to IPFS CIDs;
|
||||
master blocks become sharable across users ("swarm dedup")
|
||||
- [ ] Cloud archiving backend: `uc2 --s3 s3://bucket/path` for
|
||||
streaming compress-to-cloud with dedup-aware incremental uploads
|
||||
- [ ] Filecoin/Sia for decentralized paid storage (optional)
|
||||
|
||||
## Phase 9: Zero-Knowledge Proofs (Experimental)
|
||||
|
||||
ZK proofs extend the Merkle DAG and encryption layers with
|
||||
privacy-preserving verification. Most valuable for decentralized and
|
||||
compliance scenarios; heavyweight, so implemented as an optional module.
|
||||
|
||||
- [ ] **Prove archive integrity without revealing contents** -- ZK proof
|
||||
that the archive's Merkle root matches claimed file hashes, without
|
||||
exposing the tree structure. Enables auditing of IPFS-shared
|
||||
encrypted archives.
|
||||
- [ ] **Selective disclosure from encrypted archives** -- prove a specific
|
||||
file (by hash) exists in an encrypted archive without decrypting
|
||||
anything else. Useful for collaborative encrypted team archives.
|
||||
- [ ] **Verifiable deduplication** -- ZK proof that master-block dedup was
|
||||
performed correctly across archives without revealing block contents.
|
||||
Builds trust in distributed dedup without data leaks.
|
||||
- [ ] **Compliance proofs** -- prove properties ("archive created before
|
||||
date Y", "archive does not contain file with hash H") without
|
||||
revealing contents. For regulatory/legal use cases.
|
||||
- [ ] Implementation: Halo2 or Bulletproofs (no trusted setup) via
|
||||
Rust-to-C wrapper or WASM bridge; compile-time optional module.
|
||||
STARKs preferred over SNARKs for quantum resistance alignment
|
||||
with Phase 5.
|
||||
|
||||
### ZK Feasibility Notes
|
||||
|
||||
ZK adds genuine value for privacy-focused decentralized archiving (Phases
|
||||
7--8) but is heavyweight for a CLI tool. SNARKs require pairing-friendly
|
||||
curves (not quantum-resistant); **STARKs are preferred** as they align
|
||||
with the post-quantum direction and need no trusted setup. Proof
|
||||
generation is slow (seconds to minutes for complex circuits) so this is
|
||||
an opt-in feature, not on the critical path. Prototype in a fork first.
|
||||
|
||||
## Phase 10: Ecosystem Integrations
|
||||
|
||||
### libarchive plugin
|
||||
|
||||
Highest-leverage integration. Adding UC2 read/write support to libarchive
|
||||
makes `.uc2` a first-class format for `bsdtar`, `cmake`, `pkg(8)`,
|
||||
file-roller, Ark, and dozens of other tools across the Linux ecosystem.
|
||||
|
||||
- [-] libarchive read handler (decompression/listing): milestones
|
||||
1-3 shipped -- bid() recognises UC2 magic; read_header() slurps
|
||||
the archive, walks uc2_read_cdir, yields each entry mapped onto
|
||||
archive_entry; read_data() drives uc2_extract through a buffering
|
||||
write callback and yields the result via libarchive's pull API.
|
||||
Memory scales with archive size in v1. Remaining: master-block
|
||||
dependency tracking (M4), seekable adapter (deferred), bsdtar
|
||||
round-trip test (M7), upstream PR (M8).
|
||||
- [ ] libarchive write handler (compression, once Phase 2 is done)
|
||||
|
||||
### Streaming dedup ingestion
|
||||
|
||||
Position UC2 as a deduplicating storage layer that other tools pipe into.
|
||||
No other CLI archiver offers this.
|
||||
|
||||
```sh
|
||||
rsync -a /data/ | uc2 --ingest repo.uc2 # dedup on receive
|
||||
tar cf - /project | uc2 --ingest backup.uc2 # dedup tar stream
|
||||
cp -a /snapshot/ | uc2 --ingest backup.uc2 # incremental dedup
|
||||
```
|
||||
|
||||
- [x] `uc2 --ingest` mode v1: stdin -> CDC -> sidecar blockstore at
|
||||
`<archive>.blocks/` -> chunk-hash manifest. `uc2 --ingest-restore`
|
||||
reverses the round-trip. Tested: small/multichunk round-trip,
|
||||
idempotent dedup on repeat ingest, empty stream, bad-magic
|
||||
rejection. Now legacy: writer defaults to v2.
|
||||
- [x] `uc2 --ingest` v2 (default): self-contained archive with the
|
||||
chunk pool embedded inside the archive file itself. No sidecar
|
||||
directory. Manifest entries carry absolute file offsets;
|
||||
duplicate hashes share an offset (intra-call dedup).
|
||||
Cross-archive dedup is not preserved -- the trade-off is the
|
||||
single-file UX. v1 archives still readable for restore.
|
||||
- [ ] `uc2 --ingest` v3: integrate with master-block archive layout
|
||||
so output is a real UC2 v3 archive consumable by uc2 -x / -l
|
||||
- [ ] Tar-entry preservation: parse tar boundaries inside --ingest
|
||||
so individual files are recoverable as archive entries
|
||||
- [ ] Incremental snapshots: `uc2 snapshot /path repo.uc2`
|
||||
(borg/restic-style deduplicating backups without filesystem support)
|
||||
|
||||
### Foreign archive format support
|
||||
|
||||
Read (and optionally write) other archive formats, enabling UC2 as a
|
||||
universal archive tool and migration path for legacy collections.
|
||||
|
||||
- [ ] ZIP read/write (deflate, store; the universal baseline format)
|
||||
- [ ] RAR read (v4/v5; for extraction from existing collections)
|
||||
- [ ] TGZ/tar.gz read/write (tar + gzip; Unix ecosystem staple)
|
||||
- [ ] ISO 9660 read (CD/DVD images; retro-computing preservation)
|
||||
|
||||
### File manager plugins
|
||||
|
||||
Bobrowski already shipped prototypes; update for UC2 v3.
|
||||
|
||||
- [ ] Midnight Commander VFS plugin (update `misc/mc.ext` and `misc/uuc2`)
|
||||
- [ ] Total Commander WCX plugin (update `misc/unuc2-wcx.c`)
|
||||
|
||||
## Phase 11: Advanced Features
|
||||
|
||||
- [ ] Archive-as-filesystem: FUSE mount for `.uc2` on Linux (read-only,
|
||||
decompress-on-the-fly with master-block caching)
|
||||
- [ ] Compression tournaments / community challenges
|
||||
- [ ] Neural/learned compression preprocessor (modern platforms only,
|
||||
not DOS -- optional compile-time module)
|
||||
- [ ] Jupyter kernel for interactive archive exploration and compression
|
||||
research (Python, building on foxkernel experience):
|
||||
- Rich HTML tables for archive listings with compression ratios
|
||||
- Interactive dedup graph visualization (master-block DAG: which
|
||||
files share blocks, space savings)
|
||||
- Inline benchmark charts comparing methods/levels (ratio vs speed)
|
||||
- Version diff visualization between archive snapshots
|
||||
- Huffman tree / ANS state table visualization for algorithm
|
||||
development
|
||||
|
||||
## Testing Strategy
|
||||
|
||||
- Create reference UC2 archives using original `uc2pro.exe` in DOSBox
|
||||
- Unit tests: magic detection, Fletcher checksum, CP850->UTF-8
|
||||
- Integration: extract test archives, compare SHA-256 against manifest
|
||||
- Phase 2: round-trip (new compress -> old extract in DOSBox, and vice versa)
|
||||
- Phase 3+: dedup correctness, cross-archive block sharing
|
||||
- Phase 5: encryption round-trip, key derivation vectors
|
||||
- Phase 9: ZK proof soundness and completeness
|
||||
|
||||
## Maintenance Log
|
||||
|
||||
- 2026-06-11: Fixed the rANS (L6-9) extraction crash and >64KB silent
|
||||
corruption (git-bug d747658, closed): master COMPRESS records now
|
||||
carry the real method (10 at L6-9); the rANS decoder consumes the
|
||||
EOB pair instead of desyncing the bit cursor; bits_feed handles
|
||||
short reads without overrunning its buffer; compressor chunk loads
|
||||
and rANS output flushing respect the 64KB circular-window edge.
|
||||
Found debugging extraction on sdf.org (NetBSD 10) but reproducible
|
||||
everywhere. New regression test: cli_bigfile. Follow-up filed:
|
||||
bf73896 (ftell offsets >4GB truncate silently; P2).
|
||||
|
||||
- 2026-06-13: DOS build now has CI coverage (DJGPP v3.4 toolchain,
|
||||
sha-pinned; builds uc2.exe via cmake/djgpp.cmake; git-bug 9379647).
|
||||
Consolidated the two DJGPP toolchain files onto djgpp.cmake and
|
||||
removed the redundant djgpp-toolchain.cmake.
|
||||
- 2026-06-13: Damaged-archive decode hardening (git-bug f049d6d):
|
||||
decompress_block match-length overflow guard (runtime check
|
||||
replacing an NDEBUG assert), decompress_cdir end-bounding, and a
|
||||
CLI handle/FILE leak fix on the cdir-error path. A prefix-sweep
|
||||
fuzzer drove the fixes; a residual rare cdir-parser OOB it surfaces
|
||||
is tracked for a systematic hardening + fuzzing pass (git-bug
|
||||
69e8e52).
|
||||
@@ -1,5 +1,7 @@
|
||||
# uc2 command-line tool
|
||||
|
||||
include(GNUInstallDirs)
|
||||
|
||||
add_executable(uc2-cli src/main.c)
|
||||
set_target_properties(uc2-cli PROPERTIES OUTPUT_NAME uc2)
|
||||
|
||||
@@ -13,13 +15,36 @@ target_include_directories(uc2-cli PRIVATE
|
||||
|
||||
target_compile_features(uc2-cli PRIVATE c_std_99)
|
||||
|
||||
# Skip installation when uc2 is embedded via add_subdirectory()
|
||||
if(CMAKE_SOURCE_DIR STREQUAL PROJECT_SOURCE_DIR)
|
||||
install(TARGETS uc2-cli RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||
if(UNIX)
|
||||
install(FILES uc2.1 DESTINATION ${CMAKE_INSTALL_MANDIR}/man1)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(WIN32)
|
||||
target_sources(uc2-cli PRIVATE src/compat/compat_win32.c)
|
||||
# Shared compat headers (err.h, fnmatch.h) — both MSVC and MinGW lack these
|
||||
target_include_directories(uc2-cli PRIVATE src/compat/include)
|
||||
if(MSVC)
|
||||
# MSVC standalone headers (unistd.h, utime.h, getopt.h) — no #include_next
|
||||
target_sources(uc2-cli PRIVATE src/compat/getopt.c)
|
||||
target_include_directories(uc2-cli PRIVATE src/compat/include/msvc)
|
||||
else()
|
||||
# MinGW/Clang: headers that wrap system headers via #include_next
|
||||
target_include_directories(uc2-cli PRIVATE src/compat/include/posix)
|
||||
endif()
|
||||
target_compile_definitions(uc2-cli PRIVATE
|
||||
NO_OLDNAMES
|
||||
g_err g_errx g_warn g_warnx g_vwarn g_vwarnx g_verr g_verrx
|
||||
g_getprogname g_setlinebuf g_fnmatch
|
||||
g_compat__utf8_console g_compat__wpath g_fopen
|
||||
g_access g_mkdir g_utime
|
||||
g_compat__utf8_console g_compat__wpath
|
||||
g_access g_unlink g_chdir g_mkdir g_chmod g_utime
|
||||
g_opendir
|
||||
)
|
||||
elseif(DJGPP)
|
||||
target_sources(uc2-cli PRIVATE src/compat/compat_dos.c)
|
||||
# Only add the err.h and fnmatch.h headers, not sys/ overrides
|
||||
target_include_directories(uc2-cli PRIVATE src/compat/include/dos)
|
||||
endif()
|
||||
|
||||
158
cli/src/compat/compat_dos.c
Normal file
158
cli/src/compat/compat_dos.c
Normal file
@@ -0,0 +1,158 @@
|
||||
/* SPDX-License-Identifier: LGPL-3.0-only */
|
||||
|
||||
/* DOS/DJGPP compatibility layer for UC2.
|
||||
Provides BSD err.h functions and fnmatch for DJGPP,
|
||||
which lacks these POSIX/BSD extensions.
|
||||
Copyright © Jan Bobrowski 2020 / Licence: LGPL
|
||||
Adapted for DOS by Eremey Valetov 2026 */
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdarg.h>
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
#include <stdio.h>
|
||||
|
||||
/* err/errx/warn/warnx family */
|
||||
|
||||
#include "err.h"
|
||||
|
||||
static const char *_progname = "uc2";
|
||||
|
||||
const char *getprogname(void)
|
||||
{
|
||||
return _progname;
|
||||
}
|
||||
|
||||
void setprogname(const char *argv0)
|
||||
{
|
||||
const char *p = argv0;
|
||||
for (const char *q = argv0; *q; q++)
|
||||
if (*q == '/' || *q == '\\')
|
||||
p = q + 1;
|
||||
_progname = p;
|
||||
}
|
||||
|
||||
void vwarn(const char *f, va_list a)
|
||||
{
|
||||
fprintf(stderr, "%s: ", getprogname());
|
||||
if (f) {
|
||||
vfprintf(stderr, f, a);
|
||||
fprintf(stderr, ": ");
|
||||
}
|
||||
fflush(stderr);
|
||||
perror(0);
|
||||
}
|
||||
|
||||
void vwarnx(const char *f, va_list a)
|
||||
{
|
||||
fprintf(stderr, "%s: ", getprogname());
|
||||
if (f)
|
||||
vfprintf(stderr, f, a);
|
||||
fprintf(stderr, "\n");
|
||||
fflush(stderr);
|
||||
}
|
||||
|
||||
void warn(const char *f, ...)
|
||||
{
|
||||
va_list a;
|
||||
va_start(a, f);
|
||||
vwarn(f, a);
|
||||
va_end(a);
|
||||
}
|
||||
|
||||
void warnx(const char *f, ...)
|
||||
{
|
||||
va_list a;
|
||||
va_start(a, f);
|
||||
vwarnx(f, a);
|
||||
va_end(a);
|
||||
}
|
||||
|
||||
void verr(int x, const char *f, va_list a)
|
||||
{
|
||||
vwarn(f, a);
|
||||
exit(x);
|
||||
}
|
||||
|
||||
void verrx(int x, const char *f, va_list a)
|
||||
{
|
||||
vwarnx(f, a);
|
||||
exit(x);
|
||||
}
|
||||
|
||||
void err(int x, const char *f, ...)
|
||||
{
|
||||
va_list a;
|
||||
va_start(a, f);
|
||||
verr(x, f, a);
|
||||
}
|
||||
|
||||
void errx(int x, const char *f, ...)
|
||||
{
|
||||
va_list a;
|
||||
va_start(a, f);
|
||||
verrx(x, f, a);
|
||||
}
|
||||
|
||||
/* fnmatch */
|
||||
|
||||
#include "fnmatch.h"
|
||||
|
||||
int fnmatch(const char *pattern, const char *string, int flags)
|
||||
{
|
||||
for (;;) {
|
||||
char c = *pattern++;
|
||||
switch (c) {
|
||||
case '\\':
|
||||
if (*pattern && !(flags & FNM_NOESCAPE))
|
||||
c = *pattern++;
|
||||
default:
|
||||
if (c != *string++)
|
||||
return FNM_NOMATCH;
|
||||
if (!c)
|
||||
return 0;
|
||||
continue;
|
||||
case '?':
|
||||
c = *string++;
|
||||
if (!c || (flags & FNM_PATHNAME && c == '/'))
|
||||
return FNM_NOMATCH;
|
||||
continue;
|
||||
case '*':
|
||||
do {
|
||||
if (fnmatch(pattern, string, flags) == 0)
|
||||
return 0;
|
||||
if (flags & FNM_PATHNAME && *string == '/')
|
||||
return FNM_NOMATCH;
|
||||
} while (*string++);
|
||||
return FNM_NOMATCH;
|
||||
case '[':;
|
||||
const char *p = pattern;
|
||||
if (!*pattern++)
|
||||
return FNM_NOMATCH;
|
||||
for (;;) {
|
||||
c = *pattern;
|
||||
if (c == ']')
|
||||
break;
|
||||
if (!c)
|
||||
return FNM_NOMATCH;
|
||||
pattern++;
|
||||
}
|
||||
c = *string++;
|
||||
if (flags & FNM_PATHNAME && c == '/')
|
||||
return FNM_NOMATCH;
|
||||
for (;;) {
|
||||
if (c == *p++)
|
||||
break;
|
||||
if (*p == '-' && p + 1 < pattern) {
|
||||
if (p[-1] <= c && c <= p[1])
|
||||
break;
|
||||
p++;
|
||||
}
|
||||
if (p == pattern)
|
||||
return FNM_NOMATCH;
|
||||
}
|
||||
pattern++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,9 @@
|
||||
/* Copyright © Jan Bobrowski 2020 / Licence: LGPL */
|
||||
/* SPDX-License-Identifier: LGPL-3.0-only */
|
||||
|
||||
/* Win32 compatibility layer for UC2 CLI.
|
||||
Provides POSIX/BSD functions missing from MSVC and MinGW.
|
||||
All file operations use wide-char Windows APIs for UTF-8 support.
|
||||
Copyright (c) Jan Bobrowski 2020 / Licence: LGPL */
|
||||
|
||||
#define NO_OLDNAMES
|
||||
#include <stdlib.h>
|
||||
@@ -110,7 +115,7 @@ const char *getprogname(void)
|
||||
{
|
||||
static char name[256];
|
||||
if (!name[0]) {
|
||||
#ifdef _pgmptr
|
||||
#ifdef _WIN32
|
||||
char *p = _pgmptr;
|
||||
char *q = p;
|
||||
int n;
|
||||
@@ -215,6 +220,18 @@ wchar_t *compat__wpath(const char *path);
|
||||
|
||||
#ifdef g_compat__utf8_console
|
||||
#include <fcntl.h>
|
||||
#ifdef _MSC_VER
|
||||
/* MSVC: use CRT initializer table (.CRT$XCU) instead of GCC constructor */
|
||||
static void __cdecl compat__utf8_console_init(void)
|
||||
{
|
||||
setvbuf(stdout, 0, _IOFBF, 1<<16);
|
||||
setvbuf(stderr, 0, _IOFBF, 1<<16);
|
||||
SetConsoleOutputCP(CP_UTF8);
|
||||
}
|
||||
#pragma section(".CRT$XCU", read)
|
||||
__declspec(allocate(".CRT$XCU"))
|
||||
static void (__cdecl *compat__utf8_console_p)(void) = compat__utf8_console_init;
|
||||
#else
|
||||
__attribute__((constructor))
|
||||
void compat__utf8_console(void)
|
||||
{
|
||||
@@ -223,6 +240,7 @@ void compat__utf8_console(void)
|
||||
SetConsoleOutputCP(CP_UTF8);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef g_compat__wpath
|
||||
wchar_t *compat__wpath(const char *path)
|
||||
@@ -253,6 +271,22 @@ int access(const char *path, int mode)
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef g_unlink
|
||||
int unlink(const char *path)
|
||||
{
|
||||
wchar_t *wpath = compat__wpath(path);
|
||||
return wpath ? _wunlink(wpath) : -1;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef g_chdir
|
||||
int chdir(const char *path)
|
||||
{
|
||||
wchar_t *wpath = compat__wpath(path);
|
||||
return wpath ? _wchdir(wpath) : -1;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef g_mkdir
|
||||
int mkdir(const char *path, int mode)
|
||||
{
|
||||
@@ -266,16 +300,85 @@ int mkdir(const char *path, int mode)
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef g_chmod
|
||||
int chmod(const char *path, int mode)
|
||||
{
|
||||
wchar_t *wpath = compat__wpath(path);
|
||||
return wpath ? _wchmod(wpath, mode) : -1;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef g_utime
|
||||
#include <sys/utime.h>
|
||||
int utime(const char *path, struct utimbuf *ut)
|
||||
/* The Windows SDK declares an inline utime() in <sys/utime.h>. Our
|
||||
* shim utime.h substitutes utime -> compat__utime at the call site so
|
||||
* UC2's UTF-8 paths round-trip through compat__wpath. */
|
||||
int compat__utime(const char *path, struct utimbuf *ut)
|
||||
{
|
||||
wchar_t *wpath = compat__wpath(path);
|
||||
if (!wpath)
|
||||
return -1;
|
||||
struct __utimbuf32 wut = {.actime = ut->actime, .modtime = ut->modtime};
|
||||
struct __utimbuf32 wut = {.actime = (long)ut->actime, .modtime = (long)ut->modtime};
|
||||
return _wutime32(wpath, &wut);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef g_opendir
|
||||
#include "dirent.h"
|
||||
|
||||
struct UC2_DIR {
|
||||
HANDLE handle;
|
||||
WIN32_FIND_DATAW find;
|
||||
int first;
|
||||
struct dirent ent;
|
||||
};
|
||||
|
||||
DIR *opendir(const char *path)
|
||||
{
|
||||
wchar_t *wpath = compat__wpath(path);
|
||||
if (!wpath)
|
||||
return 0;
|
||||
size_t n = wcslen(wpath);
|
||||
if (n + 3 >= MAX_PATH)
|
||||
return 0;
|
||||
wchar_t pat[MAX_PATH];
|
||||
wcscpy(pat, wpath);
|
||||
if (n > 0 && pat[n-1] != L'\\' && pat[n-1] != L'/')
|
||||
pat[n++] = L'\\';
|
||||
pat[n++] = L'*';
|
||||
pat[n] = 0;
|
||||
DIR *d = malloc(sizeof *d);
|
||||
if (!d) return 0;
|
||||
d->handle = FindFirstFileW(pat, &d->find);
|
||||
if (d->handle == INVALID_HANDLE_VALUE) {
|
||||
free(d);
|
||||
return 0;
|
||||
}
|
||||
d->first = 1;
|
||||
return d;
|
||||
}
|
||||
|
||||
struct dirent *readdir(DIR *d)
|
||||
{
|
||||
if (!d) return 0;
|
||||
if (!d->first && !FindNextFileW(d->handle, &d->find))
|
||||
return 0;
|
||||
d->first = 0;
|
||||
int rc = WideCharToMultiByte(CP_UTF8, 0, d->find.cFileName, -1,
|
||||
d->ent.d_name, sizeof d->ent.d_name,
|
||||
0, 0);
|
||||
if (rc <= 0) return 0;
|
||||
return &d->ent;
|
||||
}
|
||||
|
||||
int closedir(DIR *d)
|
||||
{
|
||||
if (!d) return -1;
|
||||
if (d->handle != INVALID_HANDLE_VALUE)
|
||||
FindClose(d->handle);
|
||||
free(d);
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
68
cli/src/compat/getopt.c
Normal file
68
cli/src/compat/getopt.c
Normal file
@@ -0,0 +1,68 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* Minimal POSIX getopt() for MSVC.
|
||||
Supports short options with optional arguments (e.g., "d:"). */
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include "include/msvc/getopt.h"
|
||||
|
||||
char *optarg;
|
||||
int optind = 1, opterr = 1, optopt;
|
||||
|
||||
int getopt(int argc, char *const argv[], const char *optstring)
|
||||
{
|
||||
static int optpos = 0;
|
||||
|
||||
if (optind >= argc || !argv[optind])
|
||||
return -1;
|
||||
|
||||
if (argv[optind][0] != '-' || !argv[optind][1])
|
||||
return -1;
|
||||
|
||||
if (argv[optind][1] == '-' && !argv[optind][2]) {
|
||||
optind++;
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (!optpos)
|
||||
optpos = 1;
|
||||
|
||||
int c = argv[optind][optpos];
|
||||
const char *p = strchr(optstring, c);
|
||||
|
||||
if (!p || c == ':') {
|
||||
optopt = c;
|
||||
if (opterr && optstring[0] != ':')
|
||||
fprintf(stderr, "%s: invalid option -- '%c'\n", argv[0], c);
|
||||
if (!argv[optind][++optpos]) {
|
||||
optind++;
|
||||
optpos = 0;
|
||||
}
|
||||
return '?';
|
||||
}
|
||||
|
||||
if (p[1] == ':') {
|
||||
if (argv[optind][optpos + 1]) {
|
||||
optarg = &argv[optind][optpos + 1];
|
||||
} else if (++optind >= argc) {
|
||||
optopt = c;
|
||||
if (opterr && optstring[0] != ':')
|
||||
fprintf(stderr, "%s: option requires an argument -- '%c'\n",
|
||||
argv[0], c);
|
||||
optpos = 0;
|
||||
return optstring[0] == ':' ? ':' : '?';
|
||||
} else {
|
||||
optarg = argv[optind];
|
||||
}
|
||||
optind++;
|
||||
optpos = 0;
|
||||
} else {
|
||||
if (!argv[optind][++optpos]) {
|
||||
optind++;
|
||||
optpos = 0;
|
||||
}
|
||||
}
|
||||
|
||||
return c;
|
||||
}
|
||||
18
cli/src/compat/include/dos/err.h
Normal file
18
cli/src/compat/include/dos/err.h
Normal file
@@ -0,0 +1,18 @@
|
||||
#ifndef _ERR_H
|
||||
#define _ERR_H
|
||||
#ifdef __GNUC__
|
||||
#define err_noreturn __attribute__((noreturn))
|
||||
//#define err_noreturn [[noreturn]]
|
||||
#else
|
||||
#define err_noreturn
|
||||
#endif
|
||||
err_noreturn void err(int x, const char* f, ...);
|
||||
err_noreturn void errx(int x, const char* f, ...);
|
||||
void warn(const char* f, ...);
|
||||
void warnx(const char* f, ...);
|
||||
#include <stdarg.h>
|
||||
void verr(int x, const char* f, va_list a);
|
||||
void verrx(int x, const char* f, va_list a);
|
||||
void vwarn(const char* f, va_list a);
|
||||
void vwarnx(const char* f, va_list a);
|
||||
#endif
|
||||
15
cli/src/compat/include/dos/fnmatch.h
Normal file
15
cli/src/compat/include/dos/fnmatch.h
Normal file
@@ -0,0 +1,15 @@
|
||||
#ifndef _FNMATCH_H
|
||||
#define _FNMATCH_H
|
||||
|
||||
#define FNM_PATHNAME 0x1
|
||||
#define FNM_NOESCAPE 0x2
|
||||
#define FNM_PERIOD 0x4
|
||||
#define FNM_LEADING_DIR 0x8
|
||||
#define FNM_CASEFOLD 0x10
|
||||
|
||||
#define FNM_NOMATCH 1
|
||||
#define FNM_NOSYS (-1)
|
||||
|
||||
int fnmatch(const char *, const char *, int);
|
||||
|
||||
#endif
|
||||
@@ -1,8 +1,11 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
#ifndef _ERR_H
|
||||
#define _ERR_H
|
||||
#ifdef __GNUC__
|
||||
#define err_noreturn __attribute__((noreturn))
|
||||
//#define err_noreturn [[noreturn]]
|
||||
#elif defined(_MSC_VER)
|
||||
#define err_noreturn __declspec(noreturn)
|
||||
#else
|
||||
#define err_noreturn
|
||||
#endif
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
#ifndef _FNMATCH_H
|
||||
#define _FNMATCH_H
|
||||
|
||||
|
||||
28
cli/src/compat/include/msvc/dirent.h
Normal file
28
cli/src/compat/include/msvc/dirent.h
Normal file
@@ -0,0 +1,28 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* Minimal POSIX dirent.h for MSVC.
|
||||
*
|
||||
* Implements only what UC2's archive scanner uses: opendir, readdir,
|
||||
* closedir, and a struct dirent with d_name. UTF-8 paths are
|
||||
* round-tripped through the wide-char Win32 APIs to match the rest of
|
||||
* the compat layer. d_name is sized to hold a Windows MAX_PATH-long
|
||||
* filename re-encoded to UTF-8 (worst case: 4 bytes per code point). */
|
||||
|
||||
#ifndef _COMPAT_DIRENT_H
|
||||
#define _COMPAT_DIRENT_H
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#define UC2_DIRENT_NAME_MAX 1024 /* 260 wide chars * 4 (UTF-8) rounded up */
|
||||
|
||||
struct dirent {
|
||||
char d_name[UC2_DIRENT_NAME_MAX];
|
||||
};
|
||||
|
||||
typedef struct UC2_DIR DIR;
|
||||
|
||||
DIR *opendir(const char *path);
|
||||
struct dirent *readdir(DIR *d);
|
||||
int closedir(DIR *d);
|
||||
|
||||
#endif
|
||||
12
cli/src/compat/include/msvc/getopt.h
Normal file
12
cli/src/compat/include/msvc/getopt.h
Normal file
@@ -0,0 +1,12 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* Minimal POSIX getopt for MSVC */
|
||||
#ifndef _COMPAT_GETOPT_H
|
||||
#define _COMPAT_GETOPT_H
|
||||
|
||||
extern char *optarg;
|
||||
extern int optind, opterr, optopt;
|
||||
|
||||
int getopt(int argc, char *const argv[], const char *optstring);
|
||||
|
||||
#endif
|
||||
42
cli/src/compat/include/msvc/unistd.h
Normal file
42
cli/src/compat/include/msvc/unistd.h
Normal file
@@ -0,0 +1,42 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* Minimal POSIX unistd.h for MSVC */
|
||||
#ifndef _COMPAT_UNISTD_H
|
||||
#define _COMPAT_UNISTD_H
|
||||
|
||||
#include <io.h>
|
||||
#include <direct.h>
|
||||
|
||||
#ifndef F_OK
|
||||
#define F_OK 0
|
||||
#endif
|
||||
#ifndef R_OK
|
||||
#define R_OK 4
|
||||
#endif
|
||||
#ifndef W_OK
|
||||
#define W_OK 2
|
||||
#endif
|
||||
#ifndef X_OK
|
||||
#define X_OK 0
|
||||
#endif
|
||||
|
||||
#ifndef PATH_MAX
|
||||
#define PATH_MAX 260
|
||||
#endif
|
||||
|
||||
#include <sys/stat.h>
|
||||
#ifndef S_ISDIR
|
||||
#define S_ISDIR(m) (((m) & _S_IFMT) == _S_IFDIR)
|
||||
#endif
|
||||
#ifndef S_ISREG
|
||||
#define S_ISREG(m) (((m) & _S_IFMT) == _S_IFREG)
|
||||
#endif
|
||||
|
||||
/* Provided by compat_win32.c (UTF-8-aware via wide-char APIs) */
|
||||
int access(const char *path, int mode);
|
||||
int unlink(const char *path);
|
||||
int chdir(const char *path);
|
||||
int mkdir(const char *path, int mode);
|
||||
int chmod(const char *path, int mode);
|
||||
|
||||
#endif
|
||||
30
cli/src/compat/include/msvc/utime.h
Normal file
30
cli/src/compat/include/msvc/utime.h
Normal file
@@ -0,0 +1,30 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* POSIX utime.h for MSVC.
|
||||
*
|
||||
* Modern MSVC SDKs (Windows 10 SDK 10.0.26100+) provide both
|
||||
* struct utimbuf and an inline wrapper named utime() in <sys/utime.h>.
|
||||
* The inline wrapper is not UTF-8-aware: it forwards to _utime32,
|
||||
* which interprets the path in the local ANSI codepage.
|
||||
*
|
||||
* UC2 needs UTF-8 paths to round-trip correctly, so this shim
|
||||
* substitutes utime() with our compat__utime(), which goes through
|
||||
* compat__wpath() before calling _wutime32. */
|
||||
|
||||
#ifndef _COMPAT_UTIME_H
|
||||
#define _COMPAT_UTIME_H
|
||||
|
||||
#include <sys/utime.h>
|
||||
|
||||
#ifdef _COMPAT_UTIMBUF_FALLBACK
|
||||
#include <time.h>
|
||||
struct utimbuf {
|
||||
time_t actime;
|
||||
time_t modtime;
|
||||
};
|
||||
#endif
|
||||
|
||||
int compat__utime(const char *path, struct utimbuf *ut);
|
||||
#define utime compat__utime
|
||||
|
||||
#endif
|
||||
1417
cli/src/main.c
1417
cli/src/main.c
File diff suppressed because it is too large
Load Diff
183
cli/uc2.1
Normal file
183
cli/uc2.1
Normal file
@@ -0,0 +1,183 @@
|
||||
.\" SPDX-License-Identifier: GPL-3.0-or-later
|
||||
.Dd June 11, 2026
|
||||
.Dt UC2 1
|
||||
.Os
|
||||
.Sh NAME
|
||||
.Nm uc2
|
||||
.Nd UltraCompressor II archiver
|
||||
.Sh SYNOPSIS
|
||||
.Nm
|
||||
.Op Fl afpqDT
|
||||
.Op Fl C Ar directory
|
||||
.Op Fl d Ar destination
|
||||
.Ar archive.uc2
|
||||
.Op Ar file ...
|
||||
.Nm
|
||||
.Fl l
|
||||
.Op Fl aqT
|
||||
.Ar archive.uc2
|
||||
.Op Ar file ...
|
||||
.Nm
|
||||
.Fl t
|
||||
.Op Fl aq
|
||||
.Ar archive.uc2
|
||||
.Op Ar file ...
|
||||
.Nm
|
||||
.Fl w
|
||||
.Op Fl q
|
||||
.Op Fl L Ar level
|
||||
.Ar archive.uc2
|
||||
.Ar file ...
|
||||
.Nm
|
||||
.Fl B
|
||||
.Ar file ...
|
||||
.Nm
|
||||
.Fl -ots-attach Ar proof.ots
|
||||
.Op Fl f
|
||||
.Ar archive.uc2
|
||||
.Nm
|
||||
.Fl -ots-extract
|
||||
.Ar archive.uc2 out.ots
|
||||
.Nm
|
||||
.Fl -ots-info
|
||||
.Ar archive.uc2
|
||||
.Nm
|
||||
.Fl -ingest Ar archive
|
||||
.Nm
|
||||
.Fl -ingest-restore Ar archive
|
||||
.Sh DESCRIPTION
|
||||
.Nm
|
||||
is a cross-platform revival of UltraCompressor II, the MS-DOS archiver
|
||||
published by Nico de Vries (AIP-NL) between 1992 and 1996.
|
||||
It reads and writes the original UC2 archive format byte-compatibly:
|
||||
archives created by the original
|
||||
.Pa uc2pro.exe
|
||||
extract correctly, and archives created at compression levels 2 through 5
|
||||
can be read by the original DOS program.
|
||||
.Pp
|
||||
Without a mode option,
|
||||
.Nm
|
||||
extracts the named archive into the current directory (or into the
|
||||
directory given with
|
||||
.Fl d ) .
|
||||
If
|
||||
.Ar file
|
||||
arguments are given, only matching entries are processed; shell-style
|
||||
wildcards are accepted.
|
||||
.Pp
|
||||
Compression levels 2 through 5 use the original LZ77 and Huffman
|
||||
bitstream.
|
||||
Levels 6 through 9 replace the entropy stage with rANS coding for
|
||||
better ratios; such archives require
|
||||
.Nm
|
||||
version 3 and cannot be read by the original DOS program.
|
||||
Files with similar content are grouped and compressed against shared
|
||||
master blocks, the deduplication scheme UC2 introduced in 1992.
|
||||
.Sh OPTIONS
|
||||
.Bl -tag -width Ds
|
||||
.It Fl a
|
||||
Process all stored versions of each file, not only the most recent.
|
||||
.It Fl B
|
||||
Benchmark every compression method on the given input files and print
|
||||
a comparison table; no archive is written.
|
||||
.It Fl C Ar directory
|
||||
Change to
|
||||
.Ar directory
|
||||
before reading or writing any files.
|
||||
Applies in every mode.
|
||||
.It Fl c , Fl p
|
||||
Write extracted data to standard output instead of creating files.
|
||||
.It Fl D
|
||||
Do not restore modification times and permissions of directories.
|
||||
Given twice, file metadata is not restored either.
|
||||
.It Fl d Ar destination
|
||||
Extract into
|
||||
.Ar destination
|
||||
instead of the current directory.
|
||||
.It Fl f
|
||||
Overwrite existing files when extracting, and allow
|
||||
.Fl -ots-attach
|
||||
to replace an existing proof.
|
||||
.It Fl h
|
||||
Print version, usage, and option summary.
|
||||
.It Fl L Ar level
|
||||
Select the compression level for
|
||||
.Fl w :
|
||||
2 (Fast), 3 (Normal), 4 (Tight, the default), 5 (Ultra) for the
|
||||
backward-compatible Huffman methods, or 6 through 9 for rANS.
|
||||
.It Fl l
|
||||
List the archive contents.
|
||||
.It Fl q
|
||||
Quiet operation; suppress status messages.
|
||||
.It Fl T
|
||||
Separate listing columns with tabs instead of spaces.
|
||||
.It Fl t
|
||||
Test archive integrity without writing any files.
|
||||
.It Fl w
|
||||
Create (write) an archive from the given files and directories.
|
||||
.It Fl x
|
||||
Extract; this is the default mode.
|
||||
.El
|
||||
.Pp
|
||||
The long options operate on OpenTimestamps proofs and the streaming
|
||||
deduplication store:
|
||||
.Bl -tag -width Ds
|
||||
.It Fl -ots-attach Ar proof.ots
|
||||
Attach an OpenTimestamps proof to the archive after verifying that the
|
||||
proof commits to the archive contents.
|
||||
The proof is stored in a trailer that the original DOS program ignores.
|
||||
.It Fl -ots-extract
|
||||
Write the attached proof to the named output file, suitable for
|
||||
.Ql ots verify .
|
||||
.It Fl -ots-info
|
||||
Describe the attached proof.
|
||||
.It Fl -ingest Ar archive
|
||||
Read a data stream from standard input into a content-addressed
|
||||
deduplicating block store inside
|
||||
.Ar archive .
|
||||
.It Fl -ingest-restore Ar archive
|
||||
Write the ingested stream from
|
||||
.Ar archive
|
||||
to standard output.
|
||||
.El
|
||||
.Sh EXIT STATUS
|
||||
.Ex -std
|
||||
.Sh EXAMPLES
|
||||
Create an archive at the default level and list it:
|
||||
.Bd -literal -offset indent
|
||||
$ uc2 -w backup.uc2 project/ notes.txt
|
||||
$ uc2 -l backup.uc2
|
||||
.Ed
|
||||
.Pp
|
||||
Extract it elsewhere, overwriting existing files (the destination
|
||||
directory must exist):
|
||||
.Bd -literal -offset indent
|
||||
$ mkdir -p /tmp/restore
|
||||
$ uc2 -f -d /tmp/restore backup.uc2
|
||||
.Ed
|
||||
.Pp
|
||||
Create a smaller archive readable only by
|
||||
.Nm
|
||||
version 3, then verify it:
|
||||
.Bd -literal -offset indent
|
||||
$ uc2 -w -L 9 backup.uc2 project/
|
||||
$ uc2 -t backup.uc2
|
||||
.Ed
|
||||
.Sh HISTORY
|
||||
UltraCompressor II was written by Nico de Vries at AIP-NL and sold as
|
||||
shareware from 1992 to 1996.
|
||||
De Vries re-released the original source under the LGPL in 2015; Jan
|
||||
Bobrowski's libunuc2 (2020) built a portable decompressor from it.
|
||||
.Nm
|
||||
version 3 builds on that lineage as a full reimplementation in C99,
|
||||
maintained by Eremey Valetov.
|
||||
.Sh AUTHORS
|
||||
.An Nico de Vries
|
||||
(original UltraCompressor II),
|
||||
.An Jan Bobrowski
|
||||
(libunuc2),
|
||||
.An Eremey Valetov Aq Mt evaletov@protonmail.com
|
||||
(version 3).
|
||||
.Pp
|
||||
Source and issue tracker:
|
||||
.Lk https://github.com/evvaletov/uc2
|
||||
79
cmake/README-djgpp.md
Normal file
79
cmake/README-djgpp.md
Normal file
@@ -0,0 +1,79 @@
|
||||
# DJGPP cross-compile
|
||||
|
||||
This builds `uc2.exe` for DOS / FreeDOS using the DJGPP toolchain.
|
||||
The output is a 32-bit protected-mode DOS executable that runs under
|
||||
the bundled `cwsdpmi.exe` extender (or any DPMI host).
|
||||
|
||||
## One-time setup
|
||||
|
||||
1. Get a DJGPP cross-toolchain. The simplest source is the prebuilt
|
||||
release from `andrewwutw/build-djgpp`:
|
||||
|
||||
```sh
|
||||
curl -fsLO https://github.com/andrewwutw/build-djgpp/releases/download/v3.4/djgpp-linux64-gcc1220.tar.bz2
|
||||
sudo mkdir -p /opt && sudo tar xjf djgpp-linux64-gcc1220.tar.bz2 -C /opt
|
||||
```
|
||||
|
||||
This puts the toolchain at `/opt/djgpp/`. Use any prefix; pass
|
||||
it as `-DDJGPP_ROOT=<prefix>` when configuring.
|
||||
|
||||
2. (Linux hosts) Make sure your shell has not exported `CPATH` or
|
||||
`CPLUS_INCLUDE_PATH`. Some distros and dev environments
|
||||
(Intel oneAPI, certain conda envs) export them. GCC honours these
|
||||
regardless of `-nostdinc`, so any host include directory listed there
|
||||
ends up *first* in the cross-compiler's search path -- typically
|
||||
pulling in glibc headers that fail to compile against DJGPP libc.
|
||||
Either `unset CPATH CPLUS_INCLUDE_PATH` for the build shell, or
|
||||
wrap the cmake invocation in `env -u CPATH -u CPLUS_INCLUDE_PATH`.
|
||||
|
||||
## Build
|
||||
|
||||
```sh
|
||||
unset CPATH CPLUS_INCLUDE_PATH
|
||||
cmake -B build-djgpp \
|
||||
-DCMAKE_TOOLCHAIN_FILE=cmake/djgpp.cmake \
|
||||
-DDJGPP_ROOT=/opt/djgpp
|
||||
cmake --build build-djgpp
|
||||
```
|
||||
|
||||
Output: `build-djgpp/cli/uc2` (also linked as `uc2.exe`). Copy it
|
||||
plus `cwsdpmi.exe` (shipped with DJGPP at
|
||||
`<DJGPP_ROOT>/i586-pc-msdosdjgpp/bin/cwsdpmi.exe`) to a DOS volume.
|
||||
|
||||
## Status
|
||||
|
||||
- Compiles clean against the DJGPP gcc 12.2.0 toolchain (the
|
||||
`cmake/djgpp.cmake` include paths are pinned to that version; the
|
||||
CI job and the andrewwutw v3.4 release both use 12.2.0).
|
||||
- Library (`libuc2.a`) builds without changes.
|
||||
- CLI uses the DOS compat layer in `cli/src/compat/compat_dos.c` for
|
||||
the BSD `err.h` and POSIX `fnmatch` shims.
|
||||
- Runtime smoke test verified: `uc2 -h` and `uc2 -l <archive>` both
|
||||
succeed under DOSBox-X via `tests/scripts/dos_smoke.sh`.
|
||||
|
||||
## Smoke test
|
||||
|
||||
```sh
|
||||
# 1. Get CWSDPMI extender:
|
||||
curl -fsLO http://www.delorie.com/pub/djgpp/current/v2misc/csdpmi7b.zip
|
||||
unzip csdpmi7b.zip -d /tmp/cwsdpmi
|
||||
|
||||
# 2. Run the smoke test (needs flatpak install com.dosbox_x.DOSBox-X):
|
||||
tests/scripts/dos_smoke.sh \
|
||||
build-djgpp/cli/uc2.exe \
|
||||
/tmp/cwsdpmi/bin/CWSDPMI.EXE \
|
||||
tests/archives/basic.uc2
|
||||
```
|
||||
|
||||
Exits 0 on success. Skips (with a "SKIP: ..." line) if any of:
|
||||
the DJGPP build was not run, CWSDPMI.EXE is missing, or DOSBox-X is
|
||||
not installed.
|
||||
|
||||
## Notes
|
||||
|
||||
- The toolchain sets `CMAKE_SYSTEM_NAME Generic` and `-nostdinc` with
|
||||
explicit DJGPP include paths, so the compiler check links a test
|
||||
binary (rather than running one) and host headers never leak in.
|
||||
- DJGPP's `unistd.h` provides POSIX-shaped APIs; most of the existing
|
||||
source compiles unchanged. The library has no DOS-specific code
|
||||
paths.
|
||||
@@ -2,6 +2,11 @@
|
||||
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU|Clang")
|
||||
add_compile_options(-Wall -Wextra -Wno-unused-parameter)
|
||||
if(DJGPP)
|
||||
# DJGPP needs gnu99 for PATH_MAX and other POSIX extensions
|
||||
add_compile_options(-std=gnu99)
|
||||
endif()
|
||||
elseif(MSVC)
|
||||
add_compile_options(/W3)
|
||||
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
|
||||
endif()
|
||||
|
||||
34
cmake/djgpp.cmake
Normal file
34
cmake/djgpp.cmake
Normal file
@@ -0,0 +1,34 @@
|
||||
# CMake toolchain file for DJGPP cross-compilation (DOS target)
|
||||
#
|
||||
# Usage: cmake -B build-dos -DCMAKE_TOOLCHAIN_FILE=cmake/djgpp.cmake
|
||||
# cmake --build build-dos
|
||||
|
||||
set(CMAKE_SYSTEM_NAME Generic)
|
||||
set(CMAKE_SYSTEM_PROCESSOR i586)
|
||||
|
||||
set(DJGPP_ROOT "/usr/local/djgpp" CACHE PATH "DJGPP installation root")
|
||||
set(DJGPP_PREFIX "i586-pc-msdosdjgpp" CACHE STRING "DJGPP toolchain prefix")
|
||||
|
||||
set(CMAKE_C_COMPILER "${DJGPP_ROOT}/bin/${DJGPP_PREFIX}-gcc")
|
||||
set(CMAKE_CXX_COMPILER "${DJGPP_ROOT}/bin/${DJGPP_PREFIX}-g++")
|
||||
set(CMAKE_ASM_COMPILER "${DJGPP_ROOT}/bin/${DJGPP_PREFIX}-gcc")
|
||||
set(CMAKE_AR "${DJGPP_ROOT}/bin/${DJGPP_PREFIX}-ar" CACHE FILEPATH "")
|
||||
set(CMAKE_RANLIB "${DJGPP_ROOT}/bin/${DJGPP_PREFIX}-ranlib" CACHE FILEPATH "")
|
||||
set(CMAKE_STRIP "${DJGPP_ROOT}/bin/${DJGPP_PREFIX}-strip" CACHE FILEPATH "")
|
||||
|
||||
# This DJGPP cross-compiler has /usr/include baked in and -nostdinc doesn't
|
||||
# remove it. Using -I (not -isystem) puts the DJGPP paths before /usr/include
|
||||
# so the correct headers are always found first.
|
||||
set(_DJGPP_NOSTDINC "-nostdinc -I${DJGPP_ROOT}/lib/gcc/${DJGPP_PREFIX}/12.2.0/include -I${DJGPP_ROOT}/lib/gcc/${DJGPP_PREFIX}/12.2.0/include-fixed -I${DJGPP_ROOT}/${DJGPP_PREFIX}/sys-include")
|
||||
set(CMAKE_C_FLAGS_INIT "${_DJGPP_NOSTDINC}")
|
||||
set(CMAKE_ASM_FLAGS_INIT "${_DJGPP_NOSTDINC}")
|
||||
|
||||
set(CMAKE_FIND_ROOT_PATH "${DJGPP_ROOT}/${DJGPP_PREFIX}")
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
|
||||
|
||||
set(CMAKE_EXECUTABLE_SUFFIX ".exe")
|
||||
|
||||
set(DJGPP TRUE)
|
||||
set(DOS TRUE)
|
||||
58
contrib/libarchive/CMakeLists.txt
Normal file
58
contrib/libarchive/CMakeLists.txt
Normal file
@@ -0,0 +1,58 @@
|
||||
# Optional libarchive read-format plugin for UC2.
|
||||
#
|
||||
# Enable with -DUC2_BUILD_LIBARCHIVE_PLUGIN=ON. Because libarchive's
|
||||
# read-format API is internal (archive_read_private.h, __archive_read_*),
|
||||
# the plugin links against a libarchive source tree rather than the
|
||||
# installed -devel package. Pass -DLIBARCHIVE_SOURCE_DIR=<path> to a
|
||||
# checkout (or extracted release tarball) of libarchive.
|
||||
|
||||
option(UC2_BUILD_LIBARCHIVE_PLUGIN
|
||||
"Build the libarchive read-format plugin (milestone 1: bid)" OFF)
|
||||
|
||||
if(NOT UC2_BUILD_LIBARCHIVE_PLUGIN)
|
||||
return()
|
||||
endif()
|
||||
|
||||
if(NOT DEFINED LIBARCHIVE_SOURCE_DIR)
|
||||
message(WARNING
|
||||
"UC2_BUILD_LIBARCHIVE_PLUGIN=ON but LIBARCHIVE_SOURCE_DIR is "
|
||||
"not set. Pass -DLIBARCHIVE_SOURCE_DIR=<path-to-libarchive-source>; "
|
||||
"the plugin needs libarchive's private headers (archive_read_private.h, "
|
||||
"archive_platform.h). Skipping plugin build.")
|
||||
return()
|
||||
endif()
|
||||
|
||||
if(NOT EXISTS "${LIBARCHIVE_SOURCE_DIR}/libarchive/archive_read_private.h")
|
||||
message(WARNING
|
||||
"LIBARCHIVE_SOURCE_DIR=${LIBARCHIVE_SOURCE_DIR} does not look like "
|
||||
"a libarchive source tree (archive_read_private.h not found). "
|
||||
"Skipping plugin build.")
|
||||
return()
|
||||
endif()
|
||||
|
||||
# libarchive's headers live alongside its own config.h; set up enough
|
||||
# private include paths for archive_platform.h to compile against the
|
||||
# build host's <archive.h>.
|
||||
add_library(uc2_libarchive STATIC archive_read_support_format_uc2.c)
|
||||
target_include_directories(uc2_libarchive PRIVATE
|
||||
"${LIBARCHIVE_SOURCE_DIR}/libarchive"
|
||||
)
|
||||
target_link_libraries(uc2_libarchive PRIVATE uc2)
|
||||
target_compile_features(uc2_libarchive PRIVATE c_std_99)
|
||||
|
||||
# archive_platform.h pulls in build-time configuration that is normally
|
||||
# generated by libarchive's own configure. Bypass that path: tell the
|
||||
# compiler we are not using HAVE_CONFIG_H, and provide enough feature
|
||||
# guesses to keep the platform header satisfied.
|
||||
target_compile_definitions(uc2_libarchive PRIVATE
|
||||
PLATFORM_CONFIG_H="archive_platform_config.uc2.h"
|
||||
)
|
||||
configure_file(archive_platform_config.uc2.h.in
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/archive_platform_config.uc2.h"
|
||||
COPYONLY)
|
||||
target_include_directories(uc2_libarchive PRIVATE
|
||||
"${CMAKE_CURRENT_BINARY_DIR}"
|
||||
)
|
||||
|
||||
message(STATUS "UC2: libarchive plugin enabled "
|
||||
"(milestone 1, source=${LIBARCHIVE_SOURCE_DIR})")
|
||||
166
contrib/libarchive/README.md
Normal file
166
contrib/libarchive/README.md
Normal file
@@ -0,0 +1,166 @@
|
||||
# UC2 read-format plugin for libarchive
|
||||
|
||||
This directory contains the design and a skeleton implementation of a
|
||||
read-only `.uc2` format handler for libarchive. The goal is to make
|
||||
UC2 archives transparently extractable by every libarchive-using tool
|
||||
(`bsdtar`, `cmake`, `pkg(8)`, file-roller, Ark, and others).
|
||||
|
||||
## Status
|
||||
|
||||
- **Milestones 1-3 shipped.** `archive_read_support_format_uc2.c`
|
||||
implements:
|
||||
- `bid()` -- `__archive_read_ahead` reads the first 4 bytes,
|
||||
returns 64 on UC2 magic.
|
||||
- `read_header()` -- on first call, slurps the entire archive
|
||||
into memory via `__archive_read_ahead` + `__archive_read_consume`,
|
||||
opens a `libuc2` handle bound to the slurped buffer, walks
|
||||
`uc2_read_cdir` to cache every entry (with `uc2_get_tag`
|
||||
resolution for tagged entries), then yields entries one per call
|
||||
via `archive_entry_set_pathname` / `set_size` / `set_mtime` /
|
||||
`set_filetype` / `set_perm`.
|
||||
- `read_data()` -- on first call per entry, runs `uc2_extract`
|
||||
with a buffering write callback, then yields the whole entry
|
||||
in one slice; subsequent calls return `ARCHIVE_EOF`.
|
||||
- `read_data_skip()` and `cleanup()` -- correct.
|
||||
- Memory model: archive is slurped fully on the first `read_header`,
|
||||
so memory use scales with archive size. Acceptable for v1; future
|
||||
work can swap in a seekable adapter when the underlying filter
|
||||
supports `__archive_read_seek`.
|
||||
- `CMakeLists.txt` activates with `-DUC2_BUILD_LIBARCHIVE_PLUGIN=ON
|
||||
-DLIBARCHIVE_SOURCE_DIR=<libarchive-checkout>`. The pin against a
|
||||
source tree (rather than `find_package(LibArchive)`) is required
|
||||
because the read-format API is internal -- the public `-devel`
|
||||
package ships only `archive.h` and `archive_entry.h`.
|
||||
|
||||
## Integration recipe (manual, until upstream merge)
|
||||
|
||||
To actually exercise the plugin from `bsdtar`, the plugin must be
|
||||
linked into the libarchive binary itself (the relevant API is internal
|
||||
and not exported from the system shared library). Two paths:
|
||||
|
||||
1. **Drop-in patch.** Copy `archive_read_support_format_uc2.c` into
|
||||
`libarchive/libarchive/`, then add one line to
|
||||
`libarchive/libarchive/archive_read_support_format_all.c`:
|
||||
|
||||
```c
|
||||
archive_read_support_format_uc2(a);
|
||||
```
|
||||
|
||||
plus one entry in `libarchive/libarchive/CMakeLists.txt` next to
|
||||
the other `archive_read_support_format_*.c` sources. Rebuild
|
||||
libarchive; then `bsdtar -tf archive.uc2` lists entries.
|
||||
|
||||
2. **External link.** Build `libuc2_libarchive.a` from this directory
|
||||
(`cmake -DUC2_BUILD_LIBARCHIVE_PLUGIN=ON -DLIBARCHIVE_SOURCE_DIR=...`).
|
||||
Build a custom `libarchive_static.a` that includes the same
|
||||
`LIBARCHIVE_SOURCE_DIR`. Link both into a small driver program
|
||||
that calls `archive_read_support_format_uc2(a)`.
|
||||
|
||||
The upstream PR (milestone 8 in the original issue) replaces both
|
||||
recipes with a single first-class `bsdtar` integration.
|
||||
|
||||
## Why an out-of-tree skeleton?
|
||||
|
||||
libarchive's read-format plugin API is internal.
|
||||
`archive_read_register_format` is a static function, not part of the
|
||||
public ABI. An out-of-tree `.so` cannot be loaded into an unmodified
|
||||
libarchive at runtime.
|
||||
|
||||
The supported integration paths are:
|
||||
|
||||
1. **Upstream merge.** Submit
|
||||
`archive_read_support_format_uc2.c` as a PR against
|
||||
`libarchive/libarchive`. Once merged, distros pick it up and
|
||||
every tool that links libarchive sees `.uc2` automatically. This
|
||||
is the long-term goal.
|
||||
|
||||
2. **Patched libarchive build.** Distribute a small patch that
|
||||
includes the UC2 plugin against a known libarchive version.
|
||||
Useful for testing before upstream merge and for users who want
|
||||
`.uc2` support before the upstream release reaches their distro.
|
||||
|
||||
3. **Static-library wrapper.** Build the plugin as part of a custom
|
||||
tool that statically links libarchive + this plugin. Useful for
|
||||
demo binaries; not a substitute for upstream merge because the
|
||||
wrapper still won't be picked up by `bsdtar` etc.
|
||||
|
||||
## Architecture
|
||||
|
||||
UC2 archives use a fixed front header (29 bytes), a record stream
|
||||
of compressed bodies, and a compressed central directory whose
|
||||
offset is recorded in the front header. The central directory
|
||||
holds OHEAD records for masters, dirs, and files; entry attributes
|
||||
are in OSMETA + DIRMETA / FILEMETA.
|
||||
|
||||
The plugin uses libuc2 for parsing and decompression and adapts the
|
||||
results to libarchive's `struct archive_entry` model. libuc2 already
|
||||
exposes a streaming read API (`uc2_open`, `uc2_read_cdir`,
|
||||
`uc2_extract`) and is GPL-3.0 / LGPL-3.0; the plugin is GPL-3.0-or-later
|
||||
to match the cli/main.c license boundary. See
|
||||
[`docs/license-audit.md`](../../docs/license-audit.md) for the
|
||||
provenance table.
|
||||
|
||||
### Callback responsibilities
|
||||
|
||||
- **bid**: read the first 4 bytes via `__archive_read_ahead`, check
|
||||
for the UC2 magic (`0x1A324355`). Return 64 on match, 0 otherwise.
|
||||
libarchive uses the highest bid to pick a format; 64 is the
|
||||
conventional "format-recognised" score.
|
||||
|
||||
- **read_header**: on first call, open the libuc2 handle and read
|
||||
the central directory into memory. On every call, return one
|
||||
entry's metadata via `archive_entry_*` setters. When entries are
|
||||
exhausted, return `ARCHIVE_EOF`.
|
||||
|
||||
- **read_data**: stream decompressed bytes for the current entry.
|
||||
libuc2's `uc2_extract` invokes a write callback per chunk; the
|
||||
plugin needs to convert this push model into libarchive's pull
|
||||
model (the standard way: a small ring buffer, plus a generator
|
||||
loop or coroutine). The simplest first implementation buffers
|
||||
the whole entry, which is correct but increases memory pressure
|
||||
for very large files; refine later.
|
||||
|
||||
- **read_data_skip**: advance to the next entry without producing
|
||||
data. Decompression cannot be safely skipped (master-block
|
||||
dependencies), so the plugin still decompresses, just discards.
|
||||
|
||||
- **cleanup**: close the libuc2 handle, free buffers.
|
||||
|
||||
### libuc2 IO callbacks
|
||||
|
||||
libuc2 takes user-supplied callbacks for read/alloc/free/warn. The
|
||||
plugin wires these to libarchive's filter stack:
|
||||
- `read` -> `__archive_read_seek` + `__archive_read_ahead`
|
||||
- `alloc`/`free` -> `malloc`/`free`
|
||||
- `warn` -> push to libarchive's warning log via
|
||||
`archive_set_error`.
|
||||
|
||||
## Build
|
||||
|
||||
The CMake target only configures when libarchive headers are present.
|
||||
Install on Fedora/RHEL with `dnf install libarchive-devel`, on Debian
|
||||
with `apt install libarchive-dev`, or build libarchive from source.
|
||||
|
||||
```sh
|
||||
cmake -B build -DUC2_BUILD_LIBARCHIVE_PLUGIN=ON
|
||||
cmake --build build --target uc2_libarchive
|
||||
```
|
||||
|
||||
The built object can be linked into a libarchive-using application or
|
||||
patched into libarchive's source tree (`libarchive/libarchive/`).
|
||||
|
||||
## Roadmap
|
||||
|
||||
The current skeleton compiles into a stub library that registers a
|
||||
no-op format. The implementation milestones, in order:
|
||||
|
||||
1. bid function with magic check (~20 lines)
|
||||
2. read_header for the first entry only (single-file archives)
|
||||
3. read_data for uncompressed-by-master entries
|
||||
4. Master-block decompression and dependency tracking
|
||||
5. Multi-file archives + directory entries
|
||||
6. Tagged entries (long names, extended attributes)
|
||||
7. Round-trip test against bsdtar built from a patched libarchive
|
||||
8. Upstream PR
|
||||
|
||||
Each milestone is independently shippable as a working subset.
|
||||
56
contrib/libarchive/archive_platform_config.uc2.h.in
Normal file
56
contrib/libarchive/archive_platform_config.uc2.h.in
Normal file
@@ -0,0 +1,56 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* Minimal stand-in for libarchive's generated config.h.
|
||||
*
|
||||
* libarchive's archive_platform.h refuses to include if it can't find
|
||||
* either a generated config.h (via HAVE_CONFIG_H + autoconf) or a
|
||||
* hand-built PLATFORM_CONFIG_H. This file is the latter. Only the
|
||||
* minimum needed to compile our bid() and read_header() is set; full
|
||||
* libarchive functionality is not required from this translation unit.
|
||||
*/
|
||||
|
||||
#ifndef UC2_ARCHIVE_PLATFORM_CONFIG_H
|
||||
#define UC2_ARCHIVE_PLATFORM_CONFIG_H
|
||||
|
||||
/* Common POSIX feature presence (Linux/macOS/BSD). Adjust by host
|
||||
* if cross-compiling onto something different. */
|
||||
#define HAVE_STDINT_H 1
|
||||
#define HAVE_INTTYPES_H 1
|
||||
#define HAVE_STDLIB_H 1
|
||||
#define HAVE_STRING_H 1
|
||||
#define HAVE_SYS_TYPES_H 1
|
||||
#define HAVE_UNISTD_H 1
|
||||
#define HAVE_ERRNO_H 1
|
||||
#define HAVE_LIMITS_H 1
|
||||
#define HAVE_FCNTL_H 1
|
||||
#define HAVE_LOCALE_H 1
|
||||
#define HAVE_WCHAR_H 1
|
||||
#define HAVE_WCTYPE_H 1
|
||||
#define HAVE_TIME_H 1
|
||||
#define HAVE_ICONV_H 1
|
||||
#define HAVE_LANGINFO_H 1
|
||||
#define HAVE_DECL_NL_LANGINFO 1
|
||||
#define HAVE_DECL_INT32_MAX 1
|
||||
#define HAVE_DECL_INT32_MIN 1
|
||||
#define HAVE_DECL_UINT32_MAX 1
|
||||
#define HAVE_DECL_INT64_MAX 1
|
||||
#define HAVE_DECL_INT64_MIN 1
|
||||
#define HAVE_DECL_UINT64_MAX 1
|
||||
#define HAVE_DECL_INTMAX_MAX 1
|
||||
#define HAVE_DECL_INTMAX_MIN 1
|
||||
#define HAVE_DECL_UINTMAX_MAX 1
|
||||
#define HAVE_DECL_SIZE_MAX 1
|
||||
#define HAVE_DECL_SSIZE_MAX 1
|
||||
|
||||
/* iconv shape on glibc / FreeBSD. */
|
||||
#define ICONV_CONST ""
|
||||
|
||||
/* libarchive expects these to be defined to 0 or 1. */
|
||||
#define HAVE_ZLIB_H 0
|
||||
#define HAVE_BZLIB_H 0
|
||||
#define HAVE_LZMA_H 0
|
||||
#define HAVE_LZO1X_H 0
|
||||
#define HAVE_LZ4_H 0
|
||||
#define HAVE_ZSTD_H 0
|
||||
|
||||
#endif
|
||||
510
contrib/libarchive/archive_read_support_format_uc2.c
Normal file
510
contrib/libarchive/archive_read_support_format_uc2.c
Normal file
@@ -0,0 +1,510 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* libarchive read handler for UC2 v3 archives.
|
||||
*
|
||||
* Status: milestones 1-6.
|
||||
* M1 -- bid() with UC2 magic check.
|
||||
* M2 -- read_header iterates uc2_read_cdir, maps each cdir entry to
|
||||
* libarchive's archive_entry shape (name, size, mode, mtime).
|
||||
* M3 -- read_data uses uc2_extract to decompress an entry, buffers
|
||||
* the result, then yields it via libarchive's pull-style API.
|
||||
* M4 -- master blocks resolve inside libuc2 during uc2_extract.
|
||||
* M5 -- multi-file archives with full directory paths composed from
|
||||
* the cdir's directory ids (parent-before-child not assumed).
|
||||
* M6 -- tagged entries (Win95 long names) resolved via uc2_get_tag.
|
||||
*
|
||||
* Strategy: on the first read_header call we slurp the entire archive
|
||||
* into memory through __archive_read_ahead, then drive libuc2 against
|
||||
* that buffer. This is correct for any input but uses memory equal
|
||||
* to the archive size; future revisions can swap in a seekable adapter
|
||||
* when the underlying filter supports __archive_read_seek.
|
||||
*
|
||||
* Built against libarchive's internal API
|
||||
* (archive_read_private.h, __archive_read_ahead,
|
||||
* __archive_read_register_format), so it must compile inside a
|
||||
* libarchive source tree. Pass -DLIBARCHIVE_SOURCE_DIR=<path> to
|
||||
* cmake to build standalone.
|
||||
*/
|
||||
|
||||
#include "archive_platform.h"
|
||||
|
||||
#include "archive.h"
|
||||
#include "archive_entry.h"
|
||||
#include "archive_private.h"
|
||||
#include "archive_read_private.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
|
||||
#include <uc2/libuc2.h>
|
||||
|
||||
#define ARCHIVE_FORMAT_UC2 0xC0FF0000 /* placeholder format code */
|
||||
|
||||
struct uc2_la_state {
|
||||
/* Slurped archive */
|
||||
uint8_t *data;
|
||||
size_t len;
|
||||
int slurped; /* 0 = not yet, 1 = done */
|
||||
|
||||
/* libuc2 */
|
||||
uc2_handle handle;
|
||||
|
||||
/* Cached cdir entries. uc2_read_cdir is single-pass; we capture
|
||||
* everything on the first read_header call. */
|
||||
struct uc2_entry *entries;
|
||||
char **paths; /* composed full path per entry */
|
||||
int n_entries;
|
||||
int n_capacity;
|
||||
int next_entry;
|
||||
char label[12];
|
||||
|
||||
/* Per-entry decompressed buffer for read_data. */
|
||||
uint8_t *entry_data;
|
||||
size_t entry_cap;
|
||||
size_t entry_len;
|
||||
int entry_yielded;
|
||||
};
|
||||
|
||||
/* libuc2 IO callbacks bound to the slurped buffer. */
|
||||
static int
|
||||
slurp_read(void *ctx, unsigned pos, void *buf, unsigned len)
|
||||
{
|
||||
struct uc2_la_state *st = (struct uc2_la_state *)ctx;
|
||||
if ((size_t)pos >= st->len)
|
||||
return 0;
|
||||
unsigned avail = (unsigned)(st->len - pos);
|
||||
if (len > avail)
|
||||
len = avail;
|
||||
memcpy(buf, st->data + pos, len);
|
||||
return (int)len;
|
||||
}
|
||||
|
||||
static void *
|
||||
slurp_alloc(void *ctx, unsigned size)
|
||||
{
|
||||
(void)ctx;
|
||||
return malloc(size);
|
||||
}
|
||||
|
||||
static void
|
||||
slurp_free(void *ctx, void *ptr)
|
||||
{
|
||||
(void)ctx;
|
||||
free(ptr);
|
||||
}
|
||||
|
||||
static struct uc2_io slurp_io = {
|
||||
.read = slurp_read,
|
||||
.alloc = slurp_alloc,
|
||||
.free = slurp_free,
|
||||
.warn = NULL,
|
||||
};
|
||||
|
||||
/* Push-style write callback for uc2_extract. Buffer everything and
|
||||
* let read_data yield it in one slice. */
|
||||
struct extract_buf {
|
||||
uint8_t *data;
|
||||
size_t cap;
|
||||
size_t len;
|
||||
int err;
|
||||
};
|
||||
|
||||
static int
|
||||
extract_write(void *ctx, const void *p, unsigned len)
|
||||
{
|
||||
struct extract_buf *eb = (struct extract_buf *)ctx;
|
||||
if (eb->len + (size_t)len < eb->len) { eb->err = 1; return -1; } /* wrap */
|
||||
size_t need = eb->len + (size_t)len;
|
||||
if (need > eb->cap) {
|
||||
size_t ncap = eb->cap ? eb->cap : 4096;
|
||||
while (ncap < need) {
|
||||
if (ncap > ((size_t)-1) / 2) { ncap = need; break; }
|
||||
ncap *= 2;
|
||||
}
|
||||
uint8_t *np = realloc(eb->data, ncap);
|
||||
if (!np) { eb->err = 1; return -1; }
|
||||
eb->data = np;
|
||||
eb->cap = ncap;
|
||||
}
|
||||
memcpy(eb->data + eb->len, p, len);
|
||||
eb->len = need;
|
||||
return (int)len;
|
||||
}
|
||||
|
||||
/* DOS date/time -> Unix time_t (UTC; DOS times are local but we treat
|
||||
* them as UTC since timezone info is not present in the archive). */
|
||||
static time_t
|
||||
dos_to_unix_time(unsigned dos_time)
|
||||
{
|
||||
struct tm tm;
|
||||
memset(&tm, 0, sizeof tm);
|
||||
tm.tm_sec = (dos_time & 0x1f) * 2;
|
||||
tm.tm_min = (dos_time >> 5) & 0x3f;
|
||||
tm.tm_hour = (dos_time >> 11) & 0x1f;
|
||||
tm.tm_mday = (dos_time >> 16) & 0x1f;
|
||||
tm.tm_mon = ((dos_time >> 21) & 0x0f) - 1;
|
||||
tm.tm_year = ((dos_time >> 25) & 0x7f) + 80;
|
||||
#if defined(_WIN32)
|
||||
return _mkgmtime(&tm);
|
||||
#elif defined(__GLIBC__) || defined(__APPLE__) || defined(__FreeBSD__) || \
|
||||
defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__)
|
||||
return timegm(&tm);
|
||||
#else
|
||||
return mktime(&tm);
|
||||
#endif
|
||||
}
|
||||
|
||||
static int uc2_la_bid(struct archive_read *, int);
|
||||
static int uc2_la_read_header(struct archive_read *, struct archive_entry *);
|
||||
static int uc2_la_read_data(struct archive_read *, const void **,
|
||||
size_t *, int64_t *);
|
||||
static int uc2_la_read_data_skip(struct archive_read *);
|
||||
static int uc2_la_cleanup(struct archive_read *);
|
||||
|
||||
int
|
||||
archive_read_support_format_uc2(struct archive *_a)
|
||||
{
|
||||
struct archive_read *a = (struct archive_read *)_a;
|
||||
struct uc2_la_state *state;
|
||||
int r;
|
||||
|
||||
archive_check_magic(_a, ARCHIVE_READ_MAGIC,
|
||||
ARCHIVE_STATE_NEW, "archive_read_support_format_uc2");
|
||||
|
||||
state = (struct uc2_la_state *)calloc(1, sizeof(*state));
|
||||
if (state == NULL) {
|
||||
archive_set_error(_a, ENOMEM,
|
||||
"Out of memory allocating UC2 reader state");
|
||||
return (ARCHIVE_FATAL);
|
||||
}
|
||||
|
||||
r = __archive_read_register_format(a,
|
||||
state,
|
||||
"uc2",
|
||||
uc2_la_bid,
|
||||
NULL,
|
||||
uc2_la_read_header,
|
||||
uc2_la_read_data,
|
||||
uc2_la_read_data_skip,
|
||||
NULL,
|
||||
uc2_la_cleanup,
|
||||
NULL,
|
||||
NULL);
|
||||
|
||||
if (r != ARCHIVE_OK)
|
||||
free(state);
|
||||
return (r);
|
||||
}
|
||||
|
||||
static int
|
||||
uc2_la_bid(struct archive_read *a, int best_bid)
|
||||
{
|
||||
const unsigned char *p;
|
||||
|
||||
(void)best_bid;
|
||||
|
||||
p = __archive_read_ahead(a, 4, NULL);
|
||||
if (p == NULL)
|
||||
return (-1);
|
||||
|
||||
if (p[0] == 0x55 && p[1] == 0x43 && p[2] == 0x32 && p[3] == 0x1A)
|
||||
return (64);
|
||||
return (0);
|
||||
}
|
||||
|
||||
/* Slurp the entire archive into state->data via __archive_read_ahead +
|
||||
* __archive_read_consume. Returns ARCHIVE_OK or ARCHIVE_FATAL. */
|
||||
static int
|
||||
slurp_archive(struct archive_read *a, struct uc2_la_state *st)
|
||||
{
|
||||
for (;;) {
|
||||
ssize_t avail;
|
||||
const void *p = __archive_read_ahead(a, 1, &avail);
|
||||
if (p == NULL) {
|
||||
if (avail < 0) {
|
||||
archive_set_error(&a->archive, EIO,
|
||||
"UC2: read error while slurping archive");
|
||||
return (ARCHIVE_FATAL);
|
||||
}
|
||||
break; /* clean EOF */
|
||||
}
|
||||
if (avail <= 0)
|
||||
break;
|
||||
|
||||
if (st->len + (size_t)avail > st->len /* overflow guard */) {
|
||||
size_t need = st->len + (size_t)avail;
|
||||
if (need > (size_t)0x80000000u) {
|
||||
archive_set_error(&a->archive, ENOMEM,
|
||||
"UC2: archive too large to slurp (>2GB)");
|
||||
return (ARCHIVE_FATAL);
|
||||
}
|
||||
/* grow to power-of-two */
|
||||
size_t cap = st->len ? st->len : 4096;
|
||||
while (cap < need) cap *= 2;
|
||||
uint8_t *np = (uint8_t *)realloc(st->data, cap);
|
||||
if (!np) {
|
||||
archive_set_error(&a->archive, ENOMEM,
|
||||
"UC2: out of memory slurping archive");
|
||||
return (ARCHIVE_FATAL);
|
||||
}
|
||||
st->data = np;
|
||||
}
|
||||
memcpy(st->data + st->len, p, (size_t)avail);
|
||||
st->len += (size_t)avail;
|
||||
__archive_read_consume(a, avail);
|
||||
}
|
||||
return (ARCHIVE_OK);
|
||||
}
|
||||
|
||||
/* Walk uc2_read_cdir and cache all entries. Tagged entries have
|
||||
* uc2_get_tag called to fully resolve names. */
|
||||
static int
|
||||
collect_entries(struct archive_read *a, struct uc2_la_state *st)
|
||||
{
|
||||
st->handle = uc2_open(&slurp_io, st);
|
||||
if (st->handle == NULL) {
|
||||
archive_set_error(&a->archive, EINVAL,
|
||||
"UC2: uc2_open failed");
|
||||
return (ARCHIVE_FATAL);
|
||||
}
|
||||
|
||||
for (;;) {
|
||||
if (st->n_entries >= st->n_capacity) {
|
||||
int ncap = st->n_capacity ? st->n_capacity * 2 : 32;
|
||||
struct uc2_entry *ne = (struct uc2_entry *)realloc(
|
||||
st->entries, (size_t)ncap * sizeof *ne);
|
||||
if (!ne) {
|
||||
archive_set_error(&a->archive, ENOMEM,
|
||||
"UC2: out of memory collecting entries");
|
||||
return (ARCHIVE_FATAL);
|
||||
}
|
||||
st->entries = ne;
|
||||
st->n_capacity = ncap;
|
||||
}
|
||||
|
||||
struct uc2_entry *e = &st->entries[st->n_entries];
|
||||
int ret = uc2_read_cdir(st->handle, e);
|
||||
if (ret == UC2_End)
|
||||
break;
|
||||
if (ret < 0) {
|
||||
archive_set_error(&a->archive, EINVAL,
|
||||
"UC2: uc2_read_cdir failed: %s",
|
||||
uc2_message(st->handle, ret));
|
||||
return (ARCHIVE_FATAL);
|
||||
}
|
||||
|
||||
while (ret == UC2_TaggedEntry) {
|
||||
char *tag;
|
||||
void *data;
|
||||
unsigned size;
|
||||
ret = uc2_get_tag(st->handle, e, &tag, &data, &size);
|
||||
if (ret < 0) {
|
||||
archive_set_error(&a->archive, EINVAL,
|
||||
"UC2: uc2_get_tag failed: %s",
|
||||
uc2_message(st->handle, ret));
|
||||
return (ARCHIVE_FATAL);
|
||||
}
|
||||
}
|
||||
|
||||
st->n_entries++;
|
||||
}
|
||||
|
||||
uc2_finish_cdir(st->handle, st->label);
|
||||
return (ARCHIVE_OK);
|
||||
}
|
||||
|
||||
/* Append the full path of directory `id` (with a trailing slash) to
|
||||
* buf. Returns the new offset, or -1 on overflow. UC2 directory ids
|
||||
* are archive-global; root is 0. The depth cap breaks cycles in
|
||||
* damaged directories. */
|
||||
static int
|
||||
build_dir_path(struct uc2_la_state *st, unsigned id,
|
||||
char *buf, size_t cap, int depth)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (id == 0)
|
||||
return (0);
|
||||
if (depth > 64)
|
||||
return (-1); /* cyclic or pathologically deep: corrupt cdir */
|
||||
for (i = 0; i < st->n_entries; i++) {
|
||||
struct uc2_entry *d = &st->entries[i];
|
||||
if (d->is_dir && d->id == id) {
|
||||
int off = build_dir_path(st, d->dirid, buf, cap,
|
||||
depth + 1);
|
||||
int n;
|
||||
if (off < 0)
|
||||
return (-1);
|
||||
n = snprintf(buf + off, cap - off, "%s/", d->name);
|
||||
if (n < 0 || (size_t)n >= cap - off)
|
||||
return (-1);
|
||||
return (off + n);
|
||||
}
|
||||
}
|
||||
return (0); /* unknown parent: fall back to root */
|
||||
}
|
||||
|
||||
/* Compose a full path for every entry: parent directories joined with
|
||||
* '/', directories themselves carrying a trailing slash. */
|
||||
static int
|
||||
compose_paths(struct archive_read *a, struct uc2_la_state *st)
|
||||
{
|
||||
int i;
|
||||
|
||||
st->paths = (char **)calloc((size_t)st->n_entries,
|
||||
sizeof *st->paths);
|
||||
if (st->paths == NULL && st->n_entries > 0) {
|
||||
archive_set_error(&a->archive, ENOMEM,
|
||||
"UC2: out of memory composing paths");
|
||||
return (ARCHIVE_FATAL);
|
||||
}
|
||||
|
||||
for (i = 0; i < st->n_entries; i++) {
|
||||
struct uc2_entry *e = &st->entries[i];
|
||||
char buf[2048];
|
||||
int off = build_dir_path(st, e->dirid, buf, sizeof buf, 0);
|
||||
int n;
|
||||
if (off < 0) {
|
||||
archive_set_error(&a->archive, EINVAL,
|
||||
"UC2: directory path too long");
|
||||
return (ARCHIVE_FATAL);
|
||||
}
|
||||
n = snprintf(buf + off, sizeof buf - off, "%s%s",
|
||||
e->name, e->is_dir ? "/" : "");
|
||||
if (n < 0 || (size_t)n >= sizeof buf - off) {
|
||||
archive_set_error(&a->archive, EINVAL,
|
||||
"UC2: entry path too long");
|
||||
return (ARCHIVE_FATAL);
|
||||
}
|
||||
st->paths[i] = strdup(buf);
|
||||
if (st->paths[i] == NULL) {
|
||||
archive_set_error(&a->archive, ENOMEM,
|
||||
"UC2: out of memory composing paths");
|
||||
return (ARCHIVE_FATAL);
|
||||
}
|
||||
}
|
||||
return (ARCHIVE_OK);
|
||||
}
|
||||
|
||||
static int
|
||||
uc2_la_read_header(struct archive_read *a, struct archive_entry *entry)
|
||||
{
|
||||
struct uc2_la_state *st = (struct uc2_la_state *)a->format->data;
|
||||
|
||||
a->archive.archive_format = ARCHIVE_FORMAT_UC2;
|
||||
a->archive.archive_format_name = "UC2";
|
||||
|
||||
if (!st->slurped) {
|
||||
int r = slurp_archive(a, st);
|
||||
if (r != ARCHIVE_OK) return r;
|
||||
st->slurped = 1;
|
||||
|
||||
r = collect_entries(a, st);
|
||||
if (r != ARCHIVE_OK) return r;
|
||||
|
||||
r = compose_paths(a, st);
|
||||
if (r != ARCHIVE_OK) return r;
|
||||
}
|
||||
|
||||
if (st->next_entry >= st->n_entries)
|
||||
return (ARCHIVE_EOF);
|
||||
|
||||
struct uc2_entry *e = &st->entries[st->next_entry++];
|
||||
|
||||
/* Reset per-entry buffer state. */
|
||||
st->entry_len = 0;
|
||||
st->entry_yielded = 0;
|
||||
|
||||
archive_entry_set_pathname(entry, st->paths[st->next_entry - 1]);
|
||||
archive_entry_set_size(entry, (la_int64_t)e->size);
|
||||
archive_entry_set_mtime(entry, dos_to_unix_time(e->dos_time), 0);
|
||||
|
||||
if (e->is_dir) {
|
||||
archive_entry_set_filetype(entry, AE_IFDIR);
|
||||
archive_entry_set_perm(entry, 0755);
|
||||
} else {
|
||||
archive_entry_set_filetype(entry, AE_IFREG);
|
||||
mode_t mode = 0644;
|
||||
if (e->attr & UC2_Attr_R) mode &= ~0222;
|
||||
archive_entry_set_perm(entry, mode);
|
||||
}
|
||||
|
||||
return (ARCHIVE_OK);
|
||||
}
|
||||
|
||||
static int
|
||||
uc2_la_read_data(struct archive_read *a,
|
||||
const void **buff, size_t *size, int64_t *offset)
|
||||
{
|
||||
struct uc2_la_state *st = (struct uc2_la_state *)a->format->data;
|
||||
|
||||
if (st->next_entry == 0 || st->entry_yielded) {
|
||||
*buff = NULL;
|
||||
*size = 0;
|
||||
*offset = 0;
|
||||
return (ARCHIVE_EOF);
|
||||
}
|
||||
|
||||
struct uc2_entry *e = &st->entries[st->next_entry - 1];
|
||||
if (e->is_dir || e->size == 0) {
|
||||
st->entry_yielded = 1;
|
||||
*buff = NULL;
|
||||
*size = 0;
|
||||
*offset = 0;
|
||||
return (ARCHIVE_EOF);
|
||||
}
|
||||
|
||||
/* Decompress the whole entry once. */
|
||||
struct extract_buf eb = { .data = st->entry_data, .cap = st->entry_cap };
|
||||
int ret = uc2_extract(st->handle, &e->xi, e->size,
|
||||
extract_write, &eb);
|
||||
st->entry_data = eb.data;
|
||||
st->entry_cap = eb.cap;
|
||||
st->entry_len = eb.len;
|
||||
|
||||
if (ret < 0 || eb.err) {
|
||||
archive_set_error(&a->archive, EIO,
|
||||
"UC2: uc2_extract failed: %s",
|
||||
uc2_message(st->handle, ret));
|
||||
return (ARCHIVE_FATAL);
|
||||
}
|
||||
|
||||
st->entry_yielded = 1;
|
||||
*buff = st->entry_data;
|
||||
*size = st->entry_len;
|
||||
*offset = 0;
|
||||
return (ARCHIVE_OK);
|
||||
}
|
||||
|
||||
static int
|
||||
uc2_la_read_data_skip(struct archive_read *a)
|
||||
{
|
||||
struct uc2_la_state *st = (struct uc2_la_state *)a->format->data;
|
||||
st->entry_yielded = 1;
|
||||
return (ARCHIVE_OK);
|
||||
}
|
||||
|
||||
static int
|
||||
uc2_la_cleanup(struct archive_read *a)
|
||||
{
|
||||
struct uc2_la_state *st = (struct uc2_la_state *)a->format->data;
|
||||
if (st == NULL)
|
||||
return (ARCHIVE_OK);
|
||||
if (st->handle)
|
||||
uc2_close(st->handle);
|
||||
if (st->paths) {
|
||||
int i;
|
||||
for (i = 0; i < st->n_entries; i++)
|
||||
free(st->paths[i]);
|
||||
free(st->paths);
|
||||
}
|
||||
free(st->data);
|
||||
free(st->entries);
|
||||
free(st->entry_data);
|
||||
free(st);
|
||||
a->format->data = NULL;
|
||||
return (ARCHIVE_OK);
|
||||
}
|
||||
0
docs/_static/.gitkeep
vendored
Normal file
0
docs/_static/.gitkeep
vendored
Normal file
201
docs/blog/uc2-revival-writeup.md
Normal file
201
docs/blog/uc2-revival-writeup.md
Normal file
@@ -0,0 +1,201 @@
|
||||
# Reviving UltraCompressor II: a 1992 DOS archiver, ported forward
|
||||
|
||||
Subtitle candidates:
|
||||
- *Show HN: UC2 v3 - 1992 DOS archiver, ported to modern C99* (HN)
|
||||
- *UltraCompressor II revival: rANS, CDC, BLAKE3, OpenTimestamps* (Lobsters)
|
||||
|
||||
---
|
||||
|
||||
In 1992, Nico de Vries shipped UltraCompressor II for DOS. It
|
||||
competed with PKZIP and ARJ. Among the things it did that were
|
||||
unusual for the era: master-block deduplication. If an archive
|
||||
contained several similar files, UC2 could store one shared "master"
|
||||
block and represent each file as a delta against it - within a
|
||||
single archive, on a single floppy, in 4 MB of RAM.
|
||||
|
||||
UC2 v3.0.0-alpha.1 is a port forward. Its compressor produces
|
||||
bitstreams that the original `uc2pro.exe` (UC2 Pro v2.3, 1992)
|
||||
accepts and extracts at byte-identical fidelity. It also adds
|
||||
content-defined chunking, an rANS entropy coder, BLAKE3 + SHA-256
|
||||
hashing, and an OpenTimestamps integration so an archive can be
|
||||
anchored to a Bitcoin block at creation time.
|
||||
|
||||
This post is the story of how it got here.
|
||||
|
||||
## The lineage
|
||||
|
||||
UC2 has passed through several pairs of hands across thirty-four
|
||||
years:
|
||||
|
||||
1. **Nico de Vries (1992-1996)** wrote UC2, with assistance from
|
||||
Danny Bezemer, Jan-Pieter Cornet, and others credited in the
|
||||
original `U_MANUAL.TXT`.
|
||||
|
||||
2. **2015 LGPL release**. In December 2015, Vladislav Sagunov asked
|
||||
de Vries whether the source could be re-released under a free
|
||||
licence. De Vries agreed and published the full source under the
|
||||
GNU LGPL-3.0 (with a small Borland C/C++ runtime carve-out for
|
||||
DOS-specific code). That release lives in this repo unchanged
|
||||
under `original/UC2_source/`, including the `Read Me First.txt`
|
||||
from de Vries himself.
|
||||
|
||||
3. **Jan Bobrowski (2020-2021)** wrote a clean-room portable
|
||||
*decompressor* in C, called `unuc2` / `libunuc2`. The library is
|
||||
LGPL-3.0; the CLI tool is GPL-3.0-or-later. Bobrowski's
|
||||
decompressor compiles cleanly on POSIX, MSVC, and (with care) DOS,
|
||||
and his code is what most modern UC2 work builds on.
|
||||
|
||||
4. **Eremey Valetov (2026)** is who I am. What I've added is the
|
||||
*compressor* that pairs with Bobrowski's decompressor, plus
|
||||
several modules of compression / dedup / integrity work.
|
||||
|
||||
Bobrowski-derived files in the repo retain LGPL-3.0; new work is
|
||||
GPL-3.0-or-later. Per-file provenance is in
|
||||
[`docs/license-audit.md`](../license-audit.md); the 1992 source and
|
||||
the 2020 release are preserved unmodified.
|
||||
|
||||
## What's in v3
|
||||
|
||||
### Byte-bitstream-compatible LZ77 + Huffman
|
||||
|
||||
The compressor in `lib/src/compress.c` is the inverse of Bobrowski's
|
||||
decompressor and produces UC2 v3 archives whose method-4 bitstream is
|
||||
accepted by `uc2pro.exe`. Cross-tool round-trip is in CI: a
|
||||
`tests/scripts/roundtrip_dosbox.sh` job runs the original 1992 binary
|
||||
in DOSBox-X against archives built by the modern tool (and vice
|
||||
versa) and verifies that extracted files are bit-identical to the
|
||||
inputs.
|
||||
|
||||
Compression levels 2-5 (Fast, Normal, Tight, Ultra) match the
|
||||
original's IDs. The original's `bFlag` heuristic for choosing
|
||||
between default and custom Huffman trees on small blocks is
|
||||
preserved.
|
||||
|
||||
### Master-block deduplication, modernised
|
||||
|
||||
The 1992 UC2 grouped files by an exact prefix match and built one
|
||||
master block per group. v3 layers content-defined chunking (CDC) on
|
||||
top: file pairs that share large blocks of content at *non-aligned*
|
||||
positions can also be grouped, since the chunker breaks both files
|
||||
on the same content-defined boundaries. CDC uses a Gear rolling
|
||||
hash with an average chunk size of 4 KiB.
|
||||
|
||||
Several additional modules ship as libraries with their own unit
|
||||
tests, used so far by the compressor's master-block selection logic
|
||||
and exposed for callers:
|
||||
- a Merkle DAG of deduplicated blocks (`uc2_merkle.h`),
|
||||
- a content-addressable cross-archive block store (`uc2_blockstore.h`),
|
||||
- SimHash near-duplicate detection (`uc2_simhash.h`),
|
||||
- byte-level delta compression (`uc2_delta.h`).
|
||||
|
||||
These extend the format with new metadata records. Method-4 (the
|
||||
1992 bitstream) remains untouched, so old readers handle the file
|
||||
data; new readers see the additional dedup hints.
|
||||
|
||||
### Modern compression backends
|
||||
|
||||
Phase 4 added pluggable backends behind new method IDs. Method 4
|
||||
(the original Huffman) is kept as-is for round-trip with the 1992
|
||||
binary.
|
||||
|
||||
- **Method 10**: rANS entropy coder. 32-bit table-based. Selected
|
||||
by levels 6-9.
|
||||
- **LZ4**: ultra-fast mode, exposed via the `uc2_lz4.h` library and
|
||||
the `uc2 -B` benchmark; not yet a first-class archive backend.
|
||||
- **Content-aware preprocessing** (`uc2_preprocess.h`): BCJ for x86
|
||||
address normalisation, BWT for text, byte-stride delta filter.
|
||||
- **Dictionary metadata** (`uc2_dict.h`): zstd-inspired formal
|
||||
dictionary records with content-hash IDs and integrity checksums.
|
||||
- **`uc2 -B`**: built-in benchmark mode runs all methods on the
|
||||
input and prints ratio + timing per method.
|
||||
|
||||
### Cryptographic integrity
|
||||
|
||||
Phase 7 anchored the archive's content hash:
|
||||
|
||||
- **BLAKE3** (`uc2_blake3.h`) for fast content hashing.
|
||||
- **SHA-256** (`uc2_sha256.h`, FIPS 180-4) for OpenTimestamps
|
||||
compatibility.
|
||||
- **OpenTimestamps integration** (`uc2_ots.h`): the archive's SHA-256
|
||||
can be anchored to a Bitcoin block via a public calendar server,
|
||||
and the resulting proof is stored in a magic-bracketed sidecar
|
||||
trailer appended after the recorded archive bytes. The 1992 reader
|
||||
ignores the trailer (it uses the front header's recorded length),
|
||||
preserving backward compatibility. Extracted proofs are
|
||||
byte-identical to standard `.ots` files; the cross-check test runs
|
||||
them through `python-opentimestamps` to confirm round-trip parsing.
|
||||
|
||||
The OTS parser is conservative about hostile input: strict-canonical
|
||||
varints, depth-bounded recursion, varbytes size cap, leaf digest must
|
||||
match the recomputed archive SHA-256 before `--ots-attach` accepts a
|
||||
proof.
|
||||
|
||||
## A demonstration
|
||||
|
||||
```sh
|
||||
# Create an archive
|
||||
$ uc2 -w -L4 demo.uc2 file1.txt file2.txt
|
||||
UC2 compression level: Tight
|
||||
Created demo.uc2 (2 files, 0 dirs, 1 master, 215 bytes)
|
||||
Everything went OK
|
||||
|
||||
# Extract with the original UC2 Pro v2.3 in DOSBox-X
|
||||
$ dosbox -conf <(echo -e "[autoexec]\nmount C: .\nC:\nuc2pro.exe -x demo.uc2")
|
||||
# -> bit-identical files
|
||||
|
||||
# Anchor the archive to the Bitcoin blockchain
|
||||
$ ots stamp demo.uc2 # produces demo.uc2.ots from a calendar
|
||||
$ uc2 --ots-attach demo.uc2.ots demo.uc2
|
||||
Attached 396-byte OTS proof to demo.uc2
|
||||
|
||||
$ uc2 -t demo.uc2
|
||||
Testing archive integrity...
|
||||
OTS proof: leaf matches; structure verified
|
||||
Everything went OK
|
||||
```
|
||||
|
||||
Cross-tool round-trip is in CI. The OTS extracted output is
|
||||
verified against the upstream `python-opentimestamps` parser when
|
||||
that package is installed (the test skips cleanly otherwise).
|
||||
|
||||
## What's coming
|
||||
|
||||
The roadmap is in [`ROADMAP.md`](../../ROADMAP.md), with each item
|
||||
tracked as a `git-bug` issue. The next things on the list are
|
||||
practical:
|
||||
|
||||
- **DJGPP cross-compile** so v3 actually runs on DOS. The compat
|
||||
layer is already in `cli/src/compat/compat_dos.c`; the
|
||||
cross-compile target and DOSBox-X CI are the missing pieces.
|
||||
- **libarchive read handler** so `.uc2` is a recognised format for
|
||||
tools that integrate with libarchive.
|
||||
- **`uc2 --ingest` streaming mode** for piping `tar` or `rsync` into
|
||||
a deduplicating sink.
|
||||
|
||||
Beyond that, the issue tracker has speculative items for
|
||||
post-quantum encryption, IPFS integration, and zero-knowledge
|
||||
proofs. Those are research directions, not promises.
|
||||
|
||||
## Why bother?
|
||||
|
||||
Two reasons.
|
||||
|
||||
First, archive formats are a load-bearing piece of computing
|
||||
history. Preserving the 1992 source unchanged, the 2015 LGPL
|
||||
re-release unchanged, and the 2020 portable decompressor unchanged
|
||||
- all in the same repository as the modern port - is what makes
|
||||
the format survive its hosting choices.
|
||||
|
||||
Second, the master-block design from 1992 turns out to be a
|
||||
surprisingly clean substrate to bolt content-defined chunking,
|
||||
content-addressable storage, and verifiable timestamps onto. Phase
|
||||
3 and Phase 7 work landed without breaking the 1992 reader. Doing
|
||||
the same project as a wrapper around `gzip` would have been more
|
||||
work for less reach.
|
||||
|
||||
The repo, with full source, license trail, test suite, and
|
||||
roadmap, is at <https://github.com/evvaletov/uc2>.
|
||||
|
||||
---
|
||||
|
||||
*Eremey Valetov, May 2026.*
|
||||
113
docs/building.rst
Normal file
113
docs/building.rst
Normal file
@@ -0,0 +1,113 @@
|
||||
Building
|
||||
========
|
||||
|
||||
Requirements
|
||||
------------
|
||||
|
||||
- CMake >= 3.16
|
||||
- C99 compiler: GCC, Clang, or MSVC
|
||||
- Optional: DJGPP cross-compiler for DOS builds
|
||||
|
||||
Linux / macOS
|
||||
-------------
|
||||
|
||||
.. code-block:: sh
|
||||
|
||||
cmake -B build -DCMAKE_BUILD_TYPE=Release
|
||||
cmake --build build
|
||||
ctest --test-dir build
|
||||
|
||||
The binary is ``build/cli/uc2`` and the library is
|
||||
``build/lib/libuc2.a``.
|
||||
|
||||
Windows (MSVC)
|
||||
--------------
|
||||
|
||||
.. code-block:: sh
|
||||
|
||||
cmake -B build
|
||||
cmake --build build --config Release
|
||||
ctest --test-dir build -C Release
|
||||
|
||||
DOS (DJGPP Cross-Compilation)
|
||||
-----------------------------
|
||||
|
||||
Cross-compile from a Linux host using the DJGPP toolchain:
|
||||
|
||||
.. code-block:: sh
|
||||
|
||||
cmake -B build-dos -DCMAKE_TOOLCHAIN_FILE=cmake/djgpp.cmake
|
||||
cmake --build build-dos
|
||||
|
||||
This produces a DOS executable suitable for DOSBox or real hardware.
|
||||
|
||||
libarchive Read Plugin
|
||||
----------------------
|
||||
|
||||
The optional libarchive read handler (``contrib/libarchive/``) lets any
|
||||
libarchive consumer — ``bsdtar``, file managers, language bindings —
|
||||
list and extract ``.uc2`` archives. It uses libarchive's internal
|
||||
read-format API, so it builds against a libarchive **source tree**
|
||||
rather than an installed ``-devel`` package.
|
||||
|
||||
Unpack a libarchive release and build a static library (a
|
||||
dependency-free configuration is enough for the plugin and its test):
|
||||
|
||||
.. code-block:: sh
|
||||
|
||||
curl -LO https://github.com/libarchive/libarchive/releases/download/v3.7.7/libarchive-3.7.7.tar.gz
|
||||
tar xzf libarchive-3.7.7.tar.gz
|
||||
cmake -S libarchive-3.7.7 -B larch-build -DCMAKE_BUILD_TYPE=Release \
|
||||
-DBUILD_SHARED_LIBS=OFF -DENABLE_TEST=OFF
|
||||
cmake --build larch-build --target archive_static
|
||||
|
||||
Then configure UC2 with the plugin enabled, pointing at the source tree
|
||||
and the static library:
|
||||
|
||||
.. code-block:: sh
|
||||
|
||||
cmake -B build -DCMAKE_BUILD_TYPE=Release \
|
||||
-DUC2_BUILD_LIBARCHIVE_PLUGIN=ON \
|
||||
-DLIBARCHIVE_SOURCE_DIR=$PWD/libarchive-3.7.7 \
|
||||
-DLIBARCHIVE_LIBRARY=$PWD/larch-build/libarchive/libarchive.a
|
||||
cmake --build build
|
||||
|
||||
This builds ``libuc2_libarchive.a`` and the ``libarchive_roundtrip``
|
||||
test, which creates archives at multiple compression levels and reads
|
||||
them back through libarchive's public API, verifying every byte. The
|
||||
plugin handles multi-file archives with directory paths, master-block
|
||||
deduplication, and Win95 long names.
|
||||
|
||||
Build Options
|
||||
-------------
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:widths: 30 15 55
|
||||
|
||||
* - Option
|
||||
- Default
|
||||
- Description
|
||||
* - ``UC2_BUILD_TESTS``
|
||||
- ``ON``
|
||||
- Build test programs
|
||||
* - ``UC2_BUILD_LIBARCHIVE_PLUGIN``
|
||||
- ``OFF``
|
||||
- Build the libarchive read handler (needs ``LIBARCHIVE_SOURCE_DIR``)
|
||||
* - ``CMAKE_BUILD_TYPE``
|
||||
- (none)
|
||||
- ``Release``, ``Debug``, ``RelWithDebInfo``
|
||||
|
||||
Running Tests
|
||||
-------------
|
||||
|
||||
.. code-block:: sh
|
||||
|
||||
ctest --test-dir build --output-on-failure
|
||||
|
||||
Tests include:
|
||||
|
||||
- **identify**: UC2 magic detection
|
||||
- **extract**: decompression against reference archives
|
||||
- **roundtrip**: compress → archive → decompress → verify (8 patterns
|
||||
× 4 compression levels = 32 tests)
|
||||
29
docs/conf.py
Normal file
29
docs/conf.py
Normal file
@@ -0,0 +1,29 @@
|
||||
project = "UC2"
|
||||
copyright = "2026, Eremey Valetov"
|
||||
author = "Eremey Valetov"
|
||||
release = "3.0.0"
|
||||
|
||||
extensions = [
|
||||
"sphinx.ext.autodoc",
|
||||
"sphinx.ext.intersphinx",
|
||||
"sphinx.ext.githubpages",
|
||||
]
|
||||
|
||||
templates_path = ["_templates"]
|
||||
exclude_patterns = ["_build"]
|
||||
|
||||
html_theme = "furo"
|
||||
html_static_path = ["_static"]
|
||||
html_title = "UC2 — UltraCompressor II"
|
||||
html_logo = None
|
||||
html_favicon = None
|
||||
|
||||
html_theme_options = {
|
||||
"source_repository": "https://github.com/evvaletov/uc2",
|
||||
"source_branch": "main",
|
||||
"source_directory": "docs/",
|
||||
}
|
||||
|
||||
intersphinx_mapping = {
|
||||
"python": ("https://docs.python.org/3", None),
|
||||
}
|
||||
196
docs/format.rst
Normal file
196
docs/format.rst
Normal file
@@ -0,0 +1,196 @@
|
||||
UC2 Archive Format
|
||||
==================
|
||||
|
||||
This documents the binary format as implemented by the original UC2
|
||||
v2.x and supported by UC2 v3.
|
||||
|
||||
Archive Layout
|
||||
--------------
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
FHEAD (13 bytes)
|
||||
XHEAD (16 bytes)
|
||||
File data blocks (compressed bitstreams)
|
||||
COMPRESS + compressed central directory
|
||||
|
||||
All multi-byte integers are little-endian.
|
||||
|
||||
FHEAD — File Header
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. list-table::
|
||||
:widths: 15 15 70
|
||||
|
||||
* - Offset
|
||||
- Size
|
||||
- Field
|
||||
* - 0
|
||||
- 4
|
||||
- Magic: ``UC2\x1A`` (0x1A324355)
|
||||
* - 4
|
||||
- 4
|
||||
- Component length
|
||||
* - 8
|
||||
- 4
|
||||
- Component length + 0x01B2C3D4 (validation)
|
||||
* - 12
|
||||
- 1
|
||||
- Damage protection flag
|
||||
|
||||
XHEAD — Extended Header
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. list-table::
|
||||
:widths: 15 15 70
|
||||
|
||||
* - Offset
|
||||
- Size
|
||||
- Field
|
||||
* - 13
|
||||
- 4
|
||||
- Cdir volume (always 1)
|
||||
* - 17
|
||||
- 4
|
||||
- Cdir offset
|
||||
* - 21
|
||||
- 2
|
||||
- Fletcher checksum of raw cdir
|
||||
* - 23
|
||||
- 1
|
||||
- Busy flag
|
||||
* - 24
|
||||
- 2
|
||||
- Version made by (e.g. 200 = v2.00)
|
||||
* - 26
|
||||
- 2
|
||||
- Version needed to extract
|
||||
* - 28
|
||||
- 1
|
||||
- Reserved
|
||||
|
||||
Central Directory
|
||||
-----------------
|
||||
|
||||
The central directory is itself compressed using the UC2 compression
|
||||
engine. It is located at the offset specified in XHEAD and preceded by
|
||||
a COMPRESS record.
|
||||
|
||||
Each directory entry begins with a 1-byte type tag:
|
||||
|
||||
.. list-table::
|
||||
:widths: 15 85
|
||||
|
||||
* - 1
|
||||
- Directory entry (OSMETA + DIRMETA)
|
||||
* - 2
|
||||
- File entry (OSMETA + FILEMETA + COMPRESS + LOCATION)
|
||||
* - 3
|
||||
- Master entry (MASMETA + COMPRESS + LOCATION)
|
||||
* - 4
|
||||
- End of central directory
|
||||
|
||||
The directory ends with XTAIL (17 bytes) + archive serial (4 bytes).
|
||||
|
||||
Master Blocks
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
Masters are LZ77 dictionary prefixes that pre-fill the sliding window
|
||||
before decompression, allowing back-references into shared content
|
||||
across files. Three kinds exist:
|
||||
|
||||
.. list-table::
|
||||
:widths: 15 85
|
||||
|
||||
* - 0
|
||||
- **SuperMaster** — built-in 49 152-byte dictionary, decompressed
|
||||
from a static blob embedded in the library.
|
||||
* - 1
|
||||
- **NoMaster** — 512 zero bytes (minimal dictionary).
|
||||
* - ≥ 2
|
||||
- **Custom master** — archive-specific, described by a MASMETA
|
||||
record in the central directory.
|
||||
|
||||
MASMETA (20 bytes):
|
||||
|
||||
.. list-table::
|
||||
:widths: 15 15 70
|
||||
|
||||
* - Offset
|
||||
- Size
|
||||
- Field
|
||||
* - 0
|
||||
- 4
|
||||
- Master index (≥ 2)
|
||||
* - 4
|
||||
- 4
|
||||
- Content key (FNV-1a hash)
|
||||
* - 8
|
||||
- 4
|
||||
- Total uncompressed size of referring files
|
||||
* - 12
|
||||
- 4
|
||||
- Number of referring files
|
||||
* - 16
|
||||
- 2
|
||||
- Master data length (uncompressed, ≤ 65 535)
|
||||
* - 18
|
||||
- 2
|
||||
- Fletcher checksum of master data
|
||||
|
||||
A master entry in the cdir is: type byte (3) + MASMETA (20) +
|
||||
COMPRESS (10) + LOCATION (8) = 39 bytes. The compressed master data
|
||||
is stored at the location pointed to by LOCATION; it is itself
|
||||
compressed using another master (typically SuperMaster).
|
||||
|
||||
Compression Format
|
||||
------------------
|
||||
|
||||
UC2 uses LZ77 with Huffman entropy coding. The bitstream consists of
|
||||
blocks, each containing:
|
||||
|
||||
1. **Block-present flag** (1 bit): 1 = block follows, 0 = end of stream.
|
||||
|
||||
2. **Huffman tree** encoded as:
|
||||
|
||||
- Tree-changed flag (1 bit): 0 = use default tree, 1 = new tree.
|
||||
- Type flags (2 bits): ``has_lo | has_hi << 1``, controlling which
|
||||
symbol ranges are encoded.
|
||||
- Tree-encoding tree (15 × 3-bit lengths).
|
||||
- Delta-coded symbol lengths with RLE (344 symbols total =
|
||||
256 literals + 60 distance + 28 length).
|
||||
|
||||
3. **Compressed data**: Huffman-coded literals and distance/length pairs.
|
||||
|
||||
4. **End-of-block marker**: distance = 64001 with length = 3.
|
||||
|
||||
Distance Encoding
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
60 distance symbols in 4 tiers:
|
||||
|
||||
- Tier 0: distances 1--15 (0 extra bits)
|
||||
- Tier 1: distances 16--255 (4 extra bits)
|
||||
- Tier 2: distances 256--4095 (8 extra bits)
|
||||
- Tier 3: distances 4096--64000 (12 extra bits)
|
||||
|
||||
Length Encoding
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
28 length symbols with varying extra bits, covering lengths 3--35482.
|
||||
|
||||
Delta-Coded Trees
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
Symbol code lengths are delta-coded against the previous block's
|
||||
lengths using the ``vval`` lookup table. The first block's default
|
||||
lengths are hard-coded. The delta stream uses 14 delta codes (0--13)
|
||||
plus a repeat code for RLE compression.
|
||||
|
||||
Fletcher Checksum
|
||||
-----------------
|
||||
|
||||
UC2 uses an XOR-based Fletcher checksum (initial value 0xA55A) for
|
||||
both file data integrity and central directory validation. Bytes are
|
||||
processed in little-endian 16-bit words with a carry flag for
|
||||
odd-length data.
|
||||
35
docs/history.rst
Normal file
35
docs/history.rst
Normal file
@@ -0,0 +1,35 @@
|
||||
History
|
||||
=======
|
||||
|
||||
.. list-table::
|
||||
:widths: 20 80
|
||||
|
||||
* - **1992--1996**
|
||||
- UltraCompressor II created by Nico de Vries. Releases v1.0
|
||||
through v2.3 for DOS. Notable features: LZ77+Huffman
|
||||
compression, master-block deduplication, file versioning,
|
||||
and multi-volume spanning.
|
||||
|
||||
* - **2015**
|
||||
- Source code released under LGPL-3.0 by Danny Bezemer.
|
||||
|
||||
* - **2020--2021**
|
||||
- Jan Bobrowski writes `unuc2/libunuc2
|
||||
<http://torinak.com/~jb/unuc2/>`_, a clean-room portable
|
||||
decompressor in C.
|
||||
|
||||
* - **2026**
|
||||
- UC2 v3.0.0: cross-platform revival by Eremey Valetov.
|
||||
CMake build system, Linux/macOS/Windows/DOS targets,
|
||||
original compression engine reimplemented, Sphinx
|
||||
documentation, CI/CD pipeline.
|
||||
|
||||
Credits
|
||||
-------
|
||||
|
||||
- **Nico de Vries** — original UC2 author
|
||||
- **Danny Bezemer** — source code release
|
||||
- **Jan Bobrowski** — portable decompressor (libunuc2)
|
||||
- **Jan-Pieter Cornet** — testing and format documentation
|
||||
- **Vladislav Sagunov** — UC2 resources and documentation
|
||||
- **Eremey Valetov** — v3.0.0 revival and ongoing development
|
||||
22
docs/index.rst
Normal file
22
docs/index.rst
Normal file
@@ -0,0 +1,22 @@
|
||||
UC2 — UltraCompressor II
|
||||
========================
|
||||
|
||||
A cross-platform revival of UltraCompressor II, the DOS-era archiver by
|
||||
Nico de Vries (1992--1996). UC2 was notable for its advanced
|
||||
deduplication ("master blocks"), file versioning, and competitive
|
||||
compression ratios on the hardware of its day.
|
||||
|
||||
UC2 v3 brings it back as a modern, portable C99 tool with both
|
||||
decompression and compression, targeting Linux, macOS, Windows, and DOS.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: Contents
|
||||
|
||||
quickstart
|
||||
usage
|
||||
library
|
||||
format
|
||||
building
|
||||
history
|
||||
roadmap
|
||||
174
docs/library.rst
Normal file
174
docs/library.rst
Normal file
@@ -0,0 +1,174 @@
|
||||
Library API (libuc2)
|
||||
====================
|
||||
|
||||
``libuc2`` provides C99 functions for reading, extracting, and
|
||||
compressing UC2 archives. The library is callback-based: callers supply
|
||||
I/O and memory callbacks, making it suitable for embedded, DOS, and
|
||||
freestanding environments.
|
||||
|
||||
Header: ``<uc2/libuc2.h>``
|
||||
|
||||
Archive Reading
|
||||
---------------
|
||||
|
||||
.. c:function:: int uc2_identify(void *magic, unsigned magic_size)
|
||||
|
||||
Check whether a buffer contains a UC2 archive header.
|
||||
|
||||
:param magic: Pointer to the first 4--21 bytes of the file.
|
||||
:param magic_size: Number of bytes available.
|
||||
:returns: ``1`` if UC2, ``0`` if not, ``-1`` if more bytes needed.
|
||||
|
||||
.. c:function:: uc2_handle uc2_open(struct uc2_io *io, void *io_ctx)
|
||||
|
||||
Open a UC2 archive. The caller provides I/O callbacks via
|
||||
:c:type:`uc2_io`. Returns ``NULL`` on allocation failure.
|
||||
|
||||
.. c:function:: uc2_handle uc2_close(uc2_handle h)
|
||||
|
||||
Close the archive and free all resources. Always returns ``NULL``.
|
||||
|
||||
Directory Enumeration
|
||||
---------------------
|
||||
|
||||
.. c:function:: int uc2_read_cdir(uc2_handle h, struct uc2_entry *entry)
|
||||
|
||||
Read the next central directory entry.
|
||||
|
||||
:returns:
|
||||
- ``UC2_End`` (0): end of directory, *entry* not filled.
|
||||
- ``UC2_BareEntry`` (1): entry filled, no tags.
|
||||
- ``UC2_TaggedEntry`` (3): entry filled, call :c:func:`uc2_get_tag`
|
||||
to read tags (long filename, etc.).
|
||||
- Negative value on error.
|
||||
|
||||
Directories appear before their contents. Duplicate filenames are
|
||||
listed oldest-first.
|
||||
|
||||
.. c:function:: int uc2_get_tag(uc2_handle h, struct uc2_entry *entry, char **tag, void **data, unsigned *data_len)
|
||||
|
||||
Read a tag from a tagged entry. Call repeatedly until it returns
|
||||
``UC2_End``.
|
||||
|
||||
.. c:function:: int uc2_finish_cdir(uc2_handle h, char label[12])
|
||||
|
||||
Read the archive tail and retrieve the volume label.
|
||||
|
||||
Extraction
|
||||
----------
|
||||
|
||||
.. c:function:: int uc2_extract(uc2_handle h, struct uc2_xinfo *xi, unsigned size, int (*write)(void *ctx, const void *ptr, unsigned len), void *ctx)
|
||||
|
||||
Decompress a file entry. Call only after the entire central
|
||||
directory has been read. The *write* callback receives decompressed
|
||||
data in chunks.
|
||||
|
||||
Compression
|
||||
-----------
|
||||
|
||||
.. c:function:: int uc2_compress(int level, int (*read)(void *ctx, void *buf, unsigned len), void *read_ctx, int (*write)(void *ctx, const void *ptr, unsigned len), void *write_ctx, unsigned size, unsigned short *checksum_out, unsigned *compressed_size_out)
|
||||
|
||||
Compress raw data into a UC2 bitstream (no archive framing).
|
||||
|
||||
:param level: Compression level: 2 = Fast, 3 = Normal, 4 = Tight
|
||||
(default), 5 = Ultra.
|
||||
:param read: Callback returning bytes read (0 at EOF, <0 on error).
|
||||
:param write: Callback returning <0 on error.
|
||||
:param size: Total input size in bytes.
|
||||
:param checksum_out: Receives the Fletcher checksum of the input.
|
||||
:param compressed_size_out: Receives the compressed size.
|
||||
:returns: 0 on success, negative ``UC2_*`` error code on failure.
|
||||
|
||||
.. c:function:: int uc2_compress_ex(int level, const void *master, unsigned master_size, int (*read)(void *ctx, void *buf, unsigned len), void *read_ctx, int (*write)(void *ctx, const void *ptr, unsigned len), void *write_ctx, unsigned size, unsigned short *checksum_out, unsigned *compressed_size_out)
|
||||
|
||||
Compress with a master-block dictionary prefix. The master data
|
||||
pre-fills the LZ77 sliding window, allowing back-references into
|
||||
the master for cross-file deduplication. Pass ``NULL`` / ``0`` for
|
||||
no master (equivalent to :c:func:`uc2_compress`).
|
||||
|
||||
The CLI uses the built-in SuperMaster (49 KB) by default.
|
||||
|
||||
.. c:function:: int uc2_get_supermaster(void *buf, unsigned buf_size)
|
||||
|
||||
Decompress the built-in SuperMaster into *buf* (must be at least
|
||||
49152 bytes). Returns ``49152`` on success, negative error code on
|
||||
failure.
|
||||
|
||||
I/O Callbacks
|
||||
-------------
|
||||
|
||||
.. c:struct:: uc2_io
|
||||
|
||||
.. c:member:: int (*read)(void *io_ctx, unsigned pos, void *buf, unsigned len)
|
||||
|
||||
Read *len* bytes from the archive at offset *pos* into *buf*.
|
||||
Return number of bytes read (less if EOF), or negative on error.
|
||||
|
||||
.. c:member:: void *(*alloc)(void *io_ctx, unsigned size)
|
||||
|
||||
Allocate memory. Return ``NULL`` on failure.
|
||||
|
||||
.. c:member:: void (*free)(void *io_ctx, void *ptr)
|
||||
|
||||
Free memory.
|
||||
|
||||
.. c:member:: void (*warn)(void *io_ctx, char *fmt, ...)
|
||||
|
||||
Optional warning callback.
|
||||
|
||||
Data Structures
|
||||
---------------
|
||||
|
||||
.. c:struct:: uc2_entry
|
||||
|
||||
A directory entry.
|
||||
|
||||
.. c:member:: unsigned dirid
|
||||
|
||||
Parent directory (0 = root).
|
||||
|
||||
.. c:member:: unsigned id
|
||||
|
||||
Directory index (directories only).
|
||||
|
||||
.. c:member:: unsigned size
|
||||
|
||||
Uncompressed file size.
|
||||
|
||||
.. c:member:: unsigned csize
|
||||
|
||||
Compressed file size.
|
||||
|
||||
.. c:member:: unsigned dos_time
|
||||
|
||||
DOS-format timestamp.
|
||||
|
||||
.. c:member:: unsigned char attr
|
||||
|
||||
DOS file attributes.
|
||||
|
||||
.. c:member:: char name[300]
|
||||
|
||||
Filename (UTF-8, NUL-terminated). Populated after tags are read.
|
||||
|
||||
Error Codes
|
||||
-----------
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:widths: 30 70
|
||||
|
||||
* - Constant
|
||||
- Meaning
|
||||
* - ``UC2_UserFault`` (-2)
|
||||
- User callback refused to cooperate
|
||||
* - ``UC2_BadState`` (-3)
|
||||
- API called in wrong order
|
||||
* - ``UC2_Damaged`` (-4)
|
||||
- Archive data is corrupt
|
||||
* - ``UC2_Truncated`` (-5)
|
||||
- Unexpected end of data
|
||||
* - ``UC2_Unimplemented`` (-6)
|
||||
- Feature not yet implemented
|
||||
* - ``UC2_InternalError`` (-7)
|
||||
- Internal logic error
|
||||
160
docs/license-audit.md
Normal file
160
docs/license-audit.md
Normal file
@@ -0,0 +1,160 @@
|
||||
# UC2 License Audit
|
||||
|
||||
Status: 2026-05-03. Maintained by Eremey Valetov.
|
||||
|
||||
UC2 v3 builds on three layers of prior work, each released under its
|
||||
own free-software licence. This document records per-file provenance,
|
||||
the LGPL-3.0 -> GPL-3.0 transition rationale, and the SPDX identifiers
|
||||
applied across the source tree.
|
||||
|
||||
## Layer 1: Nico de Vries (1992-1996), released 2015
|
||||
|
||||
Nico de Vries authored UltraCompressor II as proprietary DOS software
|
||||
between 1992 and 1996, with co-development from Danny Bezemer,
|
||||
Jan-Pieter Cornet, and others credited in the original `U_MANUAL.TXT`.
|
||||
In December 2015, Vladislav Sagunov asked de Vries whether the source
|
||||
could be re-released under a free licence; de Vries agreed and
|
||||
personally published the full source under the GNU Lesser General
|
||||
Public License v3 (LGPL-3.0), with a small Borland C/C++ runtime
|
||||
carve-out for DOS-specific code.
|
||||
|
||||
The 2015 release is preserved in this repository under
|
||||
`original/UC2_source/` byte-for-byte unchanged, including its license
|
||||
header (`GNU LESSER GENERAL PUBLIC LICENSE V3.txt`) and the original
|
||||
binaries (`uc2pro.exe`, `uc237b.exe`, `ue.exe`). No file in `lib/` or
|
||||
`cli/` is a verbatim copy of any file in that release. The 2015 source
|
||||
serves as the format specification: it is read for documentation
|
||||
purposes (the on-disk archive layout, the SuperMaster dictionary
|
||||
contents, the Huffman tree encoding) but its C code is not linked in.
|
||||
|
||||
Relicensing impact: none. Layer 1 is preserved under its original
|
||||
LGPL-3.0 licence; nothing is moved upward to GPL-3.0.
|
||||
|
||||
## Layer 2: Jan Bobrowski (2020-2021), libunuc2 / unuc2
|
||||
|
||||
Jan Bobrowski wrote a clean-room portable decompressor for UC2 v3
|
||||
archives between 2020 and 2021. He released two products:
|
||||
|
||||
- `libunuc2` (decompression library) under LGPL-3.0
|
||||
- `unuc2` (CLI tool) under GPL-3.0-or-later
|
||||
|
||||
The upstream source is preserved in `original/unuc2-0.6/`. The
|
||||
following files in this repository derive from Bobrowski's work and
|
||||
retain his original licence:
|
||||
|
||||
| Current file | Upstream origin | Licence |
|
||||
|---------------------------------------|------------------------------------------|---------------|
|
||||
| `lib/src/decompress.c` | `original/unuc2-0.6/libunuc2.c` | LGPL-3.0-only |
|
||||
| `lib/src/list.h` | `original/unuc2-0.6/list.h` (identical) | LGPL-3.0-only |
|
||||
| `lib/src/super.bin` | `original/unuc2-0.6/super.bin` (identical) | data (de Vries) |
|
||||
| `lib/include/uc2/libuc2.h` | `original/unuc2-0.6/libunuc2.h` | LGPL-3.0-only |
|
||||
| `cli/src/main.c` | `original/unuc2-0.6/unuc2.c` | GPL-3.0-or-later |
|
||||
| `cli/src/compat/compat_win32.c` | `original/unuc2-0.6/compat/compat.c` (Win32 portions) | LGPL-3.0-only |
|
||||
| `cli/src/compat/compat_dos.c` | derived from `compat/compat.c` (DOS adaptation by Valetov) | LGPL-3.0-only |
|
||||
|
||||
Modifications by Valetov in 2026 are released under the same licence
|
||||
as the file's upstream origin (LGPL-3.0 stays LGPL-3.0; GPL-3.0 stays
|
||||
GPL-3.0). No unilateral upgrade from LGPL to GPL has been applied to
|
||||
Bobrowski's work.
|
||||
|
||||
`lib/src/super.bin` is the SuperMaster dictionary block from the 1992
|
||||
distribution. It is data, not code: a fixed binary table used as a
|
||||
compression-context priming dictionary. It is bit-identical to the
|
||||
file in Bobrowski's release and to the data extracted from de Vries's
|
||||
1992 binaries.
|
||||
|
||||
## Layer 3: Eremey Valetov (2026), UC2 v3 revival
|
||||
|
||||
The following files are new work by Valetov, originally authored for
|
||||
the UC2 v3 revival project, released under GPL-3.0-or-later:
|
||||
|
||||
| File | Function |
|
||||
|---------------------------------------|----------------------------------------------------|
|
||||
| `lib/src/compress.c` | LZ77+Huffman compressor (inverse of decompress.c) |
|
||||
| `lib/src/uc2_tables.c` | Huffman delta-coding lookup tables |
|
||||
| `lib/src/uc2_internal.h` | Shared compressor/decompressor types and constants |
|
||||
| `lib/src/uc2_cdc.c` + `.h` | Content-defined chunking (Gear hash) |
|
||||
| `lib/src/uc2_merkle.c` + `.h` | Merkle DAG of deduplicated blocks |
|
||||
| `lib/src/uc2_blockstore.c` + `.h` | Cross-archive content-addressable block store |
|
||||
| `lib/src/uc2_simhash.c` + `.h` | SimHash near-duplicate detection |
|
||||
| `lib/src/uc2_delta.c` + `.h` | Binary delta compression |
|
||||
| `lib/src/uc2_rans.c` + `.h` | rANS entropy coder (method 10) |
|
||||
| `lib/src/uc2_dict.c` + `.h` | Dictionary metadata for cross-archive sharing |
|
||||
| `lib/src/uc2_preprocess.c` + `.h` | BCJ / BWT / delta-filter preprocessing |
|
||||
| `lib/src/uc2_lz4.c` + `.h` | LZ4 ultra-fast compression |
|
||||
| `lib/src/uc2_blake3.c` + `.h` | BLAKE3 cryptographic hashing |
|
||||
| `lib/src/uc2_sha256.c` + `.h` | SHA-256 (FIPS 180-4) |
|
||||
| `lib/src/uc2_ots.c` + `.h` | OpenTimestamps proof parser, walker, trailer |
|
||||
| `cli/src/compat/getopt.c` | Minimal POSIX getopt for MSVC |
|
||||
| `cli/src/main.c` (post-`9525a81` additions) | OTS attach/extract/info, archive creation, scanning, benchmark | GPL-3.0-or-later (matches upstream `unuc2.c`) |
|
||||
|
||||
The Phase 3-7 modules are independent implementations. They reference
|
||||
the UC2 v3 archive format (which is a bitstream layout, not a
|
||||
copyrighted work) and use BLAKE3, SHA-256, LZ4, rANS, etc. from
|
||||
public-domain or self-authored reference implementations. None of
|
||||
these modules link to or derive from Bobrowski's code beyond using
|
||||
shared header types declared in `uc2_internal.h`.
|
||||
|
||||
## Relicensing rationale
|
||||
|
||||
The composite project links Bobrowski's LGPL-3.0 library (`lib/`) into
|
||||
a GPL-3.0-or-later executable (`cli/`). This combination is permitted
|
||||
by LGPL-3.0 sec. 4 (Combined Works): the LGPL library may be used in
|
||||
GPL-licensed work without requiring the library itself to be relicensed.
|
||||
|
||||
No code has been moved from LGPL-3.0 to GPL-3.0 in this project.
|
||||
LGPL §3 permits a one-way upgrade from LGPL to GPL but exercising it
|
||||
is unnecessary here: the LGPL files remain LGPL, the GPL files remain
|
||||
GPL, and the combined work is distributable under GPL-3.0-or-later (as
|
||||
recorded in the project root `LICENSE` file).
|
||||
|
||||
If a downstream user wishes to redistribute `lib/` standalone under
|
||||
LGPL-3.0, the LGPL-3.0 files are individually identifiable via their
|
||||
SPDX-License-Identifier headers.
|
||||
|
||||
## SPDX policy
|
||||
|
||||
All source files in `lib/` and `cli/` carry one of two SPDX
|
||||
identifiers near the top:
|
||||
|
||||
- `SPDX-License-Identifier: LGPL-3.0-only` for files derived from
|
||||
Bobrowski's libunuc2 / compat code.
|
||||
- `SPDX-License-Identifier: GPL-3.0-or-later` for `cli/src/main.c`
|
||||
(matches Bobrowski's original GPL-3.0-or-later choice for the CLI
|
||||
tool) and for all Valetov-authored Phase 2 through Phase 7 work.
|
||||
|
||||
Original copyright lines authored by Bobrowski are preserved verbatim
|
||||
where present. Where Valetov has added substantial new content to a
|
||||
Bobrowski-originated file (notably `cli/src/main.c` and
|
||||
`compat_dos.c`), an additional Valetov copyright line has been added
|
||||
without removing the original.
|
||||
|
||||
The 2015 LGPL-3.0 release in `original/UC2_source/` and the 2020-2021
|
||||
release in `original/unuc2-0.6/` are preserved unchanged and are not
|
||||
subject to this policy: they retain whatever licence headers their
|
||||
authors shipped them with.
|
||||
|
||||
## Audit checklist
|
||||
|
||||
- [x] LGPL-3.0 release by Bezemer/de Vries preserved unchanged in
|
||||
`original/UC2_source/`
|
||||
- [x] LGPL-3.0 / GPL-3.0 release by Bobrowski preserved unchanged in
|
||||
`original/unuc2-0.6/`
|
||||
- [x] Per-file provenance table above
|
||||
- [x] SPDX-License-Identifier on every source file in `lib/` and `cli/`
|
||||
- [x] CREDITS.md attributes Bobrowski specifically for libunuc2-derived
|
||||
files, not as generic "inspiration"
|
||||
- [x] LICENSE-HISTORY summary published as this file
|
||||
(`docs/license-audit.md`)
|
||||
- [x] No silent LGPL-to-GPL upgrade: every Bobrowski-origin file
|
||||
retains LGPL-3.0-only
|
||||
|
||||
## References
|
||||
|
||||
- LGPL-3.0 text: <https://www.gnu.org/licenses/lgpl-3.0.html>
|
||||
- GPL-3.0 text: see `LICENSE` in repository root
|
||||
- Bobrowski upstream: <http://torinak.com/~jb/unuc2/>
|
||||
- Bezemer 2015 release notes: `original/UC2_source/Read Me First.txt`
|
||||
- LGPL-3.0 sec. 3 (allowing one-way upgrade to GPL):
|
||||
<https://www.gnu.org/licenses/lgpl-3.0.html#section3>
|
||||
- LGPL-3.0 sec. 4 (Combined Works): same document, sec. 4
|
||||
26
docs/quickstart.rst
Normal file
26
docs/quickstart.rst
Normal file
@@ -0,0 +1,26 @@
|
||||
Quick Start
|
||||
===========
|
||||
|
||||
Building
|
||||
--------
|
||||
|
||||
Requires CMake >= 3.16 and a C99 compiler (GCC, Clang, or MSVC).
|
||||
|
||||
.. code-block:: sh
|
||||
|
||||
cmake -B build
|
||||
cmake --build build
|
||||
|
||||
The binary is at ``build/cli/uc2``.
|
||||
|
||||
Basic Usage
|
||||
-----------
|
||||
|
||||
.. code-block:: sh
|
||||
|
||||
uc2 -w archive.uc2 file1 file2 # Create archive
|
||||
uc2 archive.uc2 # Extract all files
|
||||
uc2 -l archive.uc2 # List contents
|
||||
uc2 -t archive.uc2 # Test archive integrity
|
||||
uc2 -d /tmp/out archive.uc2 # Extract to directory
|
||||
uc2 -w -L 5 big.uc2 data/* # Create with Ultra compression
|
||||
2
docs/requirements.txt
Normal file
2
docs/requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
sphinx>=7.0
|
||||
furo
|
||||
42
docs/roadmap.rst
Normal file
42
docs/roadmap.rst
Normal file
@@ -0,0 +1,42 @@
|
||||
Roadmap
|
||||
=======
|
||||
|
||||
The development roadmap is maintained in ``ROADMAP.md`` at the project
|
||||
root. Key phases:
|
||||
|
||||
1. **Decompression MVP** -- Done. Portable decompressor, CLI tool,
|
||||
CMake build system.
|
||||
|
||||
2. **Original Compression Engine** -- Done. LZ77+Huffman compressor
|
||||
with custom Huffman trees, full backward compatibility, and UC2
|
||||
personality (``-q`` for scripting). Automated DOSBox-X round-trip
|
||||
validates 4+5 files in both directions.
|
||||
|
||||
3. **Modernized Master-Block Deduplication** -- Done.
|
||||
CDC with Gear hash, Merkle DAG with content addressing,
|
||||
cross-archive block store, SimHash near-duplicate detection,
|
||||
and delta compression. All Phase 3 items complete.
|
||||
|
||||
4. **Modern Compression Backends** -- Done. rANS entropy coder (method
|
||||
10), zstd-style dictionary metadata, content-aware preprocessing
|
||||
(BCJ, BWT, delta), LZ4 ultra-fast, benchmark mode (``uc2 -B``).
|
||||
|
||||
5. **Quantum-Resistant Encryption** -- CRYSTALS-Kyber + AES-256-GCM.
|
||||
|
||||
6. **DOS / FreeDOS / Retro-Computing** -- DJGPP toolchain, vintage
|
||||
hardware support, self-extracting archives.
|
||||
|
||||
7. **Cryptographic Integrity & Timestamping** -- BLAKE3 hashing done
|
||||
(pure C, 256-bit digests). Remaining: integration into archive
|
||||
blocks, OpenTimestamps.
|
||||
|
||||
8. **Decentralized & Cloud Integration** -- IPFS pinning,
|
||||
content-addressable dedup, cloud archiving.
|
||||
|
||||
9. **Zero-Knowledge Proofs** -- Privacy-preserving archive verification.
|
||||
|
||||
10. **Ecosystem Integrations** -- libarchive plugin, streaming dedup
|
||||
ingestion, file manager plugins.
|
||||
|
||||
See the full roadmap: `ROADMAP.md on GitHub
|
||||
<https://github.com/evvaletov/uc2/blob/main/ROADMAP.md>`_.
|
||||
93
docs/usage.rst
Normal file
93
docs/usage.rst
Normal file
@@ -0,0 +1,93 @@
|
||||
Command-Line Usage
|
||||
==================
|
||||
|
||||
Synopsis
|
||||
--------
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
uc2 [options] archive.uc2 [patterns...]
|
||||
uc2 -w [-L level] archive.uc2 files...
|
||||
|
||||
Modes
|
||||
-----
|
||||
|
||||
``uc2 archive.uc2``
|
||||
Extract all files to the current directory.
|
||||
|
||||
``uc2 -l archive.uc2``
|
||||
List archive contents.
|
||||
|
||||
``uc2 -t archive.uc2``
|
||||
Test archive integrity (decompress and verify checksums without
|
||||
writing files).
|
||||
|
||||
``uc2 -p archive.uc2 filename``
|
||||
Extract a file to stdout.
|
||||
|
||||
``uc2 -w archive.uc2 files...``
|
||||
Create a new archive from the given files. The original LZ77+Huffman
|
||||
algorithm is used. Compression level defaults to 4 (Tight); use
|
||||
``-L`` to change it.
|
||||
|
||||
The archiver automatically groups similar files using content
|
||||
fingerprinting: files sharing identical first 4096 bytes are assigned a
|
||||
custom master block built from the largest file in the group. This
|
||||
pre-fills the LZ77 sliding window with shared content, improving
|
||||
compression for collections of structurally similar files (e.g. log
|
||||
rotations, versioned configs, same-format data files). Files that
|
||||
don't group (or are smaller than 1 KB) use the built-in 49 KB
|
||||
SuperMaster dictionary.
|
||||
|
||||
Options
|
||||
-------
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:widths: 15 85
|
||||
|
||||
* - Flag
|
||||
- Description
|
||||
* - ``-l``
|
||||
- List archive contents
|
||||
* - ``-t``
|
||||
- Test archive integrity
|
||||
* - ``-w``
|
||||
- Create archive
|
||||
* - ``-L n``
|
||||
- Compression level: 2 = Fast, 3 = Normal, 4 = Tight (default),
|
||||
5 = Ultra
|
||||
* - ``-a``
|
||||
- Include all file versions (not just latest)
|
||||
* - ``-d path``
|
||||
- Extract to specified directory
|
||||
* - ``-f``
|
||||
- Overwrite existing files
|
||||
* - ``-p``
|
||||
- Extract to stdout
|
||||
* - ``-D``
|
||||
- Skip directory metadata; ``-DD`` also skips file metadata
|
||||
* - ``-T``
|
||||
- Tab-separated output (for scripting)
|
||||
|
||||
Pattern Matching
|
||||
----------------
|
||||
|
||||
File patterns use glob syntax. Only files matching the pattern are
|
||||
listed or extracted:
|
||||
|
||||
.. code-block:: sh
|
||||
|
||||
uc2 -l archive.uc2 '*.txt' # List only .txt files
|
||||
uc2 archive.uc2 'src/*' # Extract src/ subtree
|
||||
|
||||
Exit Codes
|
||||
----------
|
||||
|
||||
.. list-table::
|
||||
:widths: 15 85
|
||||
|
||||
* - ``0``
|
||||
- Success
|
||||
* - ``1``
|
||||
- Error (damaged archive, I/O failure, etc.)
|
||||
@@ -1,6 +1,6 @@
|
||||
# libuc2 — UC2 decompression library
|
||||
|
||||
set(LIBUC2_SOURCES src/decompress.c)
|
||||
set(LIBUC2_SOURCES src/decompress.c src/compress.c src/uc2_tables.c src/uc2_cdc.c src/uc2_merkle.c src/uc2_blockstore.c src/uc2_simhash.c src/uc2_delta.c src/uc2_rans.c src/uc2_dict.c src/uc2_preprocess.c src/uc2_lz4.c src/uc2_blake3.c src/uc2_sha256.c src/uc2_ots.c src/uc2_ingest.c)
|
||||
|
||||
# Embed super.bin: use .S with .incbin on GCC/Clang, generated C array on MSVC
|
||||
if(MSVC)
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
/* SPDX-License-Identifier: LGPL-3.0-only */
|
||||
|
||||
#ifndef LIBUC2_H
|
||||
#define LIBUC2_H
|
||||
|
||||
@@ -83,6 +85,43 @@ UC2_API int uc2_extract(
|
||||
|
||||
UC2_API const char *uc2_message(uc2_handle, int ret);
|
||||
|
||||
/* Compress raw data into a UC2 bitstream (no archive framing).
|
||||
level: 2=Fast, 3=Normal, 4=Tight(default), 5=Ultra.
|
||||
read() should return bytes read (0 at EOF, <0 on error).
|
||||
write() should return <0 on error.
|
||||
Returns 0 on success, negative UC2_* error code on failure. */
|
||||
UC2_API int uc2_compress(
|
||||
int level,
|
||||
int (*read)(void *context, void *buf, unsigned len),
|
||||
void *read_ctx,
|
||||
int (*write)(void *context, const void *ptr, unsigned len),
|
||||
void *write_ctx,
|
||||
unsigned size,
|
||||
unsigned short *checksum_out,
|
||||
unsigned *compressed_size_out
|
||||
);
|
||||
|
||||
/* Compress with a master-block dictionary prefix.
|
||||
The master data pre-fills the LZ77 sliding window, allowing
|
||||
back-references into the master for cross-file deduplication.
|
||||
Set master=NULL, master_size=0 for no master (same as uc2_compress). */
|
||||
UC2_API int uc2_compress_ex(
|
||||
int level,
|
||||
const void *master, unsigned master_size,
|
||||
int (*read)(void *context, void *buf, unsigned len),
|
||||
void *read_ctx,
|
||||
int (*write)(void *context, const void *ptr, unsigned len),
|
||||
void *write_ctx,
|
||||
unsigned size,
|
||||
unsigned short *checksum_out,
|
||||
unsigned *compressed_size_out
|
||||
);
|
||||
|
||||
/* Decompress the built-in SuperMaster (49152 bytes).
|
||||
buf must be at least 49152 bytes.
|
||||
Returns 49152 on success, negative UC2_* error code on failure. */
|
||||
UC2_API int uc2_get_supermaster(void *buf, unsigned buf_size);
|
||||
|
||||
struct uc2_io {
|
||||
/* Read len bytes from the archive at offset pos into buf.
|
||||
Return number of bytes read, or less if eof.
|
||||
|
||||
64
lib/include/uc2/uc2_blake3.h
Normal file
64
lib/include/uc2/uc2_blake3.h
Normal file
@@ -0,0 +1,64 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* BLAKE3 cryptographic hashing for archive integrity.
|
||||
*
|
||||
* BLAKE3 is a fast cryptographic hash based on the Bao tree hashing
|
||||
* mode and the BLAKE2s compression function. It produces 256-bit
|
||||
* (32-byte) digests suitable for content verification, integrity
|
||||
* checking, and content-addressable storage.
|
||||
*
|
||||
* This is a simplified single-threaded implementation (~300 lines).
|
||||
* For full BLAKE3 features (keyed hashing, KDF, XOF), see the
|
||||
* reference implementation at github.com/BLAKE3-team/BLAKE3.
|
||||
*
|
||||
* Usage:
|
||||
* struct uc2_blake3 ctx;
|
||||
* uc2_blake3_init(&ctx);
|
||||
* uc2_blake3_update(&ctx, data, len);
|
||||
* uint8_t hash[32];
|
||||
* uc2_blake3_final(&ctx, hash);
|
||||
*
|
||||
* // Or one-shot:
|
||||
* uc2_blake3_hash(data, len, hash);
|
||||
*/
|
||||
|
||||
#ifndef UC2_BLAKE3_H
|
||||
#define UC2_BLAKE3_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
#define UC2_BLAKE3_OUT_LEN 32
|
||||
#define UC2_BLAKE3_BLOCK_LEN 64
|
||||
#define UC2_BLAKE3_CHUNK_LEN 1024
|
||||
|
||||
struct uc2_blake3 {
|
||||
uint32_t cv[8]; /* chaining value */
|
||||
uint8_t buf[UC2_BLAKE3_BLOCK_LEN];
|
||||
uint8_t buf_len;
|
||||
uint64_t counter;
|
||||
uint8_t flags;
|
||||
/* Stack for tree hashing */
|
||||
uint32_t cv_stack[8 * 54]; /* max tree depth */
|
||||
uint8_t cv_stack_len;
|
||||
uint64_t chunk_counter;
|
||||
uint8_t blocks_compressed;
|
||||
};
|
||||
|
||||
/* Initialize hasher. */
|
||||
void uc2_blake3_init(struct uc2_blake3 *ctx);
|
||||
|
||||
/* Feed data to the hasher. */
|
||||
void uc2_blake3_update(struct uc2_blake3 *ctx, const void *data, size_t len);
|
||||
|
||||
/* Finalize and produce hash. */
|
||||
void uc2_blake3_final(const struct uc2_blake3 *ctx, uint8_t out[UC2_BLAKE3_OUT_LEN]);
|
||||
|
||||
/* One-shot hash. */
|
||||
void uc2_blake3_hash(const void *data, size_t len, uint8_t out[UC2_BLAKE3_OUT_LEN]);
|
||||
|
||||
/* Compare two hashes (constant-time). Returns 1 if equal. */
|
||||
int uc2_blake3_equal(const uint8_t a[UC2_BLAKE3_OUT_LEN],
|
||||
const uint8_t b[UC2_BLAKE3_OUT_LEN]);
|
||||
|
||||
#endif
|
||||
66
lib/include/uc2/uc2_blockstore.h
Normal file
66
lib/include/uc2/uc2_blockstore.h
Normal file
@@ -0,0 +1,66 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* Cross-archive block store for content-addressable deduplication.
|
||||
*
|
||||
* Stores unique CDC chunks indexed by 64-bit content hash. Multiple
|
||||
* archives can share blocks through the store, enabling cross-archive
|
||||
* and cross-version dedup.
|
||||
*
|
||||
* The store is a directory of chunk files named by their hash. A
|
||||
* manifest maps (archive, file, chunk_index) -> chunk_hash, enabling
|
||||
* reconstruction of any file from its chunk list.
|
||||
*
|
||||
* Usage:
|
||||
* struct uc2_blockstore bs;
|
||||
* uc2_blockstore_open(&bs, "/path/to/store");
|
||||
* uc2_blockstore_ingest(&bs, &merkle_tree, data, len);
|
||||
* // ... later, from a different archive:
|
||||
* int new_chunks = uc2_blockstore_ingest(&bs, &tree2, data2, len2);
|
||||
* // new_chunks < tree2.nchunks means dedup happened
|
||||
* uc2_blockstore_close(&bs);
|
||||
*/
|
||||
|
||||
#ifndef UC2_BLOCKSTORE_H
|
||||
#define UC2_BLOCKSTORE_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include "uc2_merkle.h"
|
||||
|
||||
/* Block store state. */
|
||||
struct uc2_blockstore {
|
||||
char *path; /* store directory path */
|
||||
int nblocks; /* number of unique blocks stored */
|
||||
int64_t total_bytes; /* total bytes of unique block data */
|
||||
int64_t saved_bytes; /* bytes saved by dedup */
|
||||
};
|
||||
|
||||
/* Open or create a block store at the given directory path.
|
||||
* Returns 0 on success, -1 on error. */
|
||||
int uc2_blockstore_open(struct uc2_blockstore *bs, const char *path);
|
||||
|
||||
/* Ingest a file's chunks into the store. Only stores chunks not
|
||||
* already present (dedup). Returns the number of NEW chunks stored
|
||||
* (0 = fully deduplicated). */
|
||||
int uc2_blockstore_ingest(struct uc2_blockstore *bs,
|
||||
const struct uc2_merkle *tree,
|
||||
const uint8_t *data, size_t len);
|
||||
|
||||
/* Check if a chunk exists in the store. */
|
||||
int uc2_blockstore_has(const struct uc2_blockstore *bs, uint64_t hash);
|
||||
|
||||
/* Read a chunk from the store into buf (must be large enough).
|
||||
* Returns chunk length, or -1 on error. */
|
||||
int uc2_blockstore_read(const struct uc2_blockstore *bs,
|
||||
uint64_t hash, uint8_t *buf, size_t buf_size);
|
||||
|
||||
/* Get dedup statistics. */
|
||||
static inline int64_t uc2_blockstore_saved(const struct uc2_blockstore *bs)
|
||||
{
|
||||
return bs->saved_bytes;
|
||||
}
|
||||
|
||||
/* Close the block store (frees internal state, does not delete files). */
|
||||
void uc2_blockstore_close(struct uc2_blockstore *bs);
|
||||
|
||||
#endif
|
||||
55
lib/include/uc2/uc2_cdc.h
Normal file
55
lib/include/uc2/uc2_cdc.h
Normal file
@@ -0,0 +1,55 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* Content-defined chunking (CDC) for UC2 deduplication.
|
||||
*
|
||||
* Uses the Gear rolling hash for fast, content-aware chunk boundary
|
||||
* detection. Gear hash is a simple multiplicative hash that XORs each
|
||||
* byte with a pre-computed random table, giving O(1) per-byte updates.
|
||||
*
|
||||
* Typical usage:
|
||||
* struct uc2_chunker c;
|
||||
* uc2_chunker_init(&c, 13); // avg chunk ~8KB (2^13)
|
||||
* while (uc2_chunker_next(&c, data, len, &chunk_off, &chunk_len))
|
||||
* process(data + chunk_off, chunk_len);
|
||||
*/
|
||||
|
||||
#ifndef UC2_CDC_H
|
||||
#define UC2_CDC_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
/* Gear hash: fast rolling hash with O(1) per-byte update. */
|
||||
uint32_t uc2_gear_hash(const uint8_t *data, size_t len);
|
||||
|
||||
/* CDC chunker state. */
|
||||
struct uc2_chunker {
|
||||
uint32_t mask; /* boundary mask: (1 << bits) - 1 */
|
||||
size_t min_chunk; /* minimum chunk size */
|
||||
size_t max_chunk; /* maximum chunk size */
|
||||
size_t pos; /* current position in data */
|
||||
};
|
||||
|
||||
/* Initialize chunker.
|
||||
* bits: target chunk size exponent (avg chunk = 2^bits bytes).
|
||||
* Recommended: 13 (8KB), 14 (16KB), or 15 (32KB).
|
||||
* min_chunk: minimum chunk size (0 = bits-2 default)
|
||||
* max_chunk: maximum chunk size (0 = bits+2 default) */
|
||||
void uc2_chunker_init(struct uc2_chunker *c, int bits,
|
||||
size_t min_chunk, size_t max_chunk);
|
||||
|
||||
/* Find the next chunk boundary in [data, data+len).
|
||||
* Returns 1 and sets *chunk_len if a chunk was found.
|
||||
* Returns 0 when all data has been consumed (final chunk).
|
||||
* Call repeatedly until it returns 0. */
|
||||
int uc2_chunker_next(struct uc2_chunker *c,
|
||||
const uint8_t *data, size_t len,
|
||||
size_t *chunk_off, size_t *chunk_len);
|
||||
|
||||
/* Reset chunker for a new data stream. */
|
||||
void uc2_chunker_reset(struct uc2_chunker *c);
|
||||
|
||||
/* FNV-1a hash for chunk content addressing. */
|
||||
uint32_t uc2_fnv1a(const uint8_t *data, size_t len);
|
||||
|
||||
#endif
|
||||
50
lib/include/uc2/uc2_delta.h
Normal file
50
lib/include/uc2/uc2_delta.h
Normal file
@@ -0,0 +1,50 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* Delta compression for file versioning.
|
||||
*
|
||||
* Computes a compact binary delta between a source (old) and target
|
||||
* (new) file. The delta encodes copy-from-source and insert-new-data
|
||||
* instructions, similar to xdelta/bsdiff.
|
||||
*
|
||||
* The delta can be applied to reconstruct the target from the source.
|
||||
* Combined with master blocks, this enables version-level dedup:
|
||||
* store the first version as a master, subsequent versions as deltas.
|
||||
*
|
||||
* Usage:
|
||||
* uint8_t *delta; size_t delta_len;
|
||||
* uc2_delta_encode(src, src_len, tgt, tgt_len, &delta, &delta_len);
|
||||
* uint8_t *reconstructed; size_t recon_len;
|
||||
* uc2_delta_apply(src, src_len, delta, delta_len, &reconstructed, &recon_len);
|
||||
* // reconstructed == tgt
|
||||
* free(delta); free(reconstructed);
|
||||
*/
|
||||
|
||||
#ifndef UC2_DELTA_H
|
||||
#define UC2_DELTA_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
/* Encode a delta from source to target.
|
||||
* Allocates *out_delta (caller must free).
|
||||
* Returns 0 on success, -1 on error. */
|
||||
int uc2_delta_encode(const uint8_t *src, size_t src_len,
|
||||
const uint8_t *tgt, size_t tgt_len,
|
||||
uint8_t **out_delta, size_t *out_delta_len);
|
||||
|
||||
/* Apply a delta to source to reconstruct target.
|
||||
* Allocates *out_tgt (caller must free).
|
||||
* Returns 0 on success, -1 on error. */
|
||||
int uc2_delta_apply(const uint8_t *src, size_t src_len,
|
||||
const uint8_t *delta, size_t delta_len,
|
||||
uint8_t **out_tgt, size_t *out_tgt_len);
|
||||
|
||||
/* Delta format:
|
||||
* Header: "UC2D" (4 bytes) + target_len (4 bytes LE)
|
||||
* Instructions:
|
||||
* COPY: 0x01 + offset(4 LE) + length(4 LE) — copy from source
|
||||
* INSERT: 0x02 + length(4 LE) + data[length] — insert new bytes
|
||||
* END: 0x00 — end of delta
|
||||
*/
|
||||
|
||||
#endif
|
||||
76
lib/include/uc2/uc2_dict.h
Normal file
76
lib/include/uc2/uc2_dict.h
Normal file
@@ -0,0 +1,76 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* Dictionary management for zstd-inspired dictionary compression.
|
||||
*
|
||||
* Formalizes UC2's master blocks as proper dictionaries with content
|
||||
* hashes (IDs), integrity checksums, and cross-archive sharing.
|
||||
* Combined with the block store (uc2_blockstore.h), this enables
|
||||
* distributed dedup: archives in different locations can reference
|
||||
* shared dictionaries by content hash.
|
||||
*
|
||||
* Usage:
|
||||
* struct uc2_dict dict;
|
||||
* uc2_dict_create(&dict, master_data, master_size);
|
||||
* uint64_t id = uc2_dict_id(&dict);
|
||||
* // Store/share/reference by id...
|
||||
* uc2_dict_verify(&dict); // check integrity
|
||||
* uc2_dict_free(&dict);
|
||||
*/
|
||||
|
||||
#ifndef UC2_DICT_H
|
||||
#define UC2_DICT_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
/* Dictionary header (serialized in archive or block store). */
|
||||
#define UC2_DICT_MAGIC 0x44324355 /* "UC2D" */
|
||||
|
||||
struct uc2_dict {
|
||||
uint64_t id; /* content hash (FNV-1a 64-bit of data) */
|
||||
uint32_t checksum; /* FNV-1a 32-bit integrity check */
|
||||
uint32_t size; /* dictionary data size */
|
||||
uint8_t *data; /* dictionary content (owned) */
|
||||
};
|
||||
|
||||
/* Serialized dictionary header (24 bytes, stored in archive/block store). */
|
||||
struct uc2_dict_header {
|
||||
uint32_t magic; /* UC2_DICT_MAGIC */
|
||||
uint64_t id; /* content hash */
|
||||
uint32_t checksum; /* integrity */
|
||||
uint32_t size; /* data size following header */
|
||||
uint32_t reserved; /* future use */
|
||||
};
|
||||
|
||||
/* Create a dictionary from raw master data.
|
||||
* Computes id (content hash) and checksum. Copies data (caller
|
||||
* can free the original after this call). */
|
||||
int uc2_dict_create(struct uc2_dict *dict, const uint8_t *data, size_t size);
|
||||
|
||||
/* Get dictionary ID (content hash for cross-archive sharing). */
|
||||
static inline uint64_t uc2_dict_id(const struct uc2_dict *dict)
|
||||
{
|
||||
return dict->id;
|
||||
}
|
||||
|
||||
/* Verify dictionary integrity (returns 1 if valid, 0 if corrupted). */
|
||||
int uc2_dict_verify(const struct uc2_dict *dict);
|
||||
|
||||
/* Serialize dictionary to a buffer (header + data).
|
||||
* Allocates *out (caller must free). Returns total size. */
|
||||
size_t uc2_dict_serialize(const struct uc2_dict *dict, uint8_t **out);
|
||||
|
||||
/* Deserialize dictionary from a buffer.
|
||||
* Returns 0 on success, -1 on error. */
|
||||
int uc2_dict_deserialize(struct uc2_dict *dict, const uint8_t *buf, size_t len);
|
||||
|
||||
/* Check if two dictionaries have the same content (by ID). */
|
||||
static inline int uc2_dict_match(const struct uc2_dict *a, const struct uc2_dict *b)
|
||||
{
|
||||
return a->id == b->id;
|
||||
}
|
||||
|
||||
/* Free dictionary data. */
|
||||
void uc2_dict_free(struct uc2_dict *dict);
|
||||
|
||||
#endif
|
||||
75
lib/include/uc2/uc2_ingest.h
Normal file
75
lib/include/uc2/uc2_ingest.h
Normal file
@@ -0,0 +1,75 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* Streaming dedup ingest for UC2.
|
||||
*
|
||||
* uc2 --ingest <archive> reads a byte stream (typically stdin from
|
||||
* tar / rsync / cp -a), splits it via CDC, deduplicates chunks, and
|
||||
* writes a self-contained archive file. uc2 --ingest-restore <archive>
|
||||
* reverses this.
|
||||
*
|
||||
* Two on-disk formats are supported:
|
||||
*
|
||||
* v1 (legacy): manifest in <archive>, chunk data in a sidecar
|
||||
* blockstore directory at <archive>.blocks/. Cross-archive dedup
|
||||
* works through shared blockstore directories. Read-only now;
|
||||
* writer defaults to v2.
|
||||
*
|
||||
* v2 (default): archive is self-contained -- chunks are stored in
|
||||
* an embedded pool inside the archive itself. No sidecar
|
||||
* directory. Each manifest entry carries its chunk's absolute
|
||||
* file offset; deduplicated chunks share a single offset.
|
||||
*
|
||||
* Manifest layouts (all little-endian):
|
||||
*
|
||||
* v1: +0 8B magic "UC2INGST"
|
||||
* +8 1B version (1)
|
||||
* +9 1B cdc_bits
|
||||
* +10 2B reserved
|
||||
* +12 4B chunk_count
|
||||
* +16 ... chunk_count * 12B: 8B hash, 4B length
|
||||
*
|
||||
* v2: +0 8B magic "UC2INGST"
|
||||
* +8 1B version (2)
|
||||
* +9 1B cdc_bits
|
||||
* +10 2B reserved
|
||||
* +12 4B chunk_count
|
||||
* +16 ... chunk_count * 16B: 8B hash, 4B length, 4B offset
|
||||
* ... chunk pool: unique chunks back-to-back at recorded offsets
|
||||
*
|
||||
* Limitations:
|
||||
* - The whole stream is buffered in memory before chunking. Suits
|
||||
* CDC's locality-of-boundary requirement and is fine for streams
|
||||
* up to a few GB. True streaming is a future revision.
|
||||
* - The format is not yet a UC2 v3 archive consumable by uc2 -x /
|
||||
* -l; integrating with the master-block layout is a follow-up.
|
||||
*/
|
||||
|
||||
#ifndef UC2_INGEST_H
|
||||
#define UC2_INGEST_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
|
||||
struct uc2_ingest_stats {
|
||||
uint64_t bytes_in; /* input stream length */
|
||||
int chunks_total; /* total chunks in input */
|
||||
int chunks_new; /* chunks newly stored */
|
||||
int chunks_dedup; /* chunks already in the block store */
|
||||
uint64_t bytes_stored; /* bytes physically written this call */
|
||||
uint64_t bytes_saved; /* bytes saved by dedup */
|
||||
};
|
||||
|
||||
/* Ingest len bytes of data into archive_path. The block store lives
|
||||
* at <archive_path>.blocks/. cdc_bits selects the average chunk
|
||||
* size (13 = 8 KiB; 0 picks a sensible default). */
|
||||
int uc2_ingest_write(const char *archive_path,
|
||||
const uint8_t *data, size_t len,
|
||||
int cdc_bits,
|
||||
struct uc2_ingest_stats *stats);
|
||||
|
||||
/* Restore the byte stream described by an ingest manifest. Reads
|
||||
* chunks from <archive_path>.blocks/ and writes them in order to out. */
|
||||
int uc2_ingest_restore(const char *archive_path, FILE *out);
|
||||
|
||||
#endif
|
||||
46
lib/include/uc2/uc2_lz4.h
Normal file
46
lib/include/uc2/uc2_lz4.h
Normal file
@@ -0,0 +1,46 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* LZ4-compatible ultra-fast compression.
|
||||
*
|
||||
* Minimal LZ4-like compressor optimized for speed over ratio.
|
||||
* Uses a single-probe hash table (no chains) for O(1) match finding.
|
||||
* Suitable for real-time or low-resource scenarios where decompression
|
||||
* speed is critical and compression ratio is secondary.
|
||||
*
|
||||
* Format: sequence of literal/match tokens:
|
||||
* [token] [literal_length_ext?] [literals] [offset:16LE] [match_length_ext?]
|
||||
* token = (literal_len:4 << 4) | match_len:4
|
||||
* If literal_len == 15: read additional bytes until < 255
|
||||
* If match_len == 15: read additional bytes until < 255
|
||||
* Match lengths are +4 (minimum match = 4)
|
||||
*
|
||||
* Usage:
|
||||
* size_t bound = uc2_lz4_bound(src_len);
|
||||
* uint8_t *dst = malloc(bound);
|
||||
* size_t clen = uc2_lz4_compress(src, src_len, dst, bound);
|
||||
* size_t dlen = uc2_lz4_decompress(dst, clen, out, out_cap);
|
||||
*/
|
||||
|
||||
#ifndef UC2_LZ4_H
|
||||
#define UC2_LZ4_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
/* Maximum compressed size for a given input length. */
|
||||
static inline size_t uc2_lz4_bound(size_t src_len)
|
||||
{
|
||||
return src_len + src_len / 255 + 16;
|
||||
}
|
||||
|
||||
/* Compress src into dst. Returns compressed size, or 0 on error.
|
||||
* dst must be at least uc2_lz4_bound(src_len) bytes. */
|
||||
size_t uc2_lz4_compress(const uint8_t *src, size_t src_len,
|
||||
uint8_t *dst, size_t dst_cap);
|
||||
|
||||
/* Decompress src into dst. Returns decompressed size, or 0 on error.
|
||||
* dst must be large enough for the original data. */
|
||||
size_t uc2_lz4_decompress(const uint8_t *src, size_t src_len,
|
||||
uint8_t *dst, size_t dst_cap);
|
||||
|
||||
#endif
|
||||
66
lib/include/uc2/uc2_merkle.h
Normal file
66
lib/include/uc2/uc2_merkle.h
Normal file
@@ -0,0 +1,66 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* Merkle DAG for content-addressable deduplication.
|
||||
*
|
||||
* Builds a Merkle tree from CDC chunks: each file is represented as a
|
||||
* list of chunk hashes. The file's root hash is derived from the
|
||||
* concatenated chunk hashes, enabling structural comparison.
|
||||
*
|
||||
* Usage:
|
||||
* struct uc2_merkle tree;
|
||||
* uc2_merkle_build(&tree, data, len, 13);
|
||||
* uint64_t root = uc2_merkle_root(&tree);
|
||||
* int shared = uc2_merkle_common(&tree_a, &tree_b);
|
||||
* uc2_merkle_free(&tree);
|
||||
*/
|
||||
|
||||
#ifndef UC2_MERKLE_H
|
||||
#define UC2_MERKLE_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
/* A chunk in the Merkle tree. */
|
||||
struct uc2_chunk {
|
||||
uint64_t hash; /* content hash of this chunk */
|
||||
uint32_t offset; /* offset within the file */
|
||||
uint32_t length; /* chunk length in bytes */
|
||||
};
|
||||
|
||||
/* Merkle tree for one file. */
|
||||
struct uc2_merkle {
|
||||
struct uc2_chunk *chunks;
|
||||
int nchunks;
|
||||
int capacity;
|
||||
uint64_t root; /* root hash (hash of chunk hash list) */
|
||||
};
|
||||
|
||||
/* Build a Merkle tree from file data.
|
||||
* tree: output tree (caller must call uc2_merkle_free later)
|
||||
* data: file content
|
||||
* len: file length
|
||||
* bits: CDC chunk size exponent (13 = avg 8KB) */
|
||||
void uc2_merkle_build(struct uc2_merkle *tree,
|
||||
const uint8_t *data, size_t len, int bits);
|
||||
|
||||
/* Get the root hash of a Merkle tree. */
|
||||
static inline uint64_t uc2_merkle_root(const struct uc2_merkle *tree)
|
||||
{
|
||||
return tree->root;
|
||||
}
|
||||
|
||||
/* Count chunks shared between two Merkle trees (by hash). */
|
||||
int uc2_merkle_common(const struct uc2_merkle *a, const struct uc2_merkle *b);
|
||||
|
||||
/* Compute the fraction of bytes in tree A covered by shared chunks with B.
|
||||
* Returns 0.0 (no overlap) to 1.0 (identical content). */
|
||||
double uc2_merkle_similarity(const struct uc2_merkle *a,
|
||||
const struct uc2_merkle *b);
|
||||
|
||||
/* Free a Merkle tree's chunk array. */
|
||||
void uc2_merkle_free(struct uc2_merkle *tree);
|
||||
|
||||
/* 64-bit content hash (FNV-1a 64-bit). */
|
||||
uint64_t uc2_hash64(const uint8_t *data, size_t len);
|
||||
|
||||
#endif
|
||||
168
lib/include/uc2/uc2_ots.h
Normal file
168
lib/include/uc2/uc2_ots.h
Normal file
@@ -0,0 +1,168 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* OpenTimestamps integration.
|
||||
*
|
||||
* UC2 stores an OpenTimestamps proof in a magic-bracketed sidecar
|
||||
* trailer appended after the regular UC2 archive bytes. The trailer
|
||||
* does not affect compatibility with the original UC2 Pro reader,
|
||||
* which uses the front header's recorded length.
|
||||
*
|
||||
* The proof itself is the standard `.ots` binary: a 31-byte header
|
||||
* magic + version + file-hash op + leaf digest + serialized timestamp.
|
||||
* Callers can extract the proof verbatim and run the standard
|
||||
* `ots verify` tool on it.
|
||||
*
|
||||
* Local verification covers structural validity and the calendar-path
|
||||
* subset of opcodes (APPEND, PREPEND, SHA256). Proofs that use other
|
||||
* crypto ops (SHA1, RIPEMD160, KECCAK256) are accepted as structurally
|
||||
* valid but reported as not locally cryptographically verified;
|
||||
* the standard `ots verify` should be used for full validation. */
|
||||
|
||||
#ifndef UC2_OTS_H
|
||||
#define UC2_OTS_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
/* OTS opcodes. */
|
||||
enum {
|
||||
UC2_OTS_OP_APPEND = 0xf0, /* binary: append varbytes operand */
|
||||
UC2_OTS_OP_PREPEND = 0xf1, /* binary: prepend varbytes operand */
|
||||
UC2_OTS_OP_REVERSE = 0xf2, /* unary, deprecated */
|
||||
UC2_OTS_OP_HEXLIFY = 0xf3, /* unary */
|
||||
UC2_OTS_OP_SHA1 = 0x02, /* unary */
|
||||
UC2_OTS_OP_RIPEMD160 = 0x03, /* unary */
|
||||
UC2_OTS_OP_SHA256 = 0x08, /* unary, file-hash op */
|
||||
UC2_OTS_OP_KECCAK256 = 0x67, /* unary */
|
||||
UC2_OTS_BRANCH = 0xff,
|
||||
UC2_OTS_ATTESTATION = 0x00
|
||||
};
|
||||
|
||||
#define UC2_OTS_HEADER_MAGIC \
|
||||
"\x00OpenTimestamps\x00\x00Proof\x00\xbf\x89\xe2\xe8\x84\xe8\x92\x94"
|
||||
#define UC2_OTS_HEADER_MAGIC_LEN 31
|
||||
#define UC2_OTS_VERSION 0x01
|
||||
|
||||
/* Attestation tags (8 bytes each). */
|
||||
#define UC2_OTS_TAG_PENDING "\x83\xdf\xe3\x0d\x2e\xf9\x0c\x8e"
|
||||
#define UC2_OTS_TAG_BITCOIN "\x05\x88\x96\x0d\x73\xd7\x19\x01"
|
||||
#define UC2_OTS_TAG_LITECOIN "\x06\x86\x9a\x0d\x73\xd7\x1b\x45"
|
||||
#define UC2_OTS_TAG_LEN 8
|
||||
|
||||
/* Hard limits to bound parser cost on hostile input. */
|
||||
#define UC2_OTS_MAX_DIGEST_LEN 64
|
||||
#define UC2_OTS_MAX_VARBYTES 8192
|
||||
#define UC2_OTS_MAX_DEPTH 32
|
||||
#define UC2_OTS_MAX_VARINT 0xffffffffu
|
||||
|
||||
/* Error codes. */
|
||||
enum {
|
||||
UC2_OTS_OK = 0,
|
||||
UC2_OTS_ERR_TRUNCATED = -1,
|
||||
UC2_OTS_ERR_NONCANONICAL= -2,
|
||||
UC2_OTS_ERR_OVERFLOW = -3,
|
||||
UC2_OTS_ERR_BAD_MAGIC = -4,
|
||||
UC2_OTS_ERR_BAD_VERSION = -5,
|
||||
UC2_OTS_ERR_BAD_HASH_OP = -6,
|
||||
UC2_OTS_ERR_DEPTH = -7,
|
||||
UC2_OTS_ERR_TOO_LARGE = -8,
|
||||
UC2_OTS_ERR_BAD_OP = -9
|
||||
};
|
||||
|
||||
/* Verification result reported by uc2_ots_walk. */
|
||||
enum {
|
||||
UC2_OTS_RESULT_VERIFIED = 1, /* leaf reaches all attestations via supported ops only */
|
||||
UC2_OTS_RESULT_STRUCTURAL = 2, /* parses cleanly but contains unsupported ops */
|
||||
UC2_OTS_RESULT_LEAF_MISMATCH = 3 /* shape OK but leaf digest doesn't match input */
|
||||
};
|
||||
|
||||
/* Attestation summary callback. Called once per attestation reached.
|
||||
* `digest` is the digest at the leaf where the attestation was emitted.
|
||||
* Return non-zero to abort the walk.
|
||||
*
|
||||
* Note: the digest is only meaningful when uc2_ots_walk returns
|
||||
* UC2_OTS_RESULT_VERIFIED. When the walker returns
|
||||
* UC2_OTS_RESULT_STRUCTURAL the proof contains unsupported unary ops
|
||||
* (SHA1, RIPEMD160, KECCAK256, REVERSE, HEXLIFY) which leave the digest
|
||||
* unchanged for structural traversal; the digest passed to the callback
|
||||
* does not represent the cryptographic state at that leaf. */
|
||||
typedef int (*uc2_ots_attest_cb)(void *ctx,
|
||||
const uint8_t *tag /* 8 bytes */,
|
||||
const uint8_t *payload, size_t payload_len,
|
||||
const uint8_t *digest, size_t digest_len);
|
||||
|
||||
/* OTS varint codec. *out_value is set on success; *consumed is the
|
||||
* number of input bytes read. */
|
||||
int uc2_ots_varint_decode(const uint8_t *in, size_t in_len,
|
||||
uint64_t *out_value, size_t *consumed);
|
||||
size_t uc2_ots_varint_encode(uint64_t value, uint8_t out[10]);
|
||||
|
||||
/* Parse the .ots file envelope (header magic + version + file-hash op +
|
||||
* leaf digest + timestamp body). Sets out_* pointers into the input
|
||||
* buffer; no allocation. */
|
||||
int uc2_ots_parse_file(const uint8_t *file, size_t file_len,
|
||||
uint8_t *out_hash_op,
|
||||
const uint8_t **out_leaf_digest,
|
||||
size_t *out_leaf_digest_len,
|
||||
const uint8_t **out_body,
|
||||
size_t *out_body_len);
|
||||
|
||||
/* Build a .ots file from a leaf digest and a serialized timestamp body.
|
||||
* Returns total bytes written, or a negative error code. */
|
||||
int uc2_ots_serialize_file(uint8_t hash_op,
|
||||
const uint8_t *leaf_digest, size_t leaf_digest_len,
|
||||
const uint8_t *body, size_t body_len,
|
||||
uint8_t *out, size_t out_cap);
|
||||
|
||||
/* Walk a serialized timestamp body from `leaf_digest`, applying ops and
|
||||
* invoking `cb` for each attestation reached. Returns one of
|
||||
* UC2_OTS_RESULT_* on structural success, or a negative error code. */
|
||||
int uc2_ots_walk(const uint8_t *body, size_t body_len,
|
||||
const uint8_t *leaf_digest, size_t leaf_digest_len,
|
||||
uc2_ots_attest_cb cb, void *ctx);
|
||||
|
||||
/* UC2 OTS trailer.
|
||||
*
|
||||
* Layout (all integers little-endian, 32-bit unsigned):
|
||||
*
|
||||
* [archive bytes ...]
|
||||
* "UC2-OTS\0" (8 bytes, front magic)
|
||||
* u32 version (= 1)
|
||||
* u32 archive_len (length of preceding archive bytes)
|
||||
* u32 proof_len
|
||||
* bytes proof (proof_len bytes, raw .ots file)
|
||||
* u32 proof_len (duplicate, for reverse-scan)
|
||||
* "UC2-OTS\0" (8 bytes, back magic)
|
||||
*/
|
||||
|
||||
#define UC2_OTS_TRAILER_MAGIC "UC2-OTS\0"
|
||||
#define UC2_OTS_TRAILER_MAGIC_LEN 8
|
||||
#define UC2_OTS_TRAILER_VERSION 1u
|
||||
#define UC2_OTS_TRAILER_HEAD_LEN (UC2_OTS_TRAILER_MAGIC_LEN + 4 + 4 + 4)
|
||||
#define UC2_OTS_TRAILER_TAIL_LEN (4 + UC2_OTS_TRAILER_MAGIC_LEN)
|
||||
#define UC2_OTS_TRAILER_OVERHEAD (UC2_OTS_TRAILER_HEAD_LEN + UC2_OTS_TRAILER_TAIL_LEN)
|
||||
#define UC2_OTS_TRAILER_MAX_PROOF (1u << 20)
|
||||
|
||||
/* Build a trailer for an existing archive of length archive_len.
|
||||
* Writes [front magic | version | archive_len | proof_len | proof | proof_len | back magic]
|
||||
* to out. Returns total bytes written, or negative on error. */
|
||||
int uc2_ots_trailer_build(uint32_t archive_len,
|
||||
const uint8_t *proof, size_t proof_len,
|
||||
uint8_t *out, size_t out_cap);
|
||||
|
||||
/* Read a trailer from the end of a file image. On success sets
|
||||
* *out_archive_len = length of preceding archive (the SHA-256 region)
|
||||
* *out_proof, *out_proof_len = pointer/length of proof inside `file`
|
||||
* Returns:
|
||||
* UC2_OTS_OK if a well-formed trailer is present,
|
||||
* 1 if no trailer (back magic absent),
|
||||
* negative error code if the back magic is present but the trailer is malformed. */
|
||||
int uc2_ots_trailer_parse(const uint8_t *file, size_t file_len,
|
||||
uint32_t *out_archive_len,
|
||||
const uint8_t **out_proof, size_t *out_proof_len);
|
||||
|
||||
/* Convenience: get a human-readable name for a known attestation tag,
|
||||
* or NULL if unknown. */
|
||||
const char *uc2_ots_attest_name(const uint8_t tag[UC2_OTS_TAG_LEN]);
|
||||
|
||||
#endif
|
||||
67
lib/include/uc2/uc2_preprocess.h
Normal file
67
lib/include/uc2/uc2_preprocess.h
Normal file
@@ -0,0 +1,67 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* Content-aware preprocessing filters for improved compression.
|
||||
*
|
||||
* These transforms are applied BEFORE compression to expose redundancy
|
||||
* that LZ77+entropy coding can exploit more efficiently. Each filter
|
||||
* is reversible (apply/revert) and content-type specific.
|
||||
*
|
||||
* Filters:
|
||||
* BCJ — x86 branch/call/jump address normalization (E8/E9 transform)
|
||||
* BWT — Burrows-Wheeler transform for text (groups similar contexts)
|
||||
* Delta — byte-wise delta encoding for structured/tabular data
|
||||
*/
|
||||
|
||||
#ifndef UC2_PREPROCESS_H
|
||||
#define UC2_PREPROCESS_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
/* --- BCJ (Branch/Call/Jump) filter for x86 executables --- */
|
||||
|
||||
/* Convert relative x86 CALL/JMP addresses to absolute.
|
||||
* This makes the same function called from different locations produce
|
||||
* identical byte sequences, improving LZ77 matching.
|
||||
* Operates in-place. Returns 0 on success. */
|
||||
int uc2_bcj_apply(uint8_t *data, size_t len);
|
||||
|
||||
/* Revert BCJ transform (absolute → relative). */
|
||||
int uc2_bcj_revert(uint8_t *data, size_t len);
|
||||
|
||||
/* --- BWT (Burrows-Wheeler Transform) for text --- */
|
||||
|
||||
/* Apply BWT to data. Allocates *out (caller must free).
|
||||
* Sets *primary_index to the BWT primary index (needed for revert).
|
||||
* Returns 0 on success. */
|
||||
int uc2_bwt_apply(const uint8_t *data, size_t len,
|
||||
uint8_t **out, uint32_t *primary_index);
|
||||
|
||||
/* Revert BWT. Allocates *out (caller must free).
|
||||
* Returns 0 on success. */
|
||||
int uc2_bwt_revert(const uint8_t *data, size_t len,
|
||||
uint32_t primary_index, uint8_t **out);
|
||||
|
||||
/* --- Delta filter for structured data --- */
|
||||
|
||||
/* Apply byte-wise delta encoding (each byte = current - previous).
|
||||
* Operates in-place. Stride controls the delta distance (1 = adjacent
|
||||
* bytes, 2 = every other byte, etc.). Stride 1 is best for sequential
|
||||
* data; stride 2+ for interleaved multi-channel data. */
|
||||
void uc2_delta_filter_apply(uint8_t *data, size_t len, int stride);
|
||||
|
||||
/* Revert byte-wise delta encoding. Operates in-place. */
|
||||
void uc2_delta_filter_revert(uint8_t *data, size_t len, int stride);
|
||||
|
||||
/* --- Content detection --- */
|
||||
|
||||
/* Detect likely content type for automatic filter selection.
|
||||
* Returns one of the UC2_CONTENT_* constants. */
|
||||
#define UC2_CONTENT_BINARY 0 /* generic binary / unknown */
|
||||
#define UC2_CONTENT_TEXT 1 /* text (high ASCII printable ratio) */
|
||||
#define UC2_CONTENT_X86 2 /* x86 executable (MZ/PE/ELF header) */
|
||||
#define UC2_CONTENT_STRUCT 3 /* structured/tabular (regular patterns) */
|
||||
|
||||
int uc2_detect_content(const uint8_t *data, size_t len);
|
||||
|
||||
#endif
|
||||
92
lib/include/uc2/uc2_rans.h
Normal file
92
lib/include/uc2/uc2_rans.h
Normal file
@@ -0,0 +1,92 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* rANS (range Asymmetric Numeral Systems) entropy coder.
|
||||
*
|
||||
* Drop-in replacement for Huffman coding with ~5-15% better compression
|
||||
* on skewed distributions. Uses table-based rANS with 32-bit state
|
||||
* and frequencies normalized to a power of 2.
|
||||
*
|
||||
* Usage:
|
||||
* struct uc2_rans_enc enc;
|
||||
* uc2_rans_enc_init(&enc, freqs, nsym);
|
||||
* for each symbol: uc2_rans_encode(&enc, sym, &out_buf, &out_pos);
|
||||
* uc2_rans_enc_flush(&enc, &out_buf, &out_pos);
|
||||
*
|
||||
* struct uc2_rans_dec dec;
|
||||
* uc2_rans_dec_init(&dec, freqs, nsym, in_buf, in_len);
|
||||
* for each symbol: int sym = uc2_rans_decode(&dec);
|
||||
*/
|
||||
|
||||
#ifndef UC2_RANS_H
|
||||
#define UC2_RANS_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
/* Frequency table precision: frequencies sum to 1 << PROB_BITS */
|
||||
#define UC2_RANS_PROB_BITS 12
|
||||
#define UC2_RANS_PROB_SCALE (1 << UC2_RANS_PROB_BITS)
|
||||
|
||||
/* Maximum symbols supported */
|
||||
#define UC2_RANS_MAX_SYMS 344
|
||||
|
||||
/* Normalized frequency table. */
|
||||
struct uc2_rans_table {
|
||||
uint16_t freq[UC2_RANS_MAX_SYMS]; /* normalized frequencies */
|
||||
uint16_t cumfreq[UC2_RANS_MAX_SYMS]; /* cumulative frequencies */
|
||||
int nsym;
|
||||
};
|
||||
|
||||
/* Build normalized frequency table from raw counts.
|
||||
* Frequencies are scaled to sum to UC2_RANS_PROB_SCALE. */
|
||||
void uc2_rans_build_table(struct uc2_rans_table *tab,
|
||||
const uint32_t *raw_freq, int nsym);
|
||||
|
||||
/* --- Encoder --- */
|
||||
|
||||
struct uc2_rans_enc {
|
||||
uint32_t state;
|
||||
const struct uc2_rans_table *tab;
|
||||
/* Reverse buffer: rANS encodes in reverse order */
|
||||
uint8_t *rev_buf;
|
||||
size_t rev_pos;
|
||||
size_t rev_cap;
|
||||
};
|
||||
|
||||
/* Initialize encoder. */
|
||||
void uc2_rans_enc_init(struct uc2_rans_enc *enc,
|
||||
const struct uc2_rans_table *tab);
|
||||
|
||||
/* Encode one symbol. Symbols must be encoded in REVERSE order
|
||||
* (last symbol first). Use uc2_rans_enc_flush to finalize. */
|
||||
void uc2_rans_encode(struct uc2_rans_enc *enc, int sym);
|
||||
|
||||
/* Finalize encoding: write state and return the compressed data.
|
||||
* Caller must free *out_data. Returns compressed size. */
|
||||
size_t uc2_rans_enc_finish(struct uc2_rans_enc *enc,
|
||||
uint8_t **out_data);
|
||||
|
||||
/* Free encoder resources. */
|
||||
void uc2_rans_enc_free(struct uc2_rans_enc *enc);
|
||||
|
||||
/* --- Decoder --- */
|
||||
|
||||
struct uc2_rans_dec {
|
||||
uint32_t state;
|
||||
const struct uc2_rans_table *tab;
|
||||
const uint8_t *data;
|
||||
size_t pos;
|
||||
size_t len;
|
||||
/* Reverse lookup: cumfreq → symbol (for fast decoding) */
|
||||
uint16_t lookup[UC2_RANS_PROB_SCALE];
|
||||
};
|
||||
|
||||
/* Initialize decoder from compressed data. */
|
||||
void uc2_rans_dec_init(struct uc2_rans_dec *dec,
|
||||
const struct uc2_rans_table *tab,
|
||||
const uint8_t *data, size_t len);
|
||||
|
||||
/* Decode one symbol. */
|
||||
int uc2_rans_decode(struct uc2_rans_dec *dec);
|
||||
|
||||
#endif
|
||||
29
lib/include/uc2/uc2_sha256.h
Normal file
29
lib/include/uc2/uc2_sha256.h
Normal file
@@ -0,0 +1,29 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* SHA-256 (FIPS 180-4) -- pure C implementation.
|
||||
*
|
||||
* Used by the OpenTimestamps integration; calendars accept SHA-256
|
||||
* digests as proof leaves. */
|
||||
|
||||
#ifndef UC2_SHA256_H
|
||||
#define UC2_SHA256_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
#define UC2_SHA256_OUT_LEN 32
|
||||
#define UC2_SHA256_BLOCK_LEN 64
|
||||
|
||||
struct uc2_sha256 {
|
||||
uint32_t state[8];
|
||||
uint64_t bitcount;
|
||||
uint8_t buf[UC2_SHA256_BLOCK_LEN];
|
||||
size_t buf_len;
|
||||
};
|
||||
|
||||
void uc2_sha256_init(struct uc2_sha256 *ctx);
|
||||
void uc2_sha256_update(struct uc2_sha256 *ctx, const void *data, size_t len);
|
||||
void uc2_sha256_final(struct uc2_sha256 *ctx, uint8_t out[UC2_SHA256_OUT_LEN]);
|
||||
void uc2_sha256_hash(const void *data, size_t len, uint8_t out[UC2_SHA256_OUT_LEN]);
|
||||
|
||||
#endif
|
||||
44
lib/include/uc2/uc2_simhash.h
Normal file
44
lib/include/uc2/uc2_simhash.h
Normal file
@@ -0,0 +1,44 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* Near-duplicate detection via SimHash.
|
||||
*
|
||||
* SimHash produces a fixed-size fingerprint where similar documents
|
||||
* have fingerprints with small Hamming distance. Two files are
|
||||
* "near-duplicates" if their SimHash fingerprints differ in fewer
|
||||
* than a threshold number of bits.
|
||||
*
|
||||
* This detects patched executables, slightly edited documents, and
|
||||
* minor revisions — cases where CDC chunks might not align but the
|
||||
* overall content is structurally similar.
|
||||
*
|
||||
* Usage:
|
||||
* uint64_t h1 = uc2_simhash(data1, len1);
|
||||
* uint64_t h2 = uc2_simhash(data2, len2);
|
||||
* int dist = uc2_hamming(h1, h2);
|
||||
* if (dist <= 10) // near-duplicates
|
||||
*/
|
||||
|
||||
#ifndef UC2_SIMHASH_H
|
||||
#define UC2_SIMHASH_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
/* Compute a 64-bit SimHash fingerprint.
|
||||
* Uses 4-byte shingles hashed with FNV-1a, accumulated into a
|
||||
* 64-bit vector where each bit is the majority vote of all
|
||||
* shingle hash bits. */
|
||||
uint64_t uc2_simhash(const uint8_t *data, size_t len);
|
||||
|
||||
/* Hamming distance between two SimHash fingerprints (0-64). */
|
||||
int uc2_hamming(uint64_t a, uint64_t b);
|
||||
|
||||
/* Check if two fingerprints are near-duplicates.
|
||||
* threshold: max Hamming distance (recommended: 6-10 for text,
|
||||
* 3-6 for binary). */
|
||||
static inline int uc2_is_near_dup(uint64_t a, uint64_t b, int threshold)
|
||||
{
|
||||
return uc2_hamming(a, b) <= threshold;
|
||||
}
|
||||
|
||||
#endif
|
||||
1110
lib/src/compress.c
Normal file
1110
lib/src/compress.c
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,3 +1,5 @@
|
||||
/* SPDX-License-Identifier: LGPL-3.0-only */
|
||||
|
||||
/* UltraCompressor II decompression library.
|
||||
Copyright © Jan Bobrowski 2020, 2021
|
||||
torinak.com/~jb/unuc2/
|
||||
@@ -11,11 +13,13 @@
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdarg.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "uc2/libuc2.h"
|
||||
#include "uc2/uc2_rans.h"
|
||||
|
||||
#if !defined NDEBUG && !defined NDIAG
|
||||
#include <stdio.h>
|
||||
@@ -174,7 +178,9 @@ struct range {
|
||||
u8 *ptr, *end;
|
||||
};
|
||||
|
||||
static unsigned range_len(struct range *r) {return (unsigned)(r->end - r->ptr);}
|
||||
/* Defensive: a never-set or stale end (end < ptr) must report an empty
|
||||
range so range_get() refuses rather than handing out wild pointers. */
|
||||
static unsigned range_len(struct range *r) {return r->end > r->ptr ? (unsigned)(r->end - r->ptr) : 0;}
|
||||
|
||||
struct uc2_context {
|
||||
char *message;
|
||||
@@ -308,6 +314,7 @@ struct bits {
|
||||
u32 bits;
|
||||
unsigned have_bits;
|
||||
unsigned head, tail;
|
||||
int err;
|
||||
struct reader *rd;
|
||||
u8 buffer[4 << 10];
|
||||
};
|
||||
@@ -318,6 +325,7 @@ static int bits_init(struct bits *bi, struct reader *rd)
|
||||
bi->tail = 0;
|
||||
bi->bits = 0;
|
||||
bi->have_bits = 0;
|
||||
bi->err = 0;
|
||||
bi->rd = rd;
|
||||
return 0;
|
||||
}
|
||||
@@ -331,16 +339,23 @@ static void bits_skip(struct bits *bi, unsigned n)
|
||||
static int bits_feed(struct bits *bi, unsigned n)
|
||||
{
|
||||
assert(n <= 16);
|
||||
if (bi->err)
|
||||
return bi->err;
|
||||
if (bi->have_bits < n) {
|
||||
unsigned have = bi->tail - bi->head;
|
||||
if (have <= 1) {
|
||||
if (have == 1)
|
||||
bi->buffer[0] = bi->buffer[bi->tail - 1];
|
||||
/* The stream is consumed two bytes at a time; keep reading until
|
||||
at least a full pair is buffered (a reader may legally return
|
||||
short counts, including a single byte). */
|
||||
while (bi->tail - bi->head < 2) {
|
||||
unsigned have = bi->tail - bi->head;
|
||||
if (have && bi->head)
|
||||
bi->buffer[0] = bi->buffer[bi->head];
|
||||
bi->head = 0;
|
||||
bi->tail = have;
|
||||
int r = bi->rd->read(bi->rd->context, bi->buffer + have, sizeof bi->buffer - have);
|
||||
if (r <= 0)
|
||||
return r ? r : UC2_Truncated;
|
||||
bi->head = 0;
|
||||
if (r <= 0) {
|
||||
bi->err = r ? r : UC2_Truncated;
|
||||
return bi->err;
|
||||
}
|
||||
bi->tail += r;
|
||||
}
|
||||
bi->bits = bi->bits << 16 | bi->buffer[bi->head] | bi->buffer[bi->head + 1] << 8;
|
||||
@@ -646,6 +661,30 @@ static int use_master(struct uc2_context *uc2, u8 buffer[65535], u32 id)
|
||||
|
||||
static int cdir_damaged(struct uc2_context *uc2);
|
||||
|
||||
/* Writer for the central-directory decode that also enforces a
|
||||
compression-ratio ceiling. A tiny crafted cdir stream can expand via
|
||||
long matches into tens of megabytes (a decompression bomb), turning a
|
||||
few-hundred-byte archive into a multi-second decode. Abort once the
|
||||
output far outgrows the compressed bytes consumed. */
|
||||
struct cdir_writer {
|
||||
struct range out;
|
||||
struct archive_ctx *src; /* reader context, for bytes consumed */
|
||||
unsigned base; /* src->offset at decode start */
|
||||
unsigned long produced;
|
||||
};
|
||||
|
||||
static int cdir_write(void *context, const void *ptr, unsigned size)
|
||||
{
|
||||
struct cdir_writer *w = context;
|
||||
w->produced += size;
|
||||
unsigned consumed = w->src->offset - w->base;
|
||||
/* Real cdir metadata compresses well under ~20:1; 64:1 with a
|
||||
64 KiB floor leaves ample headroom while stopping bombs. */
|
||||
if (w->produced > 65536 + 64ul * consumed)
|
||||
return UC2_Damaged;
|
||||
return buf_write(&w->out, ptr, size);
|
||||
}
|
||||
|
||||
static int decompress_cdir(struct uc2_context *uc2, u32 offset, u16 csum)
|
||||
{
|
||||
assert(!uc2->cdir_buf);
|
||||
@@ -673,15 +712,20 @@ static int decompress_cdir(struct uc2_context *uc2, u32 offset, u16 csum)
|
||||
|
||||
struct archive_ctx ar = {.offset = offset, .uc2 = uc2};
|
||||
struct reader rd = {.read = archive_read, .context = &ar};
|
||||
struct range wrctx = {.ptr = uc2->cdir_buf, .end = uc2->cdir_buf + size};
|
||||
struct writer wr = {.write = buf_write, .context = &wrctx};
|
||||
struct cdir_writer wctx = {
|
||||
.out = {uc2->cdir_buf, uc2->cdir_buf + size},
|
||||
.src = &ar, .base = offset
|
||||
};
|
||||
struct writer wr = {.write = cdir_write, .context = &wctx};
|
||||
u16 cs;
|
||||
ret = decompressor(uc2, get16(c.method), &rd, &wr, NoMaster, 100000000, &cs);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
goto fail;
|
||||
|
||||
if (cs != csum)
|
||||
return cdir_damaged(uc2);
|
||||
if (cs != csum) {
|
||||
ret = cdir_damaged(uc2);
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if ((unsigned)ret <= size)
|
||||
break;
|
||||
@@ -691,8 +735,20 @@ static int decompress_cdir(struct uc2_context *uc2, u32 offset, u16 csum)
|
||||
uc2->cdir_buf = u_free(uc2, uc2->cdir_buf);
|
||||
}
|
||||
|
||||
uc2->cdir_range.end = uc2->cdir_buf + size;
|
||||
/* Bound the walk to the bytes actually decompressed, not the
|
||||
allocation. A damaged cdir that passes the 16-bit checksum by
|
||||
chance would otherwise be parsed into uninitialised heap between
|
||||
the real end and the buffer end. */
|
||||
uc2->cdir_range.end = uc2->cdir_buf + (unsigned)ret;
|
||||
return 0;
|
||||
|
||||
/* On error, free cdir_buf and leave it NULL so the invariant
|
||||
"cdir_buf != NULL iff cdir_range is fully valid" holds; otherwise
|
||||
a later uc2_read_cdir / uc2_finish_cdir would walk a range whose
|
||||
end was never set, handing out wild pointers. */
|
||||
fail:
|
||||
uc2->cdir_buf = u_free(uc2, uc2->cdir_buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int start_read(struct uc2_context *uc2);
|
||||
@@ -934,7 +990,10 @@ static int cdir_damaged(struct uc2_context *uc2)
|
||||
struct delta {
|
||||
u8 size;
|
||||
u8 index;
|
||||
u8 val[8];
|
||||
/* size is the delta stride; decompressor() accepts methods up to 49,
|
||||
giving strides up to 10, so val[] must cover that (was [8], which
|
||||
both read out of bounds and mis-decoded strides 9-10). */
|
||||
u8 val[16];
|
||||
};
|
||||
|
||||
static void delta_init(struct delta *db, u8 type)
|
||||
@@ -1000,6 +1059,7 @@ int uc2_extract(
|
||||
/* decompress */
|
||||
|
||||
static int decompressor_ultra(struct uc2_context *uc2, unsigned master, unsigned delta, struct reader *rd, struct writer *wr, unsigned limit, u16 *csum);
|
||||
static int decompressor_rans(struct uc2_context *uc2, unsigned master_id, struct reader *rd, struct writer *wr, unsigned limit, u16 *csum);
|
||||
|
||||
static int decompressor(struct uc2_context *uc2, int method, struct reader *rd, struct writer *wr, unsigned master, unsigned len, u16 *csum)
|
||||
{
|
||||
@@ -1021,6 +1081,8 @@ ultra:
|
||||
} else if (method >= 21 && method <= 29) {
|
||||
delta = 1;
|
||||
goto ultra;
|
||||
} else if (method == 10) {
|
||||
ret = decompressor_rans(uc2, master, rd, wr, len, csum);
|
||||
} else if (method == 80) {
|
||||
uc2->message = "Turbo compression not implemented";
|
||||
ret = UC2_Unimplemented;
|
||||
@@ -1029,11 +1091,146 @@ ultra:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Flush the unwritten window region [*wpos, tail) in ring order. */
|
||||
static int rans_flush(struct writer *wr, struct csum *cs, const u8 *buf,
|
||||
u16 *wpos, u16 tail)
|
||||
{
|
||||
while (*wpos != tail) {
|
||||
unsigned n = (u16)(tail - *wpos);
|
||||
unsigned lin = 0x10000u - *wpos;
|
||||
if (n > lin) n = lin;
|
||||
csum_update(cs, buf + *wpos, n);
|
||||
if (wr->write(wr->context, buf + *wpos, n) < 0)
|
||||
return UC2_UserFault;
|
||||
*wpos = (u16)(*wpos + n);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* rANS decompressor (method 10) */
|
||||
static int decompressor_rans(struct uc2_context *uc2, unsigned master_id,
|
||||
struct reader *rd, struct writer *wr,
|
||||
unsigned limit, u16 *csum)
|
||||
{
|
||||
const unsigned EOB = 64001;
|
||||
int ret;
|
||||
|
||||
u8 *buf = u_alloc(uc2, 65536);
|
||||
if (!buf) return UC2_UserFault;
|
||||
|
||||
ret = use_master(uc2, buf, master_id);
|
||||
if (ret < 0) { u_free(uc2, buf); return ret; }
|
||||
u16 tail = (u16)ret;
|
||||
u16 wpos = tail; /* window position of the next unwritten output byte */
|
||||
/* Bytes written into the 64KB window so far (master fill + output),
|
||||
saturated at the window size. A match distance must not exceed it,
|
||||
else (u16)(tail - dist) would reference unwritten window bytes. */
|
||||
unsigned produced = (unsigned)ret;
|
||||
struct csum cs;
|
||||
csum_init(&cs);
|
||||
unsigned remaining = limit;
|
||||
|
||||
struct bits bi;
|
||||
ret = bits_init(&bi, rd);
|
||||
if (ret < 0) { u_free(uc2, buf); return ret; }
|
||||
|
||||
while (bits_get(&bi, 1) == 1) { /* block-present */
|
||||
unsigned nsyms = (unsigned)(bits_get(&bi, 8) & 0xff) << 8;
|
||||
nsyms |= (unsigned)(bits_get(&bi, 8) & 0xff);
|
||||
unsigned rlen = (unsigned)(bits_get(&bi, 8) & 0xff) << 8;
|
||||
rlen |= (unsigned)(bits_get(&bi, 8) & 0xff);
|
||||
if (bi.err) break;
|
||||
|
||||
u32 freqs[344];
|
||||
for (int i = 0; i < 344; i++)
|
||||
freqs[i] = (u32)(bits_get(&bi, 12) & 0xfff);
|
||||
if (bi.err) break;
|
||||
|
||||
struct uc2_rans_table tab;
|
||||
uc2_rans_build_table(&tab, freqs, 344);
|
||||
|
||||
u8 *rdata = u_alloc(uc2, rlen ? rlen : 1);
|
||||
if (!rdata) { bits_destroy(&bi); u_free(uc2, buf); return UC2_UserFault; }
|
||||
for (unsigned i = 0; i < rlen; i++)
|
||||
rdata[i] = (u8)bits_get(&bi, 8);
|
||||
if (bi.err) { u_free(uc2, rdata); break; }
|
||||
|
||||
struct uc2_rans_dec dec;
|
||||
uc2_rans_dec_init(&dec, &tab, rdata, rlen);
|
||||
|
||||
/* Decode all nsyms symbols, including the trailing EOB pair and
|
||||
its extra bits: stopping at remaining == 0 would leave the bit
|
||||
cursor mid-block and desynchronize the next block-present bit. */
|
||||
for (unsigned s = 0; s < nsyms; s++) {
|
||||
int sym = uc2_rans_decode(&dec);
|
||||
if (sym < 256) {
|
||||
if (remaining) {
|
||||
buf[tail++] = (u8)sym;
|
||||
remaining--;
|
||||
if (produced < 65536) produced++;
|
||||
if ((u16)(tail - wpos) >= 0x8000) {
|
||||
ret = rans_flush(wr, &cs, buf, &wpos, tail);
|
||||
if (ret < 0) { bi.err = ret; break; }
|
||||
}
|
||||
}
|
||||
} else if (sym < 316) {
|
||||
int ds = sym - 256;
|
||||
unsigned dist = (ds < 15) ? ds + 1 :
|
||||
(ds < 30) ? (ds-15+1)*16 + (bits_get(&bi, 4) & 0xf) :
|
||||
(ds < 45) ? (ds-30+1)*256 + (bits_get(&bi, 8) & 0xff) :
|
||||
(ds-45+1)*4096 + (bits_get(&bi, 12) & 0xfff);
|
||||
if (bi.err) break;
|
||||
if (dist == EOB) { s++; if (s < nsyms) uc2_rans_decode(&dec); break; }
|
||||
s++;
|
||||
if (s >= nsyms) break;
|
||||
int ls = uc2_rans_decode(&dec) - 316;
|
||||
if (ls < 0) ls = 0;
|
||||
unsigned length = (ls < 8) ? ls + 3 :
|
||||
(ls < 16) ? (ls-8)*2+11+(bits_get(&bi,1) & 0x1) :
|
||||
(ls < 24) ? (ls-16)*8+27+(bits_get(&bi,3) & 0x7) :
|
||||
(ls == 24) ? 91+(bits_get(&bi,6) & 0x3f) :
|
||||
(ls == 25) ? 155+(bits_get(&bi,9) & 0x1ff) :
|
||||
(ls == 26) ? 667+(bits_get(&bi,11) & 0x7ff) :
|
||||
2715+(bits_get(&bi,15) & 0x7fff);
|
||||
if (bi.err) break;
|
||||
if (dist > produced) { bi.err = UC2_Damaged; break; }
|
||||
for (unsigned j = 0; j < length && remaining > 0; j++) {
|
||||
buf[tail] = buf[(u16)(tail - dist)];
|
||||
tail++; remaining--;
|
||||
if (produced < 65536) produced++;
|
||||
if ((u16)(tail - wpos) >= 0x8000) {
|
||||
ret = rans_flush(wr, &cs, buf, &wpos, tail);
|
||||
if (ret < 0) { bi.err = ret; break; }
|
||||
}
|
||||
}
|
||||
if (bi.err) break;
|
||||
}
|
||||
}
|
||||
u_free(uc2, rdata);
|
||||
if (bi.err) break;
|
||||
}
|
||||
if (bi.err) {
|
||||
bits_destroy(&bi);
|
||||
u_free(uc2, buf);
|
||||
return bi.err;
|
||||
}
|
||||
|
||||
/* Flush remaining output */
|
||||
ret = rans_flush(wr, &cs, buf, &wpos, tail);
|
||||
if (ret < 0) { bits_destroy(&bi); u_free(uc2, buf); return ret; }
|
||||
|
||||
bits_destroy(&bi);
|
||||
u_free(uc2, buf);
|
||||
if (csum) *csum = csum_get(&cs);
|
||||
return limit - remaining;
|
||||
}
|
||||
|
||||
/* cbuf */
|
||||
|
||||
struct cbuffer {
|
||||
u16 head, tail;
|
||||
unsigned limit;
|
||||
unsigned produced; /* bytes written to the window (master + output), <= 0x10000 */
|
||||
struct csum csum;
|
||||
u8 data[0x10000];
|
||||
};
|
||||
@@ -1117,7 +1314,7 @@ enum {
|
||||
NumLenCodes = NumDeltaCodes + NumExtraCodes,
|
||||
};
|
||||
|
||||
const u8 vval[NumDeltaCodes][NumDeltaCodes] = {
|
||||
static const u8 vval[NumDeltaCodes][NumDeltaCodes] = {
|
||||
{ 0,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1},
|
||||
{ 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0},
|
||||
{ 2, 1, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0},
|
||||
@@ -1205,6 +1402,8 @@ static int ht_dec(u8 lengths[NumSymbols], struct dcinfo *dc, struct bits *bi, u3
|
||||
if (c < 0)
|
||||
return c;
|
||||
int n = c + MinRepeat - 1;
|
||||
if (n > (int)(syme - symp))
|
||||
return UC2_Damaged; /* malformed tree overruns stream[] */
|
||||
for (; n > 0; n--)
|
||||
*symp++ = val;
|
||||
} else {
|
||||
@@ -1301,6 +1500,7 @@ static int decompressor_ultra(struct uc2_context *uc2, unsigned master, unsigned
|
||||
goto ret;
|
||||
ultra->cb.limit = limit;
|
||||
ultra->cb.head = ultra->cb.tail = ret;
|
||||
ultra->cb.produced = ret;
|
||||
csum_init(&ultra->cb.csum);
|
||||
|
||||
u8 *dbuf = 0;
|
||||
@@ -1396,9 +1596,10 @@ static int decompress_block(struct ultra *ultra)
|
||||
int c = huff(ultra->bd_table, &ultra->bi);
|
||||
if (c < 0)
|
||||
return c;
|
||||
if (!(c & 1<<16))
|
||||
if (!(c & 1<<16)) {
|
||||
ultra->cb.data[ultra->cb.tail++] = (u8)c;
|
||||
else {
|
||||
if (ultra->cb.produced < 65536) ultra->cb.produced++;
|
||||
} else {
|
||||
unsigned dist = c & 0xffff;
|
||||
c = c >> 20 & 0xf;
|
||||
if (c)
|
||||
@@ -1415,10 +1616,24 @@ static int decompress_block(struct ultra *ultra)
|
||||
c = c >> 20 & 0xf;
|
||||
if (c)
|
||||
len += bits_get(&ultra->bi, c);
|
||||
assert(cbuf_space(&ultra->cb) >= len);
|
||||
/* On valid data the loop guard below keeps len within the
|
||||
window (<= 35482 <= cbuf_space at block entry). A
|
||||
corrupt or truncated stream can underflow len (a short
|
||||
bits_get returns negative); the original assert caught
|
||||
that only in debug builds, so NDEBUG would let the copy
|
||||
overrun cb.data. Bail cleanly instead -- the checksum
|
||||
path then reports the damage. */
|
||||
if (len > cbuf_space(&ultra->cb))
|
||||
return UC2_Damaged;
|
||||
/* dist must reference already-written history; a too-large
|
||||
dist (or a negative bits_get above wrapping it huge) would
|
||||
read unwritten/uninitialised window bytes into the output. */
|
||||
if (dist == 0 || dist > ultra->cb.produced)
|
||||
return UC2_Damaged;
|
||||
do {
|
||||
ultra->cb.data[ultra->cb.tail] = ultra->cb.data[(u16)(ultra->cb.tail - dist)];
|
||||
ultra->cb.tail++;
|
||||
if (ultra->cb.produced < 65536) ultra->cb.produced++;
|
||||
} while (--len);
|
||||
}
|
||||
|
||||
@@ -1519,3 +1734,26 @@ const char *uc2_message(struct uc2_context *uc2, int ret)
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
/* Decompress the built-in SuperMaster (49152 bytes) into caller's buffer.
|
||||
Returns 49152 on success, negative UC2_* error code on failure. */
|
||||
static void *sm_alloc(void *ctx, unsigned size) { (void)ctx; return malloc(size); }
|
||||
static void sm_free(void *ctx, void *ptr) { (void)ctx; free(ptr); }
|
||||
|
||||
int uc2_get_supermaster(void *buf, unsigned buf_size)
|
||||
{
|
||||
if (buf_size < 49152)
|
||||
return UC2_UserFault;
|
||||
|
||||
struct uc2_io io = { .alloc = sm_alloc, .free = sm_free };
|
||||
struct uc2_context *uc2 = uc2_open(&io, NULL);
|
||||
if (!uc2)
|
||||
return UC2_UserFault;
|
||||
|
||||
int ret = resolve_master(uc2, SuperMaster);
|
||||
if (ret >= 0)
|
||||
memcpy(buf, uc2->supermaster, 49152);
|
||||
|
||||
uc2_close(uc2);
|
||||
return ret < 0 ? ret : 49152;
|
||||
}
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
/* SPDX-License-Identifier: LGPL-3.0-only */
|
||||
|
||||
/* list.h by Jan Bobrowski. Inspired by list.h from Linux */
|
||||
|
||||
#ifndef LIST_H
|
||||
|
||||
275
lib/src/uc2_blake3.c
Normal file
275
lib/src/uc2_blake3.c
Normal file
@@ -0,0 +1,275 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* BLAKE3 cryptographic hashing — simplified single-threaded implementation.
|
||||
*
|
||||
* Based on the BLAKE3 specification (github.com/BLAKE3-team/BLAKE3).
|
||||
* Uses the BLAKE2s round function with Bao tree structure.
|
||||
*
|
||||
* This implementation handles the common case (single chunk, sequential
|
||||
* hashing) and supports the tree structure for inputs > 1024 bytes. */
|
||||
|
||||
#include "uc2/uc2_blake3.h"
|
||||
#include <string.h>
|
||||
|
||||
/* BLAKE3 IV (same as BLAKE2s) */
|
||||
static const uint32_t IV[8] = {
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
|
||||
};
|
||||
|
||||
/* Flags */
|
||||
enum {
|
||||
CHUNK_START = 1 << 0,
|
||||
CHUNK_END = 1 << 1,
|
||||
PARENT = 1 << 2,
|
||||
ROOT = 1 << 3,
|
||||
};
|
||||
|
||||
/* Message schedule (BLAKE3 permutation) */
|
||||
static const uint8_t MSG_SCHEDULE[7][16] = {
|
||||
{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15},
|
||||
{2,6,3,10,7,0,4,13,1,11,12,5,9,14,15,8},
|
||||
{3,4,10,12,13,2,7,14,6,5,9,0,11,15,8,1},
|
||||
{10,7,12,9,14,3,13,15,4,0,11,2,5,8,1,6},
|
||||
{12,13,9,11,15,10,14,8,7,2,5,3,0,1,6,4},
|
||||
{9,14,11,5,8,12,15,1,13,3,0,10,2,6,4,7},
|
||||
{11,15,5,0,1,9,8,6,14,10,2,12,3,4,7,13},
|
||||
};
|
||||
|
||||
static uint32_t rotr(uint32_t x, int n) { return (x >> n) | (x << (32 - n)); }
|
||||
|
||||
static void g(uint32_t *s, int a, int b, int c, int d, uint32_t mx, uint32_t my)
|
||||
{
|
||||
s[a] = s[a] + s[b] + mx; s[d] = rotr(s[d] ^ s[a], 16);
|
||||
s[c] = s[c] + s[d]; s[b] = rotr(s[b] ^ s[c], 12);
|
||||
s[a] = s[a] + s[b] + my; s[d] = rotr(s[d] ^ s[a], 8);
|
||||
s[c] = s[c] + s[d]; s[b] = rotr(s[b] ^ s[c], 7);
|
||||
}
|
||||
|
||||
static void round_fn(uint32_t *s, const uint32_t *m)
|
||||
{
|
||||
g(s,0,4, 8,12,m[0],m[1]); g(s,1,5, 9,13,m[2],m[3]);
|
||||
g(s,2,6,10,14,m[4],m[5]); g(s,3,7,11,15,m[6],m[7]);
|
||||
g(s,0,5,10,15,m[8],m[9]); g(s,1,6,11,12,m[10],m[11]);
|
||||
g(s,2,7, 8,13,m[12],m[13]); g(s,3,4,9,14,m[14],m[15]);
|
||||
}
|
||||
|
||||
static void compress(const uint32_t cv[8], const uint8_t block[64],
|
||||
uint8_t block_len, uint64_t counter, uint8_t flags,
|
||||
uint32_t out[16])
|
||||
{
|
||||
uint32_t m[16];
|
||||
for (int i = 0; i < 16; i++)
|
||||
m[i] = (uint32_t)block[i*4] | ((uint32_t)block[i*4+1]<<8) |
|
||||
((uint32_t)block[i*4+2]<<16) | ((uint32_t)block[i*4+3]<<24);
|
||||
|
||||
uint32_t s[16] = {
|
||||
cv[0],cv[1],cv[2],cv[3],cv[4],cv[5],cv[6],cv[7],
|
||||
IV[0],IV[1],IV[2],IV[3],
|
||||
(uint32_t)counter, (uint32_t)(counter>>32),
|
||||
block_len, flags
|
||||
};
|
||||
|
||||
for (int r = 0; r < 7; r++) {
|
||||
uint32_t pm[16];
|
||||
for (int i = 0; i < 16; i++) pm[i] = m[MSG_SCHEDULE[r][i]];
|
||||
round_fn(s, pm);
|
||||
}
|
||||
|
||||
for (int i = 0; i < 8; i++) out[i] = s[i] ^ s[i+8];
|
||||
for (int i = 8; i < 16; i++) out[i] = s[i] ^ cv[i-8];
|
||||
}
|
||||
|
||||
static void cv_from_out(const uint32_t out[16], uint32_t cv[8])
|
||||
{
|
||||
for (int i = 0; i < 8; i++) cv[i] = out[i];
|
||||
}
|
||||
|
||||
/* Process one block within a chunk */
|
||||
static void chunk_block(struct uc2_blake3 *ctx, const uint8_t block[64],
|
||||
uint8_t block_len, uint8_t extra_flags)
|
||||
{
|
||||
uint8_t flags = ctx->flags | extra_flags;
|
||||
if (ctx->blocks_compressed == 0) flags |= CHUNK_START;
|
||||
|
||||
uint32_t out[16];
|
||||
compress(ctx->cv, block, block_len, ctx->chunk_counter, flags, out);
|
||||
cv_from_out(out, ctx->cv);
|
||||
ctx->blocks_compressed++;
|
||||
}
|
||||
|
||||
/* Finalize a chunk: compress the last block with CHUNK_END */
|
||||
static void chunk_finalize(struct uc2_blake3 *ctx, uint32_t cv_out[8])
|
||||
{
|
||||
uint8_t flags = ctx->flags | CHUNK_END;
|
||||
if (ctx->blocks_compressed == 0) flags |= CHUNK_START;
|
||||
|
||||
uint8_t block[64];
|
||||
memset(block, 0, 64);
|
||||
memcpy(block, ctx->buf, ctx->buf_len);
|
||||
|
||||
uint32_t out[16];
|
||||
compress(ctx->cv, block, ctx->buf_len, ctx->chunk_counter, flags, out);
|
||||
cv_from_out(out, cv_out);
|
||||
}
|
||||
|
||||
/* Merge two chaining values as a parent node */
|
||||
static void parent_cv(const uint32_t left[8], const uint32_t right[8],
|
||||
uint32_t out_cv[8])
|
||||
{
|
||||
uint8_t block[64];
|
||||
for (int i = 0; i < 8; i++) {
|
||||
block[i*4] = (uint8_t)(left[i]);
|
||||
block[i*4+1] = (uint8_t)(left[i]>>8);
|
||||
block[i*4+2] = (uint8_t)(left[i]>>16);
|
||||
block[i*4+3] = (uint8_t)(left[i]>>24);
|
||||
}
|
||||
for (int i = 0; i < 8; i++) {
|
||||
block[32+i*4] = (uint8_t)(right[i]);
|
||||
block[32+i*4+1] = (uint8_t)(right[i]>>8);
|
||||
block[32+i*4+2] = (uint8_t)(right[i]>>16);
|
||||
block[32+i*4+3] = (uint8_t)(right[i]>>24);
|
||||
}
|
||||
uint32_t out[16];
|
||||
compress(IV, block, 64, 0, PARENT, out);
|
||||
cv_from_out(out, out_cv);
|
||||
}
|
||||
|
||||
static void push_cv(struct uc2_blake3 *ctx, const uint32_t cv[8])
|
||||
{
|
||||
/* Merge with stack entries that have matching tree levels */
|
||||
uint32_t new_cv[8];
|
||||
memcpy(new_cv, cv, 32);
|
||||
uint64_t total = ctx->chunk_counter;
|
||||
while (total & 1) {
|
||||
ctx->cv_stack_len--;
|
||||
parent_cv(&ctx->cv_stack[ctx->cv_stack_len * 8], new_cv, new_cv);
|
||||
total >>= 1;
|
||||
}
|
||||
memcpy(&ctx->cv_stack[ctx->cv_stack_len * 8], new_cv, 32);
|
||||
ctx->cv_stack_len++;
|
||||
}
|
||||
|
||||
void uc2_blake3_init(struct uc2_blake3 *ctx)
|
||||
{
|
||||
memset(ctx, 0, sizeof *ctx);
|
||||
memcpy(ctx->cv, IV, 32);
|
||||
}
|
||||
|
||||
void uc2_blake3_update(struct uc2_blake3 *ctx, const void *data, size_t len)
|
||||
{
|
||||
const uint8_t *p = data;
|
||||
while (len > 0) {
|
||||
/* If buffer has a full block, process it */
|
||||
if (ctx->buf_len == 64) {
|
||||
chunk_block(ctx, ctx->buf, 64, 0);
|
||||
ctx->buf_len = 0;
|
||||
|
||||
/* If we've filled a full chunk (1024 bytes = 16 blocks),
|
||||
finalize this chunk and start a new one */
|
||||
if (ctx->blocks_compressed == 16) {
|
||||
/* This was the 16th block; we need to finalize with the
|
||||
PREVIOUS block as the last, and this leftover starts a
|
||||
new chunk. Actually, we process blocks as they come
|
||||
and finalize when the chunk is complete. */
|
||||
}
|
||||
}
|
||||
|
||||
/* Check if we're at a chunk boundary */
|
||||
size_t chunk_bytes = (size_t)ctx->blocks_compressed * 64 + ctx->buf_len;
|
||||
if (chunk_bytes >= UC2_BLAKE3_CHUNK_LEN && ctx->buf_len == 0 &&
|
||||
ctx->blocks_compressed > 0) {
|
||||
/* Finalize current chunk — but we've already processed all
|
||||
blocks. The last block was a full block, so re-compress
|
||||
it with CHUNK_END. */
|
||||
/* Start new chunk */
|
||||
uint32_t chunk_cv[8];
|
||||
/* Recompute final block with CHUNK_END */
|
||||
chunk_finalize(ctx, chunk_cv);
|
||||
push_cv(ctx, chunk_cv);
|
||||
|
||||
ctx->chunk_counter++;
|
||||
memcpy(ctx->cv, IV, 32);
|
||||
ctx->blocks_compressed = 0;
|
||||
ctx->flags = 0;
|
||||
}
|
||||
|
||||
size_t take = 64 - ctx->buf_len;
|
||||
if (take > len) take = len;
|
||||
memcpy(ctx->buf + ctx->buf_len, p, take);
|
||||
ctx->buf_len += (uint8_t)take;
|
||||
p += take;
|
||||
len -= take;
|
||||
}
|
||||
}
|
||||
|
||||
void uc2_blake3_final(const struct uc2_blake3 *ctx, uint8_t out[UC2_BLAKE3_OUT_LEN])
|
||||
{
|
||||
/* Finalize current chunk */
|
||||
uint32_t chunk_cv[8];
|
||||
struct uc2_blake3 tmp = *ctx;
|
||||
|
||||
/* If this is the only chunk, it gets ROOT flag */
|
||||
if (tmp.chunk_counter == 0 && tmp.cv_stack_len == 0) {
|
||||
uint8_t flags = tmp.flags | CHUNK_START | CHUNK_END | ROOT;
|
||||
uint8_t block[64];
|
||||
memset(block, 0, 64);
|
||||
memcpy(block, tmp.buf, tmp.buf_len);
|
||||
uint32_t result[16];
|
||||
compress(tmp.cv, block, tmp.buf_len, 0, flags, result);
|
||||
for (int i = 0; i < 8; i++) {
|
||||
out[i*4] = (uint8_t)(result[i]);
|
||||
out[i*4+1] = (uint8_t)(result[i]>>8);
|
||||
out[i*4+2] = (uint8_t)(result[i]>>16);
|
||||
out[i*4+3] = (uint8_t)(result[i]>>24);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
/* Multi-chunk: finalize current chunk */
|
||||
chunk_finalize(&tmp, chunk_cv);
|
||||
|
||||
/* Merge with stack */
|
||||
uint32_t cv[8];
|
||||
memcpy(cv, chunk_cv, 32);
|
||||
for (int i = (int)tmp.cv_stack_len - 1; i >= 0; i--) {
|
||||
uint32_t merged[8];
|
||||
parent_cv(&tmp.cv_stack[i * 8], cv, merged);
|
||||
memcpy(cv, merged, 32);
|
||||
}
|
||||
|
||||
/* Output with ROOT flag */
|
||||
uint8_t block[64];
|
||||
memset(block, 0, 64);
|
||||
for (int i = 0; i < 8; i++) {
|
||||
block[i*4] = (uint8_t)(cv[i]);
|
||||
block[i*4+1] = (uint8_t)(cv[i]>>8);
|
||||
block[i*4+2] = (uint8_t)(cv[i]>>16);
|
||||
block[i*4+3] = (uint8_t)(cv[i]>>24);
|
||||
}
|
||||
uint32_t result[16];
|
||||
compress(IV, block, 32, 0, PARENT | ROOT, result);
|
||||
for (int i = 0; i < 8; i++) {
|
||||
out[i*4] = (uint8_t)(result[i]);
|
||||
out[i*4+1] = (uint8_t)(result[i]>>8);
|
||||
out[i*4+2] = (uint8_t)(result[i]>>16);
|
||||
out[i*4+3] = (uint8_t)(result[i]>>24);
|
||||
}
|
||||
}
|
||||
|
||||
void uc2_blake3_hash(const void *data, size_t len, uint8_t out[UC2_BLAKE3_OUT_LEN])
|
||||
{
|
||||
struct uc2_blake3 ctx;
|
||||
uc2_blake3_init(&ctx);
|
||||
uc2_blake3_update(&ctx, data, len);
|
||||
uc2_blake3_final(&ctx, out);
|
||||
}
|
||||
|
||||
int uc2_blake3_equal(const uint8_t a[UC2_BLAKE3_OUT_LEN],
|
||||
const uint8_t b[UC2_BLAKE3_OUT_LEN])
|
||||
{
|
||||
uint8_t diff = 0;
|
||||
for (int i = 0; i < UC2_BLAKE3_OUT_LEN; i++)
|
||||
diff |= a[i] ^ b[i];
|
||||
return diff == 0;
|
||||
}
|
||||
105
lib/src/uc2_blockstore.c
Normal file
105
lib/src/uc2_blockstore.c
Normal file
@@ -0,0 +1,105 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* Cross-archive block store for content-addressable deduplication.
|
||||
*
|
||||
* Chunks are stored as individual files named by their 64-bit hash
|
||||
* (hex encoded). This is simple and portable -- no database needed.
|
||||
* For large stores, a two-level directory structure (first 2 hex chars
|
||||
* as subdirectory) prevents filesystem performance issues. */
|
||||
|
||||
#include "uc2/uc2_blockstore.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/stat.h>
|
||||
#include <errno.h>
|
||||
|
||||
static void hash_path(const struct uc2_blockstore *bs, uint64_t hash,
|
||||
char *buf, size_t buf_size)
|
||||
{
|
||||
/* Two-level: store/AB/ABCDEF0123456789 */
|
||||
snprintf(buf, buf_size, "%s/%02x/%016llx",
|
||||
bs->path, (unsigned)(hash >> 56) & 0xFF,
|
||||
(unsigned long long)hash);
|
||||
}
|
||||
|
||||
static void ensure_subdir(const struct uc2_blockstore *bs, uint64_t hash)
|
||||
{
|
||||
char dir[4096];
|
||||
snprintf(dir, sizeof dir, "%s/%02x",
|
||||
bs->path, (unsigned)(hash >> 56) & 0xFF);
|
||||
mkdir(dir, 0755);
|
||||
}
|
||||
|
||||
int uc2_blockstore_open(struct uc2_blockstore *bs, const char *path)
|
||||
{
|
||||
memset(bs, 0, sizeof *bs);
|
||||
bs->path = strdup(path);
|
||||
if (!bs->path) return -1;
|
||||
|
||||
/* Create store directory if it doesn't exist */
|
||||
if (mkdir(path, 0755) < 0 && errno != EEXIST) {
|
||||
free(bs->path);
|
||||
bs->path = NULL;
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int uc2_blockstore_has(const struct uc2_blockstore *bs, uint64_t hash)
|
||||
{
|
||||
char fpath[4096];
|
||||
hash_path(bs, hash, fpath, sizeof fpath);
|
||||
struct stat st;
|
||||
return stat(fpath, &st) == 0;
|
||||
}
|
||||
|
||||
int uc2_blockstore_ingest(struct uc2_blockstore *bs,
|
||||
const struct uc2_merkle *tree,
|
||||
const uint8_t *data, size_t len)
|
||||
{
|
||||
int new_chunks = 0;
|
||||
for (int i = 0; i < tree->nchunks; i++) {
|
||||
uint64_t h = tree->chunks[i].hash;
|
||||
uint32_t off = tree->chunks[i].offset;
|
||||
uint32_t clen = tree->chunks[i].length;
|
||||
|
||||
if (off > len || clen > len - off) continue; /* overflow-safe */
|
||||
|
||||
if (uc2_blockstore_has(bs, h)) {
|
||||
bs->saved_bytes += clen;
|
||||
continue;
|
||||
}
|
||||
|
||||
ensure_subdir(bs, h);
|
||||
char fpath[4096];
|
||||
hash_path(bs, h, fpath, sizeof fpath);
|
||||
FILE *f = fopen(fpath, "wb");
|
||||
if (!f) continue;
|
||||
fwrite(data + off, 1, clen, f);
|
||||
fclose(f);
|
||||
|
||||
bs->nblocks++;
|
||||
bs->total_bytes += clen;
|
||||
new_chunks++;
|
||||
}
|
||||
return new_chunks;
|
||||
}
|
||||
|
||||
int uc2_blockstore_read(const struct uc2_blockstore *bs,
|
||||
uint64_t hash, uint8_t *buf, size_t buf_size)
|
||||
{
|
||||
char fpath[4096];
|
||||
hash_path(bs, hash, fpath, sizeof fpath);
|
||||
FILE *f = fopen(fpath, "rb");
|
||||
if (!f) return -1;
|
||||
int n = (int)fread(buf, 1, buf_size, f);
|
||||
fclose(f);
|
||||
return n;
|
||||
}
|
||||
|
||||
void uc2_blockstore_close(struct uc2_blockstore *bs)
|
||||
{
|
||||
free(bs->path);
|
||||
memset(bs, 0, sizeof *bs);
|
||||
}
|
||||
124
lib/src/uc2_cdc.c
Normal file
124
lib/src/uc2_cdc.c
Normal file
@@ -0,0 +1,124 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* Content-defined chunking (CDC) for UC2 deduplication.
|
||||
*
|
||||
* Gear hash: each byte updates the hash by shifting left and XORing
|
||||
* with a pre-computed random table entry. This gives uniform
|
||||
* distribution and O(1) per-byte cost. A chunk boundary is detected
|
||||
* when (hash & mask) == 0, giving an average chunk size of 2^bits.
|
||||
*
|
||||
* Reference: "A Framework for Analyzing and Improving Content-Based
|
||||
* Chunking Algorithms" (Xia et al., HP Labs, 2005).
|
||||
*/
|
||||
|
||||
#include "uc2/uc2_cdc.h"
|
||||
#include <string.h>
|
||||
|
||||
/* Gear hash lookup table: 256 random 32-bit values.
|
||||
Generated from a PRNG seeded with the string "UC2 Gear CDC". */
|
||||
static const uint32_t gear_table[256] = {
|
||||
0x5c27b2e4, 0x8a3b9c01, 0xf7e52d9f, 0x3d14a867, 0xc6f893b2, 0x91d047e5, 0x2e6b1fa8, 0xe4a37c63,
|
||||
0x7f582b1d, 0xb90c64f6, 0x46d1e823, 0x13a95f7b, 0xd87e24c9, 0xa5430168, 0x6c9fb3d4, 0x028e7a1f,
|
||||
0xfb614d93, 0x3742c856, 0x84b50fea, 0xc1d6937e, 0x590a2eb1, 0xaef41c67, 0x67c385d2, 0x0dbf694a,
|
||||
0xe2984513, 0x76ab3dc8, 0x4517e29f, 0xb86a0c54, 0x1e23f7b6, 0xd3c58e41, 0x8a71b02d, 0xf09d43e8,
|
||||
0x2b06d175, 0x9f48a623, 0xc3e71bdf, 0x54b2f906, 0x1d65c48a, 0xe83a074b, 0x72196ed3, 0xa4de8b17,
|
||||
0x3fac5264, 0xd10738b9, 0x6ec4a1f5, 0x8593d642, 0x4a7f1d8e, 0xf6b2e071, 0x2748bc3a, 0xc981459d,
|
||||
0x50f37e26, 0xbe269ac3, 0x13da4587, 0x9c07b1f4, 0x614ed368, 0xa7bc2f15, 0xd4f56c89, 0x38a19047,
|
||||
0x876cb5e2, 0xe53d48ab, 0x42801d76, 0xfc17a93c, 0x0b9e62d1, 0x7654cf08, 0xcda37b94, 0x19e80e5f,
|
||||
0xab3c91d7, 0x6271f4a6, 0xd8bf2843, 0x3506de71, 0xf94a637b, 0x8ed5b02c, 0x471c89e5, 0x0a63d4f9,
|
||||
0xc4982e17, 0x7db15a8c, 0x12ef4360, 0xb637c9a5, 0x5f740ed8, 0xe1a8b524, 0x28c96f13, 0x93014876,
|
||||
0xdae27b9d, 0x3d8f15c2, 0x815ca04e, 0xf47e6d39, 0x4b93d2f7, 0xa620be81, 0x69d7014a, 0xc5b4f836,
|
||||
0x1c486aeb, 0x70a5931d, 0xef12dc64, 0x8279b508, 0xb6c34a9f, 0x57e82173, 0x0a1f7dc6, 0xde64c952,
|
||||
0x43b0a819, 0xad5e37e4, 0x6897cb71, 0xf1240f9c, 0x342bc6a5, 0x9d1852e8, 0xc7fa9b34, 0x586d4e07,
|
||||
0xb2a1d3f6, 0x2536ec89, 0x7ecb1047, 0xe408a5bd, 0x0f957e62, 0xd3ca81a0, 0x917f2d14, 0xfa42b6d9,
|
||||
0x45d968b3, 0xbbe50c37, 0x1274f1e5, 0x6a9e3db8, 0xcf538241, 0x87a0c96f, 0x5eb75423, 0x31dc0fa7,
|
||||
0xa41b63c4, 0xd96fae58, 0x4cd2891e, 0xf5863072, 0x0b17e4a6, 0x7c60bd9d, 0xe39845c1, 0xb85e2f17,
|
||||
0x21a37689, 0x9e4fc153, 0xd702dba4, 0x5384e96f, 0xaf51067c, 0x64c83db1, 0xc2e7f548, 0x3a198c24,
|
||||
0xf06b47d2, 0x85d2a19e, 0x4f3e5c63, 0x19c78b07, 0xe6a402db, 0x7b59d3f4, 0xbd146ea5, 0x0e82c917,
|
||||
0xc3f01b76, 0x5da564a9, 0x32b9f852, 0xa847201c, 0x6e9cb7e3, 0x81635d38, 0x470ad1bf, 0xfc718946,
|
||||
0x16ce3fa2, 0x9ab045e7, 0xd52c6814, 0x43f9bc79, 0xb8e213a6, 0x2f174e51, 0x657d90cd, 0xcda4f738,
|
||||
0x0198269b, 0x7e3cdb54, 0xe26f8013, 0x39c154e7, 0xa45db39c, 0xd792e841, 0x58067f2b, 0xb3adc466,
|
||||
0x1b41a5d0, 0x76e83917, 0xcf250b74, 0x84b7d2a8, 0x4dc69e53, 0xf01a47bf, 0x28f361c4, 0x93758c19,
|
||||
0xe5c24037, 0x3a8ef956, 0x7e51b682, 0xc107da4f, 0x5269031d, 0xad84c7e6, 0x6eb3589a, 0x0f4ea143,
|
||||
0xd8356fd7, 0x417c9e2b, 0xba20d364, 0x25f745a8, 0xf6c11e79, 0x7db8a30c, 0x830f52b4, 0x49617cd9,
|
||||
0x1cda0e63, 0xa7b23148, 0xde46c5f2, 0x63895db7, 0xb21ea481, 0x574c6f0e, 0x0a8392c5, 0xc5f7b84a,
|
||||
0x380e41d6, 0xed72d923, 0x91c5a687, 0x4a19f054, 0xf4a83b19, 0x673d8ec2, 0xbce1470b, 0x01567da4,
|
||||
0xd8abc196, 0x2490534e, 0x7de7bf83, 0xc3348217, 0x5f629ed5, 0xa6b70468, 0x1c43d7a9, 0x89f56b30,
|
||||
0x4508cfe1, 0xf27a1694, 0xb81e5d47, 0x05a9c3ba, 0xdac28f62, 0x61b740d5, 0x9e3f254c, 0x37d4a8e1,
|
||||
0x8b612c97, 0xc419f035, 0x5d8e7ba6, 0xa2f3d14c, 0x16458db9, 0xeb27c673, 0x70da0e28, 0xbf9c53e4,
|
||||
0x42a1679f, 0xde38b102, 0x95c42f56, 0x037bd8a1, 0xfc1645ed, 0x69ea9cb3, 0xad5f0374, 0x3487e1c9,
|
||||
0xc0b29d15, 0x5e617a48, 0x8714c6bf, 0x1da93273, 0xf2d5e804, 0x764b5f96, 0xab86031d, 0x41c8b4e2,
|
||||
0xd53a6927, 0x0f91dc83, 0xe8450b5a, 0x72f7a1c6, 0xbc234d90, 0x2dbe7641, 0x960cf5bd, 0x5b618a49,
|
||||
};
|
||||
|
||||
uint32_t uc2_gear_hash(const uint8_t *data, size_t len)
|
||||
{
|
||||
uint32_t h = 0;
|
||||
for (size_t i = 0; i < len; i++)
|
||||
h = (h << 1) + gear_table[data[i]];
|
||||
return h;
|
||||
}
|
||||
|
||||
void uc2_chunker_init(struct uc2_chunker *c, int bits,
|
||||
size_t min_chunk, size_t max_chunk)
|
||||
{
|
||||
if (bits < 8) bits = 8;
|
||||
if (bits > 20) bits = 20;
|
||||
c->mask = ((uint32_t)1 << bits) - 1;
|
||||
c->min_chunk = min_chunk ? min_chunk : ((size_t)1 << (bits - 2));
|
||||
c->max_chunk = max_chunk ? max_chunk : ((size_t)1 << (bits + 2));
|
||||
c->pos = 0;
|
||||
}
|
||||
|
||||
void uc2_chunker_reset(struct uc2_chunker *c)
|
||||
{
|
||||
c->pos = 0;
|
||||
}
|
||||
|
||||
int uc2_chunker_next(struct uc2_chunker *c,
|
||||
const uint8_t *data, size_t len,
|
||||
size_t *chunk_off, size_t *chunk_len)
|
||||
{
|
||||
if (c->pos >= len)
|
||||
return 0;
|
||||
|
||||
size_t start = c->pos;
|
||||
size_t end = start + c->max_chunk;
|
||||
if (end > len) end = len;
|
||||
|
||||
/* Skip minimum chunk size before checking boundaries */
|
||||
size_t scan = start + c->min_chunk;
|
||||
if (scan > end) scan = end;
|
||||
|
||||
uint32_t h = 0;
|
||||
/* Prime the hash over the min_chunk prefix */
|
||||
for (size_t i = start; i < scan; i++)
|
||||
h = (h << 1) + gear_table[data[i]];
|
||||
|
||||
/* Scan for boundary: (hash & mask) == 0 */
|
||||
for (size_t i = scan; i < end; i++) {
|
||||
h = (h << 1) + gear_table[data[i]];
|
||||
if ((h & c->mask) == 0) {
|
||||
*chunk_off = start;
|
||||
*chunk_len = i + 1 - start;
|
||||
c->pos = i + 1;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
/* No boundary found: emit max_chunk or remaining data */
|
||||
*chunk_off = start;
|
||||
*chunk_len = end - start;
|
||||
c->pos = end;
|
||||
return (c->pos < len) ? 1 : 0;
|
||||
}
|
||||
|
||||
uint32_t uc2_fnv1a(const uint8_t *data, size_t len)
|
||||
{
|
||||
uint32_t h = 2166136261u;
|
||||
for (size_t i = 0; i < len; i++) {
|
||||
h ^= data[i];
|
||||
h *= 16777619u;
|
||||
}
|
||||
return h;
|
||||
}
|
||||
209
lib/src/uc2_delta.c
Normal file
209
lib/src/uc2_delta.c
Normal file
@@ -0,0 +1,209 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* Delta compression for file versioning.
|
||||
*
|
||||
* Uses a hash-based matching approach: hash all 4-byte windows in the
|
||||
* source, then scan the target looking for matching regions. Matched
|
||||
* regions become COPY instructions, unmatched regions become INSERT.
|
||||
*
|
||||
* This is a simplified version of the vcdiff/xdelta algorithm. */
|
||||
|
||||
#include "uc2/uc2_delta.h"
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#define HASH_SIZE 65536
|
||||
#define WINDOW 4
|
||||
#define MIN_MATCH 8
|
||||
|
||||
static uint32_t roll_hash(const uint8_t *p)
|
||||
{
|
||||
return (uint32_t)p[0] | ((uint32_t)p[1] << 8) |
|
||||
((uint32_t)p[2] << 16) | ((uint32_t)p[3] << 24);
|
||||
}
|
||||
|
||||
static void put32(uint8_t *p, uint32_t v)
|
||||
{
|
||||
p[0] = v & 0xFF; p[1] = (v >> 8) & 0xFF;
|
||||
p[2] = (v >> 16) & 0xFF; p[3] = (v >> 24) & 0xFF;
|
||||
}
|
||||
|
||||
static uint32_t get32(const uint8_t *p)
|
||||
{
|
||||
return (uint32_t)p[0] | ((uint32_t)p[1] << 8) |
|
||||
((uint32_t)p[2] << 16) | ((uint32_t)p[3] << 24);
|
||||
}
|
||||
|
||||
/* Growable output buffer */
|
||||
struct obuf {
|
||||
uint8_t *data;
|
||||
size_t len, cap;
|
||||
};
|
||||
|
||||
static int obuf_append(struct obuf *o, const void *data, size_t len)
|
||||
{
|
||||
if (o->len + len > o->cap) {
|
||||
size_t newcap = o->cap ? o->cap * 2 : 256;
|
||||
while (newcap < o->len + len) newcap *= 2;
|
||||
uint8_t *p = realloc(o->data, newcap);
|
||||
if (!p) return -1;
|
||||
o->data = p;
|
||||
o->cap = newcap;
|
||||
}
|
||||
memcpy(o->data + o->len, data, len);
|
||||
o->len += len;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int emit_copy(struct obuf *o, uint32_t offset, uint32_t length)
|
||||
{
|
||||
uint8_t buf[9];
|
||||
buf[0] = 0x01;
|
||||
put32(buf + 1, offset);
|
||||
put32(buf + 5, length);
|
||||
return obuf_append(o, buf, 9);
|
||||
}
|
||||
|
||||
static int emit_insert(struct obuf *o, const uint8_t *data, uint32_t length)
|
||||
{
|
||||
uint8_t hdr[5];
|
||||
hdr[0] = 0x02;
|
||||
put32(hdr + 1, length);
|
||||
if (obuf_append(o, hdr, 5) < 0) return -1;
|
||||
return obuf_append(o, data, length);
|
||||
}
|
||||
|
||||
int uc2_delta_encode(const uint8_t *src, size_t src_len,
|
||||
const uint8_t *tgt, size_t tgt_len,
|
||||
uint8_t **out_delta, size_t *out_delta_len)
|
||||
{
|
||||
*out_delta = NULL;
|
||||
*out_delta_len = 0;
|
||||
|
||||
/* Build hash table of source positions */
|
||||
int32_t *htab = calloc(HASH_SIZE, sizeof(int32_t));
|
||||
if (!htab) return -1;
|
||||
for (size_t i = 0; i < HASH_SIZE; i++) htab[i] = -1;
|
||||
|
||||
if (src_len >= WINDOW) {
|
||||
for (size_t i = 0; i <= src_len - WINDOW; i++) {
|
||||
uint32_t h = roll_hash(src + i) & (HASH_SIZE - 1);
|
||||
htab[h] = (int32_t)i; /* last occurrence wins */
|
||||
}
|
||||
}
|
||||
|
||||
struct obuf out = {0};
|
||||
|
||||
/* Header */
|
||||
uint8_t hdr[8] = {'U','C','2','D', 0,0,0,0};
|
||||
put32(hdr + 4, (uint32_t)tgt_len);
|
||||
obuf_append(&out, hdr, 8);
|
||||
|
||||
/* Scan target, emit COPY or INSERT */
|
||||
size_t tpos = 0;
|
||||
size_t insert_start = 0;
|
||||
int in_insert = 0;
|
||||
|
||||
while (tpos + WINDOW <= tgt_len) {
|
||||
uint32_t h = roll_hash(tgt + tpos) & (HASH_SIZE - 1);
|
||||
int32_t spos = htab[h];
|
||||
|
||||
if (spos >= 0 && (size_t)spos + WINDOW <= src_len &&
|
||||
memcmp(src + spos, tgt + tpos, WINDOW) == 0) {
|
||||
/* Extend match forward */
|
||||
size_t match_len = WINDOW;
|
||||
while (tpos + match_len < tgt_len &&
|
||||
(size_t)spos + match_len < src_len &&
|
||||
src[spos + match_len] == tgt[tpos + match_len])
|
||||
match_len++;
|
||||
|
||||
if (match_len >= MIN_MATCH) {
|
||||
/* Flush pending insert */
|
||||
if (in_insert && tpos > insert_start)
|
||||
emit_insert(&out, tgt + insert_start,
|
||||
(uint32_t)(tpos - insert_start));
|
||||
in_insert = 0;
|
||||
|
||||
emit_copy(&out, (uint32_t)spos, (uint32_t)match_len);
|
||||
tpos += match_len;
|
||||
insert_start = tpos;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (!in_insert) {
|
||||
insert_start = tpos;
|
||||
in_insert = 1;
|
||||
}
|
||||
tpos++;
|
||||
}
|
||||
|
||||
/* Trailing bytes */
|
||||
size_t trailing = tgt_len - tpos;
|
||||
if (in_insert || trailing > 0) {
|
||||
size_t ins_start = in_insert ? insert_start : tpos;
|
||||
size_t ins_len = tgt_len - ins_start;
|
||||
if (ins_len > 0)
|
||||
emit_insert(&out, tgt + ins_start, (uint32_t)ins_len);
|
||||
}
|
||||
|
||||
/* End marker */
|
||||
uint8_t end = 0x00;
|
||||
obuf_append(&out, &end, 1);
|
||||
|
||||
free(htab);
|
||||
*out_delta = out.data;
|
||||
*out_delta_len = out.len;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int uc2_delta_apply(const uint8_t *src, size_t src_len,
|
||||
const uint8_t *delta, size_t delta_len,
|
||||
uint8_t **out_tgt, size_t *out_tgt_len)
|
||||
{
|
||||
*out_tgt = NULL;
|
||||
*out_tgt_len = 0;
|
||||
|
||||
if (delta_len < 9 || memcmp(delta, "UC2D", 4) != 0)
|
||||
return -1;
|
||||
|
||||
uint32_t tgt_len = get32(delta + 4);
|
||||
/* malloc(0) is implementation-defined; ensure at least one byte
|
||||
* so the returned pointer is canonical and free()-safe. */
|
||||
uint8_t *tgt = malloc(tgt_len ? tgt_len : 1);
|
||||
if (!tgt) return -1;
|
||||
|
||||
size_t dpos = 8; /* after header */
|
||||
size_t tpos = 0;
|
||||
|
||||
while (dpos < delta_len) {
|
||||
uint8_t op = delta[dpos++];
|
||||
if (op == 0x00) break; /* END */
|
||||
|
||||
if (op == 0x01) { /* COPY */
|
||||
if (dpos + 8 > delta_len) goto err;
|
||||
uint32_t offset = get32(delta + dpos); dpos += 4;
|
||||
uint32_t length = get32(delta + dpos); dpos += 4;
|
||||
if ((size_t)offset + length > src_len) goto err;
|
||||
if (tpos + length > tgt_len) goto err;
|
||||
memcpy(tgt + tpos, src + offset, length);
|
||||
tpos += length;
|
||||
} else if (op == 0x02) { /* INSERT */
|
||||
if (dpos + 4 > delta_len) goto err;
|
||||
uint32_t length = get32(delta + dpos); dpos += 4;
|
||||
if (dpos + length > delta_len) goto err;
|
||||
if (tpos + length > tgt_len) goto err;
|
||||
memcpy(tgt + tpos, delta + dpos, length);
|
||||
dpos += length;
|
||||
tpos += length;
|
||||
} else goto err;
|
||||
}
|
||||
|
||||
*out_tgt = tgt;
|
||||
*out_tgt_len = tpos;
|
||||
return 0;
|
||||
|
||||
err:
|
||||
free(tgt);
|
||||
return -1;
|
||||
}
|
||||
85
lib/src/uc2_dict.c
Normal file
85
lib/src/uc2_dict.c
Normal file
@@ -0,0 +1,85 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* Dictionary management for zstd-inspired dictionary compression. */
|
||||
|
||||
#include "uc2/uc2_dict.h"
|
||||
#include "uc2/uc2_merkle.h"
|
||||
#include "uc2/uc2_cdc.h"
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
/* Serialization helpers (little-endian, alignment-safe) */
|
||||
static void put32(uint8_t *p, uint32_t v) {
|
||||
p[0]=v; p[1]=v>>8; p[2]=v>>16; p[3]=v>>24;
|
||||
}
|
||||
static void put64(uint8_t *p, uint64_t v) {
|
||||
for (int i = 0; i < 8; i++) p[i] = (uint8_t)(v >> (i*8));
|
||||
}
|
||||
static uint32_t get32(const uint8_t *p) {
|
||||
return p[0] | ((uint32_t)p[1]<<8) | ((uint32_t)p[2]<<16) | ((uint32_t)p[3]<<24);
|
||||
}
|
||||
static uint64_t get64(const uint8_t *p) {
|
||||
uint64_t v = 0;
|
||||
for (int i = 7; i >= 0; i--) v = (v << 8) | p[i];
|
||||
return v;
|
||||
}
|
||||
|
||||
int uc2_dict_create(struct uc2_dict *dict, const uint8_t *data, size_t size)
|
||||
{
|
||||
memset(dict, 0, sizeof *dict);
|
||||
if (!data || size == 0) return -1;
|
||||
dict->data = malloc(size);
|
||||
if (!dict->data) return -1;
|
||||
memcpy(dict->data, data, size);
|
||||
dict->size = (uint32_t)size;
|
||||
dict->id = uc2_hash64(data, size);
|
||||
dict->checksum = uc2_fnv1a(data, size);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int uc2_dict_verify(const struct uc2_dict *dict)
|
||||
{
|
||||
if (!dict->data || dict->size == 0) return 0;
|
||||
return uc2_fnv1a(dict->data, dict->size) == dict->checksum;
|
||||
}
|
||||
|
||||
/* Serialized format: magic(4) + id(8) + checksum(4) + size(4) + reserved(4) = 24 bytes */
|
||||
#define HDR_SIZE 24
|
||||
|
||||
size_t uc2_dict_serialize(const struct uc2_dict *dict, uint8_t **out)
|
||||
{
|
||||
if (dict->size > (1u << 30)) { *out = NULL; return 0; } /* sane cap; no wrap */
|
||||
size_t total = HDR_SIZE + dict->size;
|
||||
uint8_t *buf = malloc(total);
|
||||
if (!buf) { *out = NULL; return 0; }
|
||||
put32(buf, UC2_DICT_MAGIC);
|
||||
put64(buf + 4, dict->id);
|
||||
put32(buf + 12, dict->checksum);
|
||||
put32(buf + 16, dict->size);
|
||||
put32(buf + 20, 0);
|
||||
memcpy(buf + HDR_SIZE, dict->data, dict->size);
|
||||
*out = buf;
|
||||
return total;
|
||||
}
|
||||
|
||||
int uc2_dict_deserialize(struct uc2_dict *dict, const uint8_t *buf, size_t len)
|
||||
{
|
||||
memset(dict, 0, sizeof *dict);
|
||||
if (len < HDR_SIZE) return -1;
|
||||
if (get32(buf) != UC2_DICT_MAGIC) return -1;
|
||||
uint32_t size = get32(buf + 16);
|
||||
if (HDR_SIZE + size > len) return -1;
|
||||
dict->id = get64(buf + 4);
|
||||
dict->checksum = get32(buf + 12);
|
||||
dict->size = size;
|
||||
dict->data = malloc(size);
|
||||
if (!dict->data) return -1;
|
||||
memcpy(dict->data, buf + HDR_SIZE, size);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void uc2_dict_free(struct uc2_dict *dict)
|
||||
{
|
||||
free(dict->data);
|
||||
memset(dict, 0, sizeof *dict);
|
||||
}
|
||||
443
lib/src/uc2_ingest.c
Normal file
443
lib/src/uc2_ingest.c
Normal file
@@ -0,0 +1,443 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
#include "uc2/uc2_ingest.h"
|
||||
#include "uc2/uc2_blockstore.h"
|
||||
#include "uc2/uc2_merkle.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
static const char INGEST_MAGIC[8] = { 'U','C','2','I','N','G','S','T' };
|
||||
#define INGEST_VERSION_V1 1
|
||||
#define INGEST_VERSION_V2 2
|
||||
#define DEFAULT_CDC_BITS 13
|
||||
|
||||
#define ENTRY_SIZE_V1 12u /* 8B hash + 4B length */
|
||||
#define ENTRY_SIZE_V2 16u /* 8B hash + 4B length + 4B offset */
|
||||
#define HEADER_SIZE 16u
|
||||
|
||||
static char *make_blocks_path(const char *archive_path)
|
||||
{
|
||||
size_t n = strlen(archive_path);
|
||||
char *p = malloc(n + 8);
|
||||
if (!p) return NULL;
|
||||
memcpy(p, archive_path, n);
|
||||
memcpy(p + n, ".blocks", 8); /* includes trailing NUL */
|
||||
return p;
|
||||
}
|
||||
|
||||
static void put_le32(uint8_t *p, uint32_t v)
|
||||
{
|
||||
p[0] = (uint8_t)v;
|
||||
p[1] = (uint8_t)(v >> 8);
|
||||
p[2] = (uint8_t)(v >> 16);
|
||||
p[3] = (uint8_t)(v >> 24);
|
||||
}
|
||||
|
||||
static void put_le64(uint8_t *p, uint64_t v)
|
||||
{
|
||||
for (int i = 0; i < 8; i++)
|
||||
p[i] = (uint8_t)(v >> (i * 8));
|
||||
}
|
||||
|
||||
static uint32_t get_le32(const uint8_t *p)
|
||||
{
|
||||
return (uint32_t)p[0]
|
||||
| ((uint32_t)p[1] << 8)
|
||||
| ((uint32_t)p[2] << 16)
|
||||
| ((uint32_t)p[3] << 24);
|
||||
}
|
||||
|
||||
static uint64_t get_le64(const uint8_t *p)
|
||||
{
|
||||
uint64_t v = 0;
|
||||
for (int i = 0; i < 8; i++)
|
||||
v |= (uint64_t)p[i] << (i * 8);
|
||||
return v;
|
||||
}
|
||||
|
||||
/* Linear-probed open-addressing hash map: hash -> file offset.
|
||||
* Used during write to record where each unique chunk lives so
|
||||
* subsequent appearances of the same hash share an offset. */
|
||||
struct dedup_map {
|
||||
uint64_t *keys; /* 0 = empty slot */
|
||||
uint32_t *offsets; /* offset of chunk in archive */
|
||||
int cap;
|
||||
int len;
|
||||
};
|
||||
|
||||
static int dedup_map_init(struct dedup_map *m, int initial_cap)
|
||||
{
|
||||
/* Round up to power of two; mask-based probing requires it. */
|
||||
int cap = 16;
|
||||
while (cap < initial_cap) cap *= 2;
|
||||
m->keys = calloc((size_t)cap, sizeof *m->keys);
|
||||
m->offsets = calloc((size_t)cap, sizeof *m->offsets);
|
||||
if (!m->keys || !m->offsets) {
|
||||
free(m->keys); free(m->offsets);
|
||||
m->keys = NULL; m->offsets = NULL;
|
||||
return -1;
|
||||
}
|
||||
m->cap = cap;
|
||||
m->len = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void dedup_map_free(struct dedup_map *m)
|
||||
{
|
||||
free(m->keys);
|
||||
free(m->offsets);
|
||||
m->keys = NULL;
|
||||
m->offsets = NULL;
|
||||
}
|
||||
|
||||
static int dedup_map_grow(struct dedup_map *m);
|
||||
|
||||
/* Look up hash in map. If present, return its offset via *out_off
|
||||
* and return 1. Else return 0 (caller may insert). */
|
||||
static int dedup_map_get(const struct dedup_map *m, uint64_t hash,
|
||||
uint32_t *out_off)
|
||||
{
|
||||
if (m->cap == 0) return 0;
|
||||
uint64_t mask = (uint64_t)m->cap - 1;
|
||||
uint64_t i = hash & mask;
|
||||
for (int probe = 0; probe < m->cap; probe++) {
|
||||
if (m->keys[i] == 0) return 0;
|
||||
if (m->keys[i] == hash) {
|
||||
*out_off = m->offsets[i];
|
||||
return 1;
|
||||
}
|
||||
i = (i + 1) & mask;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dedup_map_put(struct dedup_map *m, uint64_t hash, uint32_t off)
|
||||
{
|
||||
if (hash == 0) hash = 1; /* sentinel collision: shift to 1 */
|
||||
if ((m->len + 1) * 2 > m->cap) {
|
||||
if (dedup_map_grow(m) != 0) return -1;
|
||||
}
|
||||
uint64_t mask = (uint64_t)m->cap - 1;
|
||||
uint64_t i = hash & mask;
|
||||
while (m->keys[i] != 0) {
|
||||
if (m->keys[i] == hash) return 0; /* already inserted */
|
||||
i = (i + 1) & mask;
|
||||
}
|
||||
m->keys[i] = hash;
|
||||
m->offsets[i] = off;
|
||||
m->len++;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dedup_map_grow(struct dedup_map *m)
|
||||
{
|
||||
int ncap = m->cap ? m->cap * 2 : 16;
|
||||
uint64_t *nkeys = calloc((size_t)ncap, sizeof *nkeys);
|
||||
uint32_t *noffs = calloc((size_t)ncap, sizeof *noffs);
|
||||
if (!nkeys || !noffs) {
|
||||
free(nkeys); free(noffs);
|
||||
return -1;
|
||||
}
|
||||
uint64_t mask = (uint64_t)ncap - 1;
|
||||
for (int j = 0; j < m->cap; j++) {
|
||||
uint64_t k = m->keys[j];
|
||||
if (k == 0) continue;
|
||||
uint64_t i = k & mask;
|
||||
while (nkeys[i] != 0) i = (i + 1) & mask;
|
||||
nkeys[i] = k;
|
||||
noffs[i] = m->offsets[j];
|
||||
}
|
||||
free(m->keys);
|
||||
free(m->offsets);
|
||||
m->keys = nkeys;
|
||||
m->offsets = noffs;
|
||||
m->cap = ncap;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int uc2_ingest_write(const char *archive_path,
|
||||
const uint8_t *data, size_t len,
|
||||
int cdc_bits,
|
||||
struct uc2_ingest_stats *stats)
|
||||
{
|
||||
if (!archive_path)
|
||||
return -1;
|
||||
if (cdc_bits <= 0)
|
||||
cdc_bits = DEFAULT_CDC_BITS;
|
||||
|
||||
struct uc2_merkle tree;
|
||||
uc2_merkle_build(&tree, data, len, cdc_bits);
|
||||
|
||||
FILE *f = fopen(archive_path, "wb");
|
||||
if (!f) {
|
||||
uc2_merkle_free(&tree);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Header */
|
||||
uint8_t hdr[HEADER_SIZE];
|
||||
memcpy(hdr, INGEST_MAGIC, 8);
|
||||
hdr[8] = INGEST_VERSION_V2;
|
||||
hdr[9] = (uint8_t)cdc_bits;
|
||||
hdr[10] = 0;
|
||||
hdr[11] = 0;
|
||||
put_le32(hdr + 12, (uint32_t)tree.nchunks);
|
||||
if (fwrite(hdr, 1, sizeof hdr, f) != sizeof hdr) {
|
||||
fclose(f);
|
||||
uc2_merkle_free(&tree);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Reserve manifest entry table; we'll backfill offsets after
|
||||
* appending the chunk pool. */
|
||||
long manifest_off = ftell(f);
|
||||
if (manifest_off < 0) {
|
||||
fclose(f);
|
||||
uc2_merkle_free(&tree);
|
||||
return -1;
|
||||
}
|
||||
if (tree.nchunks < 0 || tree.nchunks > (1 << 24)) {
|
||||
fclose(f);
|
||||
uc2_merkle_free(&tree);
|
||||
return -1;
|
||||
}
|
||||
size_t manifest_size = (size_t)tree.nchunks * ENTRY_SIZE_V2;
|
||||
if (tree.nchunks > 0) {
|
||||
uint8_t *zero = calloc(manifest_size, 1);
|
||||
if (!zero) {
|
||||
fclose(f);
|
||||
uc2_merkle_free(&tree);
|
||||
return -1;
|
||||
}
|
||||
size_t w = fwrite(zero, 1, manifest_size, f);
|
||||
free(zero);
|
||||
if (w != manifest_size) {
|
||||
fclose(f);
|
||||
uc2_merkle_free(&tree);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/* Append unique chunks; record offset per hash. */
|
||||
struct dedup_map dmap;
|
||||
if (dedup_map_init(&dmap, tree.nchunks > 16 ? tree.nchunks * 2 : 16) != 0) {
|
||||
fclose(f);
|
||||
uc2_merkle_free(&tree);
|
||||
return -1;
|
||||
}
|
||||
|
||||
uint32_t *entry_offsets = calloc((size_t)tree.nchunks, sizeof *entry_offsets);
|
||||
if (tree.nchunks > 0 && !entry_offsets) {
|
||||
dedup_map_free(&dmap);
|
||||
fclose(f);
|
||||
uc2_merkle_free(&tree);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int new_chunks = 0;
|
||||
uint64_t bytes_appended = 0;
|
||||
uint64_t bytes_saved = 0;
|
||||
for (int i = 0; i < tree.nchunks; i++) {
|
||||
uint64_t h = tree.chunks[i].hash;
|
||||
uint32_t clen = tree.chunks[i].length;
|
||||
uint32_t off;
|
||||
if (dedup_map_get(&dmap, h, &off)) {
|
||||
entry_offsets[i] = off;
|
||||
bytes_saved += clen;
|
||||
continue;
|
||||
}
|
||||
long here = ftell(f);
|
||||
if (here < 0 || (uint64_t)here > 0xFFFFFFFFu) {
|
||||
free(entry_offsets);
|
||||
dedup_map_free(&dmap);
|
||||
fclose(f);
|
||||
uc2_merkle_free(&tree);
|
||||
return -1;
|
||||
}
|
||||
off = (uint32_t)here;
|
||||
entry_offsets[i] = off;
|
||||
if (dedup_map_put(&dmap, h, off) != 0) {
|
||||
free(entry_offsets);
|
||||
dedup_map_free(&dmap);
|
||||
fclose(f);
|
||||
uc2_merkle_free(&tree);
|
||||
return -1;
|
||||
}
|
||||
size_t w = fwrite(data + tree.chunks[i].offset, 1, clen, f);
|
||||
if (w != clen) {
|
||||
free(entry_offsets);
|
||||
dedup_map_free(&dmap);
|
||||
fclose(f);
|
||||
uc2_merkle_free(&tree);
|
||||
return -1;
|
||||
}
|
||||
bytes_appended += clen;
|
||||
new_chunks++;
|
||||
}
|
||||
|
||||
/* Backfill manifest entries. */
|
||||
if (tree.nchunks > 0) {
|
||||
if (fseek(f, manifest_off, SEEK_SET) != 0) {
|
||||
free(entry_offsets);
|
||||
dedup_map_free(&dmap);
|
||||
fclose(f);
|
||||
uc2_merkle_free(&tree);
|
||||
return -1;
|
||||
}
|
||||
for (int i = 0; i < tree.nchunks; i++) {
|
||||
uint8_t rec[ENTRY_SIZE_V2];
|
||||
put_le64(rec, tree.chunks[i].hash);
|
||||
put_le32(rec + 8, tree.chunks[i].length);
|
||||
put_le32(rec + 12, entry_offsets[i]);
|
||||
if (fwrite(rec, 1, sizeof rec, f) != sizeof rec) {
|
||||
free(entry_offsets);
|
||||
dedup_map_free(&dmap);
|
||||
fclose(f);
|
||||
uc2_merkle_free(&tree);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
free(entry_offsets);
|
||||
dedup_map_free(&dmap);
|
||||
|
||||
if (fclose(f) != 0) {
|
||||
uc2_merkle_free(&tree);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (stats) {
|
||||
stats->bytes_in = (uint64_t)len;
|
||||
stats->chunks_total = tree.nchunks;
|
||||
stats->chunks_new = new_chunks;
|
||||
stats->chunks_dedup = tree.nchunks - new_chunks;
|
||||
stats->bytes_stored = bytes_appended;
|
||||
stats->bytes_saved = bytes_saved;
|
||||
}
|
||||
|
||||
uc2_merkle_free(&tree);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* v1 restore: read manifest, fetch chunks from sidecar blockstore. */
|
||||
static int restore_v1(FILE *f, uint32_t nchunks, const char *archive_path,
|
||||
FILE *out)
|
||||
{
|
||||
char *blocks_path = make_blocks_path(archive_path);
|
||||
if (!blocks_path) return -1;
|
||||
|
||||
struct uc2_blockstore bs;
|
||||
if (uc2_blockstore_open(&bs, blocks_path) != 0) {
|
||||
free(blocks_path);
|
||||
return -1;
|
||||
}
|
||||
free(blocks_path);
|
||||
|
||||
uint8_t *buf = NULL;
|
||||
size_t buf_cap = 0;
|
||||
int rc = 0;
|
||||
|
||||
for (uint32_t i = 0; i < nchunks; i++) {
|
||||
uint8_t rec[ENTRY_SIZE_V1];
|
||||
if (fread(rec, 1, sizeof rec, f) != sizeof rec) { rc = -1; break; }
|
||||
uint64_t hash = get_le64(rec);
|
||||
uint32_t clen = get_le32(rec + 8);
|
||||
|
||||
if (clen > buf_cap) {
|
||||
uint8_t *p = realloc(buf, clen);
|
||||
if (!p) { rc = -1; break; }
|
||||
buf = p;
|
||||
buf_cap = clen;
|
||||
}
|
||||
|
||||
int n = uc2_blockstore_read(&bs, hash, buf, clen);
|
||||
if (n != (int)clen) { rc = -1; break; }
|
||||
if (fwrite(buf, 1, clen, out) != clen) { rc = -1; break; }
|
||||
}
|
||||
|
||||
free(buf);
|
||||
uc2_blockstore_close(&bs);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* v2 restore: chunk pool is in the same file; manifest entries
|
||||
* carry absolute offsets. */
|
||||
static int restore_v2(FILE *f, uint32_t nchunks, FILE *out)
|
||||
{
|
||||
/* Read full manifest table first, then seek to each chunk. */
|
||||
if (nchunks == 0)
|
||||
return 0;
|
||||
|
||||
/* nchunks comes from the (untrusted) archive header; cap it so the
|
||||
manifest size cannot wrap (notably on 32-bit) and to bound memory.
|
||||
16M chunks exceeds any archive within the 4 GiB container limit. */
|
||||
if (nchunks > (1u << 24))
|
||||
return -1;
|
||||
uint8_t *manifest = malloc((size_t)nchunks * ENTRY_SIZE_V2);
|
||||
if (!manifest) return -1;
|
||||
if (fread(manifest, 1, (size_t)nchunks * ENTRY_SIZE_V2, f)
|
||||
!= (size_t)nchunks * ENTRY_SIZE_V2) {
|
||||
free(manifest);
|
||||
return -1;
|
||||
}
|
||||
|
||||
uint8_t *buf = NULL;
|
||||
size_t buf_cap = 0;
|
||||
int rc = 0;
|
||||
for (uint32_t i = 0; i < nchunks; i++) {
|
||||
const uint8_t *rec = manifest + (size_t)i * ENTRY_SIZE_V2;
|
||||
uint32_t clen = get_le32(rec + 8);
|
||||
uint32_t off = get_le32(rec + 12);
|
||||
|
||||
if (clen > buf_cap) {
|
||||
uint8_t *p = realloc(buf, clen);
|
||||
if (!p) { rc = -1; break; }
|
||||
buf = p;
|
||||
buf_cap = clen;
|
||||
}
|
||||
|
||||
if (fseek(f, (long)off, SEEK_SET) != 0) { rc = -1; break; }
|
||||
if (fread(buf, 1, clen, f) != clen) { rc = -1; break; }
|
||||
if (fwrite(buf, 1, clen, out) != clen) { rc = -1; break; }
|
||||
}
|
||||
|
||||
free(buf);
|
||||
free(manifest);
|
||||
return rc;
|
||||
}
|
||||
|
||||
int uc2_ingest_restore(const char *archive_path, FILE *out)
|
||||
{
|
||||
if (!archive_path || !out)
|
||||
return -1;
|
||||
|
||||
FILE *f = fopen(archive_path, "rb");
|
||||
if (!f)
|
||||
return -1;
|
||||
|
||||
uint8_t hdr[HEADER_SIZE];
|
||||
if (fread(hdr, 1, sizeof hdr, f) != sizeof hdr) {
|
||||
fclose(f);
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(hdr, INGEST_MAGIC, 8) != 0) {
|
||||
fclose(f);
|
||||
return -1;
|
||||
}
|
||||
uint32_t nchunks = get_le32(hdr + 12);
|
||||
|
||||
int rc;
|
||||
if (hdr[8] == INGEST_VERSION_V2) {
|
||||
rc = restore_v2(f, nchunks, out);
|
||||
} else if (hdr[8] == INGEST_VERSION_V1) {
|
||||
rc = restore_v1(f, nchunks, archive_path, out);
|
||||
} else {
|
||||
rc = -1;
|
||||
}
|
||||
|
||||
fclose(f);
|
||||
return rc;
|
||||
}
|
||||
105
lib/src/uc2_internal.h
Normal file
105
lib/src/uc2_internal.h
Normal file
@@ -0,0 +1,105 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* UC2 format constants and shared types.
|
||||
Used by both the compressor and decompressor. */
|
||||
|
||||
#ifndef UC2_INTERNAL_H
|
||||
#define UC2_INTERNAL_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
typedef uint8_t u8;
|
||||
typedef uint16_t u16;
|
||||
typedef uint32_t u32;
|
||||
|
||||
/* Huffman tree parameters */
|
||||
enum {
|
||||
MaxCodeBits = 13,
|
||||
LookupSize = 1 << MaxCodeBits, /* 8192 */
|
||||
|
||||
NumByteSym = 256,
|
||||
NumDistSym = 60,
|
||||
NumLenSym = 28,
|
||||
NumSymbols = NumByteSym + NumDistSym + NumLenSym, /* 344 */
|
||||
|
||||
NumLoAsciiSym = 28, /* symbols 4..31 (0-3 are control) */
|
||||
NumHiByteSym = 128, /* symbols 128..255 */
|
||||
|
||||
NumDeltaCodes = MaxCodeBits + 1, /* 14 (code lengths 0..13) */
|
||||
NumExtraCodes = 1, /* repeat code */
|
||||
NumLenCodes = NumDeltaCodes + NumExtraCodes, /* 15 */
|
||||
RepeatCode = MaxCodeBits + 1, /* 14 */
|
||||
MinRepeat = 6,
|
||||
};
|
||||
|
||||
/* LZ77 parameters */
|
||||
enum {
|
||||
UC2_MAX_DIST = 125 * 512, /* 64000 */
|
||||
UC2_READ_SIZE = 512,
|
||||
UC2_BUF_SIZE = 65536, /* circular buffer: u16 index wraps */
|
||||
UC2_EOB_MARK = 125 * 512 + 1, /* 64001 — end-of-block distance */
|
||||
UC2_MIN_MATCH = 3,
|
||||
UC2_MAX_LEN = 200, /* direct match limit */
|
||||
UC2_MAX_XLEN = 32760, /* extended match limit */
|
||||
};
|
||||
|
||||
/* Distance encoding: 60 codes in 4 tiers.
|
||||
tier 0: dist 1..15 (15 codes, 0 extra bits)
|
||||
tier 1: dist 16..255 (15 codes, 4 extra bits)
|
||||
tier 2: dist 256..4095 (15 codes, 8 extra bits)
|
||||
tier 3: dist 4096..64000 (15 codes, 12 extra bits) */
|
||||
|
||||
/* Length encoding: 28 codes.
|
||||
0..7: len 3..10 (0 extra bits)
|
||||
8..15: len 11..26 (1 extra bit)
|
||||
16..23: len 27..90 (3 extra bits)
|
||||
24: len 91..154 (6 extra bits)
|
||||
25: len 155..666 (9 extra bits)
|
||||
26: len 667..2714 (11 extra bits)
|
||||
27: len 2715..35482 (15 extra bits) */
|
||||
|
||||
/* Delta-to-absolute table for tree decoding (from decompress.c).
|
||||
vval[prev_length][delta_code] = absolute_length */
|
||||
extern const u8 vval[NumDeltaCodes][NumDeltaCodes];
|
||||
|
||||
/* Inverse: absolute-to-delta table for tree encoding.
|
||||
ivval[prev_length][abs_length] = delta_code */
|
||||
extern const u8 ivval[NumDeltaCodes][NumDeltaCodes];
|
||||
|
||||
/* Default Huffman code lengths for the first block */
|
||||
void uc2_default_lengths(u8 d[NumSymbols]);
|
||||
|
||||
/* Little-endian record types */
|
||||
typedef struct u16le { u8 b[2]; } u16le;
|
||||
typedef struct u32le { u8 b[4]; } u32le;
|
||||
|
||||
static inline u16 get16(u16le v) { return v.b[0] | v.b[1] << 8; }
|
||||
static inline u32 get32(u32le v) { return v.b[0] | v.b[1] << 8 | v.b[2] << 16 | (u32)v.b[3] << 24; }
|
||||
static inline u16le put16(u16 v) { return (u16le){{v & 0xff, v >> 8}}; }
|
||||
static inline u32le put32(u32 v) { return (u32le){{v & 0xff, v >> 8 & 0xff, v >> 16 & 0xff, v >> 24}}; }
|
||||
|
||||
/* Fletcher checksum (XOR-based, as used by UC2) */
|
||||
struct csum { u32 value; };
|
||||
|
||||
static inline void csum_init(struct csum *cs) { cs->value = 0xA55A; }
|
||||
|
||||
static inline void csum_update(struct csum *cs, const u8 *p, unsigned n)
|
||||
{
|
||||
if (!n) return;
|
||||
u32 v = cs->value;
|
||||
const u8 *e = p + n - 1;
|
||||
if (v > 0xffff)
|
||||
v ^= *p++ << 8;
|
||||
while (p < e) {
|
||||
v ^= p[0] | p[1] << 8;
|
||||
p += 2;
|
||||
}
|
||||
v &= 0xffff;
|
||||
if (p == e)
|
||||
v ^= *p | 0x10000;
|
||||
cs->value = v;
|
||||
}
|
||||
|
||||
static inline u16 csum_get(struct csum *cs) { return (u16)cs->value; }
|
||||
|
||||
#endif
|
||||
185
lib/src/uc2_lz4.c
Normal file
185
lib/src/uc2_lz4.c
Normal file
@@ -0,0 +1,185 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* LZ4-compatible ultra-fast compression.
|
||||
*
|
||||
* Single-probe hash table with 4-byte match minimum. No hash chains —
|
||||
* each hash slot holds only the most recent position, giving O(1)
|
||||
* match finding at the cost of missing some matches. This trades
|
||||
* compression ratio for extreme speed. */
|
||||
|
||||
#include "uc2/uc2_lz4.h"
|
||||
#include <string.h>
|
||||
|
||||
#define HASH_BITS 16
|
||||
#define HASH_SIZE (1 << HASH_BITS)
|
||||
#define MIN_MATCH 4
|
||||
#define ML_BITS 4
|
||||
#define ML_MASK ((1 << ML_BITS) - 1)
|
||||
#define RUN_BITS 4
|
||||
#define RUN_MASK ((1 << RUN_BITS) - 1)
|
||||
|
||||
static uint32_t lz4_hash(const uint8_t *p)
|
||||
{
|
||||
uint32_t v = p[0] | ((uint32_t)p[1] << 8) |
|
||||
((uint32_t)p[2] << 16) | ((uint32_t)p[3] << 24);
|
||||
return (v * 2654435761U) >> (32 - HASH_BITS);
|
||||
}
|
||||
|
||||
static void write_len(uint8_t **dst, size_t len)
|
||||
{
|
||||
while (len >= 255) {
|
||||
*(*dst)++ = 255;
|
||||
len -= 255;
|
||||
}
|
||||
*(*dst)++ = (uint8_t)len;
|
||||
}
|
||||
|
||||
size_t uc2_lz4_compress(const uint8_t *src, size_t src_len,
|
||||
uint8_t *dst, size_t dst_cap)
|
||||
{
|
||||
if (src_len == 0 || dst_cap < 1) return 0;
|
||||
|
||||
uint32_t htab[HASH_SIZE];
|
||||
memset(htab, 0, sizeof htab);
|
||||
|
||||
const uint8_t *ip = src;
|
||||
const uint8_t *const iend = src + src_len;
|
||||
const uint8_t *const mflimit = iend - MIN_MATCH;
|
||||
const uint8_t *anchor = ip;
|
||||
uint8_t *op = dst;
|
||||
uint8_t *const oend = dst + dst_cap;
|
||||
|
||||
if (src_len < MIN_MATCH + 1) goto emit_last;
|
||||
|
||||
ip++; /* first byte can't be a match ref */
|
||||
|
||||
for (;;) {
|
||||
/* Find a match */
|
||||
const uint8_t *ref;
|
||||
uint32_t h;
|
||||
|
||||
for (;;) {
|
||||
if (ip > mflimit) goto emit_last;
|
||||
h = lz4_hash(ip);
|
||||
ref = src + htab[h];
|
||||
htab[h] = (uint32_t)(ip - src);
|
||||
if (ref >= src && ip - ref <= 65535 && ip - ref > 0 &&
|
||||
memcmp(ref, ip, MIN_MATCH) == 0)
|
||||
break;
|
||||
ip++;
|
||||
}
|
||||
|
||||
/* Encode literal run before match */
|
||||
size_t lit_len = (size_t)(ip - anchor);
|
||||
size_t match_len = MIN_MATCH;
|
||||
|
||||
/* Extend match forward */
|
||||
while (ip + match_len < iend && ref[match_len] == ip[match_len])
|
||||
match_len++;
|
||||
|
||||
/* Emit token */
|
||||
if (op + 1 + (lit_len > 14 ? lit_len/255 + 1 : 0) + lit_len + 2 +
|
||||
(match_len - MIN_MATCH > 14 ? (match_len - MIN_MATCH)/255 + 1 : 0) > oend)
|
||||
return 0; /* output overflow */
|
||||
|
||||
uint8_t *token = op++;
|
||||
size_t ll = lit_len < 15 ? lit_len : 15;
|
||||
size_t ml = (match_len - MIN_MATCH) < 15 ? (match_len - MIN_MATCH) : 15;
|
||||
*token = (uint8_t)((ll << 4) | ml);
|
||||
|
||||
if (lit_len >= 15)
|
||||
write_len(&op, lit_len - 15);
|
||||
memcpy(op, anchor, lit_len);
|
||||
op += lit_len;
|
||||
|
||||
/* Offset (16-bit LE) */
|
||||
uint16_t offset = (uint16_t)(ip - ref);
|
||||
*op++ = (uint8_t)(offset & 0xFF);
|
||||
*op++ = (uint8_t)(offset >> 8);
|
||||
|
||||
if (match_len - MIN_MATCH >= 15)
|
||||
write_len(&op, match_len - MIN_MATCH - 15);
|
||||
|
||||
ip += match_len;
|
||||
anchor = ip;
|
||||
|
||||
if (ip > mflimit) goto emit_last;
|
||||
|
||||
/* Hash the positions we skipped */
|
||||
htab[lz4_hash(ip - 2)] = (uint32_t)(ip - 2 - src);
|
||||
}
|
||||
|
||||
emit_last:;
|
||||
/* Emit final literal run */
|
||||
size_t last_lit = (size_t)(iend - anchor);
|
||||
if (op + 1 + (last_lit > 14 ? last_lit/255 + 1 : 0) + last_lit > oend)
|
||||
return 0;
|
||||
|
||||
uint8_t *token = op++;
|
||||
size_t ll = last_lit < 15 ? last_lit : 15;
|
||||
*token = (uint8_t)(ll << 4); /* match_len = 0 (no match) */
|
||||
if (last_lit >= 15)
|
||||
write_len(&op, last_lit - 15);
|
||||
memcpy(op, anchor, last_lit);
|
||||
op += last_lit;
|
||||
|
||||
return (size_t)(op - dst);
|
||||
}
|
||||
|
||||
size_t uc2_lz4_decompress(const uint8_t *src, size_t src_len,
|
||||
uint8_t *dst, size_t dst_cap)
|
||||
{
|
||||
const uint8_t *ip = src;
|
||||
const uint8_t *const iend = src + src_len;
|
||||
uint8_t *op = dst;
|
||||
uint8_t *const oend = dst + dst_cap;
|
||||
|
||||
while (ip < iend) {
|
||||
uint8_t token = *ip++;
|
||||
|
||||
/* Literal length */
|
||||
size_t lit_len = token >> 4;
|
||||
if (lit_len == 15) {
|
||||
uint8_t b;
|
||||
do {
|
||||
if (ip >= iend) return 0;
|
||||
b = *ip++;
|
||||
lit_len += b;
|
||||
} while (b == 255);
|
||||
}
|
||||
|
||||
/* Copy literals */
|
||||
if (ip + lit_len > iend || op + lit_len > oend) return 0;
|
||||
memcpy(op, ip, lit_len);
|
||||
ip += lit_len;
|
||||
op += lit_len;
|
||||
|
||||
if (ip >= iend) break; /* end of stream (last token has no match) */
|
||||
|
||||
/* Match offset */
|
||||
if (ip + 2 > iend) return 0;
|
||||
uint16_t offset = ip[0] | ((uint16_t)ip[1] << 8);
|
||||
ip += 2;
|
||||
if (offset == 0 || op - dst < offset) return 0;
|
||||
|
||||
/* Match length */
|
||||
size_t match_len = (token & ML_MASK) + MIN_MATCH;
|
||||
if ((token & ML_MASK) == ML_MASK) {
|
||||
uint8_t b;
|
||||
do {
|
||||
if (ip >= iend) return 0;
|
||||
b = *ip++;
|
||||
match_len += b;
|
||||
} while (b == 255);
|
||||
}
|
||||
|
||||
/* Copy match */
|
||||
if (op + match_len > oend) return 0;
|
||||
const uint8_t *ref = op - offset;
|
||||
for (size_t i = 0; i < match_len; i++)
|
||||
op[i] = ref[i]; /* byte-by-byte for overlapping matches */
|
||||
op += match_len;
|
||||
}
|
||||
|
||||
return (size_t)(op - dst);
|
||||
}
|
||||
114
lib/src/uc2_merkle.c
Normal file
114
lib/src/uc2_merkle.c
Normal file
@@ -0,0 +1,114 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* Merkle DAG for content-addressable deduplication.
|
||||
*
|
||||
* Each file is split into CDC chunks (Gear hash), each chunk hashed
|
||||
* with FNV-1a 64-bit. The file's root hash is computed from the
|
||||
* concatenated chunk hashes, forming a single-level Merkle tree.
|
||||
*
|
||||
* Comparison operations find shared chunks between trees, enabling
|
||||
* dedup decisions based on structural content similarity rather than
|
||||
* simple byte-prefix matching. */
|
||||
|
||||
#include "uc2/uc2_merkle.h"
|
||||
#include "uc2/uc2_cdc.h"
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
uint64_t uc2_hash64(const uint8_t *data, size_t len)
|
||||
{
|
||||
uint64_t h = 14695981039346656037ULL;
|
||||
for (size_t i = 0; i < len; i++) {
|
||||
h ^= data[i];
|
||||
h *= 1099511628211ULL;
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
void uc2_merkle_build(struct uc2_merkle *tree,
|
||||
const uint8_t *data, size_t len, int bits)
|
||||
{
|
||||
tree->chunks = NULL;
|
||||
tree->nchunks = 0;
|
||||
tree->capacity = 0;
|
||||
tree->root = 0;
|
||||
|
||||
if (!data || len == 0)
|
||||
return;
|
||||
|
||||
struct uc2_chunker chunker;
|
||||
uc2_chunker_init(&chunker, bits, 0, 0);
|
||||
|
||||
size_t off, clen;
|
||||
int more = 1;
|
||||
while (more) {
|
||||
more = uc2_chunker_next(&chunker, data, len, &off, &clen);
|
||||
if (clen == 0) break;
|
||||
|
||||
if (tree->nchunks >= tree->capacity) {
|
||||
int ncap = tree->capacity ? tree->capacity * 2 : 16;
|
||||
struct uc2_chunk *nc = realloc(tree->chunks,
|
||||
(size_t)ncap * sizeof *tree->chunks);
|
||||
if (!nc)
|
||||
break; /* out of memory: keep chunks gathered so far */
|
||||
tree->chunks = nc;
|
||||
tree->capacity = ncap;
|
||||
}
|
||||
struct uc2_chunk *c = &tree->chunks[tree->nchunks++];
|
||||
c->hash = uc2_hash64(data + off, clen);
|
||||
c->offset = (uint32_t)off;
|
||||
c->length = (uint32_t)clen;
|
||||
}
|
||||
|
||||
/* Root hash = hash of concatenated chunk hashes */
|
||||
if (tree->nchunks > 0) {
|
||||
uint8_t *hashbuf = malloc((size_t)tree->nchunks * 8);
|
||||
if (hashbuf) {
|
||||
for (int i = 0; i < tree->nchunks; i++) {
|
||||
uint64_t h = tree->chunks[i].hash;
|
||||
for (int j = 0; j < 8; j++)
|
||||
hashbuf[i * 8 + j] = (uint8_t)(h >> (j * 8));
|
||||
}
|
||||
tree->root = uc2_hash64(hashbuf, (size_t)tree->nchunks * 8);
|
||||
free(hashbuf);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int uc2_merkle_common(const struct uc2_merkle *a, const struct uc2_merkle *b)
|
||||
{
|
||||
int count = 0;
|
||||
for (int i = 0; i < a->nchunks; i++)
|
||||
for (int j = 0; j < b->nchunks; j++)
|
||||
if (a->chunks[i].hash == b->chunks[j].hash) {
|
||||
count++;
|
||||
break; /* count each A chunk at most once */
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
double uc2_merkle_similarity(const struct uc2_merkle *a,
|
||||
const struct uc2_merkle *b)
|
||||
{
|
||||
if (a->nchunks == 0) return 0.0;
|
||||
|
||||
uint32_t shared_bytes = 0;
|
||||
uint32_t total_bytes = 0;
|
||||
for (int i = 0; i < a->nchunks; i++) {
|
||||
total_bytes += a->chunks[i].length;
|
||||
for (int j = 0; j < b->nchunks; j++)
|
||||
if (a->chunks[i].hash == b->chunks[j].hash) {
|
||||
shared_bytes += a->chunks[i].length;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return total_bytes > 0 ? (double)shared_bytes / total_bytes : 0.0;
|
||||
}
|
||||
|
||||
void uc2_merkle_free(struct uc2_merkle *tree)
|
||||
{
|
||||
free(tree->chunks);
|
||||
tree->chunks = NULL;
|
||||
tree->nchunks = 0;
|
||||
tree->capacity = 0;
|
||||
}
|
||||
343
lib/src/uc2_ots.c
Normal file
343
lib/src/uc2_ots.c
Normal file
@@ -0,0 +1,343 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* OpenTimestamps proof parser, serializer, walker, and UC2 trailer.
|
||||
*
|
||||
* The walker supports the calendar-path subset of opcodes (APPEND,
|
||||
* PREPEND, SHA256) directly. Other unary crypto ops (SHA1, RIPEMD160,
|
||||
* KECCAK256) are accepted as structurally valid but flagged as not
|
||||
* locally cryptographically verified; for full validation, extract
|
||||
* the proof and run the standard `ots verify` tool. */
|
||||
|
||||
#include "uc2/uc2_ots.h"
|
||||
#include "uc2/uc2_sha256.h"
|
||||
#include <string.h>
|
||||
|
||||
static uint32_t r32le(const uint8_t *p)
|
||||
{
|
||||
return (uint32_t)p[0] | ((uint32_t)p[1] << 8) |
|
||||
((uint32_t)p[2] << 16) | ((uint32_t)p[3] << 24);
|
||||
}
|
||||
|
||||
static void w32le(uint8_t *p, uint32_t v)
|
||||
{
|
||||
p[0] = (uint8_t)v;
|
||||
p[1] = (uint8_t)(v >> 8);
|
||||
p[2] = (uint8_t)(v >> 16);
|
||||
p[3] = (uint8_t)(v >> 24);
|
||||
}
|
||||
|
||||
int uc2_ots_varint_decode(const uint8_t *in, size_t in_len,
|
||||
uint64_t *out_value, size_t *consumed)
|
||||
{
|
||||
uint64_t v = 0;
|
||||
int shift = 0;
|
||||
size_t i = 0;
|
||||
for (;;) {
|
||||
if (i >= in_len) return UC2_OTS_ERR_TRUNCATED;
|
||||
if (shift >= 64) return UC2_OTS_ERR_OVERFLOW;
|
||||
uint8_t b = in[i++];
|
||||
uint8_t group = b & 0x7f;
|
||||
/* At shift == 63 only payloads of 0 or 1 fit in 64 bits;
|
||||
* anything larger would silently lose its high bits. */
|
||||
if (shift == 63 && group > 1)
|
||||
return UC2_OTS_ERR_OVERFLOW;
|
||||
v |= (uint64_t)group << shift;
|
||||
if (!(b & 0x80)) {
|
||||
/* Canonical: a multi-byte encoding must not have a zero
|
||||
* high group, i.e. the last byte cannot be 0x00 unless
|
||||
* the value is zero in a single byte. */
|
||||
if (i > 1 && b == 0)
|
||||
return UC2_OTS_ERR_NONCANONICAL;
|
||||
*out_value = v;
|
||||
*consumed = i;
|
||||
return UC2_OTS_OK;
|
||||
}
|
||||
shift += 7;
|
||||
}
|
||||
}
|
||||
|
||||
size_t uc2_ots_varint_encode(uint64_t value, uint8_t out[10])
|
||||
{
|
||||
size_t i = 0;
|
||||
while (value >= 0x80) {
|
||||
out[i++] = (uint8_t)(value | 0x80);
|
||||
value >>= 7;
|
||||
}
|
||||
out[i++] = (uint8_t)value;
|
||||
return i;
|
||||
}
|
||||
|
||||
/* Read a "varbytes" field: varint length + that many bytes. */
|
||||
static int read_varbytes(const uint8_t *p, size_t len,
|
||||
const uint8_t **out_data, size_t *out_data_len,
|
||||
size_t *consumed)
|
||||
{
|
||||
uint64_t n;
|
||||
size_t lc;
|
||||
int rc = uc2_ots_varint_decode(p, len, &n, &lc);
|
||||
if (rc < 0) return rc;
|
||||
if (n > UC2_OTS_MAX_VARBYTES) return UC2_OTS_ERR_TOO_LARGE;
|
||||
if (n > len - lc) return UC2_OTS_ERR_TRUNCATED;
|
||||
*out_data = p + lc;
|
||||
*out_data_len = (size_t)n;
|
||||
*consumed = lc + (size_t)n;
|
||||
return UC2_OTS_OK;
|
||||
}
|
||||
|
||||
int uc2_ots_parse_file(const uint8_t *file, size_t file_len,
|
||||
uint8_t *out_hash_op,
|
||||
const uint8_t **out_leaf_digest,
|
||||
size_t *out_leaf_digest_len,
|
||||
const uint8_t **out_body,
|
||||
size_t *out_body_len)
|
||||
{
|
||||
if (file_len < UC2_OTS_HEADER_MAGIC_LEN + 1 + 1 + 32)
|
||||
return UC2_OTS_ERR_TRUNCATED;
|
||||
if (memcmp(file, UC2_OTS_HEADER_MAGIC, UC2_OTS_HEADER_MAGIC_LEN) != 0)
|
||||
return UC2_OTS_ERR_BAD_MAGIC;
|
||||
size_t off = UC2_OTS_HEADER_MAGIC_LEN;
|
||||
if (file[off++] != UC2_OTS_VERSION)
|
||||
return UC2_OTS_ERR_BAD_VERSION;
|
||||
uint8_t hash_op = file[off++];
|
||||
size_t digest_len;
|
||||
switch (hash_op) {
|
||||
case UC2_OTS_OP_SHA1: digest_len = 20; break;
|
||||
case UC2_OTS_OP_RIPEMD160: digest_len = 20; break;
|
||||
case UC2_OTS_OP_SHA256: digest_len = 32; break;
|
||||
case UC2_OTS_OP_KECCAK256: digest_len = 32; break;
|
||||
default: return UC2_OTS_ERR_BAD_HASH_OP;
|
||||
}
|
||||
if (file_len - off < digest_len)
|
||||
return UC2_OTS_ERR_TRUNCATED;
|
||||
*out_hash_op = hash_op;
|
||||
*out_leaf_digest = file + off;
|
||||
*out_leaf_digest_len = digest_len;
|
||||
off += digest_len;
|
||||
*out_body = file + off;
|
||||
*out_body_len = file_len - off;
|
||||
return UC2_OTS_OK;
|
||||
}
|
||||
|
||||
int uc2_ots_serialize_file(uint8_t hash_op,
|
||||
const uint8_t *leaf_digest, size_t leaf_digest_len,
|
||||
const uint8_t *body, size_t body_len,
|
||||
uint8_t *out, size_t out_cap)
|
||||
{
|
||||
size_t want_len;
|
||||
switch (hash_op) {
|
||||
case UC2_OTS_OP_SHA1: want_len = 20; break;
|
||||
case UC2_OTS_OP_RIPEMD160: want_len = 20; break;
|
||||
case UC2_OTS_OP_SHA256: want_len = 32; break;
|
||||
case UC2_OTS_OP_KECCAK256: want_len = 32; break;
|
||||
default: return UC2_OTS_ERR_BAD_HASH_OP;
|
||||
}
|
||||
if (leaf_digest_len != want_len) return UC2_OTS_ERR_BAD_HASH_OP;
|
||||
size_t need = UC2_OTS_HEADER_MAGIC_LEN + 1 + 1 + leaf_digest_len + body_len;
|
||||
if (need > out_cap) return UC2_OTS_ERR_TRUNCATED;
|
||||
uint8_t *p = out;
|
||||
memcpy(p, UC2_OTS_HEADER_MAGIC, UC2_OTS_HEADER_MAGIC_LEN);
|
||||
p += UC2_OTS_HEADER_MAGIC_LEN;
|
||||
*p++ = UC2_OTS_VERSION;
|
||||
*p++ = hash_op;
|
||||
memcpy(p, leaf_digest, leaf_digest_len);
|
||||
p += leaf_digest_len;
|
||||
memcpy(p, body, body_len);
|
||||
p += body_len;
|
||||
return (int)(p - out);
|
||||
}
|
||||
|
||||
/* A serialized timestamp is a sequence of "items"; each item is either
|
||||
* (attestation) 0x00 + tag(8) + varbytes(payload)
|
||||
* (op) op-byte + (varbytes operand for binary ops) + child-timestamp
|
||||
*
|
||||
* Within one timestamp node, items are separated by 0xff: every item
|
||||
* except the LAST is preceded by 0xff. Children timestamps recurse
|
||||
* the same structure with the digest produced by their parent op. */
|
||||
|
||||
struct walker {
|
||||
const uint8_t *p, *end;
|
||||
uc2_ots_attest_cb cb;
|
||||
void *ctx;
|
||||
int has_unsupported_op;
|
||||
};
|
||||
|
||||
/* Apply an op to `digest`, consuming a varbytes operand for binary ops.
|
||||
* Supported ops update the digest in place; unsupported unary ops set
|
||||
* has_unsupported_op and leave the digest unchanged so the structural
|
||||
* walk can continue. */
|
||||
static int apply_op(struct walker *w, uint8_t op,
|
||||
uint8_t *digest, size_t *digest_len)
|
||||
{
|
||||
switch (op) {
|
||||
case UC2_OTS_OP_APPEND:
|
||||
case UC2_OTS_OP_PREPEND: {
|
||||
const uint8_t *operand;
|
||||
size_t operand_len, consumed;
|
||||
int rc = read_varbytes(w->p, (size_t)(w->end - w->p),
|
||||
&operand, &operand_len, &consumed);
|
||||
if (rc < 0) return rc;
|
||||
w->p += consumed;
|
||||
if (*digest_len + operand_len > UC2_OTS_MAX_DIGEST_LEN)
|
||||
return UC2_OTS_ERR_TOO_LARGE;
|
||||
if (op == UC2_OTS_OP_APPEND) {
|
||||
memcpy(digest + *digest_len, operand, operand_len);
|
||||
} else {
|
||||
memmove(digest + operand_len, digest, *digest_len);
|
||||
memcpy(digest, operand, operand_len);
|
||||
}
|
||||
*digest_len += operand_len;
|
||||
return UC2_OTS_OK;
|
||||
}
|
||||
case UC2_OTS_OP_SHA256: {
|
||||
uint8_t out[UC2_SHA256_OUT_LEN];
|
||||
uc2_sha256_hash(digest, *digest_len, out);
|
||||
memcpy(digest, out, UC2_SHA256_OUT_LEN);
|
||||
*digest_len = UC2_SHA256_OUT_LEN;
|
||||
return UC2_OTS_OK;
|
||||
}
|
||||
case UC2_OTS_OP_SHA1:
|
||||
case UC2_OTS_OP_RIPEMD160:
|
||||
case UC2_OTS_OP_KECCAK256:
|
||||
case UC2_OTS_OP_REVERSE:
|
||||
case UC2_OTS_OP_HEXLIFY:
|
||||
w->has_unsupported_op = 1;
|
||||
return UC2_OTS_OK;
|
||||
default:
|
||||
return UC2_OTS_ERR_BAD_OP;
|
||||
}
|
||||
}
|
||||
|
||||
static int walk_attestation(struct walker *w,
|
||||
const uint8_t *digest, size_t digest_len)
|
||||
{
|
||||
if (w->end - w->p < UC2_OTS_TAG_LEN) return UC2_OTS_ERR_TRUNCATED;
|
||||
const uint8_t *tag = w->p;
|
||||
w->p += UC2_OTS_TAG_LEN;
|
||||
const uint8_t *payload;
|
||||
size_t payload_len, consumed;
|
||||
int rc = read_varbytes(w->p, (size_t)(w->end - w->p),
|
||||
&payload, &payload_len, &consumed);
|
||||
if (rc < 0) return rc;
|
||||
w->p += consumed;
|
||||
if (w->cb && w->cb(w->ctx, tag, payload, payload_len, digest, digest_len))
|
||||
return UC2_OTS_ERR_OVERFLOW;
|
||||
return UC2_OTS_OK;
|
||||
}
|
||||
|
||||
static int walk_node(struct walker *w,
|
||||
const uint8_t *digest_in, size_t digest_in_len,
|
||||
int depth)
|
||||
{
|
||||
if (depth >= UC2_OTS_MAX_DEPTH) return UC2_OTS_ERR_DEPTH;
|
||||
|
||||
for (;;) {
|
||||
if (w->p >= w->end) return UC2_OTS_ERR_TRUNCATED;
|
||||
uint8_t b = *w->p++;
|
||||
int is_last = (b != UC2_OTS_BRANCH);
|
||||
if (!is_last) {
|
||||
if (w->p >= w->end) return UC2_OTS_ERR_TRUNCATED;
|
||||
b = *w->p++;
|
||||
}
|
||||
|
||||
if (b == UC2_OTS_ATTESTATION) {
|
||||
int rc = walk_attestation(w, digest_in, digest_in_len);
|
||||
if (rc < 0) return rc;
|
||||
} else {
|
||||
/* Op item: snapshot digest into a local buffer (siblings
|
||||
* within the same node share the parent digest), apply
|
||||
* the op, recurse into the sub-timestamp. */
|
||||
uint8_t mut[UC2_OTS_MAX_DIGEST_LEN];
|
||||
size_t mut_len = digest_in_len;
|
||||
memcpy(mut, digest_in, digest_in_len);
|
||||
int rc = apply_op(w, b, mut, &mut_len);
|
||||
if (rc < 0) return rc;
|
||||
rc = walk_node(w, mut, mut_len, depth + 1);
|
||||
if (rc < 0) return rc;
|
||||
}
|
||||
|
||||
if (is_last) return UC2_OTS_OK;
|
||||
}
|
||||
}
|
||||
|
||||
int uc2_ots_walk(const uint8_t *body, size_t body_len,
|
||||
const uint8_t *leaf_digest, size_t leaf_digest_len,
|
||||
uc2_ots_attest_cb cb, void *ctx)
|
||||
{
|
||||
if (leaf_digest_len > UC2_OTS_MAX_DIGEST_LEN)
|
||||
return UC2_OTS_ERR_TOO_LARGE;
|
||||
|
||||
struct walker w = { body, body + body_len, cb, ctx, 0 };
|
||||
int rc = walk_node(&w, leaf_digest, leaf_digest_len, 0);
|
||||
if (rc < 0) return rc;
|
||||
if (w.p != w.end) return UC2_OTS_ERR_OVERFLOW;
|
||||
return w.has_unsupported_op ? UC2_OTS_RESULT_STRUCTURAL
|
||||
: UC2_OTS_RESULT_VERIFIED;
|
||||
}
|
||||
|
||||
const char *uc2_ots_attest_name(const uint8_t tag[UC2_OTS_TAG_LEN])
|
||||
{
|
||||
if (memcmp(tag, UC2_OTS_TAG_PENDING, UC2_OTS_TAG_LEN) == 0)
|
||||
return "pending";
|
||||
if (memcmp(tag, UC2_OTS_TAG_BITCOIN, UC2_OTS_TAG_LEN) == 0)
|
||||
return "Bitcoin";
|
||||
if (memcmp(tag, UC2_OTS_TAG_LITECOIN, UC2_OTS_TAG_LEN) == 0)
|
||||
return "Litecoin";
|
||||
return 0;
|
||||
}
|
||||
|
||||
int uc2_ots_trailer_build(uint32_t archive_len,
|
||||
const uint8_t *proof, size_t proof_len,
|
||||
uint8_t *out, size_t out_cap)
|
||||
{
|
||||
if (proof_len > UC2_OTS_TRAILER_MAX_PROOF)
|
||||
return UC2_OTS_ERR_TOO_LARGE;
|
||||
size_t total = UC2_OTS_TRAILER_OVERHEAD + proof_len;
|
||||
if (total > out_cap) return UC2_OTS_ERR_TRUNCATED;
|
||||
uint8_t *p = out;
|
||||
memcpy(p, UC2_OTS_TRAILER_MAGIC, UC2_OTS_TRAILER_MAGIC_LEN);
|
||||
p += UC2_OTS_TRAILER_MAGIC_LEN;
|
||||
w32le(p, UC2_OTS_TRAILER_VERSION); p += 4;
|
||||
w32le(p, archive_len); p += 4;
|
||||
w32le(p, (uint32_t)proof_len); p += 4;
|
||||
memcpy(p, proof, proof_len); p += proof_len;
|
||||
w32le(p, (uint32_t)proof_len); p += 4;
|
||||
memcpy(p, UC2_OTS_TRAILER_MAGIC, UC2_OTS_TRAILER_MAGIC_LEN);
|
||||
p += UC2_OTS_TRAILER_MAGIC_LEN;
|
||||
return (int)(p - out);
|
||||
}
|
||||
|
||||
int uc2_ots_trailer_parse(const uint8_t *file, size_t file_len,
|
||||
uint32_t *out_archive_len,
|
||||
const uint8_t **out_proof, size_t *out_proof_len)
|
||||
{
|
||||
if (file_len < UC2_OTS_TRAILER_TAIL_LEN) return 1;
|
||||
const uint8_t *back = file + file_len - UC2_OTS_TRAILER_MAGIC_LEN;
|
||||
if (memcmp(back, UC2_OTS_TRAILER_MAGIC, UC2_OTS_TRAILER_MAGIC_LEN) != 0)
|
||||
return 1;
|
||||
|
||||
/* Back magic present: from here on, every check is hard-failed. */
|
||||
uint32_t back_proof_len = r32le(file + file_len - UC2_OTS_TRAILER_TAIL_LEN);
|
||||
if (back_proof_len > UC2_OTS_TRAILER_MAX_PROOF)
|
||||
return UC2_OTS_ERR_TOO_LARGE;
|
||||
|
||||
size_t total = UC2_OTS_TRAILER_OVERHEAD + back_proof_len;
|
||||
if (total > file_len) return UC2_OTS_ERR_TRUNCATED;
|
||||
const uint8_t *trailer_start = file + file_len - total;
|
||||
|
||||
if (memcmp(trailer_start, UC2_OTS_TRAILER_MAGIC, UC2_OTS_TRAILER_MAGIC_LEN) != 0)
|
||||
return UC2_OTS_ERR_BAD_MAGIC;
|
||||
|
||||
uint32_t version = r32le(trailer_start + UC2_OTS_TRAILER_MAGIC_LEN);
|
||||
uint32_t archive_ln = r32le(trailer_start + UC2_OTS_TRAILER_MAGIC_LEN + 4);
|
||||
uint32_t front_pl = r32le(trailer_start + UC2_OTS_TRAILER_MAGIC_LEN + 8);
|
||||
|
||||
if (version != UC2_OTS_TRAILER_VERSION) return UC2_OTS_ERR_BAD_VERSION;
|
||||
if (front_pl != back_proof_len) return UC2_OTS_ERR_NONCANONICAL;
|
||||
if ((size_t)archive_ln != (size_t)(trailer_start - file))
|
||||
return UC2_OTS_ERR_OVERFLOW;
|
||||
|
||||
*out_archive_len = archive_ln;
|
||||
*out_proof = trailer_start + UC2_OTS_TRAILER_HEAD_LEN;
|
||||
*out_proof_len = back_proof_len;
|
||||
return UC2_OTS_OK;
|
||||
}
|
||||
195
lib/src/uc2_preprocess.c
Normal file
195
lib/src/uc2_preprocess.c
Normal file
@@ -0,0 +1,195 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* Content-aware preprocessing filters. */
|
||||
|
||||
#include "uc2/uc2_preprocess.h"
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
/* --- BCJ (E8/E9 transform for x86) --- */
|
||||
|
||||
/* Convert relative CALL (E8) and JMP (E9) addresses to absolute.
|
||||
* The 4-byte displacement after E8/E9 is replaced with an absolute
|
||||
* address relative to position 0. This normalizes calls to the same
|
||||
* function from different locations, improving LZ77 matching. */
|
||||
|
||||
int uc2_bcj_apply(uint8_t *data, size_t len)
|
||||
{
|
||||
if (len < 5) return 0;
|
||||
for (size_t i = 0; i + 4 < len; i++) {
|
||||
if (data[i] == 0xE8 || data[i] == 0xE9) {
|
||||
int32_t rel = (int32_t)((uint32_t)data[i+1] | ((uint32_t)data[i+2] << 8) |
|
||||
((uint32_t)data[i+3] << 16) | ((uint32_t)data[i+4] << 24));
|
||||
int32_t abs_addr = rel + (int32_t)(i + 5);
|
||||
data[i+1] = (uint8_t)(abs_addr);
|
||||
data[i+2] = (uint8_t)(abs_addr >> 8);
|
||||
data[i+3] = (uint8_t)(abs_addr >> 16);
|
||||
data[i+4] = (uint8_t)(abs_addr >> 24);
|
||||
i += 4; /* skip the address bytes */
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int uc2_bcj_revert(uint8_t *data, size_t len)
|
||||
{
|
||||
if (len < 5) return 0;
|
||||
for (size_t i = 0; i + 4 < len; i++) {
|
||||
if (data[i] == 0xE8 || data[i] == 0xE9) {
|
||||
int32_t abs_addr = (int32_t)((uint32_t)data[i+1] | ((uint32_t)data[i+2] << 8) |
|
||||
((uint32_t)data[i+3] << 16) | ((uint32_t)data[i+4] << 24));
|
||||
int32_t rel = abs_addr - (int32_t)(i + 5);
|
||||
data[i+1] = (uint8_t)(rel);
|
||||
data[i+2] = (uint8_t)(rel >> 8);
|
||||
data[i+3] = (uint8_t)(rel >> 16);
|
||||
data[i+4] = (uint8_t)(rel >> 24);
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* --- BWT (Burrows-Wheeler Transform) --- */
|
||||
|
||||
/* Simple BWT using suffix array (O(n log^2 n) via qsort). */
|
||||
|
||||
static const uint8_t *bwt_data;
|
||||
static size_t bwt_len;
|
||||
|
||||
static int bwt_cmp(const void *a, const void *b)
|
||||
{
|
||||
uint32_t ia = *(const uint32_t *)a;
|
||||
uint32_t ib = *(const uint32_t *)b;
|
||||
for (size_t k = 0; k < bwt_len; k++) {
|
||||
uint8_t ca = bwt_data[(ia + k) % bwt_len];
|
||||
uint8_t cb = bwt_data[(ib + k) % bwt_len];
|
||||
if (ca != cb) return (int)ca - (int)cb;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int uc2_bwt_apply(const uint8_t *data, size_t len,
|
||||
uint8_t **out, uint32_t *primary_index)
|
||||
{
|
||||
if (len == 0) { *out = NULL; *primary_index = 0; return 0; }
|
||||
|
||||
uint32_t *sa = malloc(len * sizeof(uint32_t));
|
||||
uint8_t *result = malloc(len);
|
||||
if (!sa || !result) { free(sa); free(result); return -1; }
|
||||
|
||||
for (size_t i = 0; i < len; i++) sa[i] = (uint32_t)i;
|
||||
bwt_data = data;
|
||||
bwt_len = len;
|
||||
qsort(sa, len, sizeof(uint32_t), bwt_cmp);
|
||||
|
||||
*primary_index = 0;
|
||||
for (size_t i = 0; i < len; i++) {
|
||||
if (sa[i] == 0) *primary_index = (uint32_t)i;
|
||||
result[i] = data[(sa[i] + len - 1) % len];
|
||||
}
|
||||
|
||||
free(sa);
|
||||
*out = result;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int uc2_bwt_revert(const uint8_t *data, size_t len,
|
||||
uint32_t primary_index, uint8_t **out)
|
||||
{
|
||||
if (len == 0) { *out = NULL; return 0; }
|
||||
|
||||
/* primary_index indexes data[]/T[]; reject an out-of-range value
|
||||
(it can come from an untrusted stream). Also guard the T[]
|
||||
allocation multiply against wrap on 32-bit. */
|
||||
if (primary_index >= len || len > ((size_t)-1) / sizeof(uint32_t))
|
||||
return -1;
|
||||
|
||||
uint8_t *result = malloc(len);
|
||||
uint32_t *T = malloc(len * sizeof(uint32_t));
|
||||
if (!result || !T) { free(result); free(T); return -1; }
|
||||
|
||||
/* Build the LF-mapping (Last-to-First column mapping).
|
||||
T[i] = position in first column corresponding to last column position i. */
|
||||
uint32_t count[256];
|
||||
memset(count, 0, sizeof count);
|
||||
for (size_t i = 0; i < len; i++) count[data[i]]++;
|
||||
|
||||
uint32_t sum = 0;
|
||||
uint32_t start[256];
|
||||
for (int c = 0; c < 256; c++) {
|
||||
start[c] = sum;
|
||||
sum += count[c];
|
||||
}
|
||||
|
||||
/* Reset count for building T */
|
||||
memset(count, 0, sizeof count);
|
||||
for (size_t i = 0; i < len; i++) {
|
||||
T[i] = start[data[i]] + count[data[i]];
|
||||
count[data[i]]++;
|
||||
}
|
||||
|
||||
/* Reconstruct: follow T from primary_index, reading in reverse */
|
||||
uint32_t idx = primary_index;
|
||||
for (size_t i = len; i > 0; i--) {
|
||||
result[i - 1] = data[idx];
|
||||
idx = T[idx];
|
||||
}
|
||||
|
||||
free(T);
|
||||
*out = result;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* --- Delta filter --- */
|
||||
|
||||
void uc2_delta_filter_apply(uint8_t *data, size_t len, int stride)
|
||||
{
|
||||
if (stride < 1) stride = 1;
|
||||
/* Process from end to start to avoid overwriting needed values */
|
||||
for (size_t i = len; i > (size_t)stride; ) {
|
||||
i--;
|
||||
data[i] = (uint8_t)(data[i] - data[i - stride]);
|
||||
}
|
||||
}
|
||||
|
||||
void uc2_delta_filter_revert(uint8_t *data, size_t len, int stride)
|
||||
{
|
||||
if (stride < 1) stride = 1;
|
||||
for (size_t i = (size_t)stride; i < len; i++)
|
||||
data[i] = (uint8_t)(data[i] + data[i - stride]);
|
||||
}
|
||||
|
||||
/* --- Content detection --- */
|
||||
|
||||
int uc2_detect_content(const uint8_t *data, size_t len)
|
||||
{
|
||||
if (len < 4) return UC2_CONTENT_BINARY;
|
||||
|
||||
/* Check for x86 executable signatures */
|
||||
if (data[0] == 'M' && data[1] == 'Z')
|
||||
return UC2_CONTENT_X86; /* DOS/PE executable */
|
||||
if (data[0] == 0x7F && data[1] == 'E' && data[2] == 'L' && data[3] == 'F')
|
||||
return UC2_CONTENT_X86; /* ELF executable */
|
||||
|
||||
/* Count printable ASCII characters */
|
||||
size_t check = len > 4096 ? 4096 : len;
|
||||
size_t printable = 0;
|
||||
for (size_t i = 0; i < check; i++)
|
||||
if ((data[i] >= 32 && data[i] <= 126) ||
|
||||
data[i] == '\n' || data[i] == '\r' || data[i] == '\t')
|
||||
printable++;
|
||||
|
||||
if (printable * 100 / check > 85)
|
||||
return UC2_CONTENT_TEXT;
|
||||
|
||||
/* Check for structured data: regular byte-value patterns */
|
||||
if (len >= 64) {
|
||||
size_t zeros = 0;
|
||||
for (size_t i = 0; i < check; i++)
|
||||
if (data[i] == 0) zeros++;
|
||||
if (zeros * 100 / check > 20)
|
||||
return UC2_CONTENT_STRUCT;
|
||||
}
|
||||
|
||||
return UC2_CONTENT_BINARY;
|
||||
}
|
||||
185
lib/src/uc2_rans.c
Normal file
185
lib/src/uc2_rans.c
Normal file
@@ -0,0 +1,185 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* rANS (range Asymmetric Numeral Systems) entropy coder.
|
||||
*
|
||||
* Table-based rANS with 32-bit state. The state represents a position
|
||||
* in a virtual number line partitioned proportionally to symbol
|
||||
* frequencies. Encoding maps the state forward (growing), decoding
|
||||
* maps it backward (shrinking).
|
||||
*
|
||||
* Key properties vs Huffman:
|
||||
* - Fractional bit costs: symbols can use e.g. 2.3 bits (not rounded to 3)
|
||||
* - 5-15% better on skewed distributions (many symbols with freq < 2^-N)
|
||||
* - Same O(1) encode/decode per symbol with lookup tables
|
||||
*
|
||||
* Reference: Duda, "Asymmetric Numeral Systems" (2009). */
|
||||
|
||||
#include "uc2/uc2_rans.h"
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#define RANS_L (1u << 23) /* lower bound of state range */
|
||||
|
||||
void uc2_rans_build_table(struct uc2_rans_table *tab,
|
||||
const uint32_t *raw_freq, int nsym)
|
||||
{
|
||||
if (nsym > UC2_RANS_MAX_SYMS)
|
||||
nsym = UC2_RANS_MAX_SYMS;
|
||||
tab->nsym = nsym;
|
||||
|
||||
/* Sum raw frequencies */
|
||||
uint64_t total = 0;
|
||||
for (int i = 0; i < nsym; i++)
|
||||
total += raw_freq[i];
|
||||
|
||||
if (total == 0) {
|
||||
memset(tab->freq, 0, sizeof tab->freq);
|
||||
memset(tab->cumfreq, 0, sizeof tab->cumfreq);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Scale to PROB_SCALE, ensuring every non-zero symbol gets freq >= 1 */
|
||||
uint32_t assigned = 0;
|
||||
for (int i = 0; i < nsym; i++) {
|
||||
if (raw_freq[i] == 0) {
|
||||
tab->freq[i] = 0;
|
||||
} else {
|
||||
uint32_t f = (uint32_t)((uint64_t)raw_freq[i] * UC2_RANS_PROB_SCALE / total);
|
||||
if (f == 0) f = 1;
|
||||
tab->freq[i] = (uint16_t)f;
|
||||
assigned += f;
|
||||
}
|
||||
}
|
||||
|
||||
/* Adjust to hit exactly PROB_SCALE: add/remove from largest symbol */
|
||||
if (assigned != UC2_RANS_PROB_SCALE) {
|
||||
int largest = 0;
|
||||
for (int i = 1; i < nsym; i++)
|
||||
if (tab->freq[i] > tab->freq[largest])
|
||||
largest = i;
|
||||
int32_t diff = (int32_t)UC2_RANS_PROB_SCALE - (int32_t)assigned;
|
||||
tab->freq[largest] = (uint16_t)((int32_t)tab->freq[largest] + diff);
|
||||
}
|
||||
|
||||
/* Build cumulative frequencies */
|
||||
tab->cumfreq[0] = 0;
|
||||
for (int i = 1; i < nsym; i++)
|
||||
tab->cumfreq[i] = tab->cumfreq[i - 1] + tab->freq[i - 1];
|
||||
}
|
||||
|
||||
/* --- Encoder --- */
|
||||
|
||||
static void enc_grow(struct uc2_rans_enc *enc)
|
||||
{
|
||||
size_t newcap = enc->rev_cap ? enc->rev_cap * 2 : 4096;
|
||||
enc->rev_buf = realloc(enc->rev_buf, newcap);
|
||||
enc->rev_cap = newcap;
|
||||
}
|
||||
|
||||
static void enc_put_byte(struct uc2_rans_enc *enc, uint8_t b)
|
||||
{
|
||||
if (enc->rev_pos >= enc->rev_cap)
|
||||
enc_grow(enc);
|
||||
enc->rev_buf[enc->rev_pos++] = b;
|
||||
}
|
||||
|
||||
void uc2_rans_enc_init(struct uc2_rans_enc *enc,
|
||||
const struct uc2_rans_table *tab)
|
||||
{
|
||||
enc->state = RANS_L;
|
||||
enc->tab = tab;
|
||||
enc->rev_buf = NULL;
|
||||
enc->rev_pos = 0;
|
||||
enc->rev_cap = 0;
|
||||
}
|
||||
|
||||
void uc2_rans_encode(struct uc2_rans_enc *enc, int sym)
|
||||
{
|
||||
uint32_t freq = enc->tab->freq[sym];
|
||||
if (freq == 0) return; /* skip zero-freq symbols */
|
||||
|
||||
/* Renormalize: output bytes until state is in range */
|
||||
uint32_t upper = ((RANS_L >> UC2_RANS_PROB_BITS) << 8) * freq;
|
||||
while (enc->state >= upper) {
|
||||
enc_put_byte(enc, (uint8_t)(enc->state & 0xFF));
|
||||
enc->state >>= 8;
|
||||
}
|
||||
|
||||
/* Encode: state = (state / freq) * PROB_SCALE + cumfreq + (state % freq) */
|
||||
uint32_t cumfreq = enc->tab->cumfreq[sym];
|
||||
enc->state = ((enc->state / freq) << UC2_RANS_PROB_BITS) +
|
||||
cumfreq + (enc->state % freq);
|
||||
}
|
||||
|
||||
size_t uc2_rans_enc_finish(struct uc2_rans_enc *enc, uint8_t **out_data)
|
||||
{
|
||||
/* Write final state (4 bytes, little-endian) */
|
||||
for (int i = 0; i < 4; i++) {
|
||||
enc_put_byte(enc, (uint8_t)(enc->state & 0xFF));
|
||||
enc->state >>= 8;
|
||||
}
|
||||
|
||||
/* Reverse the buffer (rANS produces output in reverse) */
|
||||
size_t len = enc->rev_pos;
|
||||
uint8_t *out = malloc(len);
|
||||
if (out) {
|
||||
for (size_t i = 0; i < len; i++)
|
||||
out[i] = enc->rev_buf[len - 1 - i];
|
||||
}
|
||||
|
||||
*out_data = out;
|
||||
return len;
|
||||
}
|
||||
|
||||
void uc2_rans_enc_free(struct uc2_rans_enc *enc)
|
||||
{
|
||||
free(enc->rev_buf);
|
||||
enc->rev_buf = NULL;
|
||||
enc->rev_pos = 0;
|
||||
}
|
||||
|
||||
/* --- Decoder --- */
|
||||
|
||||
void uc2_rans_dec_init(struct uc2_rans_dec *dec,
|
||||
const struct uc2_rans_table *tab,
|
||||
const uint8_t *data, size_t len)
|
||||
{
|
||||
dec->tab = tab;
|
||||
dec->data = data;
|
||||
dec->len = len;
|
||||
dec->pos = 0;
|
||||
|
||||
/* Build reverse lookup table: cumfreq → symbol */
|
||||
memset(dec->lookup, 0, sizeof dec->lookup);
|
||||
for (int s = 0; s < tab->nsym; s++)
|
||||
for (uint32_t i = tab->cumfreq[s];
|
||||
i < tab->cumfreq[s] + tab->freq[s] && i < UC2_RANS_PROB_SCALE; i++)
|
||||
dec->lookup[i] = (uint16_t)s;
|
||||
|
||||
/* Read initial state (4 bytes, little-endian) */
|
||||
dec->state = 0;
|
||||
for (int i = 3; i >= 0; i--) {
|
||||
dec->state <<= 8;
|
||||
if (dec->pos < len)
|
||||
dec->state |= data[dec->pos++];
|
||||
}
|
||||
}
|
||||
|
||||
int uc2_rans_decode(struct uc2_rans_dec *dec)
|
||||
{
|
||||
/* Find symbol from state */
|
||||
uint32_t slot = dec->state & (UC2_RANS_PROB_SCALE - 1);
|
||||
int sym = dec->lookup[slot];
|
||||
uint32_t freq = dec->tab->freq[sym];
|
||||
uint32_t cumfreq = dec->tab->cumfreq[sym];
|
||||
|
||||
/* Update state: state = freq * (state >> PROB_BITS) + slot - cumfreq */
|
||||
dec->state = freq * (dec->state >> UC2_RANS_PROB_BITS) + slot - cumfreq;
|
||||
|
||||
/* Renormalize: read bytes to keep state in range */
|
||||
while (dec->state < RANS_L && dec->pos < dec->len) {
|
||||
dec->state = (dec->state << 8) | dec->data[dec->pos++];
|
||||
}
|
||||
|
||||
return sym;
|
||||
}
|
||||
133
lib/src/uc2_sha256.c
Normal file
133
lib/src/uc2_sha256.c
Normal file
@@ -0,0 +1,133 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* SHA-256 (FIPS 180-4). Reference textbook implementation. */
|
||||
|
||||
#include "uc2/uc2_sha256.h"
|
||||
#include <string.h>
|
||||
|
||||
static const uint32_t K[64] = {
|
||||
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
|
||||
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
|
||||
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
|
||||
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
|
||||
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
|
||||
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
|
||||
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
|
||||
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
|
||||
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
|
||||
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
|
||||
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
|
||||
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
|
||||
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
|
||||
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
|
||||
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
|
||||
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
|
||||
};
|
||||
|
||||
static uint32_t rotr32(uint32_t x, int n) { return (x >> n) | (x << (32 - n)); }
|
||||
|
||||
static uint32_t r32be(const uint8_t *p)
|
||||
{
|
||||
return ((uint32_t)p[0] << 24) | ((uint32_t)p[1] << 16) |
|
||||
((uint32_t)p[2] << 8) | (uint32_t)p[3];
|
||||
}
|
||||
|
||||
static void w32be(uint8_t *p, uint32_t v)
|
||||
{
|
||||
p[0] = (uint8_t)(v >> 24);
|
||||
p[1] = (uint8_t)(v >> 16);
|
||||
p[2] = (uint8_t)(v >> 8);
|
||||
p[3] = (uint8_t)v;
|
||||
}
|
||||
|
||||
static void compress(uint32_t state[8], const uint8_t block[64])
|
||||
{
|
||||
uint32_t w[64];
|
||||
for (int i = 0; i < 16; i++)
|
||||
w[i] = r32be(block + 4 * i);
|
||||
for (int i = 16; i < 64; i++) {
|
||||
uint32_t s0 = rotr32(w[i-15], 7) ^ rotr32(w[i-15], 18) ^ (w[i-15] >> 3);
|
||||
uint32_t s1 = rotr32(w[i-2], 17) ^ rotr32(w[i-2], 19) ^ (w[i-2] >> 10);
|
||||
w[i] = w[i-16] + s0 + w[i-7] + s1;
|
||||
}
|
||||
|
||||
uint32_t a = state[0], b = state[1], c = state[2], d = state[3];
|
||||
uint32_t e = state[4], f = state[5], g = state[6], h = state[7];
|
||||
|
||||
for (int i = 0; i < 64; i++) {
|
||||
uint32_t S1 = rotr32(e, 6) ^ rotr32(e, 11) ^ rotr32(e, 25);
|
||||
uint32_t ch = (e & f) ^ (~e & g);
|
||||
uint32_t t1 = h + S1 + ch + K[i] + w[i];
|
||||
uint32_t S0 = rotr32(a, 2) ^ rotr32(a, 13) ^ rotr32(a, 22);
|
||||
uint32_t mj = (a & b) ^ (a & c) ^ (b & c);
|
||||
uint32_t t2 = S0 + mj;
|
||||
h = g; g = f; f = e; e = d + t1;
|
||||
d = c; c = b; b = a; a = t1 + t2;
|
||||
}
|
||||
|
||||
state[0] += a; state[1] += b; state[2] += c; state[3] += d;
|
||||
state[4] += e; state[5] += f; state[6] += g; state[7] += h;
|
||||
}
|
||||
|
||||
void uc2_sha256_init(struct uc2_sha256 *ctx)
|
||||
{
|
||||
ctx->state[0] = 0x6a09e667; ctx->state[1] = 0xbb67ae85;
|
||||
ctx->state[2] = 0x3c6ef372; ctx->state[3] = 0xa54ff53a;
|
||||
ctx->state[4] = 0x510e527f; ctx->state[5] = 0x9b05688c;
|
||||
ctx->state[6] = 0x1f83d9ab; ctx->state[7] = 0x5be0cd19;
|
||||
ctx->bitcount = 0;
|
||||
ctx->buf_len = 0;
|
||||
}
|
||||
|
||||
void uc2_sha256_update(struct uc2_sha256 *ctx, const void *data, size_t len)
|
||||
{
|
||||
const uint8_t *p = data;
|
||||
ctx->bitcount += (uint64_t)len * 8;
|
||||
if (ctx->buf_len) {
|
||||
size_t take = UC2_SHA256_BLOCK_LEN - ctx->buf_len;
|
||||
if (take > len) take = len;
|
||||
memcpy(ctx->buf + ctx->buf_len, p, take);
|
||||
ctx->buf_len += take;
|
||||
p += take;
|
||||
len -= take;
|
||||
if (ctx->buf_len == UC2_SHA256_BLOCK_LEN) {
|
||||
compress(ctx->state, ctx->buf);
|
||||
ctx->buf_len = 0;
|
||||
}
|
||||
}
|
||||
while (len >= UC2_SHA256_BLOCK_LEN) {
|
||||
compress(ctx->state, p);
|
||||
p += UC2_SHA256_BLOCK_LEN;
|
||||
len -= UC2_SHA256_BLOCK_LEN;
|
||||
}
|
||||
if (len) {
|
||||
memcpy(ctx->buf, p, len);
|
||||
ctx->buf_len = len;
|
||||
}
|
||||
}
|
||||
|
||||
void uc2_sha256_final(struct uc2_sha256 *ctx, uint8_t out[UC2_SHA256_OUT_LEN])
|
||||
{
|
||||
uint64_t bits = ctx->bitcount;
|
||||
ctx->buf[ctx->buf_len++] = 0x80;
|
||||
if (ctx->buf_len > 56) {
|
||||
memset(ctx->buf + ctx->buf_len, 0, UC2_SHA256_BLOCK_LEN - ctx->buf_len);
|
||||
compress(ctx->state, ctx->buf);
|
||||
ctx->buf_len = 0;
|
||||
}
|
||||
memset(ctx->buf + ctx->buf_len, 0, 56 - ctx->buf_len);
|
||||
for (int i = 0; i < 8; i++)
|
||||
ctx->buf[56 + i] = (uint8_t)(bits >> (56 - 8 * i));
|
||||
compress(ctx->state, ctx->buf);
|
||||
|
||||
for (int i = 0; i < 8; i++)
|
||||
w32be(out + 4 * i, ctx->state[i]);
|
||||
}
|
||||
|
||||
void uc2_sha256_hash(const void *data, size_t len, uint8_t out[UC2_SHA256_OUT_LEN])
|
||||
{
|
||||
struct uc2_sha256 ctx;
|
||||
uc2_sha256_init(&ctx);
|
||||
uc2_sha256_update(&ctx, data, len);
|
||||
uc2_sha256_final(&ctx, out);
|
||||
}
|
||||
67
lib/src/uc2_simhash.c
Normal file
67
lib/src/uc2_simhash.c
Normal file
@@ -0,0 +1,67 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* Near-duplicate detection via SimHash.
|
||||
*
|
||||
* Algorithm: extract overlapping 4-byte shingles from the input,
|
||||
* hash each with FNV-1a 64-bit, then for each bit position, count
|
||||
* how many hashes have that bit set vs clear. The final SimHash
|
||||
* bit is 1 if the majority of shingle hashes had 1 in that position.
|
||||
*
|
||||
* This gives a locality-sensitive hash: similar inputs produce
|
||||
* fingerprints with small Hamming distance. */
|
||||
|
||||
#include "uc2/uc2_simhash.h"
|
||||
|
||||
static uint64_t fnv1a_64(const uint8_t *data, size_t len)
|
||||
{
|
||||
uint64_t h = 14695981039346656037ULL;
|
||||
for (size_t i = 0; i < len; i++) {
|
||||
h ^= data[i];
|
||||
h *= 1099511628211ULL;
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
uint64_t uc2_simhash(const uint8_t *data, size_t len)
|
||||
{
|
||||
if (len < 4) {
|
||||
/* Too short for shingles: just hash directly */
|
||||
return fnv1a_64(data, len);
|
||||
}
|
||||
|
||||
/* Accumulate bit votes: positive = more 1s, negative = more 0s */
|
||||
int32_t votes[64];
|
||||
for (int i = 0; i < 64; i++)
|
||||
votes[i] = 0;
|
||||
|
||||
/* Slide 4-byte shingles */
|
||||
size_t nshingles = len - 3;
|
||||
for (size_t i = 0; i < nshingles; i++) {
|
||||
uint64_t h = fnv1a_64(data + i, 4);
|
||||
for (int b = 0; b < 64; b++) {
|
||||
if (h & ((uint64_t)1 << b))
|
||||
votes[b]++;
|
||||
else
|
||||
votes[b]--;
|
||||
}
|
||||
}
|
||||
|
||||
/* Majority vote */
|
||||
uint64_t result = 0;
|
||||
for (int b = 0; b < 64; b++)
|
||||
if (votes[b] > 0)
|
||||
result |= (uint64_t)1 << b;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
int uc2_hamming(uint64_t a, uint64_t b)
|
||||
{
|
||||
uint64_t x = a ^ b;
|
||||
int count = 0;
|
||||
while (x) {
|
||||
count++;
|
||||
x &= x - 1; /* clear lowest set bit */
|
||||
}
|
||||
return count;
|
||||
}
|
||||
60
lib/src/uc2_tables.c
Normal file
60
lib/src/uc2_tables.c
Normal file
@@ -0,0 +1,60 @@
|
||||
/* SPDX-License-Identifier: GPL-3.0-or-later */
|
||||
|
||||
/* UC2 shared tables: Huffman delta coding and default tree lengths. */
|
||||
|
||||
#include "uc2_internal.h"
|
||||
|
||||
/* Delta-to-absolute lookup: vval[prev][delta_code] = absolute_length.
|
||||
Used by the decompressor's ht_dec(). */
|
||||
const u8 vval[NumDeltaCodes][NumDeltaCodes] = {
|
||||
{ 0,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1},
|
||||
{ 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0},
|
||||
{ 2, 1, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0},
|
||||
{ 3, 2, 4, 1, 5, 6, 7, 8, 9,10,11,12,13, 0},
|
||||
{ 4, 3, 5, 2, 6, 1, 7, 8, 9,10,11,12,13, 0},
|
||||
{ 5, 4, 6, 3, 7, 2, 8, 1, 9,10,11,12,13, 0},
|
||||
{ 6, 5, 7, 4, 8, 3, 9, 2,10, 1,11,12,13, 0},
|
||||
{ 7, 6, 8, 5, 9, 4,10, 3,11, 2,12, 1,13, 0},
|
||||
{ 8, 7, 9, 6,10, 5,11, 4,12, 3,13, 2, 0, 1},
|
||||
{ 9, 8,10, 7,11, 6,12, 5,13, 4, 0, 3, 2, 1},
|
||||
{10, 9,11, 8,12, 7,13, 6, 0, 5, 4, 3, 2, 1},
|
||||
{11,10,12, 9,13, 8, 0, 7, 6, 5, 4, 3, 2, 1},
|
||||
{12,11,13,10, 0, 9, 8, 7, 6, 5, 4, 3, 2, 1},
|
||||
{13,12, 0,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1}
|
||||
};
|
||||
|
||||
/* Inverse delta table: ivval[prev][absolute] = delta_code.
|
||||
Used by the compressor's tree encoder. */
|
||||
const u8 ivval[NumDeltaCodes][NumDeltaCodes] = {
|
||||
{ 0,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1},
|
||||
{13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12},
|
||||
{13, 1, 0, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12},
|
||||
{13, 3, 1, 0, 2, 4, 5, 6, 7, 8, 9,10,11,12},
|
||||
{13, 5, 3, 1, 0, 2, 4, 6, 7, 8, 9,10,11,12},
|
||||
{13, 7, 5, 3, 1, 0, 2, 4, 6, 8, 9,10,11,12},
|
||||
{13, 9, 7, 5, 3, 1, 0, 2, 4, 6, 8,10,11,12},
|
||||
{13,11, 9, 7, 5, 3, 1, 0, 2, 4, 6, 8,10,12},
|
||||
{12,13,11, 9, 7, 5, 3, 1, 0, 2, 4, 6, 8,10},
|
||||
{10,13,12,11, 9, 7, 5, 3, 1, 0, 2, 4, 6, 8},
|
||||
{ 8,13,12,11,10, 9, 7, 5, 3, 1, 0, 2, 4, 6},
|
||||
{ 6,13,12,11,10, 9, 8, 7, 5, 3, 1, 0, 2, 4},
|
||||
{ 4,13,12,11,10, 9, 8, 7, 6, 5, 3, 1, 0, 2},
|
||||
{ 2,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 1, 0}
|
||||
};
|
||||
|
||||
void uc2_default_lengths(u8 d[NumSymbols])
|
||||
{
|
||||
static const u8 rle[] = {
|
||||
10,9, 1,7, 1,9, 1,7, 19,9, 1,7, 13,8, 1,7, 11,8, 1,7,
|
||||
33,8, 1,7, 35,8, 128,10, 16,6, 12,7, 6,8, 10,9, 16,10,
|
||||
9,4, 9,5, 10,6, 0
|
||||
};
|
||||
const u8 *s = rle;
|
||||
u8 n = s[0];
|
||||
do {
|
||||
u8 v = s[1];
|
||||
s += 2;
|
||||
do { *d++ = v; } while (--n);
|
||||
n = *s;
|
||||
} while (n);
|
||||
}
|
||||
188
tests/CMakeLists.txt
Normal file
188
tests/CMakeLists.txt
Normal file
@@ -0,0 +1,188 @@
|
||||
# UC2 tests
|
||||
|
||||
add_executable(test_identify src/test_identify.c)
|
||||
target_link_libraries(test_identify PRIVATE uc2)
|
||||
target_include_directories(test_identify PRIVATE "${PROJECT_BINARY_DIR}/lib")
|
||||
target_compile_features(test_identify PRIVATE c_std_99)
|
||||
add_test(NAME identify COMMAND test_identify)
|
||||
|
||||
add_executable(test_extract src/test_extract.c)
|
||||
target_link_libraries(test_extract PRIVATE uc2)
|
||||
target_include_directories(test_extract PRIVATE "${PROJECT_BINARY_DIR}/lib")
|
||||
target_compile_features(test_extract PRIVATE c_std_99)
|
||||
add_test(NAME extract COMMAND test_extract
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/archives"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/corpus"
|
||||
)
|
||||
|
||||
add_executable(test_roundtrip src/test_roundtrip.c)
|
||||
target_link_libraries(test_roundtrip PRIVATE uc2)
|
||||
target_include_directories(test_roundtrip PRIVATE "${PROJECT_BINARY_DIR}/lib")
|
||||
target_compile_features(test_roundtrip PRIVATE c_std_99)
|
||||
add_test(NAME roundtrip COMMAND test_roundtrip)
|
||||
|
||||
# CLI create/extract round-trip test
|
||||
add_test(NAME cli_create
|
||||
COMMAND ${CMAKE_COMMAND}
|
||||
-DUC2_CLI=$<TARGET_FILE:uc2-cli>
|
||||
-DTEST_DIR=${CMAKE_CURRENT_BINARY_DIR}/cli_test
|
||||
-P ${CMAKE_CURRENT_SOURCE_DIR}/test_cli_create.cmake
|
||||
)
|
||||
|
||||
# CLI master-block deduplication round-trip test
|
||||
add_test(NAME cli_master
|
||||
COMMAND ${CMAKE_COMMAND}
|
||||
-DUC2_CLI=$<TARGET_FILE:uc2-cli>
|
||||
-DTEST_DIR=${CMAKE_CURRENT_BINARY_DIR}/cli_master_test
|
||||
-P ${CMAKE_CURRENT_SOURCE_DIR}/test_cli_master.cmake
|
||||
)
|
||||
|
||||
# CLI >64KB round-trip test (sliding-window edge regression, git-bug d747658)
|
||||
add_test(NAME cli_bigfile
|
||||
COMMAND ${CMAKE_COMMAND}
|
||||
-DUC2_CLI=$<TARGET_FILE:uc2-cli>
|
||||
-DTEST_DIR=${CMAKE_CURRENT_BINARY_DIR}/cli_bigfile_test
|
||||
-P ${CMAKE_CURRENT_SOURCE_DIR}/test_cli_bigfile.cmake
|
||||
)
|
||||
|
||||
# CLI directory archival round-trip test
|
||||
add_test(NAME cli_dirs
|
||||
COMMAND ${CMAKE_COMMAND}
|
||||
-DUC2_CLI=$<TARGET_FILE:uc2-cli>
|
||||
-DTEST_DIR=${CMAKE_CURRENT_BINARY_DIR}/cli_dirs_test
|
||||
-P ${CMAKE_CURRENT_SOURCE_DIR}/test_cli_dirs.cmake
|
||||
)
|
||||
|
||||
add_executable(test_cdc src/test_cdc.c)
|
||||
target_link_libraries(test_cdc PRIVATE uc2)
|
||||
target_include_directories(test_cdc PRIVATE "${PROJECT_BINARY_DIR}/lib")
|
||||
target_compile_features(test_cdc PRIVATE c_std_99)
|
||||
add_test(NAME cdc COMMAND test_cdc)
|
||||
|
||||
add_executable(test_merkle src/test_merkle.c)
|
||||
if(WIN32)
|
||||
target_link_libraries(test_merkle PRIVATE uc2)
|
||||
else()
|
||||
target_link_libraries(test_merkle PRIVATE uc2 m)
|
||||
endif()
|
||||
target_include_directories(test_merkle PRIVATE "${PROJECT_BINARY_DIR}/lib")
|
||||
target_compile_features(test_merkle PRIVATE c_std_99)
|
||||
add_test(NAME merkle COMMAND test_merkle)
|
||||
|
||||
add_executable(test_blockstore src/test_blockstore.c)
|
||||
target_link_libraries(test_blockstore PRIVATE uc2)
|
||||
target_include_directories(test_blockstore PRIVATE "${PROJECT_BINARY_DIR}/lib")
|
||||
target_compile_features(test_blockstore PRIVATE c_std_99)
|
||||
add_test(NAME blockstore COMMAND test_blockstore)
|
||||
|
||||
add_executable(test_simhash src/test_simhash.c)
|
||||
target_link_libraries(test_simhash PRIVATE uc2)
|
||||
target_include_directories(test_simhash PRIVATE "${PROJECT_BINARY_DIR}/lib")
|
||||
target_compile_features(test_simhash PRIVATE c_std_99)
|
||||
add_test(NAME simhash COMMAND test_simhash)
|
||||
|
||||
add_executable(test_delta src/test_delta.c)
|
||||
target_link_libraries(test_delta PRIVATE uc2)
|
||||
target_include_directories(test_delta PRIVATE "${PROJECT_BINARY_DIR}/lib")
|
||||
target_compile_features(test_delta PRIVATE c_std_99)
|
||||
add_test(NAME delta COMMAND test_delta)
|
||||
|
||||
add_executable(test_rans src/test_rans.c)
|
||||
if(WIN32)
|
||||
target_link_libraries(test_rans PRIVATE uc2)
|
||||
else()
|
||||
target_link_libraries(test_rans PRIVATE uc2 m)
|
||||
endif()
|
||||
target_include_directories(test_rans PRIVATE "${PROJECT_BINARY_DIR}/lib")
|
||||
target_compile_features(test_rans PRIVATE c_std_99)
|
||||
add_test(NAME rans COMMAND test_rans)
|
||||
|
||||
add_executable(test_dict src/test_dict.c)
|
||||
target_link_libraries(test_dict PRIVATE uc2)
|
||||
target_include_directories(test_dict PRIVATE "${PROJECT_BINARY_DIR}/lib")
|
||||
target_compile_features(test_dict PRIVATE c_std_99)
|
||||
add_test(NAME dict COMMAND test_dict)
|
||||
|
||||
add_executable(test_preprocess src/test_preprocess.c)
|
||||
target_link_libraries(test_preprocess PRIVATE uc2)
|
||||
target_include_directories(test_preprocess PRIVATE "${PROJECT_BINARY_DIR}/lib")
|
||||
target_compile_features(test_preprocess PRIVATE c_std_99)
|
||||
add_test(NAME preprocess COMMAND test_preprocess)
|
||||
|
||||
add_executable(test_lz4 src/test_lz4.c)
|
||||
target_link_libraries(test_lz4 PRIVATE uc2)
|
||||
target_include_directories(test_lz4 PRIVATE "${PROJECT_BINARY_DIR}/lib")
|
||||
target_compile_features(test_lz4 PRIVATE c_std_99)
|
||||
add_test(NAME lz4 COMMAND test_lz4)
|
||||
|
||||
add_executable(test_blake3 src/test_blake3.c)
|
||||
target_link_libraries(test_blake3 PRIVATE uc2)
|
||||
target_include_directories(test_blake3 PRIVATE "${PROJECT_BINARY_DIR}/lib")
|
||||
target_compile_features(test_blake3 PRIVATE c_std_99)
|
||||
add_test(NAME blake3 COMMAND test_blake3)
|
||||
|
||||
add_executable(test_sha256 src/test_sha256.c)
|
||||
target_link_libraries(test_sha256 PRIVATE uc2)
|
||||
target_include_directories(test_sha256 PRIVATE "${PROJECT_BINARY_DIR}/lib")
|
||||
target_compile_features(test_sha256 PRIVATE c_std_99)
|
||||
add_test(NAME sha256 COMMAND test_sha256)
|
||||
|
||||
add_executable(test_ots src/test_ots.c)
|
||||
target_link_libraries(test_ots PRIVATE uc2)
|
||||
target_include_directories(test_ots PRIVATE "${PROJECT_BINARY_DIR}/lib")
|
||||
target_compile_features(test_ots PRIVATE c_std_99)
|
||||
add_test(NAME ots COMMAND test_ots)
|
||||
|
||||
add_executable(test_ingest src/test_ingest.c)
|
||||
target_link_libraries(test_ingest PRIVATE uc2)
|
||||
target_include_directories(test_ingest PRIVATE "${PROJECT_BINARY_DIR}/lib")
|
||||
target_compile_features(test_ingest PRIVATE c_std_99)
|
||||
add_test(NAME ingest COMMAND test_ingest)
|
||||
|
||||
# Optional cross-check: validates uc2 .ots output against the python-opentimestamps
|
||||
# reference parser. Skipped (return code 77) when opentimestamps is not installed.
|
||||
find_package(Python3 COMPONENTS Interpreter)
|
||||
if(Python3_Interpreter_FOUND)
|
||||
add_test(NAME ots_cross_check
|
||||
COMMAND ${Python3_EXECUTABLE}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/scripts/cross_check_ots.py
|
||||
$<TARGET_FILE:uc2-cli>
|
||||
${CMAKE_CURRENT_BINARY_DIR}/ots_cross_check
|
||||
)
|
||||
set_tests_properties(ots_cross_check PROPERTIES
|
||||
SKIP_RETURN_CODE 77
|
||||
LABELS "optional"
|
||||
)
|
||||
endif()
|
||||
|
||||
# libarchive plugin round-trip. Needs -DUC2_BUILD_LIBARCHIVE_PLUGIN=ON,
|
||||
# -DLIBARCHIVE_SOURCE_DIR=<source tree>, and -DLIBARCHIVE_LIBRARY=<built
|
||||
# libarchive.a> (a deps-disabled static build is enough; see docs).
|
||||
if(TARGET uc2_libarchive AND DEFINED LIBARCHIVE_LIBRARY
|
||||
AND DEFINED LIBARCHIVE_SOURCE_DIR)
|
||||
add_executable(test_libarchive_uc2 src/test_libarchive_uc2.c)
|
||||
target_include_directories(test_libarchive_uc2 PRIVATE
|
||||
"${LIBARCHIVE_SOURCE_DIR}/libarchive")
|
||||
target_link_libraries(test_libarchive_uc2 PRIVATE
|
||||
uc2_libarchive "${LIBARCHIVE_LIBRARY}" uc2)
|
||||
target_compile_features(test_libarchive_uc2 PRIVATE c_std_99)
|
||||
add_test(NAME libarchive_roundtrip
|
||||
COMMAND ${CMAKE_COMMAND}
|
||||
-DUC2_CLI=$<TARGET_FILE:uc2-cli>
|
||||
-DLA_TEST=$<TARGET_FILE:test_libarchive_uc2>
|
||||
-DTEST_DIR=${CMAKE_CURRENT_BINARY_DIR}/libarchive_test
|
||||
-P ${CMAKE_CURRENT_SOURCE_DIR}/test_cli_libarchive.cmake
|
||||
)
|
||||
endif()
|
||||
|
||||
# Cross-tool round-trip: UC2 v3 <-> original uc2pro.exe via DOSBox-X
|
||||
add_test(NAME roundtrip_dosbox
|
||||
COMMAND bash ${CMAKE_CURRENT_SOURCE_DIR}/scripts/roundtrip_dosbox.sh
|
||||
$<TARGET_FILE:uc2-cli>
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/../original/UC2_source/uc2pro.exe
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/corpus
|
||||
)
|
||||
set_tests_properties(roundtrip_dosbox PROPERTIES
|
||||
LABELS "dosbox"
|
||||
TIMEOUT 1200
|
||||
)
|
||||
BIN
tests/archives/basic.uc2
Normal file
BIN
tests/archives/basic.uc2
Normal file
Binary file not shown.
BIN
tests/archives/empty.uc2
Normal file
BIN
tests/archives/empty.uc2
Normal file
Binary file not shown.
BIN
tests/archives/random.uc2
Normal file
BIN
tests/archives/random.uc2
Normal file
Binary file not shown.
BIN
tests/archives/single.uc2
Normal file
BIN
tests/archives/single.uc2
Normal file
Binary file not shown.
BIN
tests/archives/zeros.uc2
Normal file
BIN
tests/archives/zeros.uc2
Normal file
Binary file not shown.
BIN
tests/corpus/allbytes.bin
Normal file
BIN
tests/corpus/allbytes.bin
Normal file
Binary file not shown.
0
tests/corpus/empty.dat
Normal file
0
tests/corpus/empty.dat
Normal file
1
tests/corpus/hello.txt
Normal file
1
tests/corpus/hello.txt
Normal file
@@ -0,0 +1 @@
|
||||
Hello, World!
|
||||
BIN
tests/corpus/random.bin
Normal file
BIN
tests/corpus/random.bin
Normal file
Binary file not shown.
30
tests/corpus/textfile.txt
Normal file
30
tests/corpus/textfile.txt
Normal file
@@ -0,0 +1,30 @@
|
||||
UltraCompressor II was a DOS-era archiver created by Nico de Vries between
|
||||
1992 and 1996. It was notable for its advanced deduplication system called
|
||||
"master blocks", file versioning within archives, and competitive compression
|
||||
ratios on the hardware of its day.
|
||||
|
||||
The archiver used an LZ77 sliding-window compressor with Huffman entropy
|
||||
coding. The algorithm operates on a 64KB circular buffer with hash-chain
|
||||
match finding. Matches of 3 to 32760 bytes are supported, with lazy
|
||||
evaluation to find better matches at adjacent positions.
|
||||
|
||||
Huffman trees are serialized using delta coding against the previous block's
|
||||
tree, with a nested Huffman code for the delta symbols. This is remarkably
|
||||
efficient for typical data where consecutive blocks have similar symbol
|
||||
distributions.
|
||||
|
||||
The deduplication system works by identifying common data blocks across files
|
||||
and storing them only once as "master blocks". When a file's compressed data
|
||||
matches an existing master, only a reference is stored. This was ahead of its
|
||||
time -- modern tools like borg and restic use similar content-defined chunking.
|
||||
|
||||
UC2 v3.0.0 is a cross-platform revival of this archiver, built on Jan
|
||||
Bobrowski's clean-room portable decompressor (libunuc2). The project brings
|
||||
UC2 back as a modern, portable C99 tool that runs on Linux, macOS, Windows,
|
||||
and even DOS via DJGPP cross-compilation.
|
||||
|
||||
This text file serves as part of the test corpus for verifying the extraction
|
||||
pipeline. It contains enough English prose to exercise the typical symbol
|
||||
distribution paths in the decompressor, including the Huffman tree generation
|
||||
and the LZ77 back-reference matching for repeated phrases like "master blocks"
|
||||
and "compression" which appear multiple times.
|
||||
BIN
tests/corpus/zeros.bin
Normal file
BIN
tests/corpus/zeros.bin
Normal file
Binary file not shown.
42
tests/fuzz/README.md
Normal file
42
tests/fuzz/README.md
Normal file
@@ -0,0 +1,42 @@
|
||||
# Fuzzing the UC2 reader
|
||||
|
||||
`fuzz_extract.c` is a libFuzzer harness that drives the full read path
|
||||
(`uc2_open` -> `uc2_read_cdir` -> `uc2_finish_cdir` -> `uc2_extract`)
|
||||
over arbitrary bytes with an in-memory reader and a discard writer. It
|
||||
targets the code that parses **untrusted** `.uc2` archives.
|
||||
|
||||
It is intentionally **not** part of the CMake build or CI: libFuzzer
|
||||
needs a Clang toolchain, and a fuzz run is open-ended rather than
|
||||
pass/fail. Build and run it by hand.
|
||||
|
||||
## Build
|
||||
|
||||
Compile the harness together with the library sources and the embedded
|
||||
super-master, against a configured build tree (for `uc2_version.h` and
|
||||
`super_data.S`):
|
||||
|
||||
```sh
|
||||
cmake -B build-asan -DCMAKE_BUILD_TYPE=Debug # any tree works; provides the generated files
|
||||
clang -fsanitize=fuzzer,address -O1 -g \
|
||||
-Ilib/include -Ilib/src -Ibuild-asan/lib \
|
||||
tests/fuzz/fuzz_extract.c $(ls lib/src/*.c) build-asan/lib/super_data.S \
|
||||
-lm -o fuzz_extract
|
||||
```
|
||||
|
||||
## Run
|
||||
|
||||
```sh
|
||||
mkdir -p corpus && cp tests/archives/*.uc2 corpus/
|
||||
./fuzz_extract -max_len=65536 -timeout=25 corpus/
|
||||
```
|
||||
|
||||
ASan flags any out-of-bounds access; libFuzzer writes a `crash-*` (or
|
||||
`timeout-*`) artifact for each finding. Re-run a single artifact with
|
||||
`./fuzz_extract <artifact>`.
|
||||
|
||||
## Status
|
||||
|
||||
Memory-safety: clean over sustained runs after the 2026-06-13 cdir
|
||||
hardening (git-bug 69e8e52). A residual slow-input (decompression-bomb)
|
||||
timeout is tracked separately; it is a bounded-CPU issue, not a
|
||||
memory-safety one.
|
||||
78
tests/fuzz/fuzz_extract.c
Normal file
78
tests/fuzz/fuzz_extract.c
Normal file
@@ -0,0 +1,78 @@
|
||||
/* libFuzzer harness for the UC2 read path.
|
||||
*
|
||||
* Feeds the fuzzer-provided bytes as a .uc2 archive through the full
|
||||
* open -> read_cdir -> finish_cdir -> extract flow with an in-memory
|
||||
* reader and a discard writer. The decoder must never read or write
|
||||
* out of bounds on any input.
|
||||
*
|
||||
* Build (clang):
|
||||
* clang -fsanitize=fuzzer,address -O1 -g -Ilib/include -Ilib/src \
|
||||
* -I<builddir>/lib tests/fuzz/fuzz_extract.c lib/src/*.c \
|
||||
* <builddir>/lib/super_data.S -o fuzz_extract
|
||||
* Run: ./fuzz_extract -max_len=65536 corpus/
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <uc2/libuc2.h>
|
||||
|
||||
struct mem { const uint8_t *data; unsigned avail; };
|
||||
|
||||
static int mem_read(void *ctx, unsigned pos, void *buf, unsigned len)
|
||||
{
|
||||
struct mem *m = ctx;
|
||||
if (pos >= m->avail)
|
||||
return 0;
|
||||
unsigned n = m->avail - pos;
|
||||
if (n > len)
|
||||
n = len;
|
||||
memcpy(buf, m->data + pos, n);
|
||||
return (int)n;
|
||||
}
|
||||
|
||||
static void *mem_alloc(void *ctx, unsigned size) { (void)ctx; return malloc(size); }
|
||||
static void mem_free(void *ctx, void *ptr) { (void)ctx; free(ptr); }
|
||||
static int discard(void *ctx, const void *p, unsigned len)
|
||||
{ (void)ctx; (void)p; (void)len; return 0; }
|
||||
|
||||
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
|
||||
{
|
||||
if (size > (1u << 20)) /* bound work; the format is small anyway */
|
||||
return 0;
|
||||
|
||||
struct uc2_io io = { .read = mem_read, .alloc = mem_alloc, .free = mem_free };
|
||||
struct mem m = { .data = data, .avail = (unsigned)size };
|
||||
|
||||
uc2_handle h = uc2_open(&io, &m);
|
||||
if (!h)
|
||||
return 0;
|
||||
|
||||
struct uc2_entry entries[64];
|
||||
int n = 0;
|
||||
for (int guard = 0; guard < 100000; guard++) {
|
||||
struct uc2_entry e;
|
||||
int ret = uc2_read_cdir(h, &e);
|
||||
if (ret == UC2_End || ret < 0)
|
||||
break;
|
||||
while (ret == UC2_TaggedEntry) {
|
||||
char *tag; void *d; unsigned sz;
|
||||
ret = uc2_get_tag(h, &e, &tag, &d, &sz);
|
||||
if (ret < 0)
|
||||
break;
|
||||
}
|
||||
if (ret < 0)
|
||||
break;
|
||||
if (!e.is_dir && n < (int)(sizeof entries / sizeof *entries))
|
||||
entries[n++] = e;
|
||||
}
|
||||
|
||||
char label[12];
|
||||
uc2_finish_cdir(h, label);
|
||||
|
||||
for (int i = 0; i < n; i++)
|
||||
uc2_extract(h, &entries[i].xi, entries[i].size, discard, 0);
|
||||
|
||||
uc2_close(h);
|
||||
return 0;
|
||||
}
|
||||
392
tests/scripts/bitdump.py
Normal file
392
tests/scripts/bitdump.py
Normal file
@@ -0,0 +1,392 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Decode and annotate UC2 compressed bitstreams for comparison.
|
||||
|
||||
Reads a UC2 archive, locates either the cdir or a file data section,
|
||||
and decodes the LZ77+Huffman bitstream symbol by symbol using the
|
||||
exact same algorithm as the Bobrowski decompressor.
|
||||
|
||||
Usage:
|
||||
python3 bitdump.py archive.uc2 [--cdir | --file N] [--max-symbols 100]
|
||||
"""
|
||||
|
||||
import struct
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
# UC2 constants
|
||||
MaxCodeBits = 13
|
||||
NumByteSym = 256
|
||||
NumDistSym = 60
|
||||
NumLenSym = 28
|
||||
NumSymbols = NumByteSym + NumDistSym + NumLenSym # 344
|
||||
NumDeltaCodes = 14
|
||||
RepeatCode = 14
|
||||
MinRepeat = 6
|
||||
EOB_MARK = 64001
|
||||
|
||||
# Default tree lengths (BasePrev from TREEENC.CPP)
|
||||
DEFAULT_LENGTHS = [0] * NumSymbols
|
||||
# Symbols 0..31: code length 9 (except 10,12,32 = 7)
|
||||
for i in range(32):
|
||||
DEFAULT_LENGTHS[i] = 9
|
||||
DEFAULT_LENGTHS[10] = 7
|
||||
DEFAULT_LENGTHS[12] = 7
|
||||
DEFAULT_LENGTHS[32] = 7
|
||||
# 33..127: 8 (except 46,58,92 = 7)
|
||||
for i in range(33, 128):
|
||||
DEFAULT_LENGTHS[i] = 8
|
||||
DEFAULT_LENGTHS[46] = 7
|
||||
DEFAULT_LENGTHS[58] = 7
|
||||
DEFAULT_LENGTHS[92] = 7
|
||||
# 128..255: 10
|
||||
for i in range(128, 256):
|
||||
DEFAULT_LENGTHS[i] = 10
|
||||
# 256..271: 6
|
||||
for i in range(256, 272):
|
||||
DEFAULT_LENGTHS[i] = 6
|
||||
# 272..283: 7
|
||||
for i in range(272, 284):
|
||||
DEFAULT_LENGTHS[i] = 7
|
||||
# 284..289: 8
|
||||
for i in range(284, 290):
|
||||
DEFAULT_LENGTHS[i] = 8
|
||||
# 290..299: 9
|
||||
for i in range(290, 300):
|
||||
DEFAULT_LENGTHS[i] = 9
|
||||
# 300..315: 10
|
||||
for i in range(300, 316):
|
||||
DEFAULT_LENGTHS[i] = 10
|
||||
# 316..324: 4
|
||||
for i in range(316, 325):
|
||||
DEFAULT_LENGTHS[i] = 4
|
||||
# 325..333: 5
|
||||
for i in range(325, 334):
|
||||
DEFAULT_LENGTHS[i] = 5
|
||||
# 334..343: 6
|
||||
for i in range(334, 344):
|
||||
DEFAULT_LENGTHS[i] = 6
|
||||
|
||||
# vval table (delta-to-absolute)
|
||||
VVAL = [
|
||||
[0,13,12,11,10,9,8,7,6,5,4,3,2,1],
|
||||
[1,2,3,4,5,6,7,8,9,10,11,12,13,0],
|
||||
[2,1,3,4,5,6,7,8,9,10,11,12,13,0],
|
||||
[3,2,4,1,5,6,7,8,9,10,11,12,13,0],
|
||||
[4,3,5,2,6,1,7,8,9,10,11,12,13,0],
|
||||
[5,4,6,3,7,2,8,1,9,10,11,12,13,0],
|
||||
[6,5,7,4,8,3,9,2,10,1,11,12,13,0],
|
||||
[7,6,8,5,9,4,10,3,11,2,12,1,13,0],
|
||||
[8,7,9,6,10,5,11,4,12,3,13,2,0,1],
|
||||
[9,8,10,7,11,6,12,5,13,4,0,3,2,1],
|
||||
[10,9,11,8,12,7,13,6,0,5,4,3,2,1],
|
||||
[11,10,12,9,13,8,0,7,6,5,4,3,2,1],
|
||||
[12,11,13,10,0,9,8,7,6,5,4,3,2,1],
|
||||
[13,12,0,11,10,9,8,7,6,5,4,3,2,1],
|
||||
]
|
||||
|
||||
# Distance decoding tables
|
||||
DIST_CODES = []
|
||||
for i in range(15):
|
||||
DIST_CODES.append((i + 1, 0)) # dist 1-15, 0 extra
|
||||
for i in range(15):
|
||||
DIST_CODES.append((16 + i * 16, 4)) # dist 16-240, 4 extra
|
||||
for i in range(15):
|
||||
DIST_CODES.append((256 + i * 256, 8)) # dist 256-3840, 8 extra
|
||||
for i in range(15):
|
||||
DIST_CODES.append((4096 + i * 4096, 12)) # dist 4096-61440, 12 extra
|
||||
|
||||
# Length decoding tables
|
||||
LEN_CODES = []
|
||||
for i in range(8):
|
||||
LEN_CODES.append((3 + i, 0))
|
||||
for i in range(8):
|
||||
LEN_CODES.append((11 + i * 2, 1))
|
||||
for i in range(8):
|
||||
LEN_CODES.append((27 + i * 8, 3))
|
||||
LEN_CODES.append((91, 6))
|
||||
LEN_CODES.append((155, 9))
|
||||
LEN_CODES.append((667, 11))
|
||||
LEN_CODES.append((2715, 15))
|
||||
|
||||
|
||||
class BitReader:
|
||||
def __init__(self, data, offset):
|
||||
self.data = data
|
||||
self.byte_pos = offset
|
||||
self.bits = 0
|
||||
self.have = 0
|
||||
self.total_bits_read = 0
|
||||
self.exhausted = False
|
||||
|
||||
def _fill(self):
|
||||
if self.byte_pos + 1 < len(self.data):
|
||||
lo = self.data[self.byte_pos]
|
||||
hi = self.data[self.byte_pos + 1]
|
||||
self.bits = (self.bits << 16) | lo | (hi << 8)
|
||||
self.have += 16
|
||||
self.byte_pos += 2
|
||||
else:
|
||||
self.exhausted = True
|
||||
|
||||
def peek(self, n):
|
||||
while self.have < n:
|
||||
if self.exhausted:
|
||||
return (self.bits << (n - self.have)) & ((1 << n) - 1)
|
||||
self._fill()
|
||||
return (self.bits >> (self.have - n)) & ((1 << n) - 1)
|
||||
|
||||
def get(self, n):
|
||||
v = self.peek(n)
|
||||
self.have -= n
|
||||
self.total_bits_read += n
|
||||
return v
|
||||
|
||||
|
||||
def build_decode_table(lengths, nsym):
|
||||
"""Build 13-bit lookup table from code lengths."""
|
||||
table = [None] * (1 << MaxCodeBits)
|
||||
code = 0
|
||||
for bit_len in range(1, MaxCodeBits + 1):
|
||||
for sym in range(nsym):
|
||||
if lengths[sym] == bit_len:
|
||||
prefix = code << (MaxCodeBits - bit_len)
|
||||
count = 1 << (MaxCodeBits - bit_len)
|
||||
for j in range(count):
|
||||
table[prefix + j] = (sym, bit_len)
|
||||
code += 1
|
||||
code <<= 1
|
||||
return table
|
||||
|
||||
|
||||
def huff_decode(br, table):
|
||||
"""Decode one Huffman symbol."""
|
||||
idx = br.peek(MaxCodeBits)
|
||||
entry = table[idx]
|
||||
if entry is None:
|
||||
return None, 0
|
||||
sym, bits = entry
|
||||
br.get(bits)
|
||||
return sym, bits
|
||||
|
||||
|
||||
def decode_tree(br, symprev):
|
||||
"""Decode Huffman tree from bitstream."""
|
||||
tree_changed = br.get(1)
|
||||
if not tree_changed:
|
||||
lengths = list(DEFAULT_LENGTHS)
|
||||
for i in range(NumSymbols):
|
||||
symprev[i] = lengths[i]
|
||||
return lengths, "default"
|
||||
|
||||
t = br.get(2)
|
||||
has_lo = t & 1
|
||||
has_hi = (t >> 1) & 1
|
||||
|
||||
# Read tree-encoding tree (15 x 3 bits)
|
||||
tlengths = [br.get(3) for _ in range(15)]
|
||||
|
||||
# Build meta-tree decode table
|
||||
meta_table = build_decode_table(tlengths, 15)
|
||||
|
||||
# Compute stream size
|
||||
stream_size = NumSymbols
|
||||
if not has_lo:
|
||||
stream_size -= 28 # skip symbols 4..31
|
||||
if not has_hi:
|
||||
stream_size -= 128 # skip symbols 128..255
|
||||
|
||||
# Decode delta stream with RLE
|
||||
stream = []
|
||||
val = 0
|
||||
while len(stream) < stream_size:
|
||||
sym, _ = huff_decode(br, meta_table)
|
||||
if sym == RepeatCode:
|
||||
c, _ = huff_decode(br, meta_table)
|
||||
count = c + MinRepeat - 1
|
||||
stream.extend([val] * count)
|
||||
else:
|
||||
val = sym
|
||||
stream.append(sym)
|
||||
|
||||
# Convert delta to absolute lengths
|
||||
lengths = [0] * NumSymbols
|
||||
si = 0
|
||||
if has_lo:
|
||||
for i in range(32):
|
||||
lengths[i] = VVAL[symprev[i]][stream[si]]
|
||||
si += 1
|
||||
else:
|
||||
lengths[9] = VVAL[symprev[9]][stream[si]]; si += 1
|
||||
lengths[10] = VVAL[symprev[10]][stream[si]]; si += 1
|
||||
lengths[12] = VVAL[symprev[12]][stream[si]]; si += 1
|
||||
lengths[13] = VVAL[symprev[13]][stream[si]]; si += 1
|
||||
|
||||
for i in range(32, 128):
|
||||
lengths[i] = VVAL[symprev[i]][stream[si]]
|
||||
si += 1
|
||||
|
||||
if has_hi:
|
||||
for i in range(128, 256):
|
||||
lengths[i] = VVAL[symprev[i]][stream[si]]
|
||||
si += 1
|
||||
|
||||
for i in range(256, 344):
|
||||
lengths[i] = VVAL[symprev[i]][stream[si]]
|
||||
si += 1
|
||||
|
||||
for i in range(NumSymbols):
|
||||
symprev[i] = lengths[i]
|
||||
|
||||
return lengths, f"custom(t={t})"
|
||||
|
||||
|
||||
def decode_block(br, bd_table, l_table, max_symbols):
|
||||
"""Decode LZ77 symbols from one block."""
|
||||
symbols = []
|
||||
max_bits = (len(br.data) - br.byte_pos + br.have) * 8 + 1000
|
||||
start_bits = br.total_bits_read
|
||||
while len(symbols) < max_symbols:
|
||||
if br.total_bits_read - start_bits > max_bits:
|
||||
symbols.append(("ERROR", "exceeded bit limit"))
|
||||
break
|
||||
sym, _ = huff_decode(br, bd_table)
|
||||
if sym is None:
|
||||
symbols.append(("ERROR", f"invalid Huffman code at bit {br.total_bits_read}"))
|
||||
break
|
||||
|
||||
if sym < NumByteSym:
|
||||
symbols.append(("LIT", sym))
|
||||
else:
|
||||
dsym = sym - NumByteSym
|
||||
if dsym >= NumDistSym:
|
||||
symbols.append(("ERROR", f"dist sym {dsym} out of range"))
|
||||
break
|
||||
base, extra_bits = DIST_CODES[dsym]
|
||||
dist = base
|
||||
if extra_bits:
|
||||
dist += br.get(extra_bits)
|
||||
|
||||
if dist == EOB_MARK:
|
||||
# Read length (should be 3)
|
||||
lsym, _ = huff_decode(br, l_table)
|
||||
lbase, lextra = LEN_CODES[lsym]
|
||||
length = lbase + (br.get(lextra) if lextra else 0)
|
||||
symbols.append(("EOB", f"dist={dist} len={length}"))
|
||||
break
|
||||
|
||||
lsym, _ = huff_decode(br, l_table)
|
||||
if lsym is None:
|
||||
symbols.append(("ERROR", "invalid length Huffman code"))
|
||||
break
|
||||
lbase, lextra = LEN_CODES[lsym]
|
||||
length = lbase + (br.get(lextra) if lextra else 0)
|
||||
symbols.append(("MATCH", f"dist={dist} len={length}"))
|
||||
|
||||
return symbols
|
||||
|
||||
|
||||
def analyze_archive(path, section, max_symbols):
|
||||
with open(path, "rb") as f:
|
||||
data = f.read()
|
||||
|
||||
magic = data[0:4]
|
||||
if magic != b'UC2\x1a':
|
||||
print(f"Not a UC2 archive: {magic}")
|
||||
return
|
||||
|
||||
complen = struct.unpack_from('<I', data, 4)[0]
|
||||
cdir_off = struct.unpack_from('<I', data, 17)[0]
|
||||
fletch = struct.unpack_from('<H', data, 21)[0]
|
||||
ver_made = struct.unpack_from('<H', data, 24)[0]
|
||||
ver_need = struct.unpack_from('<H', data, 26)[0]
|
||||
|
||||
print(f"Archive: {path} ({len(data)} bytes)")
|
||||
print(f" complen={complen}, total={complen+13}")
|
||||
print(f" cdir_offset={cdir_off}, fletcher={fletch:#06x}")
|
||||
print(f" versionMade={ver_made}, versionNeeded={ver_need}")
|
||||
|
||||
if section == 'cdir':
|
||||
crec = data[cdir_off:cdir_off + 10]
|
||||
csize, method, master = struct.unpack_from('<IHI', crec)
|
||||
print(f" cdir COMPRESS: csize={csize}, method={method}, master={master}")
|
||||
stream_start = cdir_off + 10
|
||||
else:
|
||||
stream_start = 29
|
||||
print(f" File data starts at offset {stream_start}")
|
||||
|
||||
print()
|
||||
|
||||
br = BitReader(data, stream_start)
|
||||
symprev = list(DEFAULT_LENGTHS)
|
||||
total_decoded = 0
|
||||
|
||||
block_num = 0
|
||||
while total_decoded < max_symbols:
|
||||
bit_pos = br.total_bits_read
|
||||
block_present = br.get(1)
|
||||
print(f"Block {block_num} at bit {bit_pos}: present={block_present}")
|
||||
if not block_present:
|
||||
print(" End of stream")
|
||||
break
|
||||
|
||||
lengths, tree_desc = decode_tree(br, symprev)
|
||||
tree_bits = br.total_bits_read - bit_pos - 1
|
||||
print(f" Tree: {tree_desc} ({tree_bits} bits)")
|
||||
|
||||
nonzero = sum(1 for l in lengths if l > 0)
|
||||
print(f" Non-zero lengths: {nonzero}/{NumSymbols}")
|
||||
|
||||
bd_table = build_decode_table(lengths[:NumByteSym + NumDistSym],
|
||||
NumByteSym + NumDistSym)
|
||||
l_table = build_decode_table(lengths[NumByteSym + NumDistSym:],
|
||||
NumLenSym)
|
||||
|
||||
bd_none = sum(1 for x in bd_table if x is None)
|
||||
l_none = sum(1 for x in l_table if x is None)
|
||||
if bd_none:
|
||||
print(f" WARNING: {bd_none}/{len(bd_table)} BD table entries are None")
|
||||
if l_none:
|
||||
print(f" WARNING: {l_none}/{len(l_table)} LEN table entries are None")
|
||||
|
||||
# Decode until EOB or error (no per-block symbol limit)
|
||||
remaining = max_symbols - total_decoded
|
||||
symbols = decode_block(br, bd_table, l_table, remaining)
|
||||
total_decoded += len(symbols)
|
||||
|
||||
truncated = len(symbols) >= remaining and symbols[-1][0] not in ("EOB", "ERROR")
|
||||
print(f" Decoded {len(symbols)} symbols{' (truncated)' if truncated else ''}:")
|
||||
for i, (kind, val) in enumerate(symbols):
|
||||
if kind == "LIT":
|
||||
ch = chr(val) if 32 <= val < 127 else f"\\x{val:02x}"
|
||||
print(f" [{i:3d}] LIT {val:3d} '{ch}'")
|
||||
elif kind == "MATCH":
|
||||
print(f" [{i:3d}] {val}")
|
||||
elif kind == "EOB":
|
||||
print(f" [{i:3d}] EOB ({val})")
|
||||
elif kind == "ERROR":
|
||||
print(f" [{i:3d}] ERROR: {val}")
|
||||
|
||||
data_bits = br.total_bits_read - bit_pos - 1 - tree_bits
|
||||
print(f" Data: {data_bits} bits")
|
||||
print()
|
||||
|
||||
if truncated or (symbols and symbols[-1][0] in ("ERROR",)):
|
||||
break
|
||||
block_num += 1
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='UC2 bitstream analyzer')
|
||||
parser.add_argument('archive', help='UC2 archive file')
|
||||
parser.add_argument('--cdir', action='store_true', help='Analyze cdir section')
|
||||
parser.add_argument('--file', action='store_true', help='Analyze file data section')
|
||||
parser.add_argument('--max-symbols', type=int, default=200,
|
||||
help='Max symbols to decode per block')
|
||||
args = parser.parse_args()
|
||||
|
||||
section = 'cdir' if args.cdir else 'file'
|
||||
analyze_archive(args.archive, section, args.max_symbols)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
86
tests/scripts/check_assert_side_effects.py
Executable file
86
tests/scripts/check_assert_side_effects.py
Executable file
@@ -0,0 +1,86 @@
|
||||
#!/usr/bin/env python3
|
||||
# Fails when assert(...) wraps a function call with side effects.
|
||||
#
|
||||
# Background: under -DNDEBUG (CMake's default for Release) the assert macro
|
||||
# expands to ((void)0) and the wrapped expression is not evaluated. Any work
|
||||
# done inside assert() is silently dropped. This has cost the project two
|
||||
# CI rounds:
|
||||
# - dae8a50: int-truncation in test_merkle / test_dict Debug builds
|
||||
# - 6d8087f: test_delta double-free under Release / Windows MSVC
|
||||
#
|
||||
# Rule: tests must capture the call result first, then assert on it:
|
||||
# int rc = call(...);
|
||||
# assert(rc == EXPECTED);
|
||||
#
|
||||
# This script detects the dangerous form by matching assert() that wraps a
|
||||
# call to a function whose name contains a side-effect verb. Pure queries
|
||||
# (_equal, _match, _verify, _has_, _is_, _root, _id, _hash, _attest_name,
|
||||
# memcmp, strcmp, ...) are allowed.
|
||||
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
SIDE_EFFECT_VERBS = (
|
||||
"encode", "decode", "parse", "serialize", "deserialize",
|
||||
"build", "init", "write", "read_file", "attach", "extract",
|
||||
"compress", "decompress", "create", "destroy", "open", "close",
|
||||
"flush", "push", "pop", "append", "insert", "remove", "update",
|
||||
"store", "load", "put", "finalize", "process", "run", "step",
|
||||
"alloc", "free", "register", "submit", "commit", "rollback",
|
||||
)
|
||||
|
||||
SCAN_DIRS = ("tests/src", "lib/src", "cli/src", "src")
|
||||
|
||||
assert_call_re = re.compile(
|
||||
r"assert\s*\(\s*([A-Za-z_][A-Za-z0-9_]*)\s*\("
|
||||
)
|
||||
|
||||
verb_re = re.compile(
|
||||
r"(?:^|_)(" + "|".join(SIDE_EFFECT_VERBS) + r")(?:_|$)"
|
||||
)
|
||||
|
||||
|
||||
def scan(root: Path) -> list[tuple[Path, int, str, str]]:
|
||||
findings = []
|
||||
for d in SCAN_DIRS:
|
||||
base = root / d
|
||||
if not base.is_dir():
|
||||
continue
|
||||
for path in sorted(base.rglob("*.c")):
|
||||
for lineno, line in enumerate(path.read_text(encoding="utf-8",
|
||||
errors="replace").splitlines(),
|
||||
start=1):
|
||||
# Skip comments quickly. Not perfect but adequate here.
|
||||
stripped = line.lstrip()
|
||||
if stripped.startswith("//") or stripped.startswith("*"):
|
||||
continue
|
||||
m = assert_call_re.search(line)
|
||||
if not m:
|
||||
continue
|
||||
ident = m.group(1)
|
||||
if verb_re.search(ident):
|
||||
findings.append((path, lineno, ident, line.rstrip()))
|
||||
return findings
|
||||
|
||||
|
||||
def main() -> int:
|
||||
repo_root = Path(__file__).resolve().parents[2]
|
||||
findings = scan(repo_root)
|
||||
if not findings:
|
||||
print("OK: no side-effecting asserts found.")
|
||||
return 0
|
||||
print("ERROR: assert() must not wrap calls with side effects.", file=sys.stderr)
|
||||
print("Under -DNDEBUG (Release builds) the call is dropped, leaving", file=sys.stderr)
|
||||
print("output parameters uninitialised and the test silently no-op.", file=sys.stderr)
|
||||
print("Convert to: int rc = call(...); assert(rc == EXPECTED);", file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
for path, lineno, ident, line in findings:
|
||||
rel = path.relative_to(repo_root)
|
||||
print(f"{rel}:{lineno}: {ident}", file=sys.stderr)
|
||||
print(f" {line.strip()}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
104
tests/scripts/create_archives.sh
Executable file
104
tests/scripts/create_archives.sh
Executable file
@@ -0,0 +1,104 @@
|
||||
#!/bin/bash
|
||||
# Create reference UC2 archives from the test corpus using the original
|
||||
# UC2 Pro (UC.EXE) in DOSBox-X.
|
||||
#
|
||||
# Run from the UC2 project root: bash tests/scripts/create_archives.sh
|
||||
#
|
||||
# uc2pro.exe is a UCEXE-compressed self-extracting archive containing the
|
||||
# UC2 Pro distribution. We first extract it to get UC.EXE, then use
|
||||
# UC.EXE to create the reference archives.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
PROJECT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
ARCHIVE_DIR="$PROJECT_DIR/tests/archives"
|
||||
|
||||
# DOSBox-X flatpak only has filesystem=home access.
|
||||
WORK_DIR="$(mktemp -d "$HOME/.cache/uc2-create-archives.XXXXXX")"
|
||||
trap 'rm -rf "$WORK_DIR"' EXIT
|
||||
|
||||
echo "Working in: $WORK_DIR"
|
||||
|
||||
cp "$PROJECT_DIR/original/UC2_source/uc2pro.exe" "$WORK_DIR/"
|
||||
mkdir -p "$WORK_DIR/corpus" "$WORK_DIR/out"
|
||||
cp "$PROJECT_DIR/tests/corpus/"* "$WORK_DIR/corpus/"
|
||||
|
||||
# Session 1: Extract UC2 Pro distribution from SFX
|
||||
echo "Extracting UC2 Pro tools from uc2pro.exe (this takes ~60s in DOSBox)..."
|
||||
cat > "$WORK_DIR/dosbox.conf" <<DOSBOXCFG
|
||||
[sdl]
|
||||
output=none
|
||||
fullscreen=false
|
||||
[dosbox]
|
||||
memsize=16
|
||||
machine=svga_s3
|
||||
[cpu]
|
||||
cycles=max
|
||||
[autoexec]
|
||||
mount c: $WORK_DIR
|
||||
c:
|
||||
uc2pro UC2DIST
|
||||
exit
|
||||
DOSBOXCFG
|
||||
|
||||
timeout 180 flatpak run com.dosbox_x.DOSBox-X \
|
||||
-conf "$WORK_DIR/dosbox.conf" -nopromptfolder 2>/dev/null || true
|
||||
|
||||
if [ ! -f "$WORK_DIR/UC2DIST/UC.EXE" ]; then
|
||||
echo "ERROR: SFX extraction failed (UC.EXE not found)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Session 2: Create reference archives
|
||||
echo "Creating reference archives..."
|
||||
cat > "$WORK_DIR/dosbox.conf" <<DOSBOXCFG
|
||||
[sdl]
|
||||
output=none
|
||||
fullscreen=false
|
||||
[dosbox]
|
||||
memsize=16
|
||||
machine=svga_s3
|
||||
[cpu]
|
||||
cycles=max
|
||||
[autoexec]
|
||||
mount c: $WORK_DIR
|
||||
c:
|
||||
cd C:\\UC2DIST
|
||||
|
||||
rem Basic archive: all corpus files, Method 4 (Ultra, default)
|
||||
UC a C:\\OUT\\BASIC C:\\CORPUS\\*.*
|
||||
|
||||
rem Empty file only
|
||||
UC a C:\\OUT\\EMPTY C:\\CORPUS\\EMPTY.DAT
|
||||
|
||||
rem Single text file
|
||||
UC a C:\\OUT\\SINGLE C:\\CORPUS\\HELLO.TXT
|
||||
|
||||
rem Large compressible file
|
||||
UC a C:\\OUT\\ZEROS C:\\CORPUS\\ZEROS.BIN
|
||||
|
||||
rem Incompressible data
|
||||
UC a C:\\OUT\\RANDOM C:\\CORPUS\\RANDOM.BIN
|
||||
|
||||
echo DONE > C:\\DONE.TXT
|
||||
exit
|
||||
DOSBOXCFG
|
||||
|
||||
timeout 600 flatpak run com.dosbox_x.DOSBox-X \
|
||||
-conf "$WORK_DIR/dosbox.conf" -nopromptfolder 2>/dev/null || true
|
||||
|
||||
# Copy generated archives to the project
|
||||
if ls "$WORK_DIR/out/"*.UC2 >/dev/null 2>&1; then
|
||||
mkdir -p "$ARCHIVE_DIR"
|
||||
for f in "$WORK_DIR/out/"*.UC2; do
|
||||
base=$(basename "$f")
|
||||
lower=$(echo "$base" | tr '[:upper:]' '[:lower:]')
|
||||
cp "$f" "$ARCHIVE_DIR/$lower"
|
||||
done
|
||||
echo "Archives created in $ARCHIVE_DIR:"
|
||||
ls -la "$ARCHIVE_DIR/"*.uc2
|
||||
else
|
||||
echo "ERROR: No archives were generated. Check DOSBox output."
|
||||
exit 1
|
||||
fi
|
||||
119
tests/scripts/cross_check_ots.py
Normal file
119
tests/scripts/cross_check_ots.py
Normal file
@@ -0,0 +1,119 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Cross-check uc2 OTS output against the python-opentimestamps reference.
|
||||
|
||||
Usage: cross_check_ots.py <uc2-binary> <work-dir>
|
||||
|
||||
Builds a tiny archive, attaches a hand-crafted OTS proof, then:
|
||||
1. Extracts via `uc2 --ots-extract`
|
||||
2. Round-trips the .ots through python-opentimestamps
|
||||
3. Confirms the proof's leaf digest equals SHA-256 of the attested archive prefix
|
||||
|
||||
Exits 0 on success, 1 on mismatch, 77 (autotools "skip" code) if the
|
||||
opentimestamps library isn't installed.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import os
|
||||
import struct
|
||||
import subprocess
|
||||
import sys
|
||||
from io import BytesIO
|
||||
|
||||
try:
|
||||
from opentimestamps.core.timestamp import DetachedTimestampFile
|
||||
from opentimestamps.core.serialize import StreamDeserializationContext
|
||||
except ModuleNotFoundError:
|
||||
print("opentimestamps library not installed; skipping cross-check.")
|
||||
sys.exit(77)
|
||||
|
||||
|
||||
HEADER_MAGIC = (b"\x00OpenTimestamps\x00\x00Proof\x00"
|
||||
b"\xbf\x89\xe2\xe8\x84\xe8\x92\x94")
|
||||
PENDING_TAG = b"\x83\xdf\xe3\x0d\x2e\xf9\x0c\x8e"
|
||||
TRAILER_MAGIC = b"UC2-OTS\x00"
|
||||
|
||||
|
||||
def varint(n):
|
||||
out = b""
|
||||
while n >= 0x80:
|
||||
out += bytes([n & 0x7f | 0x80])
|
||||
n >>= 7
|
||||
return out + bytes([n])
|
||||
|
||||
|
||||
def varbytes(b):
|
||||
return varint(len(b)) + b
|
||||
|
||||
|
||||
def build_proof(leaf):
|
||||
# Pending attestation payload is itself varbytes(uri) per the OTS spec,
|
||||
# wrapped in the outer varbytes(serialized_attestation) layer.
|
||||
pending_payload = varbytes(b"https://example.com/digest")
|
||||
body = b"\x00" + PENDING_TAG + varbytes(pending_payload)
|
||||
return HEADER_MAGIC + b"\x01" + b"\x08" + leaf + body
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 3:
|
||||
print("usage: cross_check_ots.py <uc2-binary> <work-dir>", file=sys.stderr)
|
||||
return 1
|
||||
uc2 = sys.argv[1]
|
||||
work = sys.argv[2]
|
||||
os.makedirs(work, exist_ok=True)
|
||||
|
||||
a = os.path.join(work, "a.txt")
|
||||
b = os.path.join(work, "b.txt")
|
||||
with open(a, "w") as f: f.write("hello uc2 ots cross-check\n")
|
||||
with open(b, "w") as f: f.write("second file\n")
|
||||
|
||||
arc = os.path.join(work, "test.uc2")
|
||||
subprocess.check_call([uc2, "-w", "-q", arc, a, b])
|
||||
|
||||
archive_size = os.path.getsize(arc)
|
||||
with open(arc, "rb") as f:
|
||||
archive_bytes = f.read()
|
||||
leaf = hashlib.sha256(archive_bytes).digest()
|
||||
|
||||
proof_path = os.path.join(work, "proof.ots")
|
||||
with open(proof_path, "wb") as f:
|
||||
f.write(build_proof(leaf))
|
||||
|
||||
subprocess.check_call([uc2, "--ots-attach", proof_path, arc])
|
||||
|
||||
extracted = os.path.join(work, "extracted.ots")
|
||||
subprocess.check_call([uc2, "--ots-extract", arc, extracted])
|
||||
|
||||
with open(extracted, "rb") as f:
|
||||
ots_bytes = f.read()
|
||||
ctx = StreamDeserializationContext(BytesIO(ots_bytes))
|
||||
detached = DetachedTimestampFile.deserialize(ctx)
|
||||
|
||||
py_leaf = bytes(detached.timestamp.msg)
|
||||
if py_leaf != leaf:
|
||||
print("LEAF MISMATCH", file=sys.stderr)
|
||||
print(f" hand-computed: {leaf.hex()}", file=sys.stderr)
|
||||
print(f" python-ots: {py_leaf.hex()}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
attestations = list(detached.timestamp.all_attestations())
|
||||
if not attestations:
|
||||
print("no attestations parsed by python-opentimestamps", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
info = subprocess.check_output(
|
||||
[uc2, "--ots-info", arc], stderr=subprocess.STDOUT, text=True)
|
||||
if "leaf matches archive: yes" not in info:
|
||||
print("uc2 --ots-info reports leaf mismatch:", file=sys.stderr)
|
||||
print(info, file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if archive_size + len(ots_bytes) >= os.path.getsize(arc):
|
||||
pass # archive grew by at least proof_len; trailer is present
|
||||
|
||||
print(f"cross-check OK: archive_size={archive_size}, proof_len={len(ots_bytes)}, "
|
||||
f"attestations={len(attestations)}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
109
tests/scripts/dos_smoke.sh
Executable file
109
tests/scripts/dos_smoke.sh
Executable file
@@ -0,0 +1,109 @@
|
||||
#!/bin/bash
|
||||
# Smoke test for the DJGPP-built uc2.exe via DOSBox-X.
|
||||
#
|
||||
# Verifies the cross-compiled DOS binary actually loads under a real
|
||||
# DPMI host and produces expected output for `uc2 -h` and `uc2 -l`.
|
||||
#
|
||||
# Usage:
|
||||
# tests/scripts/dos_smoke.sh <uc2.exe> <CWSDPMI.EXE> [<list-archive>]
|
||||
#
|
||||
# Where:
|
||||
# <uc2.exe> DJGPP-built DOS binary
|
||||
# (e.g. build-djgpp/cli/uc2.exe)
|
||||
# <CWSDPMI.EXE> DPMI extender from csdpmi7b.zip
|
||||
# (http://www.delorie.com/pub/djgpp/current/v2misc/csdpmi7b.zip)
|
||||
# <list-archive> Optional: small UC2 archive to test 'uc2 -l' against
|
||||
# (e.g. tests/archives/basic.uc2)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
UC2_EXE="${1:?usage: dos_smoke.sh <uc2.exe> <CWSDPMI.EXE> [<list-archive>]}"
|
||||
CWSDPMI="${2:?usage: dos_smoke.sh <uc2.exe> <CWSDPMI.EXE> [<list-archive>]}"
|
||||
LIST_ARCHIVE="${3:-}"
|
||||
|
||||
if [ ! -f "$UC2_EXE" ]; then
|
||||
echo "SKIP: uc2.exe not found at $UC2_EXE (run the DJGPP build first)"
|
||||
exit 0
|
||||
fi
|
||||
if [ ! -f "$CWSDPMI" ]; then
|
||||
echo "SKIP: CWSDPMI.EXE not found at $CWSDPMI"
|
||||
exit 0
|
||||
fi
|
||||
if ! flatpak info com.dosbox_x.DOSBox-X &>/dev/null; then
|
||||
echo "SKIP: DOSBox-X not installed (flatpak com.dosbox_x.DOSBox-X)"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
WORK="$(mktemp -d "$HOME/.cache/uc2-dos-smoke.XXXXXX")"
|
||||
trap 'rm -rf "$WORK"' EXIT
|
||||
|
||||
cp "$UC2_EXE" "$WORK/UC2.EXE"
|
||||
cp "$CWSDPMI" "$WORK/CWSDPMI.EXE"
|
||||
|
||||
ARCHIVE_NAME=""
|
||||
if [ -n "$LIST_ARCHIVE" ] && [ -f "$LIST_ARCHIVE" ]; then
|
||||
ARCHIVE_NAME="$(basename "$LIST_ARCHIVE" | tr '[:lower:]' '[:upper:]')"
|
||||
cp "$LIST_ARCHIVE" "$WORK/$ARCHIVE_NAME"
|
||||
fi
|
||||
|
||||
cat > "$WORK/dosbox.conf" <<DOSBOXCFG
|
||||
[sdl]
|
||||
output=none
|
||||
fullscreen=false
|
||||
[dosbox]
|
||||
memsize=16
|
||||
machine=svga_s3
|
||||
[cpu]
|
||||
cycles=max
|
||||
[autoexec]
|
||||
mount c: $WORK
|
||||
c:
|
||||
UC2 -h > HELP.TXT
|
||||
echo HELPDONE > HELPMRK.TXT
|
||||
${ARCHIVE_NAME:+UC2 -l $ARCHIVE_NAME > LIST.TXT}
|
||||
${ARCHIVE_NAME:+echo LISTDONE > LISTMRK.TXT}
|
||||
exit
|
||||
DOSBOXCFG
|
||||
|
||||
echo "=== Running uc2.exe under DOSBox-X ==="
|
||||
timeout 60 flatpak run com.dosbox_x.DOSBox-X \
|
||||
-conf "$WORK/dosbox.conf" -nopromptfolder 2>/dev/null || true
|
||||
|
||||
FAIL=0
|
||||
|
||||
# --- Validate uc2 -h ---
|
||||
if [ ! -f "$WORK/HELPMRK.TXT" ]; then
|
||||
echo " FAIL: DOSBox session did not complete (no HELPMRK.TXT)"
|
||||
FAIL=1
|
||||
elif [ ! -f "$WORK/HELP.TXT" ]; then
|
||||
echo " FAIL: uc2 -h produced no output"
|
||||
FAIL=1
|
||||
elif ! grep -qi "UltraCompressor\|UC2" "$WORK/HELP.TXT"; then
|
||||
echo " FAIL: uc2 -h output missing expected text"
|
||||
head -20 "$WORK/HELP.TXT"
|
||||
FAIL=1
|
||||
else
|
||||
echo " OK: uc2 -h"
|
||||
fi
|
||||
|
||||
# --- Optional: validate uc2 -l ---
|
||||
if [ -n "$ARCHIVE_NAME" ]; then
|
||||
if [ ! -f "$WORK/LISTMRK.TXT" ]; then
|
||||
echo " FAIL: uc2 -l did not complete"
|
||||
FAIL=1
|
||||
elif [ ! -s "$WORK/LIST.TXT" ]; then
|
||||
echo " FAIL: uc2 -l produced empty output"
|
||||
FAIL=1
|
||||
else
|
||||
echo " OK: uc2 -l $ARCHIVE_NAME"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $FAIL -ne 0 ]; then
|
||||
echo "FAILED: DOS smoke test"
|
||||
echo "Work directory preserved at: $WORK"
|
||||
trap - EXIT
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "PASSED: DJGPP-built uc2.exe runs under DOSBox-X"
|
||||
184
tests/scripts/roundtrip_dosbox.sh
Executable file
184
tests/scripts/roundtrip_dosbox.sh
Executable file
@@ -0,0 +1,184 @@
|
||||
#!/bin/bash
|
||||
# Cross-tool round-trip test: original UC2 Pro -> UC2 v3 via DOSBox-X
|
||||
#
|
||||
# Tests both directions: UC2 v3 creates a multi-file archive that the
|
||||
# original extracts (Direction 1), and the original creates an archive
|
||||
# that UC2 v3 extracts (Direction 2). Multi-file Direction 1 has worked
|
||||
# since the custom-Huffman-tree fix; an earlier version of this comment
|
||||
# documented a hang that no longer reproduces.
|
||||
#
|
||||
# Usage: roundtrip_dosbox.sh <uc2-cli> <uc2pro.exe> <corpus-dir>
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
UC2_CLI="$1"
|
||||
UC2PRO="$2"
|
||||
CORPUS="$3"
|
||||
|
||||
FILES=(hello.txt textfile.txt allbytes.bin random.bin zeros.bin)
|
||||
|
||||
if ! flatpak info com.dosbox_x.DOSBox-X &>/dev/null; then
|
||||
echo "SKIP: DOSBox-X not installed (flatpak com.dosbox_x.DOSBox-X)"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
WORK="$(mktemp -d "$HOME/.cache/uc2-dosbox-test.XXXXXX")"
|
||||
trap 'rm -rf "$WORK"' EXIT
|
||||
|
||||
mkdir -p "$WORK/corpus" "$WORK/out" "$WORK/output"
|
||||
for f in "${FILES[@]}"; do
|
||||
cp "$CORPUS/$f" "$WORK/corpus/"
|
||||
done
|
||||
cp "$UC2PRO" "$WORK/uc2pro.exe"
|
||||
|
||||
# --- Session 1: Extract UC2 Pro distribution from SFX ---
|
||||
echo "=== Session 1: Extracting UC2 Pro tools from SFX ==="
|
||||
cat > "$WORK/dosbox.conf" <<DOSBOXCFG
|
||||
[sdl]
|
||||
output=none
|
||||
fullscreen=false
|
||||
[dosbox]
|
||||
memsize=16
|
||||
machine=svga_s3
|
||||
[cpu]
|
||||
cycles=max
|
||||
[autoexec]
|
||||
mount c: $WORK
|
||||
c:
|
||||
uc2pro UC2DIST
|
||||
exit
|
||||
DOSBOXCFG
|
||||
|
||||
# SFX decompression takes 3-8 minutes depending on host CPU speed
|
||||
timeout 600 flatpak run com.dosbox_x.DOSBox-X \
|
||||
-conf "$WORK/dosbox.conf" -nopromptfolder 2>/dev/null || true
|
||||
|
||||
UC2DIST_COUNT=$(ls "$WORK/UC2DIST/" 2>/dev/null | wc -l)
|
||||
if [ ! -f "$WORK/UC2DIST/UC.EXE" ] || [ "$UC2DIST_COUNT" -lt 22 ]; then
|
||||
echo "FAIL: UC2 Pro SFX extraction incomplete ($UC2DIST_COUNT/22 files)"
|
||||
exit 1
|
||||
fi
|
||||
echo " UC.EXE extracted ($(wc -c < "$WORK/UC2DIST/UC.EXE") bytes, $UC2DIST_COUNT files)"
|
||||
|
||||
# --- Direction 1: UC2 v3 creates, original extracts (multi-file) ---
|
||||
echo "=== Direction 1: UC2 v3 creates -> original extracts ==="
|
||||
DIR1_FILES=(hello.txt textfile.txt allbytes.bin random.bin)
|
||||
"$UC2_CLI" -w "$WORK/v3multi.uc2" \
|
||||
"$WORK/corpus/hello.txt" "$WORK/corpus/textfile.txt" \
|
||||
"$WORK/corpus/allbytes.bin" "$WORK/corpus/random.bin"
|
||||
mkdir -p "$WORK/dir1_out"
|
||||
cat > "$WORK/dosbox.conf" <<DOSBOXCFG
|
||||
[sdl]
|
||||
output=none
|
||||
fullscreen=false
|
||||
[dosbox]
|
||||
memsize=16
|
||||
machine=svga_s3
|
||||
[cpu]
|
||||
cycles=max
|
||||
[autoexec]
|
||||
mount c: $WORK
|
||||
c:
|
||||
cd C:\\DIR1_OUT
|
||||
C:\\UC2DIST\\UC eF C:\\V3MULTI *.*
|
||||
echo DIR1 > C:\\DIR1.TXT
|
||||
exit
|
||||
DOSBOXCFG
|
||||
|
||||
timeout 120 flatpak run com.dosbox_x.DOSBox-X \
|
||||
-conf "$WORK/dosbox.conf" -nopromptfolder 2>/dev/null || true
|
||||
|
||||
# --- Session 2: original creates archive ---
|
||||
echo "=== Session 2 (Direction 2): UC2 Pro creates archive ==="
|
||||
cat > "$WORK/dosbox.conf" <<DOSBOXCFG
|
||||
[sdl]
|
||||
output=none
|
||||
fullscreen=false
|
||||
[dosbox]
|
||||
memsize=16
|
||||
machine=svga_s3
|
||||
[cpu]
|
||||
cycles=max
|
||||
[autoexec]
|
||||
mount c: $WORK
|
||||
c:
|
||||
cd C:\\UC2DIST
|
||||
UC a C:\\OUT\\DOSTEST C:\\CORPUS\\*.*
|
||||
echo DONE > C:\\MARKER.TXT
|
||||
exit
|
||||
DOSBOXCFG
|
||||
|
||||
timeout 300 flatpak run com.dosbox_x.DOSBox-X \
|
||||
-conf "$WORK/dosbox.conf" -nopromptfolder 2>/dev/null || true
|
||||
|
||||
if [ ! -f "$WORK/MARKER.TXT" ]; then
|
||||
echo "FAIL: DOSBox session did not complete"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
DOS_ARCHIVE=""
|
||||
for candidate in "$WORK/out/DOSTEST.UC2" "$WORK/out/dostest.uc2"; do
|
||||
[ -f "$candidate" ] && DOS_ARCHIVE="$candidate" && break
|
||||
done
|
||||
if [ -z "$DOS_ARCHIVE" ]; then
|
||||
echo "FAIL: UC2 Pro did not create DOSTEST.UC2"
|
||||
exit 1
|
||||
fi
|
||||
echo " Archive created: $(wc -c < "$DOS_ARCHIVE") bytes"
|
||||
|
||||
# --- Extract with UC2 v3 and verify ---
|
||||
echo "=== Extracting with UC2 v3 ==="
|
||||
"$UC2_CLI" -d "$WORK/output" "$DOS_ARCHIVE"
|
||||
|
||||
FAIL=0
|
||||
for f in "${FILES[@]}"; do
|
||||
upper=$(echo "$f" | tr '[:lower:]' '[:upper:]')
|
||||
extracted=""
|
||||
for candidate in "$WORK/output/$f" "$WORK/output/$upper"; do
|
||||
[ -f "$candidate" ] && extracted="$candidate" && break
|
||||
done
|
||||
if [ -z "$extracted" ]; then
|
||||
echo " FAIL: $f not extracted"
|
||||
FAIL=1
|
||||
continue
|
||||
fi
|
||||
if cmp -s "$CORPUS/$f" "$extracted"; then
|
||||
echo " OK: $f"
|
||||
else
|
||||
echo " FAIL: $f content mismatch"
|
||||
FAIL=1
|
||||
fi
|
||||
done
|
||||
|
||||
# --- Verify Direction 1 (multi-file) ---
|
||||
echo "--- Verifying Direction 1 (UC2 v3 -> original) ---"
|
||||
if [ -f "$WORK/DIR1.TXT" ]; then
|
||||
for f in "${DIR1_FILES[@]}"; do
|
||||
upper=$(echo "$f" | tr '[:lower:]' '[:upper:]')
|
||||
extracted=""
|
||||
for candidate in "$WORK/dir1_out/$upper" "$WORK/dir1_out/$f"; do
|
||||
[ -f "$candidate" ] && extracted="$candidate" && break
|
||||
done
|
||||
if [ -z "$extracted" ]; then
|
||||
echo " FAIL: $f not extracted by original (Direction 1)"
|
||||
FAIL=1
|
||||
elif cmp -s "$CORPUS/$f" "$extracted"; then
|
||||
echo " OK: $f (Direction 1)"
|
||||
else
|
||||
echo " FAIL: $f content mismatch (Direction 1)"
|
||||
FAIL=1
|
||||
fi
|
||||
done
|
||||
else
|
||||
echo " FAIL: Direction 1 DOSBox session incomplete"
|
||||
FAIL=1
|
||||
fi
|
||||
|
||||
if [ $FAIL -ne 0 ]; then
|
||||
echo "FAILED: some files did not survive cross-tool round-trip"
|
||||
echo "Work directory preserved at: $WORK"
|
||||
trap - EXIT
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "PASSED: all files verified (both directions)"
|
||||
127
tests/src/test_blake3.c
Normal file
127
tests/src/test_blake3.c
Normal file
@@ -0,0 +1,127 @@
|
||||
/* Tests for BLAKE3 cryptographic hashing. */
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include <uc2/uc2_blake3.h>
|
||||
|
||||
static int tests_run = 0, tests_passed = 0;
|
||||
#define TEST(name) do { tests_run++; printf(" %s: ", #name); name(); tests_passed++; printf("OK\n"); } while (0)
|
||||
|
||||
static void hex(const uint8_t *h, int n, char *out)
|
||||
{
|
||||
for (int i = 0; i < n; i++) sprintf(out + i*2, "%02x", h[i]);
|
||||
}
|
||||
|
||||
static void test_empty(void)
|
||||
{
|
||||
uint8_t hash[32];
|
||||
uc2_blake3_hash("", 0, hash);
|
||||
/* BLAKE3("") is a known constant */
|
||||
char h[65]; hex(hash, 32, h); h[64] = 0;
|
||||
printf("(%s) ", h);
|
||||
/* The hash should be non-zero and deterministic */
|
||||
uint8_t hash2[32];
|
||||
uc2_blake3_hash("", 0, hash2);
|
||||
assert(uc2_blake3_equal(hash, hash2));
|
||||
}
|
||||
|
||||
static void test_deterministic(void)
|
||||
{
|
||||
uint8_t data[] = "Hello, BLAKE3!";
|
||||
uint8_t h1[32], h2[32];
|
||||
uc2_blake3_hash(data, sizeof data - 1, h1);
|
||||
uc2_blake3_hash(data, sizeof data - 1, h2);
|
||||
assert(uc2_blake3_equal(h1, h2));
|
||||
}
|
||||
|
||||
static void test_differs(void)
|
||||
{
|
||||
uint8_t h1[32], h2[32];
|
||||
uc2_blake3_hash("AAA", 3, h1);
|
||||
uc2_blake3_hash("BBB", 3, h2);
|
||||
assert(!uc2_blake3_equal(h1, h2));
|
||||
}
|
||||
|
||||
static void test_incremental(void)
|
||||
{
|
||||
/* Incremental update should match one-shot */
|
||||
uint8_t data[] = "The quick brown fox jumps over the lazy dog";
|
||||
size_t len = sizeof data - 1;
|
||||
|
||||
uint8_t oneshot[32];
|
||||
uc2_blake3_hash(data, len, oneshot);
|
||||
|
||||
struct uc2_blake3 ctx;
|
||||
uc2_blake3_init(&ctx);
|
||||
uc2_blake3_update(&ctx, data, 10);
|
||||
uc2_blake3_update(&ctx, data + 10, len - 10);
|
||||
uint8_t incremental[32];
|
||||
uc2_blake3_final(&ctx, incremental);
|
||||
|
||||
assert(uc2_blake3_equal(oneshot, incremental));
|
||||
}
|
||||
|
||||
static void test_single_byte_updates(void)
|
||||
{
|
||||
uint8_t data[] = "ABCDEFGH";
|
||||
size_t len = 8;
|
||||
|
||||
uint8_t oneshot[32];
|
||||
uc2_blake3_hash(data, len, oneshot);
|
||||
|
||||
struct uc2_blake3 ctx;
|
||||
uc2_blake3_init(&ctx);
|
||||
for (size_t i = 0; i < len; i++)
|
||||
uc2_blake3_update(&ctx, data + i, 1);
|
||||
uint8_t piecemeal[32];
|
||||
uc2_blake3_final(&ctx, piecemeal);
|
||||
|
||||
assert(uc2_blake3_equal(oneshot, piecemeal));
|
||||
}
|
||||
|
||||
static void test_avalanche(void)
|
||||
{
|
||||
/* Changing one bit should change ~50% of output bits */
|
||||
uint8_t a[64], b[64];
|
||||
memset(a, 0, 64);
|
||||
memset(b, 0, 64);
|
||||
b[0] = 1; /* flip one bit */
|
||||
|
||||
uint8_t ha[32], hb[32];
|
||||
uc2_blake3_hash(a, 64, ha);
|
||||
uc2_blake3_hash(b, 64, hb);
|
||||
|
||||
int diff_bits = 0;
|
||||
for (int i = 0; i < 32; i++) {
|
||||
uint8_t x = ha[i] ^ hb[i];
|
||||
while (x) { diff_bits++; x &= x - 1; }
|
||||
}
|
||||
printf("(%d/256 bits differ) ", diff_bits);
|
||||
assert(diff_bits > 80 && diff_bits < 176); /* ~50% ± 30% */
|
||||
}
|
||||
|
||||
static void test_equal_constant_time(void)
|
||||
{
|
||||
uint8_t a[32], b[32];
|
||||
memset(a, 0xAA, 32);
|
||||
memcpy(b, a, 32);
|
||||
assert(uc2_blake3_equal(a, b));
|
||||
b[31] ^= 1;
|
||||
assert(!uc2_blake3_equal(a, b));
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
printf("BLAKE3 tests:\n");
|
||||
TEST(test_empty);
|
||||
TEST(test_deterministic);
|
||||
TEST(test_differs);
|
||||
TEST(test_incremental);
|
||||
TEST(test_single_byte_updates);
|
||||
TEST(test_avalanche);
|
||||
TEST(test_equal_constant_time);
|
||||
printf("%d/%d tests passed\n", tests_passed, tests_run);
|
||||
return tests_passed == tests_run ? 0 : 1;
|
||||
}
|
||||
255
tests/src/test_blockstore.c
Normal file
255
tests/src/test_blockstore.c
Normal file
@@ -0,0 +1,255 @@
|
||||
/* Tests for cross-archive block store. */
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include <sys/stat.h>
|
||||
#ifdef _MSC_VER
|
||||
#include <process.h>
|
||||
#include <io.h>
|
||||
#include <direct.h>
|
||||
#define getpid _getpid
|
||||
#define rmdir _rmdir
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#include <dirent.h>
|
||||
#endif
|
||||
#include <uc2/uc2_blockstore.h>
|
||||
#include <uc2/uc2_merkle.h>
|
||||
|
||||
static int tests_run = 0, tests_passed = 0;
|
||||
#define TEST(name) do { tests_run++; printf(" %s: ", #name); name(); tests_passed++; printf("OK\n"); } while (0)
|
||||
|
||||
static char store_path[256];
|
||||
|
||||
/* Temp-file base: %TEMP% on Windows, /tmp elsewhere. */
|
||||
static const char *tmpdir(void)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
const char *t = getenv("TEMP");
|
||||
if (!t) t = getenv("TMP");
|
||||
return t ? t : ".";
|
||||
#else
|
||||
return "/tmp";
|
||||
#endif
|
||||
}
|
||||
|
||||
static void fill_random(uint8_t *buf, size_t len, uint32_t seed)
|
||||
{
|
||||
for (size_t i = 0; i < len; i++) {
|
||||
seed = seed * 1103515245 + 12345;
|
||||
buf[i] = (uint8_t)(seed >> 16);
|
||||
}
|
||||
}
|
||||
|
||||
/* Portable recursive removal for the store's two-level layout. */
|
||||
static void rmtree(const char *path)
|
||||
{
|
||||
#ifdef _MSC_VER
|
||||
char pattern[512];
|
||||
struct _finddata_t fd;
|
||||
snprintf(pattern, sizeof pattern, "%s/*", path);
|
||||
intptr_t h = _findfirst(pattern, &fd);
|
||||
if (h != -1) {
|
||||
do {
|
||||
if (strcmp(fd.name, ".") == 0 || strcmp(fd.name, "..") == 0)
|
||||
continue;
|
||||
char sub[512];
|
||||
snprintf(sub, sizeof sub, "%s/%s", path, fd.name);
|
||||
if (fd.attrib & _A_SUBDIR)
|
||||
rmtree(sub);
|
||||
else
|
||||
remove(sub);
|
||||
} while (_findnext(h, &fd) == 0);
|
||||
_findclose(h);
|
||||
}
|
||||
#else
|
||||
DIR *d = opendir(path);
|
||||
if (d) {
|
||||
struct dirent *e;
|
||||
while ((e = readdir(d))) {
|
||||
if (strcmp(e->d_name, ".") == 0 || strcmp(e->d_name, "..") == 0)
|
||||
continue;
|
||||
char sub[512];
|
||||
snprintf(sub, sizeof sub, "%s/%s", path, e->d_name);
|
||||
struct stat st;
|
||||
if (stat(sub, &st) == 0 && S_ISDIR(st.st_mode))
|
||||
rmtree(sub);
|
||||
else
|
||||
remove(sub);
|
||||
}
|
||||
closedir(d);
|
||||
}
|
||||
#endif
|
||||
rmdir(path);
|
||||
}
|
||||
|
||||
static void test_open_close(void)
|
||||
{
|
||||
struct uc2_blockstore bs;
|
||||
{ int _r = uc2_blockstore_open(&bs, store_path); (void)_r; assert(_r == 0); }
|
||||
assert(bs.nblocks == 0);
|
||||
assert(bs.total_bytes == 0);
|
||||
assert(bs.saved_bytes == 0);
|
||||
uc2_blockstore_close(&bs);
|
||||
}
|
||||
|
||||
static void test_ingest_single(void)
|
||||
{
|
||||
uint8_t data[4096];
|
||||
fill_random(data, sizeof data, 0xABCD);
|
||||
|
||||
struct uc2_merkle tree;
|
||||
uc2_merkle_build(&tree, data, sizeof data, 12);
|
||||
|
||||
struct uc2_blockstore bs;
|
||||
uc2_blockstore_open(&bs, store_path);
|
||||
int new_chunks = uc2_blockstore_ingest(&bs, &tree, data, sizeof data);
|
||||
assert(new_chunks == tree.nchunks);
|
||||
assert(bs.nblocks == tree.nchunks);
|
||||
assert(bs.total_bytes == sizeof data);
|
||||
assert(bs.saved_bytes == 0);
|
||||
uc2_blockstore_close(&bs);
|
||||
uc2_merkle_free(&tree);
|
||||
}
|
||||
|
||||
static void test_dedup_identical(void)
|
||||
{
|
||||
/* Ingest same data twice: second ingest should store 0 new chunks */
|
||||
uint8_t data[8192];
|
||||
fill_random(data, sizeof data, 0x1234);
|
||||
|
||||
struct uc2_merkle tree;
|
||||
uc2_merkle_build(&tree, data, sizeof data, 12);
|
||||
|
||||
struct uc2_blockstore bs;
|
||||
uc2_blockstore_open(&bs, store_path);
|
||||
|
||||
int n1 = uc2_blockstore_ingest(&bs, &tree, data, sizeof data);
|
||||
assert(n1 == tree.nchunks);
|
||||
|
||||
int n2 = uc2_blockstore_ingest(&bs, &tree, data, sizeof data);
|
||||
assert(n2 == 0); /* fully deduplicated */
|
||||
assert(bs.saved_bytes == sizeof data);
|
||||
|
||||
printf("(%d chunks, %lld saved) ", tree.nchunks, (long long)bs.saved_bytes);
|
||||
uc2_blockstore_close(&bs);
|
||||
uc2_merkle_free(&tree);
|
||||
}
|
||||
|
||||
static void test_read_back(void)
|
||||
{
|
||||
uint8_t data[2048];
|
||||
fill_random(data, sizeof data, 0x5678);
|
||||
|
||||
struct uc2_merkle tree;
|
||||
uc2_merkle_build(&tree, data, sizeof data, 12);
|
||||
|
||||
struct uc2_blockstore bs;
|
||||
uc2_blockstore_open(&bs, store_path);
|
||||
uc2_blockstore_ingest(&bs, &tree, data, sizeof data);
|
||||
|
||||
/* Read each chunk back and verify */
|
||||
for (int i = 0; i < tree.nchunks; i++) {
|
||||
uint8_t buf[65536];
|
||||
int n = uc2_blockstore_read(&bs, tree.chunks[i].hash, buf, sizeof buf);
|
||||
assert(n == (int)tree.chunks[i].length);
|
||||
{ int _r = memcmp(buf, data + tree.chunks[i].offset, n); (void)_r; assert(_r == 0); }
|
||||
}
|
||||
|
||||
uc2_blockstore_close(&bs);
|
||||
uc2_merkle_free(&tree);
|
||||
}
|
||||
|
||||
static void test_cross_archive_dedup(void)
|
||||
{
|
||||
/* Simulate two archives with shared content */
|
||||
size_t shared_len = 32 * 1024;
|
||||
uint8_t *shared = malloc(shared_len);
|
||||
fill_random(shared, shared_len, 0xFEED);
|
||||
|
||||
/* Archive 1: [shared] */
|
||||
struct uc2_merkle t1;
|
||||
uc2_merkle_build(&t1, shared, shared_len, 12);
|
||||
|
||||
/* Archive 2: [shared + unique(8KB)] */
|
||||
size_t f2_len = shared_len + 8192;
|
||||
uint8_t *f2 = malloc(f2_len);
|
||||
memcpy(f2, shared, shared_len);
|
||||
fill_random(f2 + shared_len, 8192, 0xBEEF);
|
||||
struct uc2_merkle t2;
|
||||
uc2_merkle_build(&t2, f2, f2_len, 12);
|
||||
|
||||
struct uc2_blockstore bs;
|
||||
uc2_blockstore_open(&bs, store_path);
|
||||
|
||||
/* Ingest archive 1 */
|
||||
int n1 = uc2_blockstore_ingest(&bs, &t1, shared, shared_len);
|
||||
int64_t bytes1 = bs.total_bytes;
|
||||
|
||||
/* Ingest archive 2: shared chunks should dedup */
|
||||
int n2 = uc2_blockstore_ingest(&bs, &t2, f2, f2_len);
|
||||
int64_t saved = bs.saved_bytes;
|
||||
|
||||
printf("(a1=%d new, a2=%d new, saved=%lld) ", n1, n2, (long long)saved);
|
||||
assert(n2 < t2.nchunks); /* some chunks deduplicated */
|
||||
assert(saved > 0); /* bytes saved */
|
||||
|
||||
uc2_blockstore_close(&bs);
|
||||
uc2_merkle_free(&t1);
|
||||
uc2_merkle_free(&t2);
|
||||
free(shared);
|
||||
free(f2);
|
||||
}
|
||||
|
||||
static void test_has(void)
|
||||
{
|
||||
uint8_t data[1024];
|
||||
fill_random(data, sizeof data, 0x9999);
|
||||
|
||||
struct uc2_merkle tree;
|
||||
uc2_merkle_build(&tree, data, sizeof data, 12);
|
||||
|
||||
struct uc2_blockstore bs;
|
||||
uc2_blockstore_open(&bs, store_path);
|
||||
|
||||
/* Before ingest: chunk should not exist */
|
||||
assert(!uc2_blockstore_has(&bs, tree.chunks[0].hash));
|
||||
|
||||
uc2_blockstore_ingest(&bs, &tree, data, sizeof data);
|
||||
|
||||
/* After ingest: chunk should exist */
|
||||
assert(uc2_blockstore_has(&bs, tree.chunks[0].hash));
|
||||
|
||||
/* Random hash: should not exist */
|
||||
assert(!uc2_blockstore_has(&bs, 0x1234567890ABCDEFULL));
|
||||
|
||||
uc2_blockstore_close(&bs);
|
||||
uc2_merkle_free(&tree);
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
snprintf(store_path, sizeof store_path, "%s/uc2_blockstore_test_%d",
|
||||
tmpdir(), (int)getpid());
|
||||
|
||||
printf("Block store tests:\n");
|
||||
rmtree(store_path); /* clean start */
|
||||
|
||||
TEST(test_open_close);
|
||||
rmtree(store_path);
|
||||
TEST(test_ingest_single);
|
||||
rmtree(store_path);
|
||||
TEST(test_dedup_identical);
|
||||
rmtree(store_path);
|
||||
TEST(test_read_back);
|
||||
rmtree(store_path);
|
||||
TEST(test_cross_archive_dedup);
|
||||
rmtree(store_path);
|
||||
TEST(test_has);
|
||||
rmtree(store_path);
|
||||
|
||||
printf("%d/%d tests passed\n", tests_passed, tests_run);
|
||||
return tests_passed == tests_run ? 0 : 1;
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user