diff --git a/ROADMAP.md b/ROADMAP.md index 9e9cda6..71d982b 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -45,8 +45,11 @@ something no mainstream archiver offers. - [x] MASMETA central directory records with full metadata - [x] Masters compressed with SuperMaster, files compressed with custom master - [x] CLI integration test validating master deduplication round-trip -- [ ] Content-defined chunking (CDC) with rolling hashes (Gear or Rabin-Karp) - replacing fixed-block exact matching +- [x] Content-defined chunking (CDC) library with Gear rolling hash + (`uc2_cdc.h`): chunker with configurable min/max/target sizes, + FNV-1a content addressing, 8 unit tests including dedup detection +- [ ] Integrate CDC into archive creation (replace fixed-block file + grouping with chunk-level dedup) - [ ] Merkle DAG of deduplicated blocks (Git pack-style content addressing) - [ ] Cross-archive and cross-version dedup via shared block stores - [ ] Near-duplicate detection via simhash/minhash for fuzzy dedup diff --git a/docs/roadmap.rst b/docs/roadmap.rst index e6e4263..6684aad 100644 --- a/docs/roadmap.rst +++ b/docs/roadmap.rst @@ -13,9 +13,9 @@ root. Key phases: in both directions. Automated DOSBox-X test validates 4+5 files. 3. **Modernized Master-Block Deduplication** — In progress. - Content-fingerprint grouping and custom master-block generation done - (FNV-1a sampling, MASMETA cdir records, SuperMaster-compressed - masters). Remaining: content-defined chunking, Merkle DAG, + Content-fingerprint grouping and custom master-block generation done. + Content-defined chunking (CDC) library added with Gear rolling hash. + Remaining: CDC integration into archive creation, Merkle DAG, cross-archive dedup, near-duplicate detection. 4. **Modern Compression Backends** — ANS entropy coding, diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index c325909..bbfe525 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -1,6 +1,6 @@ # libuc2 — UC2 decompression library -set(LIBUC2_SOURCES src/decompress.c src/compress.c src/uc2_tables.c) +set(LIBUC2_SOURCES src/decompress.c src/compress.c src/uc2_tables.c src/uc2_cdc.c) # Embed super.bin: use .S with .incbin on GCC/Clang, generated C array on MSVC if(MSVC) diff --git a/lib/include/uc2/uc2_cdc.h b/lib/include/uc2/uc2_cdc.h new file mode 100644 index 0000000..6161ade --- /dev/null +++ b/lib/include/uc2/uc2_cdc.h @@ -0,0 +1,53 @@ +/* Content-defined chunking (CDC) for UC2 deduplication. + * + * Uses the Gear rolling hash for fast, content-aware chunk boundary + * detection. Gear hash is a simple multiplicative hash that XORs each + * byte with a pre-computed random table, giving O(1) per-byte updates. + * + * Typical usage: + * struct uc2_chunker c; + * uc2_chunker_init(&c, 13); // avg chunk ~8KB (2^13) + * while (uc2_chunker_next(&c, data, len, &chunk_off, &chunk_len)) + * process(data + chunk_off, chunk_len); + */ + +#ifndef UC2_CDC_H +#define UC2_CDC_H + +#include +#include + +/* Gear hash: fast rolling hash with O(1) per-byte update. */ +uint32_t uc2_gear_hash(const uint8_t *data, size_t len); + +/* CDC chunker state. */ +struct uc2_chunker { + uint32_t mask; /* boundary mask: (1 << bits) - 1 */ + size_t min_chunk; /* minimum chunk size */ + size_t max_chunk; /* maximum chunk size */ + size_t pos; /* current position in data */ +}; + +/* Initialize chunker. + * bits: target chunk size exponent (avg chunk = 2^bits bytes). + * Recommended: 13 (8KB), 14 (16KB), or 15 (32KB). + * min_chunk: minimum chunk size (0 = bits-2 default) + * max_chunk: maximum chunk size (0 = bits+2 default) */ +void uc2_chunker_init(struct uc2_chunker *c, int bits, + size_t min_chunk, size_t max_chunk); + +/* Find the next chunk boundary in [data, data+len). + * Returns 1 and sets *chunk_len if a chunk was found. + * Returns 0 when all data has been consumed (final chunk). + * Call repeatedly until it returns 0. */ +int uc2_chunker_next(struct uc2_chunker *c, + const uint8_t *data, size_t len, + size_t *chunk_off, size_t *chunk_len); + +/* Reset chunker for a new data stream. */ +void uc2_chunker_reset(struct uc2_chunker *c); + +/* FNV-1a hash for chunk content addressing. */ +uint32_t uc2_fnv1a(const uint8_t *data, size_t len); + +#endif diff --git a/lib/src/uc2_cdc.c b/lib/src/uc2_cdc.c new file mode 100644 index 0000000..43fdc84 --- /dev/null +++ b/lib/src/uc2_cdc.c @@ -0,0 +1,122 @@ +/* Content-defined chunking (CDC) for UC2 deduplication. + * + * Gear hash: each byte updates the hash by shifting left and XORing + * with a pre-computed random table entry. This gives uniform + * distribution and O(1) per-byte cost. A chunk boundary is detected + * when (hash & mask) == 0, giving an average chunk size of 2^bits. + * + * Reference: "A Framework for Analyzing and Improving Content-Based + * Chunking Algorithms" (Xia et al., HP Labs, 2005). + */ + +#include "uc2/uc2_cdc.h" +#include + +/* Gear hash lookup table: 256 random 32-bit values. + Generated from a PRNG seeded with the string "UC2 Gear CDC". */ +static const uint32_t gear_table[256] = { + 0x5c27b2e4, 0x8a3b9c01, 0xf7e52d9f, 0x3d14a867, 0xc6f893b2, 0x91d047e5, 0x2e6b1fa8, 0xe4a37c63, + 0x7f582b1d, 0xb90c64f6, 0x46d1e823, 0x13a95f7b, 0xd87e24c9, 0xa5430168, 0x6c9fb3d4, 0x028e7a1f, + 0xfb614d93, 0x3742c856, 0x84b50fea, 0xc1d6937e, 0x590a2eb1, 0xaef41c67, 0x67c385d2, 0x0dbf694a, + 0xe2984513, 0x76ab3dc8, 0x4517e29f, 0xb86a0c54, 0x1e23f7b6, 0xd3c58e41, 0x8a71b02d, 0xf09d43e8, + 0x2b06d175, 0x9f48a623, 0xc3e71bdf, 0x54b2f906, 0x1d65c48a, 0xe83a074b, 0x72196ed3, 0xa4de8b17, + 0x3fac5264, 0xd10738b9, 0x6ec4a1f5, 0x8593d642, 0x4a7f1d8e, 0xf6b2e071, 0x2748bc3a, 0xc981459d, + 0x50f37e26, 0xbe269ac3, 0x13da4587, 0x9c07b1f4, 0x614ed368, 0xa7bc2f15, 0xd4f56c89, 0x38a19047, + 0x876cb5e2, 0xe53d48ab, 0x42801d76, 0xfc17a93c, 0x0b9e62d1, 0x7654cf08, 0xcda37b94, 0x19e80e5f, + 0xab3c91d7, 0x6271f4a6, 0xd8bf2843, 0x3506de71, 0xf94a637b, 0x8ed5b02c, 0x471c89e5, 0x0a63d4f9, + 0xc4982e17, 0x7db15a8c, 0x12ef4360, 0xb637c9a5, 0x5f740ed8, 0xe1a8b524, 0x28c96f13, 0x93014876, + 0xdae27b9d, 0x3d8f15c2, 0x815ca04e, 0xf47e6d39, 0x4b93d2f7, 0xa620be81, 0x69d7014a, 0xc5b4f836, + 0x1c486aeb, 0x70a5931d, 0xef12dc64, 0x8279b508, 0xb6c34a9f, 0x57e82173, 0x0a1f7dc6, 0xde64c952, + 0x43b0a819, 0xad5e37e4, 0x6897cb71, 0xf1240f9c, 0x342bc6a5, 0x9d1852e8, 0xc7fa9b34, 0x586d4e07, + 0xb2a1d3f6, 0x2536ec89, 0x7ecb1047, 0xe408a5bd, 0x0f957e62, 0xd3ca81a0, 0x917f2d14, 0xfa42b6d9, + 0x45d968b3, 0xbbe50c37, 0x1274f1e5, 0x6a9e3db8, 0xcf538241, 0x87a0c96f, 0x5eb75423, 0x31dc0fa7, + 0xa41b63c4, 0xd96fae58, 0x4cd2891e, 0xf5863072, 0x0b17e4a6, 0x7c60bd9d, 0xe39845c1, 0xb85e2f17, + 0x21a37689, 0x9e4fc153, 0xd702dba4, 0x5384e96f, 0xaf51067c, 0x64c83db1, 0xc2e7f548, 0x3a198c24, + 0xf06b47d2, 0x85d2a19e, 0x4f3e5c63, 0x19c78b07, 0xe6a402db, 0x7b59d3f4, 0xbd146ea5, 0x0e82c917, + 0xc3f01b76, 0x5da564a9, 0x32b9f852, 0xa847201c, 0x6e9cb7e3, 0x81635d38, 0x470ad1bf, 0xfc718946, + 0x16ce3fa2, 0x9ab045e7, 0xd52c6814, 0x43f9bc79, 0xb8e213a6, 0x2f174e51, 0x657d90cd, 0xcda4f738, + 0x0198269b, 0x7e3cdb54, 0xe26f8013, 0x39c154e7, 0xa45db39c, 0xd792e841, 0x58067f2b, 0xb3adc466, + 0x1b41a5d0, 0x76e83917, 0xcf250b74, 0x84b7d2a8, 0x4dc69e53, 0xf01a47bf, 0x28f361c4, 0x93758c19, + 0xe5c24037, 0x3a8ef956, 0x7e51b682, 0xc107da4f, 0x5269031d, 0xad84c7e6, 0x6eb3589a, 0x0f4ea143, + 0xd8356fd7, 0x417c9e2b, 0xba20d364, 0x25f745a8, 0xf6c11e79, 0x7db8a30c, 0x830f52b4, 0x49617cd9, + 0x1cda0e63, 0xa7b23148, 0xde46c5f2, 0x63895db7, 0xb21ea481, 0x574c6f0e, 0x0a8392c5, 0xc5f7b84a, + 0x380e41d6, 0xed72d923, 0x91c5a687, 0x4a19f054, 0xf4a83b19, 0x673d8ec2, 0xbce1470b, 0x01567da4, + 0xd8abc196, 0x2490534e, 0x7de7bf83, 0xc3348217, 0x5f629ed5, 0xa6b70468, 0x1c43d7a9, 0x89f56b30, + 0x4508cfe1, 0xf27a1694, 0xb81e5d47, 0x05a9c3ba, 0xdac28f62, 0x61b740d5, 0x9e3f254c, 0x37d4a8e1, + 0x8b612c97, 0xc419f035, 0x5d8e7ba6, 0xa2f3d14c, 0x16458db9, 0xeb27c673, 0x70da0e28, 0xbf9c53e4, + 0x42a1679f, 0xde38b102, 0x95c42f56, 0x037bd8a1, 0xfc1645ed, 0x69ea9cb3, 0xad5f0374, 0x3487e1c9, + 0xc0b29d15, 0x5e617a48, 0x8714c6bf, 0x1da93273, 0xf2d5e804, 0x764b5f96, 0xab86031d, 0x41c8b4e2, + 0xd53a6927, 0x0f91dc83, 0xe8450b5a, 0x72f7a1c6, 0xbc234d90, 0x2dbe7641, 0x960cf5bd, 0x5b618a49, +}; + +uint32_t uc2_gear_hash(const uint8_t *data, size_t len) +{ + uint32_t h = 0; + for (size_t i = 0; i < len; i++) + h = (h << 1) + gear_table[data[i]]; + return h; +} + +void uc2_chunker_init(struct uc2_chunker *c, int bits, + size_t min_chunk, size_t max_chunk) +{ + if (bits < 8) bits = 8; + if (bits > 20) bits = 20; + c->mask = ((uint32_t)1 << bits) - 1; + c->min_chunk = min_chunk ? min_chunk : ((size_t)1 << (bits - 2)); + c->max_chunk = max_chunk ? max_chunk : ((size_t)1 << (bits + 2)); + c->pos = 0; +} + +void uc2_chunker_reset(struct uc2_chunker *c) +{ + c->pos = 0; +} + +int uc2_chunker_next(struct uc2_chunker *c, + const uint8_t *data, size_t len, + size_t *chunk_off, size_t *chunk_len) +{ + if (c->pos >= len) + return 0; + + size_t start = c->pos; + size_t end = start + c->max_chunk; + if (end > len) end = len; + + /* Skip minimum chunk size before checking boundaries */ + size_t scan = start + c->min_chunk; + if (scan > end) scan = end; + + uint32_t h = 0; + /* Prime the hash over the min_chunk prefix */ + for (size_t i = start; i < scan; i++) + h = (h << 1) + gear_table[data[i]]; + + /* Scan for boundary: (hash & mask) == 0 */ + for (size_t i = scan; i < end; i++) { + h = (h << 1) + gear_table[data[i]]; + if ((h & c->mask) == 0) { + *chunk_off = start; + *chunk_len = i + 1 - start; + c->pos = i + 1; + return 1; + } + } + + /* No boundary found: emit max_chunk or remaining data */ + *chunk_off = start; + *chunk_len = end - start; + c->pos = end; + return (c->pos < len) ? 1 : 0; +} + +uint32_t uc2_fnv1a(const uint8_t *data, size_t len) +{ + uint32_t h = 2166136261u; + for (size_t i = 0; i < len; i++) { + h ^= data[i]; + h *= 16777619u; + } + return h; +} diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index badca22..2afda91 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -45,6 +45,12 @@ add_test(NAME cli_dirs -P ${CMAKE_CURRENT_SOURCE_DIR}/test_cli_dirs.cmake ) +add_executable(test_cdc src/test_cdc.c) +target_link_libraries(test_cdc PRIVATE uc2) +target_include_directories(test_cdc PRIVATE "${PROJECT_BINARY_DIR}/lib") +target_compile_features(test_cdc PRIVATE c_std_99) +add_test(NAME cdc COMMAND test_cdc) + # Cross-tool round-trip: UC2 v3 <-> original uc2pro.exe via DOSBox-X add_test(NAME roundtrip_dosbox COMMAND bash ${CMAKE_CURRENT_SOURCE_DIR}/scripts/roundtrip_dosbox.sh diff --git a/tests/src/test_cdc.c b/tests/src/test_cdc.c new file mode 100644 index 0000000..adc4b9f --- /dev/null +++ b/tests/src/test_cdc.c @@ -0,0 +1,242 @@ +/* Tests for content-defined chunking (CDC). */ + +#include +#include +#include +#include +#include + +static int tests_run = 0; +static int tests_passed = 0; + +#define TEST(name) do { \ + tests_run++; \ + printf(" %s: ", #name); \ + name(); \ + tests_passed++; \ + printf("OK\n"); \ +} while (0) + +static void test_gear_hash_deterministic(void) +{ + uint8_t data[] = "Hello, World!"; + uint32_t h1 = uc2_gear_hash(data, sizeof data - 1); + uint32_t h2 = uc2_gear_hash(data, sizeof data - 1); + assert(h1 == h2); + assert(h1 != 0); +} + +static void test_gear_hash_differs(void) +{ + uint8_t a[] = "AAAA"; + uint8_t b[] = "BBBB"; + assert(uc2_gear_hash(a, 4) != uc2_gear_hash(b, 4)); +} + +static void test_fnv1a(void) +{ + uint8_t data[] = "test"; + uint32_t h = uc2_fnv1a(data, 4); + assert(h != 0); + assert(h == uc2_fnv1a(data, 4)); +} + +static void test_chunker_single_small(void) +{ + /* Data smaller than min_chunk: one chunk */ + uint8_t data[100]; + memset(data, 'A', sizeof data); + + struct uc2_chunker c; + uc2_chunker_init(&c, 13, 0, 0); /* avg 8KB, min ~2KB */ + + size_t off, len; + int got = uc2_chunker_next(&c, data, sizeof data, &off, &len); + assert(got == 0); /* final chunk */ + assert(off == 0); + assert(len == sizeof data); +} + +static void test_chunker_covers_all_data(void) +{ + /* Generate pseudo-random data to force boundary detection */ + size_t total = 256 * 1024; /* 256 KB */ + uint8_t *data = malloc(total); + assert(data); + uint32_t rng = 0xDEADBEEF; + for (size_t i = 0; i < total; i++) { + rng = rng * 1103515245 + 12345; + data[i] = (uint8_t)(rng >> 16); + } + + struct uc2_chunker c; + uc2_chunker_init(&c, 13, 0, 0); + + size_t total_chunked = 0; + int chunks = 0; + size_t off, len; + while (uc2_chunker_next(&c, data, total, &off, &len)) { + assert(off == total_chunked); + assert(len > 0); + total_chunked += len; + chunks++; + } + /* Handle the final chunk */ + total_chunked += len; + chunks++; + + assert(total_chunked == total); + assert(chunks > 1); /* 256KB should produce multiple 8KB-ish chunks */ + + free(data); +} + +static void test_chunker_respects_min_max(void) +{ + size_t total = 128 * 1024; + uint8_t *data = malloc(total); + assert(data); + uint32_t rng = 0x12345678; + for (size_t i = 0; i < total; i++) { + rng = rng * 1103515245 + 12345; + data[i] = (uint8_t)(rng >> 16); + } + + size_t min_chunk = 2048; + size_t max_chunk = 32768; + struct uc2_chunker c; + uc2_chunker_init(&c, 13, min_chunk, max_chunk); + + size_t off, len; + while (uc2_chunker_next(&c, data, total, &off, &len)) { + assert(len >= min_chunk || off + len == total); + assert(len <= max_chunk); + } + /* Final chunk can be smaller than min */ + assert(len <= max_chunk); + + free(data); +} + +static void test_chunker_content_defined(void) +{ + /* Same data inserted at different offsets should produce + the same chunk boundaries (shifted by the offset). */ + size_t base_len = 64 * 1024; + uint8_t *base = malloc(base_len); + assert(base); + uint32_t rng = 0xCAFEBABE; + for (size_t i = 0; i < base_len; i++) { + rng = rng * 1103515245 + 12345; + base[i] = (uint8_t)(rng >> 16); + } + + /* Chunk the base data */ + struct uc2_chunker c; + uc2_chunker_init(&c, 12, 0, 0); + + int base_n = 0; + size_t off, len; + while (uc2_chunker_next(&c, base, base_len, &off, &len) && base_n < 99) + base_n++; + base_n++; + + /* Prepend 1000 bytes of garbage, then the same data */ + size_t pad = 1000; + uint8_t *shifted = malloc(pad + base_len); + assert(shifted); + memset(shifted, 0xFF, pad); + memcpy(shifted + pad, base, base_len); + + uc2_chunker_reset(&c); + /* Skip the padded portion's chunks */ + size_t total = 0; + int found_base = 0; + while (uc2_chunker_next(&c, shifted, pad + base_len, &off, &len)) { + total += len; + if (off >= pad && !found_base) { + found_base = 1; + /* After the padding chunk(s), subsequent chunks of the + base data should eventually align */ + } + } + total += len; + assert(total == pad + base_len); + assert(found_base); + + free(base); + free(shifted); +} + +static void test_chunker_dedup_detection(void) +{ + /* Two files with a shared 256KB block: CDC should find matching chunks. + The shared region is large enough that after the Gear hash state + resets (~32 bytes), boundaries align between both files. */ + size_t shared_len = 256 * 1024; + size_t unique_a = 4096; + size_t unique_b = 8192; + + uint8_t *shared = malloc(shared_len); + uint8_t *file_a = malloc(unique_a + shared_len); + uint8_t *file_b = malloc(shared_len + unique_b); + assert(shared && file_a && file_b); + + uint32_t rng = 0xFEEDFACE; + for (size_t i = 0; i < shared_len; i++) { + rng = rng * 1103515245 + 12345; + shared[i] = (uint8_t)(rng >> 16); + } + for (size_t i = 0; i < unique_a; i++) file_a[i] = (uint8_t)i; + memcpy(file_a + unique_a, shared, shared_len); + memcpy(file_b, shared, shared_len); + for (size_t i = 0; i < unique_b; i++) file_b[shared_len + i] = (uint8_t)(i ^ 0xAA); + + struct uc2_chunker c; + uc2_chunker_init(&c, 13, 0, 0); + + /* Hash all chunks from file_a */ + uint32_t hashes_a[200]; + int n_a = 0; + size_t off, len; + while (uc2_chunker_next(&c, file_a, unique_a + shared_len, &off, &len) && n_a < 199) + hashes_a[n_a++] = uc2_fnv1a(file_a + off, len); + hashes_a[n_a++] = uc2_fnv1a(file_a + off, len); + + /* Hash all chunks from file_b */ + uc2_chunker_reset(&c); + uint32_t hashes_b[200]; + int n_b = 0; + while (uc2_chunker_next(&c, file_b, shared_len + unique_b, &off, &len) && n_b < 199) + hashes_b[n_b++] = uc2_fnv1a(file_b + off, len); + hashes_b[n_b++] = uc2_fnv1a(file_b + off, len); + + /* At least one chunk hash should appear in both files */ + int matches = 0; + for (int i = 0; i < n_a; i++) + for (int j = 0; j < n_b; j++) + if (hashes_a[i] == hashes_b[j]) + matches++; + + assert(matches > 0); + printf("(%d chunks A, %d chunks B, %d shared) ", n_a, n_b, matches); + + free(shared); + free(file_a); + free(file_b); +} + +int main(void) +{ + printf("CDC tests:\n"); + TEST(test_gear_hash_deterministic); + TEST(test_gear_hash_differs); + TEST(test_fnv1a); + TEST(test_chunker_single_small); + TEST(test_chunker_covers_all_data); + TEST(test_chunker_respects_min_max); + TEST(test_chunker_content_defined); + TEST(test_chunker_dedup_detection); + printf("%d/%d tests passed\n", tests_passed, tests_run); + return tests_passed == tests_run ? 0 : 1; +}