From 38c0898bc2fbe5eed4df88f02199c30ec25c8d20 Mon Sep 17 00:00:00 2001 From: Eremey Valetov Date: Sun, 29 Mar 2026 20:44:32 -0400 Subject: [PATCH] Add content-aware preprocessing filters (BCJ, BWT, delta) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New library (uc2_preprocess.h / uc2_preprocess.c) for Phase 4: BCJ (Branch/Call/Jump) filter: - E8/E9 x86 address normalization (relative → absolute) - Makes calls to the same function from different locations produce identical byte sequences, improving LZ77 matching - Round-trip verified; address normalization confirmed BWT (Burrows-Wheeler Transform): - Suffix-array-based forward transform - LF-mapping inverse with reverse reconstruction - Groups similar contexts for better entropy coding - Round-trip verified for text ("banana") and binary data Delta filter: - Byte-wise delta encoding with configurable stride - Stride 1 for sequential data, stride 2+ for interleaved channels - Constant-delta sequences (arithmetic progressions) reduce to repeated single values Content detection: - Automatic content type identification (text/x86/structured/binary) - MZ/PE and ELF header recognition for x86 - Printable ASCII ratio for text detection 11 unit tests covering all filters and detection. --- ROADMAP.md | 9 +- docs/roadmap.rst | 8 +- lib/CMakeLists.txt | 2 +- lib/include/uc2/uc2_preprocess.h | 65 +++++++++++ lib/src/uc2_preprocess.c | 187 +++++++++++++++++++++++++++++++ tests/CMakeLists.txt | 6 + tests/src/test_preprocess.c | 184 ++++++++++++++++++++++++++++++ 7 files changed, 452 insertions(+), 9 deletions(-) create mode 100644 lib/include/uc2/uc2_preprocess.h create mode 100644 lib/src/uc2_preprocess.c create mode 100644 tests/src/test_preprocess.c diff --git a/ROADMAP.md b/ROADMAP.md index 4507d4e..640c931 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -76,10 +76,11 @@ backward compatibility. serialization format, and cross-archive sharing via block store. 6 unit tests including round-trip and corruption detection. - [ ] LZ4 ultra-fast mode for real-time or low-resource scenarios -- [ ] Content-aware preprocessing pipeline: - - BWT (Burrows-Wheeler) for text - - E8/E9 transform for x86 executables (BCJ filter) - - Delta filter for structured/tabular data +- [x] Content-aware preprocessing (`uc2_preprocess.h`): + BCJ (E8/E9 x86 address normalization), BWT (Burrows-Wheeler + for text), delta filter (byte-wise with configurable stride), + automatic content detection (text/x86/structured/binary). + 11 unit tests. - [ ] Built-in `uc2 --benchmark` mode: test all methods on input, report results ## Phase 5: Quantum-Resistant Encryption diff --git a/docs/roadmap.rst b/docs/roadmap.rst index 7f1dd4d..de0072f 100644 --- a/docs/roadmap.rst +++ b/docs/roadmap.rst @@ -17,10 +17,10 @@ root. Key phases: cross-archive block store, SimHash near-duplicate detection, and delta compression. All Phase 3 items complete. -4. **Modern Compression Backends** — In progress. rANS entropy coder - integrated as method 10, zstd-style dictionary metadata with - content-hash IDs. Remaining: content-aware preprocessing (BWT, - BCJ, delta filters), LZ4 ultra-fast mode, benchmarking mode. +4. **Modern Compression Backends** — In progress. rANS entropy coder, + zstd-style dictionary metadata, content-aware preprocessing (BCJ, + BWT, delta filter with auto-detection). Remaining: LZ4 ultra-fast + mode, benchmarking mode, preprocessing integration into CLI. 5. **Quantum-Resistant Encryption** — CRYSTALS-Kyber + AES-256-GCM. diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 8f42526..721ea12 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -1,6 +1,6 @@ # libuc2 — UC2 decompression library -set(LIBUC2_SOURCES src/decompress.c src/compress.c src/uc2_tables.c src/uc2_cdc.c src/uc2_merkle.c src/uc2_blockstore.c src/uc2_simhash.c src/uc2_delta.c src/uc2_rans.c src/uc2_dict.c) +set(LIBUC2_SOURCES src/decompress.c src/compress.c src/uc2_tables.c src/uc2_cdc.c src/uc2_merkle.c src/uc2_blockstore.c src/uc2_simhash.c src/uc2_delta.c src/uc2_rans.c src/uc2_dict.c src/uc2_preprocess.c) # Embed super.bin: use .S with .incbin on GCC/Clang, generated C array on MSVC if(MSVC) diff --git a/lib/include/uc2/uc2_preprocess.h b/lib/include/uc2/uc2_preprocess.h new file mode 100644 index 0000000..4a28657 --- /dev/null +++ b/lib/include/uc2/uc2_preprocess.h @@ -0,0 +1,65 @@ +/* Content-aware preprocessing filters for improved compression. + * + * These transforms are applied BEFORE compression to expose redundancy + * that LZ77+entropy coding can exploit more efficiently. Each filter + * is reversible (apply/revert) and content-type specific. + * + * Filters: + * BCJ — x86 branch/call/jump address normalization (E8/E9 transform) + * BWT — Burrows-Wheeler transform for text (groups similar contexts) + * Delta — byte-wise delta encoding for structured/tabular data + */ + +#ifndef UC2_PREPROCESS_H +#define UC2_PREPROCESS_H + +#include +#include + +/* --- BCJ (Branch/Call/Jump) filter for x86 executables --- */ + +/* Convert relative x86 CALL/JMP addresses to absolute. + * This makes the same function called from different locations produce + * identical byte sequences, improving LZ77 matching. + * Operates in-place. Returns 0 on success. */ +int uc2_bcj_apply(uint8_t *data, size_t len); + +/* Revert BCJ transform (absolute → relative). */ +int uc2_bcj_revert(uint8_t *data, size_t len); + +/* --- BWT (Burrows-Wheeler Transform) for text --- */ + +/* Apply BWT to data. Allocates *out (caller must free). + * Sets *primary_index to the BWT primary index (needed for revert). + * Returns 0 on success. */ +int uc2_bwt_apply(const uint8_t *data, size_t len, + uint8_t **out, uint32_t *primary_index); + +/* Revert BWT. Allocates *out (caller must free). + * Returns 0 on success. */ +int uc2_bwt_revert(const uint8_t *data, size_t len, + uint32_t primary_index, uint8_t **out); + +/* --- Delta filter for structured data --- */ + +/* Apply byte-wise delta encoding (each byte = current - previous). + * Operates in-place. Stride controls the delta distance (1 = adjacent + * bytes, 2 = every other byte, etc.). Stride 1 is best for sequential + * data; stride 2+ for interleaved multi-channel data. */ +void uc2_delta_filter_apply(uint8_t *data, size_t len, int stride); + +/* Revert byte-wise delta encoding. Operates in-place. */ +void uc2_delta_filter_revert(uint8_t *data, size_t len, int stride); + +/* --- Content detection --- */ + +/* Detect likely content type for automatic filter selection. + * Returns one of the UC2_CONTENT_* constants. */ +#define UC2_CONTENT_BINARY 0 /* generic binary / unknown */ +#define UC2_CONTENT_TEXT 1 /* text (high ASCII printable ratio) */ +#define UC2_CONTENT_X86 2 /* x86 executable (MZ/PE/ELF header) */ +#define UC2_CONTENT_STRUCT 3 /* structured/tabular (regular patterns) */ + +int uc2_detect_content(const uint8_t *data, size_t len); + +#endif diff --git a/lib/src/uc2_preprocess.c b/lib/src/uc2_preprocess.c new file mode 100644 index 0000000..523c603 --- /dev/null +++ b/lib/src/uc2_preprocess.c @@ -0,0 +1,187 @@ +/* Content-aware preprocessing filters. */ + +#include "uc2/uc2_preprocess.h" +#include +#include + +/* --- BCJ (E8/E9 transform for x86) --- */ + +/* Convert relative CALL (E8) and JMP (E9) addresses to absolute. + * The 4-byte displacement after E8/E9 is replaced with an absolute + * address relative to position 0. This normalizes calls to the same + * function from different locations, improving LZ77 matching. */ + +int uc2_bcj_apply(uint8_t *data, size_t len) +{ + if (len < 5) return 0; + for (size_t i = 0; i + 4 < len; i++) { + if (data[i] == 0xE8 || data[i] == 0xE9) { + int32_t rel = (int32_t)(data[i+1] | (data[i+2] << 8) | + (data[i+3] << 16) | (data[i+4] << 24)); + int32_t abs_addr = rel + (int32_t)(i + 5); + data[i+1] = (uint8_t)(abs_addr); + data[i+2] = (uint8_t)(abs_addr >> 8); + data[i+3] = (uint8_t)(abs_addr >> 16); + data[i+4] = (uint8_t)(abs_addr >> 24); + i += 4; /* skip the address bytes */ + } + } + return 0; +} + +int uc2_bcj_revert(uint8_t *data, size_t len) +{ + if (len < 5) return 0; + for (size_t i = 0; i + 4 < len; i++) { + if (data[i] == 0xE8 || data[i] == 0xE9) { + int32_t abs_addr = (int32_t)(data[i+1] | (data[i+2] << 8) | + (data[i+3] << 16) | (data[i+4] << 24)); + int32_t rel = abs_addr - (int32_t)(i + 5); + data[i+1] = (uint8_t)(rel); + data[i+2] = (uint8_t)(rel >> 8); + data[i+3] = (uint8_t)(rel >> 16); + data[i+4] = (uint8_t)(rel >> 24); + i += 4; + } + } + return 0; +} + +/* --- BWT (Burrows-Wheeler Transform) --- */ + +/* Simple BWT using suffix array (O(n log^2 n) via qsort). */ + +static const uint8_t *bwt_data; +static size_t bwt_len; + +static int bwt_cmp(const void *a, const void *b) +{ + uint32_t ia = *(const uint32_t *)a; + uint32_t ib = *(const uint32_t *)b; + for (size_t k = 0; k < bwt_len; k++) { + uint8_t ca = bwt_data[(ia + k) % bwt_len]; + uint8_t cb = bwt_data[(ib + k) % bwt_len]; + if (ca != cb) return (int)ca - (int)cb; + } + return 0; +} + +int uc2_bwt_apply(const uint8_t *data, size_t len, + uint8_t **out, uint32_t *primary_index) +{ + if (len == 0) { *out = NULL; *primary_index = 0; return 0; } + + uint32_t *sa = malloc(len * sizeof(uint32_t)); + uint8_t *result = malloc(len); + if (!sa || !result) { free(sa); free(result); return -1; } + + for (size_t i = 0; i < len; i++) sa[i] = (uint32_t)i; + bwt_data = data; + bwt_len = len; + qsort(sa, len, sizeof(uint32_t), bwt_cmp); + + *primary_index = 0; + for (size_t i = 0; i < len; i++) { + if (sa[i] == 0) *primary_index = (uint32_t)i; + result[i] = data[(sa[i] + len - 1) % len]; + } + + free(sa); + *out = result; + return 0; +} + +int uc2_bwt_revert(const uint8_t *data, size_t len, + uint32_t primary_index, uint8_t **out) +{ + if (len == 0) { *out = NULL; return 0; } + + uint8_t *result = malloc(len); + uint32_t *T = malloc(len * sizeof(uint32_t)); + if (!result || !T) { free(result); free(T); return -1; } + + /* Build the LF-mapping (Last-to-First column mapping). + T[i] = position in first column corresponding to last column position i. */ + uint32_t count[256]; + memset(count, 0, sizeof count); + for (size_t i = 0; i < len; i++) count[data[i]]++; + + uint32_t sum = 0; + uint32_t start[256]; + for (int c = 0; c < 256; c++) { + start[c] = sum; + sum += count[c]; + } + + /* Reset count for building T */ + memset(count, 0, sizeof count); + for (size_t i = 0; i < len; i++) { + T[i] = start[data[i]] + count[data[i]]; + count[data[i]]++; + } + + /* Reconstruct: follow T from primary_index, reading in reverse */ + uint32_t idx = primary_index; + for (size_t i = len; i > 0; i--) { + result[i - 1] = data[idx]; + idx = T[idx]; + } + + free(T); + *out = result; + return 0; +} + +/* --- Delta filter --- */ + +void uc2_delta_filter_apply(uint8_t *data, size_t len, int stride) +{ + if (stride < 1) stride = 1; + /* Process from end to start to avoid overwriting needed values */ + for (size_t i = len; i > (size_t)stride; ) { + i--; + data[i] = (uint8_t)(data[i] - data[i - stride]); + } +} + +void uc2_delta_filter_revert(uint8_t *data, size_t len, int stride) +{ + if (stride < 1) stride = 1; + for (size_t i = (size_t)stride; i < len; i++) + data[i] = (uint8_t)(data[i] + data[i - stride]); +} + +/* --- Content detection --- */ + +int uc2_detect_content(const uint8_t *data, size_t len) +{ + if (len < 4) return UC2_CONTENT_BINARY; + + /* Check for x86 executable signatures */ + if (data[0] == 'M' && data[1] == 'Z') + return UC2_CONTENT_X86; /* DOS/PE executable */ + if (data[0] == 0x7F && data[1] == 'E' && data[2] == 'L' && data[3] == 'F') + return UC2_CONTENT_X86; /* ELF executable */ + + /* Count printable ASCII characters */ + size_t check = len > 4096 ? 4096 : len; + size_t printable = 0; + for (size_t i = 0; i < check; i++) + if ((data[i] >= 32 && data[i] <= 126) || + data[i] == '\n' || data[i] == '\r' || data[i] == '\t') + printable++; + + if (printable * 100 / check > 85) + return UC2_CONTENT_TEXT; + + /* Check for structured data: regular byte-value patterns */ + if (len >= 64) { + size_t zeros = 0; + for (size_t i = 0; i < check; i++) + if (data[i] == 0) zeros++; + if (zeros * 100 / check > 20) + return UC2_CONTENT_STRUCT; + } + + return UC2_CONTENT_BINARY; +} diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index c9a4b58..443897b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -87,6 +87,12 @@ target_include_directories(test_dict PRIVATE "${PROJECT_BINARY_DIR}/lib") target_compile_features(test_dict PRIVATE c_std_99) add_test(NAME dict COMMAND test_dict) +add_executable(test_preprocess src/test_preprocess.c) +target_link_libraries(test_preprocess PRIVATE uc2) +target_include_directories(test_preprocess PRIVATE "${PROJECT_BINARY_DIR}/lib") +target_compile_features(test_preprocess PRIVATE c_std_99) +add_test(NAME preprocess COMMAND test_preprocess) + # Cross-tool round-trip: UC2 v3 <-> original uc2pro.exe via DOSBox-X add_test(NAME roundtrip_dosbox COMMAND bash ${CMAKE_CURRENT_SOURCE_DIR}/scripts/roundtrip_dosbox.sh diff --git a/tests/src/test_preprocess.c b/tests/src/test_preprocess.c new file mode 100644 index 0000000..4af8619 --- /dev/null +++ b/tests/src/test_preprocess.c @@ -0,0 +1,184 @@ +/* Tests for content-aware preprocessing filters. */ + +#include +#include +#include +#include +#include + +static int tests_run = 0, tests_passed = 0; +#define TEST(name) do { tests_run++; printf(" %s: ", #name); name(); tests_passed++; printf("OK\n"); } while (0) + +/* --- BCJ tests --- */ + +static void test_bcj_roundtrip(void) +{ + /* Simulate x86 code with E8 (CALL) instructions */ + uint8_t code[] = { + 0x90, /* NOP */ + 0xE8, 0x10, 0x00, 0x00, 0x00, /* CALL +16 (relative) */ + 0x90, /* NOP */ + 0xE8, 0x20, 0x00, 0x00, 0x00, /* CALL +32 (relative) */ + 0x90, 0x90, 0x90, 0x90, /* NOPs */ + }; + uint8_t orig[sizeof code]; + memcpy(orig, code, sizeof code); + + uc2_bcj_apply(code, sizeof code); + /* After apply, the relative addresses should be absolute */ + assert(memcmp(code, orig, sizeof code) != 0); + + uc2_bcj_revert(code, sizeof code); + assert(memcmp(code, orig, sizeof code) == 0); +} + +static void test_bcj_normalizes(void) +{ + /* Two different calls to the same target from different positions. + After BCJ, both should have the same absolute address. */ + uint8_t a[] = { 0xE8, 0x0A, 0x00, 0x00, 0x00, 0x90, 0x90, 0x90, 0x90, 0x90 }; + uint8_t b[] = { 0x90, 0x90, 0xE8, 0x07, 0x00, 0x00, 0x00, 0x90, 0x90, 0x90 }; + /* Both call offset 15 from start: a: 5+10=15, b: 7+8=15... let me compute: + a at pos 0: rel=10, abs=10+5=15 + b at pos 2: rel=7, abs=7+7=14... not same. Adjust: */ + /* a: E8 at pos 0, rel=0x0A=10, abs=10+5=15 → target 15 + b: E8 at pos 2, rel=0x0A=10, abs=10+7=17 → target 17 + For same target (15): b needs rel=15-7=8 → 0x08 */ + b[3] = 0x08; b[4] = 0x00; b[5] = 0x00; b[6] = 0x00; + /* Now both target absolute address 15 */ + + uc2_bcj_apply(a, sizeof a); + uc2_bcj_apply(b, sizeof b); + + /* Both should now have abs=15 in the displacement bytes */ + int32_t abs_a = a[1] | (a[2]<<8) | (a[3]<<16) | (a[4]<<24); + int32_t abs_b = b[3] | (b[4]<<8) | (b[5]<<16) | (b[6]<<24); + assert(abs_a == 15); + assert(abs_b == 15); +} + +static void test_bcj_short_data(void) +{ + uint8_t data[] = { 0xE8, 0x01 }; + uc2_bcj_apply(data, 2); /* too short, no transform */ + assert(data[0] == 0xE8 && data[1] == 0x01); +} + +/* --- BWT tests --- */ + +static void test_bwt_roundtrip(void) +{ + uint8_t data[] = "banana"; + size_t len = 6; + uint8_t *bwt; + uint32_t pidx; + assert(uc2_bwt_apply(data, len, &bwt, &pidx) == 0); + + /* BWT of "banana" is well-known: "nnbaaa" with primary index at 3 */ + printf("(bwt='%.*s' idx=%u) ", (int)len, bwt, pidx); + + uint8_t *orig; + assert(uc2_bwt_revert(bwt, len, pidx, &orig) == 0); + assert(memcmp(orig, data, len) == 0); + + free(bwt); + free(orig); +} + +static void test_bwt_roundtrip_binary(void) +{ + size_t len = 256; + uint8_t *data = malloc(len); + for (size_t i = 0; i < len; i++) data[i] = (uint8_t)(i * 37 + 13); + + uint8_t *bwt; + uint32_t pidx; + assert(uc2_bwt_apply(data, len, &bwt, &pidx) == 0); + + uint8_t *orig; + assert(uc2_bwt_revert(bwt, len, pidx, &orig) == 0); + assert(memcmp(orig, data, len) == 0); + + free(data); + free(bwt); + free(orig); +} + +/* --- Delta filter tests --- */ + +static void test_delta_roundtrip(void) +{ + uint8_t data[] = {10, 12, 14, 16, 18, 20, 22, 24}; + uint8_t orig[sizeof data]; + memcpy(orig, data, sizeof data); + + uc2_delta_filter_apply(data, sizeof data, 1); + /* After delta: differences should be constant (2) for arithmetic sequence */ + for (size_t i = 1; i < sizeof data; i++) + assert(data[i] == 2); + + uc2_delta_filter_revert(data, sizeof data, 1); + assert(memcmp(data, orig, sizeof data) == 0); +} + +static void test_delta_stride2(void) +{ + /* Interleaved stereo: L0 R0 L1 R1 L2 R2 ... */ + uint8_t data[] = {100, 200, 102, 202, 104, 204, 106, 206}; + uint8_t orig[sizeof data]; + memcpy(orig, data, sizeof data); + + uc2_delta_filter_apply(data, sizeof data, 2); + /* With stride 2: each channel has constant delta of 2 */ + assert(data[2] == 2 && data[3] == 2); + assert(data[4] == 2 && data[5] == 2); + + uc2_delta_filter_revert(data, sizeof data, 2); + assert(memcmp(data, orig, sizeof data) == 0); +} + +/* --- Content detection tests --- */ + +static void test_detect_text(void) +{ + uint8_t data[] = "This is plain text content with newlines\n" + "and more text on the second line.\n"; + assert(uc2_detect_content(data, sizeof data - 1) == UC2_CONTENT_TEXT); +} + +static void test_detect_x86_mz(void) +{ + uint8_t data[] = {'M', 'Z', 0x90, 0x00}; + assert(uc2_detect_content(data, sizeof data) == UC2_CONTENT_X86); +} + +static void test_detect_x86_elf(void) +{ + uint8_t data[] = {0x7F, 'E', 'L', 'F', 0x02}; + assert(uc2_detect_content(data, sizeof data) == UC2_CONTENT_X86); +} + +static void test_detect_binary(void) +{ + uint8_t data[64]; + for (int i = 0; i < 64; i++) data[i] = (uint8_t)(i * 7); + assert(uc2_detect_content(data, sizeof data) == UC2_CONTENT_BINARY); +} + +int main(void) +{ + printf("Preprocessing filter tests:\n"); + TEST(test_bcj_roundtrip); + TEST(test_bcj_normalizes); + TEST(test_bcj_short_data); + TEST(test_bwt_roundtrip); + TEST(test_bwt_roundtrip_binary); + TEST(test_delta_roundtrip); + TEST(test_delta_stride2); + TEST(test_detect_text); + TEST(test_detect_x86_mz); + TEST(test_detect_x86_elf); + TEST(test_detect_binary); + printf("%d/%d tests passed\n", tests_passed, tests_run); + return tests_passed == tests_run ? 0 : 1; +}