From 0d17f8a7e63f51e23a09def416d376e4060d864b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Thu, 9 Jul 2020 23:39:58 -0700 Subject: [PATCH] phash: bloat the hashes somewhat, reducing the likelihood of false positives Set the hash size scaling constant to 1.6, signifying 3.2 times the hash load. This both reduces the convergence time and makes it less likely (< 25%) that a non-entry will require a secondary comparison, and after all, in most of our use cases non-entries are by far the more common. Signed-off-by: H. Peter Anvin (Intel) --- asm/pptok.pl | 2 +- perllib/phash.ph | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/asm/pptok.pl b/asm/pptok.pl index 0ac044f4..aa56921f 100755 --- a/asm/pptok.pl +++ b/asm/pptok.pl @@ -224,7 +224,7 @@ if ($what eq 'c') { # Put a large value in unused slots. This makes it extremely unlikely # that any combination that involves unused slot will pass the range test. # This speeds up rejection of unrecognized tokens, i.e. identifiers. - print OUT "#define UNUSED_HASH_ENTRY (65535/3)\n"; + print OUT "\n#define UNUSED_HASH_ENTRY (65535/3)\n"; print OUT "\n\n/* Primary preprocessor token hash */\n\n"; diff --git a/perllib/phash.ph b/perllib/phash.ph index 644e13c7..133a28b0 100644 --- a/perllib/phash.ph +++ b/perllib/phash.ph @@ -145,8 +145,10 @@ sub gen_perfect_hash($) { # Minimal power of 2 value for N with enough wiggle room. # The scaling constant must be larger than 0.5 in order for the - # algorithm to ever terminate. - my $room = int(scalar(@keys)*0.8); + # algorithm to ever terminate. The higher the scaling constant, + # the more space does the hash take up, but the less likely is it + # that an invalid token will require a string comparison. + my $room = int(scalar(@keys)*1.6); $n = 1; while ($n < $room) { $n <<= 1;