String hashing improvements (spread and performance)
Fix gaps in hashing coverage in the middle and end of even fairly short strings. E.g., `outline-1`, `outline-2` etc all hashed to the exact same value but with the patch, there are no collisions among the ~160000 symbols in the Emacs tree. This change also improves average hashing speed by using fewer mixing operations. * src/fns.c (hash_string): Use unit stride for fairly short strings, while retaining the cap of 8 samples for long ones. Always hash the last word to ensure that the end of the string is covered. For strings shorter than a word, use fewer loads and a single reduction step.
This commit is contained in:
parent
decfdd4f1a
commit
3a93e301dd
1 changed files with 37 additions and 12 deletions
49
src/fns.c
49
src/fns.c
|
@ -5069,24 +5069,49 @@ hash_string (char const *ptr, ptrdiff_t len)
|
|||
EMACS_UINT hash = len;
|
||||
/* At most 8 steps. We could reuse SXHASH_MAX_LEN, of course,
|
||||
* but dividing by 8 is cheaper. */
|
||||
ptrdiff_t step = sizeof hash + ((end - p) >> 3);
|
||||
ptrdiff_t step = max (sizeof hash, ((end - p) >> 3));
|
||||
|
||||
while (p + sizeof hash <= end)
|
||||
if (p + sizeof hash <= end)
|
||||
{
|
||||
do
|
||||
{
|
||||
EMACS_UINT c;
|
||||
/* We presume that the compiler will replace this `memcpy` with
|
||||
a single load/move instruction when applicable. */
|
||||
memcpy (&c, p, sizeof hash);
|
||||
p += step;
|
||||
hash = sxhash_combine (hash, c);
|
||||
}
|
||||
while (p + sizeof hash <= end);
|
||||
/* Hash the last wordful of bytes in the string, because that is
|
||||
is often the part where strings differ. This may cause some
|
||||
bytes to be hashed twice but we assume that's not a big problem. */
|
||||
EMACS_UINT c;
|
||||
/* We presume that the compiler will replace this `memcpy` with
|
||||
a single load/move instruction when applicable. */
|
||||
memcpy (&c, p, sizeof hash);
|
||||
p += step;
|
||||
memcpy (&c, end - sizeof c, sizeof c);
|
||||
hash = sxhash_combine (hash, c);
|
||||
}
|
||||
/* A few last bytes may remain (smaller than an EMACS_UINT). */
|
||||
/* FIXME: We could do this without a loop, but it'd require
|
||||
endian-dependent code :-( */
|
||||
while (p < end)
|
||||
else
|
||||
{
|
||||
unsigned char c = *p++;
|
||||
hash = sxhash_combine (hash, c);
|
||||
/* String is shorter than an EMACS_UINT. Use smaller loads. */
|
||||
eassume (p <= end && end - p < sizeof (EMACS_UINT));
|
||||
EMACS_UINT tail = 0;
|
||||
if (end - p >= 4)
|
||||
{
|
||||
uint32_t c;
|
||||
memcpy (&c, p, sizeof c);
|
||||
tail = (tail << (8 * sizeof c)) + c;
|
||||
p += sizeof c;
|
||||
}
|
||||
if (end - p >= 2)
|
||||
{
|
||||
uint16_t c;
|
||||
memcpy (&c, p, sizeof c);
|
||||
tail = (tail << (8 * sizeof c)) + c;
|
||||
p += sizeof c;
|
||||
}
|
||||
if (p < end)
|
||||
tail = (tail << 8) + (unsigned char)*p;
|
||||
hash = sxhash_combine (hash, tail);
|
||||
}
|
||||
|
||||
return hash;
|
||||
|
|
Loading…
Add table
Reference in a new issue