From fddafed9192359e627efcafa63029bb06f96a469 Mon Sep 17 00:00:00 2001 From: Hendrik Brucker Date: Thu, 23 Jun 2022 18:08:52 +0200 Subject: [PATCH] Optimize HashMap/HashSet using fastmod --- core/templates/hash_map.h | 33 +++++++++++--------- core/templates/hash_set.h | 33 +++++++++++--------- core/templates/hashfuncs.h | 62 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 98 insertions(+), 30 deletions(-) diff --git a/core/templates/hash_map.h b/core/templates/hash_map.h index e5f73171a2f..191f21a3ddf 100644 --- a/core/templates/hash_map.h +++ b/core/templates/hash_map.h @@ -91,9 +91,9 @@ private: return hash; } - _FORCE_INLINE_ uint32_t _get_probe_length(uint32_t p_pos, uint32_t p_hash, uint32_t p_capacity) const { - uint32_t original_pos = p_hash % p_capacity; - return (p_pos - original_pos + p_capacity) % p_capacity; + static _FORCE_INLINE_ uint32_t _get_probe_length(const uint32_t p_pos, const uint32_t p_hash, const uint32_t p_capacity, const uint64_t p_capacity_inv) { + const uint32_t original_pos = fastmod(p_hash, p_capacity_inv, p_capacity); + return fastmod(p_pos - original_pos + p_capacity, p_capacity_inv, p_capacity); } bool _lookup_pos(const TKey &p_key, uint32_t &r_pos) const { @@ -101,9 +101,10 @@ private: return false; // Failed lookups, no elements } - uint32_t capacity = hash_table_size_primes[capacity_index]; + const uint32_t capacity = hash_table_size_primes[capacity_index]; + const uint64_t capacity_inv = hash_table_size_primes_inv[capacity_index]; uint32_t hash = _hash(p_key); - uint32_t pos = hash % capacity; + uint32_t pos = fastmod(hash, capacity_inv, capacity); uint32_t distance = 0; while (true) { @@ -111,7 +112,7 @@ private: return false; } - if (distance > _get_probe_length(pos, hashes[pos], capacity)) { + if (distance > _get_probe_length(pos, hashes[pos], capacity, capacity_inv)) { return false; } @@ -120,17 +121,18 @@ private: return true; } - pos = (pos + 1) % capacity; + pos = fastmod((pos + 1), capacity_inv, capacity); distance++; } } void _insert_with_hash(uint32_t p_hash, HashMapElement *p_value) { - uint32_t capacity = hash_table_size_primes[capacity_index]; + const uint32_t capacity = hash_table_size_primes[capacity_index]; + const uint64_t capacity_inv = hash_table_size_primes_inv[capacity_index]; uint32_t hash = p_hash; HashMapElement *value = p_value; uint32_t distance = 0; - uint32_t pos = hash % capacity; + uint32_t pos = fastmod(hash, capacity_inv, capacity); while (true) { if (hashes[pos] == EMPTY_HASH) { @@ -143,14 +145,14 @@ private: } // Not an empty slot, let's check the probing length of the existing one. - uint32_t existing_probe_len = _get_probe_length(pos, hashes[pos], capacity); + uint32_t existing_probe_len = _get_probe_length(pos, hashes[pos], capacity, capacity_inv); if (existing_probe_len < distance) { SWAP(hash, hashes[pos]); SWAP(value, elements[pos]); distance = existing_probe_len; } - pos = (pos + 1) % capacity; + pos = fastmod((pos + 1), capacity_inv, capacity); distance++; } } @@ -316,13 +318,14 @@ public: return false; } - uint32_t capacity = hash_table_size_primes[capacity_index]; - uint32_t next_pos = (pos + 1) % capacity; - while (hashes[next_pos] != EMPTY_HASH && _get_probe_length(next_pos, hashes[next_pos], capacity) != 0) { + const uint32_t capacity = hash_table_size_primes[capacity_index]; + const uint64_t capacity_inv = hash_table_size_primes_inv[capacity_index]; + uint32_t next_pos = fastmod((pos + 1), capacity_inv, capacity); + while (hashes[next_pos] != EMPTY_HASH && _get_probe_length(next_pos, hashes[next_pos], capacity, capacity_inv) != 0) { SWAP(hashes[next_pos], hashes[pos]); SWAP(elements[next_pos], elements[pos]); pos = next_pos; - next_pos = (pos + 1) % capacity; + next_pos = fastmod((pos + 1), capacity_inv, capacity); } hashes[pos] = EMPTY_HASH; diff --git a/core/templates/hash_set.h b/core/templates/hash_set.h index 2318067dccd..7b3a5d46f8a 100644 --- a/core/templates/hash_set.h +++ b/core/templates/hash_set.h @@ -74,9 +74,9 @@ private: return hash; } - _FORCE_INLINE_ uint32_t _get_probe_length(uint32_t p_pos, uint32_t p_hash, uint32_t p_capacity) const { - uint32_t original_pos = p_hash % p_capacity; - return (p_pos - original_pos + p_capacity) % p_capacity; + static _FORCE_INLINE_ uint32_t _get_probe_length(const uint32_t p_pos, const uint32_t p_hash, const uint32_t p_capacity, const uint64_t p_capacity_inv) { + const uint32_t original_pos = fastmod(p_hash, p_capacity_inv, p_capacity); + return fastmod(p_pos - original_pos + p_capacity, p_capacity_inv, p_capacity); } bool _lookup_pos(const TKey &p_key, uint32_t &r_pos) const { @@ -84,9 +84,10 @@ private: return false; // Failed lookups, no elements } - uint32_t capacity = hash_table_size_primes[capacity_index]; + const uint32_t capacity = hash_table_size_primes[capacity_index]; + const uint64_t capacity_inv = hash_table_size_primes_inv[capacity_index]; uint32_t hash = _hash(p_key); - uint32_t pos = hash % capacity; + uint32_t pos = fastmod(hash, capacity_inv, capacity); uint32_t distance = 0; while (true) { @@ -94,7 +95,7 @@ private: return false; } - if (distance > _get_probe_length(pos, hashes[pos], capacity)) { + if (distance > _get_probe_length(pos, hashes[pos], capacity, capacity_inv)) { return false; } @@ -103,17 +104,18 @@ private: return true; } - pos = (pos + 1) % capacity; + pos = fastmod(pos + 1, capacity_inv, capacity); distance++; } } uint32_t _insert_with_hash(uint32_t p_hash, uint32_t p_index) { - uint32_t capacity = hash_table_size_primes[capacity_index]; + const uint32_t capacity = hash_table_size_primes[capacity_index]; + const uint64_t capacity_inv = hash_table_size_primes_inv[capacity_index]; uint32_t hash = p_hash; uint32_t index = p_index; uint32_t distance = 0; - uint32_t pos = hash % capacity; + uint32_t pos = fastmod(hash, capacity_inv, capacity); while (true) { if (hashes[pos] == EMPTY_HASH) { @@ -124,7 +126,7 @@ private: } // Not an empty slot, let's check the probing length of the existing one. - uint32_t existing_probe_len = _get_probe_length(pos, hashes[pos], capacity); + uint32_t existing_probe_len = _get_probe_length(pos, hashes[pos], capacity, capacity_inv); if (existing_probe_len < distance) { key_to_hash[index] = pos; SWAP(hash, hashes[pos]); @@ -132,7 +134,7 @@ private: distance = existing_probe_len; } - pos = (pos + 1) % capacity; + pos = fastmod(pos + 1, capacity_inv, capacity); distance++; } } @@ -265,9 +267,10 @@ public: uint32_t key_pos = pos; pos = key_to_hash[pos]; //make hash pos - uint32_t capacity = hash_table_size_primes[capacity_index]; - uint32_t next_pos = (pos + 1) % capacity; - while (hashes[next_pos] != EMPTY_HASH && _get_probe_length(next_pos, hashes[next_pos], capacity) != 0) { + const uint32_t capacity = hash_table_size_primes[capacity_index]; + const uint64_t capacity_inv = hash_table_size_primes_inv[capacity_index]; + uint32_t next_pos = fastmod(pos + 1, capacity_inv, capacity); + while (hashes[next_pos] != EMPTY_HASH && _get_probe_length(next_pos, hashes[next_pos], capacity, capacity_inv) != 0) { uint32_t kpos = hash_to_key[pos]; uint32_t kpos_next = hash_to_key[next_pos]; SWAP(key_to_hash[kpos], key_to_hash[kpos_next]); @@ -275,7 +278,7 @@ public: SWAP(hash_to_key[next_pos], hash_to_key[pos]); pos = next_pos; - next_pos = (pos + 1) % capacity; + next_pos = fastmod(pos + 1, capacity_inv, capacity); } hashes[pos] = EMPTY_HASH; diff --git a/core/templates/hashfuncs.h b/core/templates/hashfuncs.h index b0371f2ab5c..547534f26a6 100644 --- a/core/templates/hashfuncs.h +++ b/core/templates/hashfuncs.h @@ -437,4 +437,66 @@ const uint32_t hash_table_size_primes[HASH_TABLE_SIZE_MAX] = { 1610612741, }; +// Computed with elem_i = UINT64_C (0 x FFFFFFFF FFFFFFFF ) / d_i + 1, where d_i is the i-th element of the above array. +const uint64_t hash_table_size_primes_inv[HASH_TABLE_SIZE_MAX] = { + 3689348814741910324, + 1418980313362273202, + 802032351030850071, + 392483916461905354, + 190172619316593316, + 95578984837873325, + 47420935922132524, + 23987963684927896, + 11955116055547344, + 5991147799191151, + 2998982941588287, + 1501077717772769, + 750081082979285, + 375261795343686, + 187625172388393, + 93822606204624, + 46909513691883, + 23456218233098, + 11728086747027, + 5864041509391, + 2932024948977, + 1466014921160, + 733007198436, + 366503839517, + 183251896093, + 91625960335, + 45812983922, + 22906489714, + 11453246088 +}; + +/** + * Fastmod computes ( n mod d ) given the precomputed c much faster than n % d. + * The implementation of fastmod is based on the following paper by Daniel Lemire et al. + * Faster Remainder by Direct Computation: Applications to Compilers and Software Libraries + * https://arxiv.org/abs/1902.01961 + */ +static _FORCE_INLINE_ uint32_t fastmod(const uint32_t n, const uint64_t c, const uint32_t d) { +#if defined(_MSC_VER) + // Returns the upper 64 bits of the product of two 64-bit unsigned integers. + // This intrinsic function is required since MSVC does not support unsigned 128-bit integers. +#if defined(_M_X64) || defined(_M_ARM64) + return __umulh(c * n, d); +#else + // Fallback to the slower method for 32-bit platforms. + return n % d; +#endif // _M_X64 || _M_ARM64 +#else +#ifdef __SIZEOF_INT128__ + // Prevent compiler warning, because we know what we are doing. + uint64_t lowbits = c * n; + __extension__ typedef unsigned __int128 uint128; + return static_cast(((uint128)lowbits * d) >> 64); +#else + // Fallback to the slower method if no 128-bit unsigned integer type is available. + return n % d; +#endif // __SIZEOF_INT128__ +#endif // _MSC_VER +} + #endif // HASHFUNCS_H