Improvements in hashlib

This commit is contained in:
Clifford Wolf 2014-12-30 13:22:33 +01:00
parent c64b1de11d
commit 3857e1cb66
1 changed files with 72 additions and 83 deletions

View File

@ -17,7 +17,8 @@
namespace hashlib { namespace hashlib {
const int config_size_factor = 3; const int hashtable_size_trigger = 2;
const int hashtable_size_factor = 3;
// The XOR version of DJB2 // The XOR version of DJB2
// (traditionally 5381 is used as starting value for the djb2 hash) // (traditionally 5381 is used as starting value for the djb2 hash)
@ -121,37 +122,29 @@ struct hash_obj_ops {
} }
}; };
inline int hashtable_size(int old_size) inline int hashtable_size(int min_size)
{ {
// prime numbers, approx. in powers of two static std::vector<int> primes = {
if (old_size < 53) return 53; 23, 29, 37, 47, 59, 79, 101, 127, 163, 211, 269, 337, 431, 541, 677,
if (old_size < 113) return 113; 853, 1069, 1361, 1709, 2137, 2677, 3347, 4201, 5261, 6577, 8231, 10289,
if (old_size < 251) return 251; 12889, 16127, 20161, 25219, 31531, 39419, 49277, 61603, 77017, 96281,
if (old_size < 503) return 503; 120371, 150473, 188107, 235159, 293957, 367453, 459317, 574157, 717697,
if (old_size < 1129) return 1129; 897133, 1121423, 1401791, 1752239, 2190299, 2737937, 3422429, 4278037,
if (old_size < 2503) return 2503; 5347553, 6684443, 8355563, 10444457, 13055587, 16319519, 20399411,
if (old_size < 5023) return 5023; 25499291, 31874149, 39842687, 49803361, 62254207, 77817767, 97272239,
if (old_size < 11299) return 11299; 121590311, 151987889, 189984863, 237481091, 296851369, 371064217
if (old_size < 25097) return 25097; };
if (old_size < 50291) return 50291;
if (old_size < 112997) return 112997;
if (old_size < 251003) return 251003;
if (old_size < 503003) return 503003;
if (old_size < 1129991) return 1129991;
if (old_size < 2509993) return 2509993;
if (old_size < 5029991) return 5029991;
if (old_size < 11299997) return 11299997;
if (old_size < 25099999) return 25099999;
if (old_size < 50299999) return 50299999;
if (old_size < 113000009) return 113000009;
if (old_size < 250999999) return 250999999;
if (old_size < 503000009) return 503000009;
if (old_size < 1129999999) return 1129999999;
if (sizeof(old_size) == 4) for (auto p : primes)
throw std::length_error("hash table exceeded maximum size. recompile with -mint64."); if (p > min_size) return p;
return old_size * 2; if (sizeof(int) == 4)
throw std::length_error("hash table exceeded maximum size. use a ILP64 abi for larger tables.");
for (auto p : primes)
if (100129 * p > min_size) return 100129 * p;
throw std::length_error("hash table exceeded maximum size.");
} }
template<typename K, typename T, typename OPS = hash_ops<K>> template<typename K, typename T, typename OPS = hash_ops<K>>
@ -192,14 +185,12 @@ class dict
entries.clear(); entries.clear();
counter = other.size(); counter = other.size();
int new_size = hashtable_size(config_size_factor * counter); begin_n = counter - 1;
hashtable.resize(new_size); entries.reserve(counter);
new_size = new_size / config_size_factor + 1;
entries.reserve(new_size);
for (auto &it : other) for (auto &it : other)
entries.push_back(entry_t(it)); entries.push_back(entry_t(it));
entries.resize(new_size);
rehash(); rehash();
} }
@ -211,18 +202,12 @@ class dict
return hash; return hash;
} }
void upd_begin_n() void upd_begin_n(bool do_refree = true)
{ {
if (begin_n < -1) { if (begin_n < -1) {
begin_n = -(begin_n+2); begin_n = -(begin_n+2);
if (begin_n > int(entries.size())) while (begin_n >= 0 && entries[begin_n].is_free()) { begin_seek_count++; begin_n--; }
begin_n = int(entries.size()); if (do_refree && begin_seek_count > int(entries.size() / 2)) refree();
do {
if (begin_seek_count++ > int(entries.size()))
refree();
else
begin_n--;
} while (begin_n >= 0 && entries[begin_n].is_free());
} }
} }
@ -250,11 +235,14 @@ class dict
void rehash() void rehash()
{ {
upd_begin_n(false);
entries.resize(begin_n + 1);
free_list = -1; free_list = -1;
begin_n = -1; begin_n = -1;
for (auto &h : hashtable) hashtable.clear();
h = -1; hashtable.resize(hashtable_size(entries.size() * hashtable_size_factor), -1);
int last_free = -1; int last_free = -1;
for (int i = 0; i < int(entries.size()); i++) for (int i = 0; i < int(entries.size()); i++)
@ -319,15 +307,18 @@ class dict
{ {
if (free_list < 0) if (free_list < 0)
{ {
int i = entries.size(); free_list = entries.size();
int new_size = hashtable_size(config_size_factor * entries.size()); entries.push_back(entry_t());
hashtable.resize(new_size);
entries.resize(new_size / config_size_factor + 1); if (entries.size() * hashtable_size_trigger > hashtable.size()) {
entries[i].udata = value; int i = free_list;
entries[i].set_next_used(0); entries[i].udata = value;
counter++; entries[i].set_next_used(0);
rehash(); begin_n = i;
return i; counter++;
rehash();
return i;
}
} }
int i = free_list; int i = free_list;
@ -384,8 +375,7 @@ public:
dict(dict<K, T, OPS> &&other) dict(dict<K, T, OPS> &&other)
{ {
free_list = -1; init();
counter = 0;
swap(other); swap(other);
} }
@ -504,6 +494,7 @@ public:
std::swap(free_list, other.free_list); std::swap(free_list, other.free_list);
std::swap(counter, other.counter); std::swap(counter, other.counter);
std::swap(begin_n, other.begin_n); std::swap(begin_n, other.begin_n);
std::swap(begin_seek_count, other.begin_seek_count);
} }
bool operator==(const dict<K, T, OPS> &other) const { bool operator==(const dict<K, T, OPS> &other) const {
@ -579,14 +570,12 @@ class pool
entries.clear(); entries.clear();
counter = other.size(); counter = other.size();
int new_size = hashtable_size(config_size_factor * counter); begin_n = counter - 1;
hashtable.resize(new_size); entries.reserve(counter);
new_size = new_size / config_size_factor + 1;
entries.reserve(new_size);
for (auto &it : other) for (auto &it : other)
entries.push_back(entry_t(it)); entries.push_back(entry_t(it));
entries.resize(new_size);
rehash(); rehash();
} }
@ -598,18 +587,12 @@ class pool
return hash; return hash;
} }
void upd_begin_n() void upd_begin_n(bool do_refree = true)
{ {
if (begin_n < -1) { if (begin_n < -1) {
begin_n = -(begin_n+2); begin_n = -(begin_n+2);
if (begin_n > int(entries.size())) while (begin_n >= 0 && entries[begin_n].is_free()) { begin_seek_count++; begin_n--; }
begin_n = int(entries.size()); if (do_refree && begin_seek_count > int(entries.size() / 2)) refree();
do {
if (begin_seek_count++ > int(entries.size()))
refree();
else
begin_n--;
} while (begin_n >= 0 && entries[begin_n].is_free());
} }
} }
@ -637,11 +620,14 @@ class pool
void rehash() void rehash()
{ {
upd_begin_n(false);
entries.resize(begin_n + 1);
free_list = -1; free_list = -1;
begin_n = -1; begin_n = -1;
for (auto &h : hashtable) hashtable.clear();
h = -1; hashtable.resize(hashtable_size(entries.size() * hashtable_size_factor), -1);
int last_free = -1; int last_free = -1;
for (int i = 0; i < int(entries.size()); i++) for (int i = 0; i < int(entries.size()); i++)
@ -706,15 +692,18 @@ class pool
{ {
if (free_list < 0) if (free_list < 0)
{ {
int i = entries.size(); free_list = entries.size();
int new_size = hashtable_size(config_size_factor * entries.size()); entries.push_back(entry_t());
hashtable.resize(new_size);
entries.resize(new_size / config_size_factor + 1); if (entries.size() * hashtable_size_trigger > hashtable.size()) {
entries[i].key = key; int i = free_list;
entries[i].set_next_used(0); entries[i].key = key;
counter++; entries[i].set_next_used(0);
rehash(); begin_n = i;
return i; counter++;
rehash();
return i;
}
} }
int i = free_list; int i = free_list;
@ -771,8 +760,7 @@ public:
pool(pool<K, OPS> &&other) pool(pool<K, OPS> &&other)
{ {
free_list = -1; init();
counter = 0;
swap(other); swap(other);
} }
@ -871,6 +859,7 @@ public:
std::swap(free_list, other.free_list); std::swap(free_list, other.free_list);
std::swap(counter, other.counter); std::swap(counter, other.counter);
std::swap(begin_n, other.begin_n); std::swap(begin_n, other.begin_n);
std::swap(begin_seek_count, other.begin_seek_count);
} }
bool operator==(const pool<K, OPS> &other) const { bool operator==(const pool<K, OPS> &other) const {