CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/Common/Data/Collections/Hashmaps.h
Views: 1401
#pragma once12#include <cstdint> /* uint32_t */3#include <cstring>4#include <vector>56#include "ext/xxhash.h"7#include "Common/CommonFuncs.h"8#include "Common/Log.h"910// TODO: Try hardware CRC. Unfortunately not available on older Intels or ARM32.11// Seems to be ubiquitous on ARM64 though.12template<class K>13inline uint32_t HashKey(const K &k) {14return XXH3_64bits(&k, sizeof(k)) & 0xFFFFFFFF;15}16template<class K>17inline bool KeyEquals(const K &a, const K &b) {18return !memcmp(&a, &b, sizeof(K));19}2021enum class BucketState : uint8_t {22FREE,23TAKEN,24REMOVED, // for linear probing to work (and removal during deletion) we need tombstones25};2627// Uses linear probing for cache-friendliness. Not segregating values from keys because28// we always use very small values, so it's probably better to have them in the same29// cache-line as the corresponding key.30// Enforces that value are pointers to make sure that combined storage makes sense.31template <class Key, class Value>32class DenseHashMap {33public:34DenseHashMap(int initialCapacity) : capacity_(initialCapacity) {35map.resize(initialCapacity);36state.resize(initialCapacity);37}3839// Returns true if the entry was found, and writes the entry to *value.40// Returns false and does not write to value if no entry was found.41// Note that nulls can be stored.42bool Get(const Key &key, Value *value) const {43uint32_t mask = capacity_ - 1;44uint32_t pos = HashKey(key) & mask;45// No? Let's go into search mode. Linear probing.46uint32_t p = pos;47while (true) {48if (state[p] == BucketState::TAKEN && KeyEquals(key, map[p].key)) {49*value = map[p].value;50return true;51} else if (state[p] == BucketState::FREE) {52return false;53}54p = (p + 1) & mask; // If the state is REMOVED, we just keep on walking.55if (p == pos) {56// We looped around the whole map.57_assert_msg_(false, "DenseHashMap: Hit full on Get()");58}59}60return false;61}6263// Only works if Value can be nullptr64Value GetOrNull(const Key &key) const {65Value value;66if (Get(key, &value)) {67return value;68} else {69return (Value)nullptr;70}71}7273bool ContainsKey(const Key &key) const {74// Slightly wasteful, though compiler might optimize it.75Value value;76return Get(key, &value);77}7879// Asserts if we already had the key!80bool Insert(const Key &key, Value value) {81// Check load factor, resize if necessary. We never shrink.82if (count_ > capacity_ / 2) {83Grow(2);84}85uint32_t mask = capacity_ - 1;86uint32_t pos = HashKey(key) & mask;87uint32_t p = pos;88while (true) {89if (state[p] == BucketState::TAKEN) {90if (KeyEquals(key, map[p].key)) {91// Bad! We already got this one. Let's avoid this case.92_assert_msg_(false, "DenseHashMap: Duplicate key of size %d inserted", (int)sizeof(Key));93return false;94}95// continue looking....96} else {97// Got a place, either removed or FREE.98break;99}100p = (p + 1) & mask;101if (p == pos) {102// FULL! Error. Should not happen thanks to Grow().103_assert_msg_(false, "DenseHashMap: Hit full on Insert()");104}105}106if (state[p] == BucketState::REMOVED) {107removedCount_--;108}109state[p] = BucketState::TAKEN;110map[p].key = key;111map[p].value = value;112count_++;113return true;114}115116bool Remove(const Key &key) {117uint32_t mask = capacity_ - 1;118uint32_t pos = HashKey(key) & mask;119uint32_t p = pos;120while (state[p] != BucketState::FREE) {121if (state[p] == BucketState::TAKEN && KeyEquals(key, map[p].key)) {122// Got it! Mark it as removed.123state[p] = BucketState::REMOVED;124removedCount_++;125count_--;126return true;127}128p = (p + 1) & mask;129if (p == pos) {130// FULL! Error. Should not happen.131_assert_msg_(false, "DenseHashMap: Hit full on Remove()");132}133}134return false;135}136137// This will never crash if you call it without locking - but, the value might not be right.138size_t size() const {139return count_;140}141142template<class T>143inline void Iterate(T func) const {144for (size_t i = 0; i < map.size(); i++) {145if (state[i] == BucketState::TAKEN) {146func(map[i].key, map[i].value);147}148}149}150151template<class T>152inline void IterateMut(T func) {153for (size_t i = 0; i < map.size(); i++) {154if (state[i] == BucketState::TAKEN) {155func(map[i].key, map[i].value);156}157}158}159160// Note! Does NOT delete any pointed-to data (in case you stored pointers in the map).161void Clear() {162memset(state.data(), (int)BucketState::FREE, state.size());163count_ = 0;164removedCount_ = 0;165}166167void Rebuild() {168Grow(1);169}170171void Maintain() {172// Heuristic173if (removedCount_ >= capacity_ / 4) {174Rebuild();175}176}177178private:179void Grow(int factor) {180// We simply move out the existing data, then we re-insert the old.181// This is extremely non-atomic and will need synchronization.182std::vector<Pair> old = std::move(map);183std::vector<BucketState> oldState = std::move(state);184// Can't assume move will clear, it just may clear.185map.clear();186state.clear();187188int oldCount = count_;189capacity_ *= factor;190map.resize(capacity_);191state.resize(capacity_);192count_ = 0; // Insert will update it.193removedCount_ = 0;194for (size_t i = 0; i < old.size(); i++) {195if (oldState[i] == BucketState::TAKEN) {196Insert(old[i].key, old[i].value);197}198}199_assert_msg_(oldCount == count_, "DenseHashMap: count should not change in Grow()");200}201struct Pair {202Key key;203Value value;204};205std::vector<Pair> map;206std::vector<BucketState> state;207int capacity_;208int count_ = 0;209int removedCount_ = 0;210};211212// Like the above, uses linear probing for cache-friendliness.213// Does not perform hashing at all so expects well-distributed keys.214template <class Value>215class PrehashMap {216public:217PrehashMap(int initialCapacity) : capacity_(initialCapacity) {218map.resize(initialCapacity);219state.resize(initialCapacity);220}221222// Returns nullptr if no entry was found.223bool Get(uint32_t hash, Value *value) {224uint32_t mask = capacity_ - 1;225uint32_t pos = hash & mask;226// No? Let's go into search mode. Linear probing.227uint32_t p = pos;228while (true) {229if (state[p] == BucketState::TAKEN && hash == map[p].hash) {230*value = map[p].value;231return true;232} else if (state[p] == BucketState::FREE) {233return false;234}235p = (p + 1) & mask; // If the state is REMOVED, we just keep on walking.236if (p == pos) {237_assert_msg_(false, "PrehashMap: Hit full on Get()");238}239}240return false;241}242243// Returns false if we already had the key! Which is a bit different.244bool Insert(uint32_t hash, Value value) {245// Check load factor, resize if necessary. We never shrink.246if (count_ > capacity_ / 2) {247Grow(2);248}249uint32_t mask = capacity_ - 1;250uint32_t pos = hash & mask;251uint32_t p = pos;252while (state[p] != BucketState::FREE) {253if (state[p] == BucketState::TAKEN) {254if (hash == map[p].hash)255return false; // Bad!256} else {257// Got a place, either removed or FREE.258break;259}260p = (p + 1) & mask;261if (p == pos) {262// FULL! Error. Should not happen thanks to Grow().263_assert_msg_(false, "PrehashMap: Hit full on Insert()");264}265}266if (state[p] == BucketState::REMOVED) {267removedCount_--;268}269state[p] = BucketState::TAKEN;270map[p].hash = hash;271map[p].value = value;272count_++;273return true;274}275276bool Remove(uint32_t hash) {277uint32_t mask = capacity_ - 1;278uint32_t pos = hash & mask;279uint32_t p = pos;280while (state[p] != BucketState::FREE) {281if (state[p] == BucketState::TAKEN && hash == map[p].hash) {282// Got it!283state[p] = BucketState::REMOVED;284removedCount_++;285count_--;286return true;287}288p = (p + 1) & mask;289if (p == pos) {290_assert_msg_(false, "PrehashMap: Hit full on Remove()");291}292}293return false;294}295296size_t size() {297return count_;298}299300template<class T>301void Iterate(T func) const {302for (size_t i = 0; i < map.size(); i++) {303if (state[i] == BucketState::TAKEN) {304func(map[i].hash, map[i].value);305}306}307}308309void Clear() {310memset(state.data(), (int)BucketState::FREE, state.size());311count_ = 0;312removedCount_ = 0;313}314315// Gets rid of REMOVED tombstones, making lookups somewhat more efficient.316void Rebuild() {317Grow(1);318}319320void Maintain() {321// Heuristic322if (removedCount_ >= capacity_ / 4) {323Rebuild();324}325}326327private:328void Grow(int factor) {329// We simply move out the existing data, then we re-insert the old.330// This is extremely non-atomic and will need synchronization.331std::vector<Pair> old = std::move(map);332std::vector<BucketState> oldState = std::move(state);333// Can't assume move will clear, it just may clear.334map.clear();335state.clear();336337int oldCount = count_;338int oldCapacity = capacity_;339capacity_ *= factor;340map.resize(capacity_);341state.resize(capacity_);342count_ = 0; // Insert will update it.343removedCount_ = 0;344for (size_t i = 0; i < old.size(); i++) {345if (oldState[i] == BucketState::TAKEN) {346Insert(old[i].hash, old[i].value);347}348}349INFO_LOG(Log::G3D, "Grew hashmap capacity from %d to %d", oldCapacity, capacity_);350_assert_msg_(oldCount == count_, "PrehashMap: count should not change in Grow()");351}352struct Pair {353uint32_t hash;354Value value;355};356std::vector<Pair> map;357std::vector<BucketState> state;358int capacity_;359int count_ = 0;360int removedCount_ = 0;361};362363364