#!/usr/bin/env python """ Simple hashtable simulator that simulates fixed-size, stop-the-world resizing, and linear hashtables. """ __author__ = "George V. Reilly " import random class HashResult(object): """Instrument operations""" def __init__(self, value=None): self.value = value self.cost = 0 def add(self, cost=1): self.cost += cost def set(self, value): self.value = value return self def __str__(self): return "{0}: {1}".format(self.value, self.cost) class _HashTable(object): """Base class for all hashtable implementations.""" def __init__(self, num_buckets=4, load_factor=3.0): self.num_buckets = num_buckets self.load_factor = load_factor self.buckets = [[] for i in xrange(self.num_buckets)] self.items = 0 def _bucket_str(self, bucket, fmt='0'): return "[{0}]".format(", ".join([("{"+fmt+"}").format(k) for k in bucket])) def _all_bucket_str(self, fmt='0'): return "; ".join(["{0}: {1}".format( i, self._bucket_str(bucket, fmt)) for i,bucket in enumerate(self.buckets)]) def dump_table(self, additional='', fmt='0'): return "".format( self.num_buckets, self.load_factor, self.items, additional, self._all_bucket_str(fmt)) __str__ = dump_table def bucket(self, key): return self.buckets[key % self.num_buckets] def find(self, key): result = HashResult() # If it weren't for the need to instrument operations, # we could use faster Python features like "in" for k in self.bucket(key): result.add() if key == k: return result.set(key) return result.set(None) def _insert(self, key): result = HashResult() bucket = self.bucket(key) for k in bucket: result.add() if key == k: return result.set(False) self.items += 1 bucket.append(key) result.add() return result.set(True) def insert(self, key): result = self._insert(key) if self.must_expand(): result = self.expand(result) return result def must_expand(self): return False def expand(self, result): return result def _delete(self, key): result = HashResult() bucket = self.bucket(key) for i,k in enumerate(bucket): result.add() if key == k: del bucket[i] self.items -= 1 return result.set(True) return result.set(False) def delete(self, key): result = self._delete(key) if self.must_shrink(): result = self.shrink(result) return result # TODO: shrink as table empties, but use slightly different threshold def must_shrink(self): return False def shrink(self, result): return result class HashTable(_HashTable): """A simple resizable hashtable""" def __init__(self, allow_resize=True, *args, **kwargs): super(HashTable, self).__init__(*args, **kwargs) self.allow_resize = allow_resize def must_expand(self): return self.allow_resize and self.items > self.num_buckets * self.load_factor def expand(self, result): # Stop-the-world resizing num_buckets = self.num_buckets * 2 buckets = [[] for i in xrange(num_buckets * 2)] for b in self.buckets: for k in b: result.add() index = k % num_buckets buckets[index].append(k) self.num_buckets = num_buckets self.buckets = buckets return result class LinearHashTable(_HashTable): def __init__(self, num_buckets=4, load_factor=3.0, *args, **kwargs): self.split_index = 0 # round up num_buckets to next power of 2, unless already a power of 2 self.level = int.bit_length(num_buckets - 1) num_buckets = (1 << self.level) self.h0_mask = num_buckets - 1 self.h1_mask = (self.h0_mask << 1) | 1 super(LinearHashTable, self).__init__( num_buckets=num_buckets, load_factor=load_factor, *args, **kwargs) def dump_table(self, fmt='0'): return super(LinearHashTable, self).dump_table( fmt=fmt, additional="level={}, split_index={}, h0_mask={}, h1_mask={}, ".format( self.level, self.split_index, self.h0_mask, self.h1_mask)) __str__ = dump_table def _index(self, key): index = key & self.h0_mask return key & self.h1_mask if index < self.split_index else index def bucket(self, key): return self.buckets[self._index(key)] def must_expand(self): return self.items > self.load_factor * self.num_buckets def expand(self, result): self.buckets.append([]) self.num_buckets += 1 old_index = self.split_index new_index = (1 << self.level) | old_index old_bucket = self.buckets[old_index] new_bucket = self.buckets[new_index] self.split_index += 1 if self.split_index == (1 << self.level): self.split_index = 0 self.level += 1 self.h0_mask = (self.h0_mask << 1) | 1 self.h1_mask = (self.h0_mask << 1) | 1 # Walk backwards for i in range(len(old_bucket)-1, -1, -1): result.add() k = old_bucket[i] if self._index(k) == new_index: del old_bucket[i] new_bucket.insert(0, k) # prepend to maintain original order return result def print_insertion_cost(ht, numbers): print('"Insertion Cost"') for n in numbers: print "{0}".format(ht.insert(n).cost) def make_numbers(numbers, items, sample): return numbers or random.sample(range(items), sample or items) def print_insert_hash_table(ht, items, sample, numbers): numbers = make_numbers(numbers, items, sample) print_insertion_cost(ht, numbers) def insert_regular_hash_table(allow_resize, num_buckets=20, items=400, sample=None, numbers=None): ht = HashTable(allow_resize=allow_resize, num_buckets=num_buckets) print_insert_hash_table(ht, items, sample, numbers) def _demo_linear_hash_table(numbers, load_factor=3., items=400, sample=None): ht = LinearHashTable(load_factor=3.) numbers = make_numbers(numbers, items, sample) for n in numbers: print "{0:X}: {1}, {2}".format(n, ht.insert(n).cost, ht.dump_table('0:X')) def demo_linear_hash_table(): _demo_linear_hash_table([8, 2, 4, 12, 3, 10, 7, 14, 6, 1, 5, 0, 11, 13, 9, 15]) def insert_linear_hash_table(items=400, sample=None, numbers=None): print_insert_hash_table(LinearHashTable(), items, sample, numbers) demo_linear_hash_table()