8.2. Generalised Hash-Tables¶

File: BetterHashTable.ml

We have briefly considered hash-tables in Section Hash-tables. Given their ubiquity and importance, we are going to further elaborate on their construction in this lecture.

8.2.1. OCaml’s universal hashing¶

As many other mainstream languages (such as Java and C#), OCaml provides a polymorphic function for hashing any values, no matter what type they have:

utop # Hashtbl.hash;;
- : 'a -> int = <fun>

utop # Hashtbl.hash "abc";;
- : int = 767105082

utop # Hashtbl.hash 42;;
- : int = 395478795

This function, unfortunately, has some limitations for particularly deep data structures (such as, e.g., long immutable lists). In particular, for lists beyond certain length there will be collisions:

utop # Hashtbl.hash [1;2;3;4;5;6;7;8;9;0];;
- : int = 67023335

utop # Hashtbl.hash [1;2;3;4;5;6;7;8;9;0;1];;
- : int = 67023335

8.2.2. Redefining hash-table signature¶

Thanks to OCaml’s universal hashing, we no longer have to provide a hashing strategy for the keys of a hash-table, and can simply redefine its signature as follows:

module type HashTable = sig
  type key
  type 'a hash_table
  val mk_new_table : int -> (key * 'v) hash_table
  val insert : (key * 'v) hash_table -> key -> 'v -> unit
  val get : (key * 'v) hash_table -> key -> 'v option
  val remove : (key * 'v) hash_table -> key -> unit
  val print_hash_table :
    (key -> string) ->
    ('v -> string) ->
    (key * 'v) hash_table -> unit
end

For design reasons that will become clear further in this lecture, we still mention the type key of keys separately in the signature. We also add a convenience function print_hash_table to output the contents of the data structure.

8.2.3. A framework for testing hash-tables¶

Before we re-define the hash-table, let us define a module for automated testing of hash-tables. The module starts with the following preamble:

module HashTableTester
    (H : HashTable with type key = int * string) = struct

  module MyHT = H
  open MyHT

  (* More functions will come here. *)
end

Notice that it is a functor that takes an implementation of hash-table module H, but requires the keys to be of a specific type int * string.

The following function will fill the hash-table from an array:

let mk_test_table_from_array_length a m =
  let n = Array.length a in
  let ht = mk_new_table m in
  for i = 0 to n - 1 do
    insert ht a.(i) a.(i)
  done;
  (ht, a)

The next function takes a hash-table ht and an array a used for filling it with elements, and tests that all elements in the array are also in the hash-table (we optimistically assume that an array does not have repetitions):

let test_table_get ht a =
  let len = Array.length a in
  for i = 0 to len - 1 do
    let e = get ht a.(i) in
    assert (e <> None);
    let x = get_exn e in
    assert (x = a.(i))
  done;
  true

8.2.4. A simple list-based hash-table¶

With the new signature at hand, let us now redefine a simple implementation of a list-based hash-table.

Even though not strictly necessary at the moment, we are going to make the type of keys used by the hash-table implementation explicit, and expose in the following signature:

module type KeyType = sig
  type t
end

The reason why we need to do it will become in the next Section Bloom Filters and Their Applications, in which we will need to be able to introspect on the structure of the keys, prior to instantiating a hash-table.

We proceed with the fining our simple hash-table based on lists as previously:

module SimpleListBasedHashTable(K: KeyType) = struct
  type key = K.t

  type 'v hash_table = {
    buckets : 'v list array;
    capacity : int;
  }

  let mk_new_table cap =
    let buckets = Array.make cap [] in
    {buckets = buckets;
     capacity = cap}

  let insert ht k v =
    let hs = Hashtbl.hash k in
    let bnum = hs mod ht.capacity in
    let bucket = ht.buckets.(bnum) in
    let clean_bucket =
      List.filter (fun (k', _) -> k' <> k) bucket in
    ht.buckets.(bnum) <- (k, v) :: clean_bucket

  let get ht k =
    let hs = Hashtbl.hash k in
    let bnum = hs mod ht.capacity in
    let bucket = ht.buckets.(bnum) in
    let res = List.find_opt (fun (k', _) -> k' = k) bucket in
    match res with
    | Some (_, v) -> Some v
    | _ -> None

  (* Slow remove - introduce for completeness *)
  let remove ht k =
    let hs = Hashtbl.hash k in
    let bnum = hs mod ht.capacity in
    let bucket = ht.buckets.(bnum) in
    let clean_bucket =
      List.filter (fun (k', _) -> k' <> k) bucket in
    ht.buckets.(bnum) <- clean_bucket

  (* Another function is coming here *)

end

As the last touch, we add the function to print the contents of the table:

let print_hash_table ppk ppv ht =
  let open Printf in
  print_endline @@ sprintf "Capacity: %d" (ht.capacity);
  print_endline "Buckets:";
  let buckets = (ht.buckets) in
  for i = 0 to (ht.capacity) - 1 do
    let bucket = buckets.(i) in
    if bucket <> [] then (
      (* Print bucket *)
      let s = List.fold_left
          (fun acc (k, v) -> acc ^ (sprintf "(%s, %s); ") (ppk k) (ppv v)) "" bucket in
      printf "%d -> [ %s]\n" i s)
  done

Let us now instantiate the table to use pairs of type int * string as keys, as well as the corresponding testing framework developed above:

module IntKey = struct type t = int end
module SHT = SimpleListBasedHashTable(IntKey)
module SimpleHTTester = HashTableTester(SHT)

let pp_kv (k, v) = Printf.sprintf "(%d, %s)" k v

We can now create a simple hash-table and observe its contents:

utop # let a = generate_key_value_array 15;;
val a : (int * string) array =
  [|(7, "ayqtk"); (12, "kemle"); (6, "kcrtm"); (1, "qxcnk"); (3, "czzva");
    (4, "ayuys"); (6, "cdrhf"); (6, "ukobi"); (10, "hwsjs"); (13, "uyrla");
    (2, "uldju"); (5, "rkolw"); (13, "gnzzo"); (4, "nksfe"); (7, "geevu")|]

utop # let t = SimpleHTTester.mk_test_table_from_array_length a 10;;
val t : (SHT.key * SHT.key) SHT.hash_table = ...

utop # SimpleHTTester.MyHT.print_hash_table pp_kv pp_kv t;;
Capacity: 10
Buckets:
0 -> [ ((7, geevu), (7, geevu)); ((3, czzva), (3, czzva)); ((12, kemle), (12, kemle)); ]
1 -> [ ((7, ayqtk), (7, ayqtk)); ]
2 -> [ ((13, uyrla), (13, uyrla)); ((6, cdrhf), (6, cdrhf)); ]
6 -> [ ((13, gnzzo), (13, gnzzo)); ]
7 -> [ ((5, rkolw), (5, rkolw)); ((6, ukobi), (6, ukobi)); ((1, qxcnk), (1, qxcnk)); ((6, kcrtm), (6, kcrtm)); ]
8 -> [ ((4, ayuys), (4, ayuys)); ]
9 -> [ ((4, nksfe), (4, nksfe)); ((2, uldju), (2, uldju)); ((10, hwsjs), (10, hwsjs)); ]

As we can see, due to hash collisions some buckets are not used at all (e.g., 3), while others hold multiple values (e.g., 9).

8.2.5. Testing a Simple Hash-Table¶

We can also add a number of tests for the implementation of our hash-table. For instance, the following test checks that the hash table stores all (distinct) elements of a randomly generated array:

let%test "ListBasedHashTable insert" =
  let open SimpleHTTester in
  let a = generate_key_value_array 1000 in
  let ht = mk_test_table_from_array_length a 50 in
  test_table_get ht a

8.2.6. A Resizable hash-table¶

Let us change the implementation of a hash-table, so it could grow, as the number of the added elements greatly exceeds the number of buckets. We start from the following definition in the module:

module ResizableListBasedHashTable(K : KeyType) = struct
  type key = K.t

  type 'v hash_table = {
    buckets : 'v list array ref;
    size : int ref;
    capacity : int ref;
  }

  let mk_new_table cap =
    let buckets = Array.make cap [] in
    {buckets = ref buckets;
     capacity = ref cap;
     size = ref 0}

   (* More functions are coming here *)

end

That is, the hash table now includes its own capacity (a number of buckets), along with the size (a number of stored elements). Both are subject of future change, as more elements are added, and the table is resized.

Adding new elements by means of insert can now trigger the growth of the hash-table structure. Since it is convenient to define resizing by means of insertion into a new hash-table, which is going to be then swapped with the previous one, we define those two functions as mutually recursive via OCaml’s let rec ... and ... construct:

let rec insert ht k v =
  let hs = Hashtbl.hash k in
  let bnum = hs mod !(ht.capacity) in
  let bucket = !(ht.buckets).(bnum) in
  let clean_bucket =
    List.filter (fun (k', _) -> k' <> k) bucket in
  let new_bucket = (k, v) :: clean_bucket in
  !(ht.buckets).(bnum) <- new_bucket;
  (* Increase size *)
  (if List.length bucket < List.length new_bucket
  then ht.size := !(ht.size) + 1);
  (* Resize *)
  if !(ht.size) > !(ht.capacity) + 1
  then resize_and_copy ht

and resize_and_copy ht =
  let new_capacity = !(ht.capacity) * 2 in
  let new_buckets = Array.make new_capacity [] in
  let new_ht = {
    buckets = ref new_buckets;
    capacity = ref new_capacity;
    size = ref 0;
  } in
  let old_buckets = !(ht.buckets) in
  let len = Array.length old_buckets in
  for i = 0 to len - 1 do
    let bucket = old_buckets.(i) in
    List.iter (fun (k, v) -> insert new_ht k v) bucket
  done;
  ht.buckets := !(new_ht.buckets);
  ht.capacity := !(new_ht.capacity);
  ht.size := !(new_ht.size)

Fetching elements from a resizable hash-table is not very different from doing so with a simple hash table that does not re-size:

let get ht k =
  let hs = Hashtbl.hash k in
  let bnum = hs mod !(ht.capacity) in
  let bucket = !(ht.buckets).(bnum) in
  let res = List.find_opt (fun (k', _) -> k' = k) bucket in
  match res with
  | Some (_, v) -> Some v
  | _ -> None

Removal of elements requires a bit of care, so the size ht.size of the table (i.e., the number of elements it contains) would be suitably decreased:

(* Slow remove - introduce for completeness *)
let remove ht k =
  let hs = Hashtbl.hash k in
  let bnum = hs mod !(ht.capacity) in
  let bucket = !(ht.buckets).(bnum) in
  let clean_bucket =
    List.filter (fun (k', _) -> k' <> k) bucket in
  !(ht.buckets).(bnum) <- clean_bucket;
  (if List.length bucket > List.length clean_bucket
  then ht.size := !(ht.size) - 1);
  assert (!(ht.size) >= 0)

Finally, printing is defined in almost the same way as before:

let print_hash_table ppk ppv ht =
  let open Printf in
  print_endline @@ sprintf "Capacity: %d" !(ht.capacity);
  print_endline @@ sprintf "Size:     %d" !(ht.size);
  print_endline "Buckets:";
  let buckets = !(ht.buckets) in
  for i = 0 to !(ht.capacity) - 1 do
    let bucket = buckets.(i) in
    if bucket <> [] then (
      (* Print bucket *)
      let s = List.fold_left
          (fun acc (k, v) -> acc ^ (sprintf "(%s, %s); ") (ppk k) (ppv v)) "" bucket in
      printf "%d -> [ %s]\n" i s)
  done

Let us experiment with the resizable implementation by means of defining the following modules:

module RHT = ResizableListBasedHashTable(IntKey)
module ResizableHTTester = HashTableTester(RHT)

Let us see how the table grows:

utop # let a = generate_key_value_array 20;;
val a : (int * string) array =
  [|(17, "hvevv"); (9, "epsxo"); (14, "prasb"); (5, "ozdnt"); (10, "hglck");
    (18, "ayqtk"); (4, "kemle"); (11, "kcrtm"); (14, "qxcnk"); (19, "czzva");
    (4, "ayuys"); (7, "cdrhf"); (5, "ukobi"); (19, "hwsjs"); (3, "uyrla");
    (0, "uldju"); (7, "rkolw"); (6, "gnzzo"); (19, "nksfe"); (4, "geevu")|]

utop # let t = ResizableHTTester.mk_test_table_from_array_length a 5;;
val t : (SHT.key * SHT.key) RHT.hash_table = ...
   size = {contents = 20}; capacity = {contents = 20}}

utop # RHT.print_hash_table pp_kv pp_kv t;;
Capacity: 20
Size:     20
Buckets:
2 -> [ ((14, qxcnk), (14, qxcnk)); ]
3 -> [ ((7, rkolw), (7, rkolw)); ((0, uldju), (0, uldju)); ((19, hwsjs), (19, hwsjs)); ]
4 -> [ ((19, nksfe), (19, nksfe)); ((4, kemle), (4, kemle)); ((18, ayqtk), (18, ayqtk)); ((5, ozdnt), (5, ozdnt)); ]
5 -> [ ((19, czzva), (19, czzva)); ]
6 -> [ ((3, uyrla), (3, uyrla)); ]
8 -> [ ((4, ayuys), (4, ayuys)); ]
9 -> [ ((6, gnzzo), (6, gnzzo)); ]
10 -> [ ((17, hvevv), (17, hvevv)); ((7, cdrhf), (7, cdrhf)); ]
11 -> [ ((14, prasb), (14, prasb)); ]
12 -> [ ((11, kcrtm), (11, kcrtm)); ]
13 -> [ ((5, ukobi), (5, ukobi)); ]
16 -> [ ((9, epsxo), (9, epsxo)); ]
17 -> [ ((4, geevu), (4, geevu)); ((10, hglck), (10, hglck)); ]

To emphasise, even though we have created the table with capacity 5 (via mk_test_table_from_array_length a 5), it has then grew, as more elements were added, so its capacity has quadrupled, becoming 20.

We can also test a resizable implementation of a hash table similarly to how we tested a simple one:

let%test "ResizableHashTable insert" =
  let open ResizableHTTester in
  let a = generate_key_value_array 1000 in
  let ht = mk_test_table_from_array_length a 50 in
  test_table_get ht a

8.2.7. Comparing performance of different implementations¶

Which implementation of a hash-table behaves better in practice? We are going to answer this questions by setting up an experiment. For this, we define the following two functions for stress-testing our two implementations:

let insert_and_get_bulk_simple a m =
  Printf.printf "Creating simple hash table:\n";
  let ht = time (SimpleHTTester.mk_test_table_from_array_length a) m in
  Printf.printf "Fetching from simple hash table on the array of size %d:\n" (Array.length a);
  let _ = time SimpleHTTester.test_table_get ht a in ()

let insert_and_get_bulk_resizable a m =
  Printf.printf "Creating resizable hash table:\n";
  let ht = time (ResizableHTTester.mk_test_table_from_array_length a) m in
  Printf.printf "Fetching from resizable hash table on the array of size %d:\n" (Array.length a);
  let _ = time ResizableHTTester.test_table_get ht a in ()

The next function is going to run both insert_and_get_bulk_simple and insert_and_get_bulk_resizable on the same array (of a given size n), creating two hash-tables of the initial size m and measuring

1. How long does it take to fill up the table, and
1. How long does it take to fetch the elements

This is done as follows:

let compare_hashing_time n m =
  let a = generate_key_value_array n in
  insert_and_get_bulk_simple a m;
  print_endline "";
  insert_and_get_bulk_resizable a m;

When the number of buckets is of the same order of magnitude as the number of items being inserted, the simple hash-table exhibits performance better than the resizable one (as resizing takes considerable amount of time):

utop # compare_hashing_time 10000 1000;;
Creating simple hash table:
Execution elapsed time: 0.005814 sec
Fetching from simple hash table on the array of size 10000:
Execution elapsed time: 0.000000 sec

Creating resizable hash table:
Execution elapsed time: 0.010244 sec
Fetching from resizable hash table on the array of size 10000:
Execution elapsed time: 0.000000 sec

However, for a number of buckets much smaller than the number of elements to be inserted, the benefits of dynamic resizing become clear:

utop # compare_hashing_time 25000 50;;
Creating simple hash table:
Execution elapsed time: 0.477194 sec
Fetching from simple hash table on the array of size 25000:
Execution elapsed time: 0.000002 sec

Creating resizable hash table:
Execution elapsed time: 0.020068 sec
Fetching from resizable hash table on the array of size 25000:
Execution elapsed time: 0.000000 sec