github · itsibitzi · Sep 15, 2025 · aneubeck · Jan 16, 2026 · aneubeck
@@ -3,12 +3,12 @@
 members = [
     "crates/*",
     "crates/bpe/benchmarks",
-    "crates/bpe/tests",
+    "crates/bpe/tests"
 ]
 resolver = "2"
 
 [profile.bench]
 debug = true
 
 [profile.release]
-debug = true
+debug = true
@@ -0,0 +1,12 @@
+[package]
+name = "hriblt"
+version = "0.1.0"
+edition = "2024"
+description = "Algorithm for rateless set reconciliation"
+repository = "https://github.com/github/rust-gems"
+license = "MIT"
+keywords = ["set-reconciliation", "sync", "algorithm", "probabilistic"]
+categories = ["algorithms", "data-structures", "mathematics", "science"]
+
+[dependencies]
+thiserror = "2"
@@ -0,0 +1,55 @@
+# Hierarchical Rateless Bloom Lookup Tables
+
+A novel algorithm for computing the symmetric difference between sets where the amount of data shared is proportional to the size of the difference in the sets rather than proportional to the overall size.
+
+## Usage
+
+Add the library to your `Cargo.toml` file.
+
+```toml
+[dependencies]
+hriblt = "0.1"
+```
+
+Create two encoding sessions, one containing your data, and another containing the counter-parties data. This counterparty data might have been sent to you over a network for example.
+
+The following example attempts to reconcile the differences between two sets of `u64` integers, and is done from the perspective of "Bob", who has recieved some symbols from "Alice".
-The following example attempts to reconcile the differences between two sets of `u64` integers, and is done from the perspective of "Bob", who has recieved some symbols from "Alice".
+The following example attempts to reconcile the differences between two sets of `u64` integers, and is done from the perspective of "Bob", who has received some symbols from "Alice".
-The following example attempts to reconcile the differences between two sets of `u64` integers, and is done from the perspective of "Bob", who has recieved some symbols from "Alice".
+The following example attempts to reconcile the differences between such two sets of `u64` integers, and is done from the perspective of "Bob", who has received some symbols from "Alice".
-The following example attempts to reconcile the differences between two sets of `u64` integers, and is done from the perspective of "Bob", who has recieved some symbols from "Alice".
+The following example attempts to reconcile the differences between two sets of `u64` integers, and is done from the perspective of "Bob", who has received some symbols from "Alice".
-The following example attempts to reconcile the differences between two sets of `u64` integers, and is done from the perspective of "Bob", who has recieved some symbols from "Alice".
+The following example attempts to reconcile the differences between such two sets of `u64` integers, and is done from the perspective of "Bob", who has received some symbols from "Alice".
+
+```rust
+use hriblt::{DecodingSession, EncodingSession, DefaultHashFunctions};
+// On Alice's computer
+
+// Alice creates an encoding session...
+let mut alice_encoding_session = EncodingSession::<u64, DefaultHashFunctions>::new(DefaultHashFunctions, 0..128);
+
+// And adds her data to that session, in this case the numbers from 0 to 10.
+for i in 0..=10 {
+    alice_encoding_session.insert(i);
+}
+
+// On Bob's computer
+
+// Bob creates his encoding session, note that the range **must** be the same as Alice's
+let mut bob_encoding_session = EncodingSession::<u64, DefaultHashFunctions>::new(DefaultHashFunctions, 0..128);
+
+// Bob adds his data, the numbers from 5 to 15.
+for i in 5..=15 {
+    bob_encoding_session.insert(i);
+}
+
+// "Subtract" Bob's coded symbols from Alice's, the remaining symbols will be the symmetric
+// difference between the two sets, iff we can decode them. This is a commutative function so you
+// could also subtract Alice's symbols from Bob's and it would still work.
+let merged_sessions = alice_encoding_session.merge(bob_encoding_session, true);
+
+let decoding_session = DecodingSession::from_encoding(merged_sessions);
+
+assert!(decoding_session.is_done());
+
+let mut diff = decoding_session.into_decoded_iter().map(|v| v.into_value()).collect::<Vec<_>>();
+
+diff.sort();
+
+assert_eq!(diff, [0, 1, 2, 3, 4, 11, 12, 13, 14, 15]);
+
+```
@@ -0,0 +1,21 @@
+# Hash Functions
+
+This library has a trait, `HashFunctions` which is used to create the hashes required to place your symbol into the range of coded symbols.
+
+The following documentation provides more details on this trait in particular. How and why this is done is explained in the `overview.md` documentation.
-The following documentation provides more details on this trait in particular. How and why this is done is explained in the `overview.md` documentation.
+The following documentation provides more details on this trait in particular.
-The following documentation provides more details on this trait in particular. How and why this is done is explained in the `overview.md` documentation.
+The following documentation provides more details on this trait in particular.
+
+## Hash stability
+
+When using HRIBLT in production systems it is important to consider the stability of your hash functions.
+
+We provide a `DefaultHashFunctions` type which is a wrapper around the `DefaultHasher` type provided by the Rust standard library. Though the seed for this function is fixed, it should be noted that the hashes produces by this type are *not* guarenteed to be stable across different versions of the Rust standard library. As such, you should not use this type for any situation where clients might potentially be running on a binary built with an unspecified version of Rust.
-We provide a `DefaultHashFunctions` type which is a wrapper around the `DefaultHasher` type provided by the Rust standard library. Though the seed for this function is fixed, it should be noted that the hashes produces by this type are *not* guarenteed to be stable across different versions of the Rust standard library. As such, you should not use this type for any situation where clients might potentially be running on a binary built with an unspecified version of Rust.
+We provide a `DefaultHashFunctions` type which is a wrapper around the `DefaultHasher` type provided by the Rust standard library. Though the seed for this function is fixed, it should be noted that the hashes produces by this type are *not* guaranteed to be stable across different versions of the Rust standard library. As such, you should not use this type for any situation where clients might potentially be running on a binary built with an unspecified version of Rust.
-We provide a `DefaultHashFunctions` type which is a wrapper around the `DefaultHasher` type provided by the Rust standard library. Though the seed for this function is fixed, it should be noted that the hashes produces by this type are *not* guarenteed to be stable across different versions of the Rust standard library. As such, you should not use this type for any situation where clients might potentially be running on a binary built with an unspecified version of Rust.
+We provide a `DefaultHashFunctions` type which is a wrapper around the `DefaultHasher` type provided by the Rust standard library. Though the seed for this function is fixed, it should be noted that the hashes produces by this type are *not* guaranteed to be stable across different versions of the Rust standard library. As such, you should not use this type for any situation where clients might potentially be running on a binary built with an unspecified version of Rust.
+
+We recommend you implement your own `HashFunctions` implementation with a stable hash function.
+
+## Hash value hashing trick
+
+If the value you're inserting into the encoding session is a high entropy random value, such as a cryptographic hash digest, you can recycle the bytes in that value to produce the coded symbol indexing hashes, instead of hashing that value again. This results in a constant-factor speed up.
+
+For example if you were trying to find the difference between two sets of documents, instead of each coded symbol being the whole document it could instead just be a SHA1 hash of the document content. Since each SHA1 digest has 20 bytes of high entropy bits, instead of hashing this value five times again to produce the five coded symbol indices we can simply slice out five `u32` values from the digest itself.
+
+This is a useful trick because hash values are often used as IDs for documents during set reconciliation since they are a fixed size, making serialization easy.
@@ -0,0 +1,17 @@
+# Sizing your HRIBLT
+
+Because the HRIBLT is rateless, it is possible to append additional data in order to make it decoding possible. That is, it does not need to be sized in advance like a standard invertible bloom lookup table.
+
+Regardless, there are some advantages to getting the size of your decoding session correct the first time. An example might be if you're performing set reconciliation over some RPC and you want to minimise the number of round trips it takes to perform a decode.
+
+## Coded Symbol Multiplier
+
+The number of coded symbols required to find the difference between two sets is proportional to the difference between the two sets. The following chart shows the relationship between the number of coded symbols required to decode HRIBLT and the size of the diff. Note that the size of the base set (before diffs were added) was fixed.
+
+`y = len(coded_symbols) / diff_size`
+
+![Coded symbol multiplier](./assets/coded-symbol-multiplier.png)
+
+For small diffs, the number of coded symbols required per value is larger, after a difference of approximately 100 values the coefficient settles on around 1.3 to 1.4.
+
+You can use this chart, combined with an estimate of the diff size (perhaps from a `geo_filter`) to increase the probability that you will have a successful decode after a single round-trip while also minimising the amount of data sent.
@@ -0,0 +1,127 @@
+use crate::{Encodable, HashFunctions, index_for_seed, indices};
+
+/// Represents a coded symbol in the invertible bloom filter table.
+/// In some of the literature this is referred to as a "cell" or "bucket".
+/// It includes a checksum to verify whether the instance represents a pure value.
+#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
+pub struct CodedSymbol<T: Encodable> {
+    /// Values aggregated by XOR operation.
+    pub value: T,
+    /// We repurpose the two least significant bits of the checksum:
+    /// - The least significant bit is a one bit counter which is incremented for each entity.
+    ///   This bit must be set when there is a single entity represented by this hash.
+    /// - The second least significant bit indicates whether the entity is a deletion or insertion.
+    pub checksum: u64,
+}
+
+impl<T: Encodable> Default for CodedSymbol<T> {
+    fn default() -> Self {
+        CodedSymbol {
+            value: T::zero(),
+            checksum: 0,
+        }
+    }
+}
+
+impl<T: Encodable> From<(T, u64)> for CodedSymbol<T> {
+    fn from(tuple: (T, u64)) -> Self {
+        Self {
+            value: tuple.0,
+            checksum: tuple.1,
+        }
+    }
+}
+
+impl<T: Encodable> CodedSymbol<T> {
+    /// Creates a new coded symbol with the given hash and deletion flag.
+    pub(crate) fn new<S: HashFunctions<T>>(state: &S, hash: T, deletion: bool) -> Self {
+        let mut checksum = state.check_sum(&hash);
+        checksum |= 1; // Add a single bit counter
+        if deletion {
+            checksum = checksum.wrapping_neg();
+        }
+        CodedSymbol {
+            value: hash,
+            checksum,
+        }
+    }
+
+    /// Merges another coded symbol into this one.
+    pub(crate) fn add(&mut self, other: &CodedSymbol<T>, negate: bool) {
+        self.value.xor(other.value);
+        if negate {
+            self.checksum = self.checksum.wrapping_sub(other.checksum);
+        } else {
+            self.checksum = self.checksum.wrapping_add(other.checksum);
+        }
+    }
+
+    /// Checks whether this coded symbol is pure, i.e., whether it represents a single entity
+    /// A pure coded symbol must satisfy the following conditions:
+    /// - The 1-bit counter must be 1 or -1 (which are both represented by the bit being set)
+    /// - The checksum must match the checksum of the value.
+    /// - The indices of the value must match the index of this coded symbol.
+    pub(crate) fn is_pure<S: HashFunctions<T>>(
+        &self,
+        state: &S,
+        i: usize,
+        len: usize,
+    ) -> (bool, usize) {
+        if self.checksum & 1 == 0 {
+            return (false, 0);
+        }
+        let multiplicity = indices_contains(state, &self.value, len, i);
+        if multiplicity != 1 {
+            return (false, 0);
+        }
+        let checksum = state.check_sum(&self.value) | 1;
+        if checksum == self.checksum || checksum.wrapping_neg() == self.checksum {
+            (true, 0)
+        } else {
+            let required_bits = self
+                .checksum
+                .wrapping_sub(checksum)
+                .leading_zeros()
+                .max(self.checksum.wrapping_add(checksum).leading_zeros())
+                as usize;
+            (false, required_bits)
+        }
+    }
+
+    /// Checks whether this coded symbol is zero, i.e., whether it represents no entity.
+    pub(crate) fn is_zero(&self) -> bool {
+        self.checksum == 0 && self.value == T::zero()
+    }
+
+    /// Checks whether this coded symbol represents a deletion.
+    pub(crate) fn is_deletion<S: HashFunctions<T>>(&self, state: &S) -> bool {
+        let checksum = state.check_sum(&self.value) | 1;
+        checksum != self.checksum
+    }
+}
+
+/// This function checks efficiently whether the given index is contained in the indices.
+///
+/// Note: we have constructed the indices such that we can determine from the last 5 bits
+/// which hash function would map to this index. Therefore, we only need to check against
+/// a single hash function and not all 5!
+/// The only exception is for very small indices (0..32) or if the index is a multiple of 32.
+///
+/// The function returns the multiplicity, i.e. how many indices hit this particular index.
+/// Thereby, it takes into account whether the value is stored negated or not.
+fn indices_contains<T: std::hash::Hash>(
+    state: &impl HashFunctions<T>,
+    value: &T,
+    stream_len: usize,
+    i: usize,
+) -> i32 {
+    if stream_len > 32 && i % 32 != 0 {
+        let seed = i % 4;
+        let j = index_for_seed(state, value, stream_len, seed as u32);
+        if i == j { 1 } else { 0 }
+    } else {
+        indices(state, value, stream_len)
+            .map(|j| if j == i { 1 } else { 0 })
+            .sum()
+    }
+}
@@ -0,0 +1,31 @@
+/// A value that has been found by the set reconciliation algorithm.
+#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, PartialOrd, Ord)]
+pub enum DecodedValue<T> {
+    /// A value that has been added
+    Addition(T),
+    /// A value that has been removed
+    Deletion(T),
+}
+
+impl<T> DecodedValue<T> {
+    /// Consume this `DecodedValue` to return the value
+    pub fn into_value(self) -> T {
+        match self {
+            DecodedValue::Addition(v) => v,
+            DecodedValue::Deletion(v) => v,
+        }
+    }
+
+    /// Borrow the value within this decoded value.
+    pub fn value(&self) -> &T {
+        match self {
+            DecodedValue::Addition(v) => v,
+            DecodedValue::Deletion(v) => v,
+        }
+    }
+
+    /// Returns true if this decoded value is a deletion
+    pub fn is_deletion(&self) -> bool {
+        matches!(self, DecodedValue::Deletion(_))
+    }
+}