Notes: Bloom Filters - Implementation and Real World Applications

Bloom filters are probabilistic data structures used to test whether an element is possibly in a set or definitely not in a set. These data structures are a neat hack to determine if an element is in a set without actually storing the actual data.

A bloom filter is simply an array of 1s and 0s filled using a hash function. They are way more memory efficient and have faster lookups than actual data. For more theory, refer this tutorial: Bloom Filters

In this blog, we’ll cover some implementations keeping these in mind:

Memory is limited
False positives are acceptable
False negatives are not

Checking Web Cache

Let’s say we have a distributed system where we store static html page caches in redis key value. With hundreds of thousands of clients, we may have millions or even billions of keys. If we get a request to get a static page, going through the whole cache list itself would be compute heavy and with millions of requests this would either cause slowness and failures or we’ll have to invest more and more dollars to scale our system. Both of these are unwanted.

So, we’ll try to build a solution using bloom filters that quickly tells us whether the url is in cache or not without going through the whole list and route the request accordingly.

package main

import (
    "context"
    "fmt"
    "log"

    "github.com/bits-and-blooms/bloom/v3"
    "github.com/redis/go-redis/v9"
)

var ctx = context.Background()

func createRedisClient() *redis.Client {
    return redis.NewClient(&redis.Options{
        Addr: "localhost:6379",
    })
}

func main() {
    redisClient := createRedisClient()

    // creating in-memory Bloom filter (expected 10000 items, 0.1% false positive rate)
    bloomFilter := bloom.NewWithEstimates(10000, 0.001)

    cachedURLs := map[string]string{
        "https://example.com/index.html": "<html>Home</html>",
        "https://example.com/about.html": "<html>About</html>",
    }

    // adding cached urls to redis and bloom filter
    for url, html := range cachedURLs {
        redisClient.Set(ctx, url, html, 0)
        bloomFilter.AddString(url)
    }

    // test lookups
    testURLs := []string{
        "https://example.com/index.html",
        "https://example.com/contact.html",
        "https://example.com/about.html",
        "https://example.com/fake.html",
    }

    for _, url := range testURLs {
        fmt.Printf("\nChecking %s\n", url)
        if !bloomFilter.TestString(url) {
            fmt.Println("Not in Bloom filter → definitely not cached. Skipping Redis.")
            // fetch from root source
            continue
        }

        fmt.Println("Possibly cached → checking Redis...")
        val, err := redisClient.Get(ctx, url).Result()

        if err == redis.Nil {
            fmt.Println("Redis miss (false positive)")
            // fetch from root source
        } else if err != nil {
            log.Printf("Redis error: %v\n", err)
        } else {
            fmt.Printf("Redis hit: %s\n", val)
        }
    }
}

Malware Scanner

We’ll build a malware scanner that checks if a file is a malware. We’ll create bloom filter using known malware hashes and validate file hashes through it.

package main

import (
    "crypto/sha256"
    "encoding/hex"
    "fmt"
    "io"
    "log"
    "os"
    "path/filepath"

    "github.com/bits-and-blooms/bloom/v3"
)

// simulated known malware hashes (usually you'd get these from a database or API)
var knownMalwareHashes = []string{
    "b1946ac92492d2347c6235b4d2611184", // dummy hash
    "5d41402abc4b2a76b9719d911017c592",
    "9e107d9d372bb6826bd81d3542a419d6",
}

func sha256sum(filePath string) (string, error) {
    f, err := os.Open(filePath)
    if err != nil {
        return "", err
    }
    defer f.Close()

    h := sha256.New()
    if _, err := io.Copy(h, f); err != nil {
        return "", err
    }
    return hex.EncodeToString(h.Sum(nil)), nil
}

func main() {
    // creating a Bloom filter: expect 10000 items, false positive rate 0.1%
    filter := bloom.NewWithEstimates(10000, 0.001)

    // add known malware hashes to the Bloom filter
    for _, hash := range knownMalwareHashes {
        filter.AddString(hash)
    }

    // directory to scan
    targetDir := "~/Downloads"

    // walk through files
    err := filepath.Walk(targetDir, func(path string, info os.FileInfo, err error) error {
        if err != nil {
            log.Println("Error accessing file:", err)
            return nil
        }

        if info.IsDir() {
            return nil
        }

        // compute SHA256 hash
        hash, err := sha256sum(path)
        if err != nil {
            log.Println("Failed to hash:", path, err)
            return nil
        }

        // check Bloom filter
        if filter.TestString(hash) {
            fmt.Printf("WARNING: File %s might be malware (hash match: %s)\n", path, hash)
        } else {
            fmt.Printf("Safe: %s\n", path)
        }

        return nil
    })

    if err != nil {
        log.Fatal(err)
    }
}

These examples can be extended to many other use cases. Some more practical applications are username checks, deduplication systems, CDN lookups, social media already seen items, email spam detection and many more.