Skip to content

Files

Latest commit

 

History

History
292 lines (241 loc) · 8.56 KB

705.design-hashset.md

File metadata and controls

292 lines (241 loc) · 8.56 KB

Direct Access Table

class MyHashSet() {
    private val table = IntArray(1000000 + 1) { -1 }

    fun add(key: Int) {
        table[key] = key
    }

    fun remove(key: Int) {
        table[key] = -1
    }

    fun contains(key: Int): Boolean {
        return table[key] != -1
    }
}

Chaining

We can use dynamic array or linked list to handle collision.

The first approach is to use dynamic array to store the keys:

  • We use a fixed-size array (buckets) to store the keys.
  • Each bucket will store a list of keys to handle collision.
  • We use a simple hash function to find the bucket.
class MyHashSet {
    private val size = 769 // A prime number reduces collisions
    private val buckets = Array<MutableList<Int>>(size) { mutableListOf() }


    fun add(key: Int) {
        val index = hash(key)
        if (key !in buckets[index]) {
            buckets[index].add(key)
        }
    }

    fun remove(key: Int) {
        val index = hash(key)
        buckets[index].remove(key)
    }

    fun contains(key: Int): Boolean {
        val index = hash(key)
        return key in buckets[index]
    }

    private fun hash(key: Int): Int = key % size
}

Or we can use linked list to handle collision:

  • add(key): We hash the key and append it to the end of linked list. Make sure to check if the key already exists, if yes, just skip it
  • remove(key): We hash the key and remove it from the linked list.
  • contains(key): We hash the key and search it from the linked list.
// Use built-in linked list
class MyHashSet {
    private val capacity = 769  // A prime number reduces collisions
    private val buckets = Array<LinkedList<Int>>(capacity) { LinkedList() }

    private fun hash(key: Int): Int = key % capacity

    fun add(key: Int) {
        val hashIndex = hash(key)
        val bucket = buckets[hashIndex]
        if (key !in bucket) {
            bucket.add(key)
        }
    }

    fun remove(key: Int) {
        val hashIndex = hash(key)
        buckets[hashIndex].remove(key)
    }

    fun contains(key: Int): Boolean {
        val hashIndex = hash(key)
        return key in buckets[hashIndex]
    }
}

// Or equivalent with linked list implmented by ourselves
data class SetNode(
    val key: Int,
    var next: SetNode? = null
)

class MyHashSet() {

    private val size = 769
    private val multipler = 12582917L
    private val array = Array<SetNode>(size) { SetNode(-1) }

    fun add(key: Int) {
        val hash = hash(key)
        var previous: SetNode? = array[hash]
        var current: SetNode? = array[hash]?.next

        // Pay attention: we don't use general way to find the last node.
        // because we have to check if the key already exists, so we iterate
        // every node to check if the key already exists.
        // 
        // The following code doesn't work because we don't check if key already exists.
        // var current: SetNode? = head
        // while (current?.next != null) {
        //     current = current.next
        // }
        // current?.next = SetNode(key)

        // We iterate every node to check if the key already exists.
        while (current != null) {
            if (current.key == key) return
            previous = current
            current = current.next
        }
        previous?.next = SetNode(key)
    }

    fun remove(key: Int) {
        val hash = hash(key)
        var previous: SetNode? = array[hash]
        var current: SetNode? = array[hash]?.next
        while (current != null) {
            if (current.key == key) {
                previous?.next = current.next
                return
            }
            previous = current
            current = current.next
        }
    }

    fun contains(key: Int): Boolean {
        val hash = hash(key)
        var current: SetNode? = array[hash]?.next
        while (current != null) {
            if (current.key == key) return true
            current = current.next
        }
        return false
    }

    private fun hash(key: Int): Int {
        return (key.toLong() * multipler % size).toInt()
    }
}
  • Time Complexity: O(n / k) where n is the number of all possible values and k is the number of buckets.
  • Space Complexity: O(n + k) where n is the number of all possible values and k is the number of buckets.

Open Addressing

private const val EMPTY = -1
private const val REMOVED = -2

// From ChatGPT, it's correct but I don't fully understand, and we have to use -2 to mark as deleted.
class MyHashSet {
    private val size = 10007 // A prime number for better distribution
    private val buckets = IntArray(size) { EMPTY }

    private fun hash(key: Int): Int = key % size

    fun add(key: Int) {
        var index = hash(key)
        while (buckets[index] != EMPTY && buckets[index] != key) {
            index = (index + 1) % size
        }
        buckets[index] = key
    }

    fun remove(key: Int) {
        var index = hash(key)
        while (buckets[index] != EMPTY) {
            if (buckets[index] == key) {
                buckets[index] = REMOVED
                return
            }
            index = (index + 1) % size
        }
    }

    fun contains(key: Int): Boolean {
        var index = hash(key)
        while (buckets[index] != EMPTY) {
            if (buckets[index] == key) return true
            index = (index + 1) % size
        }
        return false
    }
}
class MyHashSet() {

    private val size = 10007
    private val multipler = 12582917L
    // We initialize the table with -1, which means empty.
    private val table = IntArray(size) { EMPTY }

    fun add(key: Int) {
        var probe = 0
        var hash = hash(key, probe)
        while (table[hash] != EMPTY) {
            if (table[hash] == key) return
            probe++
            hash = hash(key, probe)
        }
        table[hash] = key
    }

    fun remove(key: Int) {
        var probe = 0
        var hash = hash(key, probe)
        while (table[hash] != EMPTY) {
            if (table[hash] == key) {
                table[hash] = REMOVED
                return
            }
            probe++
            hash = hash(key, probe)
        }
    }

    fun contains(key: Int): Boolean {
        var probe = 0
        var hash = hash(key, probe)
        while (table[hash] != EMPTY) {
            if (table[hash] == key) return true
            probe++
            hash = hash(key, probe)
        }
        return false
    }

    // We use linear probing to find the next empty slot.
    private fun hash(key: Int, probe: Int): Int {
        return (hash(key) + probe) % size
    }

    private fun hash(key: Int): Int {
        return (key.toLong() * multipler % size).toInt()
    }
}
  • Time Complexity: O(1) on average if hash function is good and low load factor, O(n) where n is the number of all possible values for worst case.
  • Space Complexity: O(n) where n is the number of all possible values.

Why Mark as Removed?

When we remove an element, we have to mark it as deleted instead of setting it to EMPTY. This is because we have to ensure that the probing sequence is not broken. If we set the slot to EMPTY, the probing sequence will be broken and we can't find the element that is inserted after the deleted element.

For example, we have the following sequence of operations:

size = 10
hash(x) = x % 10
add(5)
add(15)
remove(5)
contains(15)

If we set the slot to EMPTY when we remove the element, we see

add(5)          // index 5 = 5
add(15)         // index 5 is occupied -> linear probing -> index 6 = 15 
remove(5)       // index 5 = EMPTY
contains(15)    // index 5 = EMPTY, break the while loop, return false (incorrect)

If we set the slot to REMOVED when we remove the element, we see

add(5)          // index 5 = 5
add(15)         // index 5 is occupied -> linear probing -> index 6 = 15
remove(5)       // index 5 = REMOVED
contains(15)    // index 5 = REMOVED, continue probing -> index 6 = 15, return true (correct)

Probing should continue until we find the key or reach an empty slot. If we use EMPTY for removed elements, the search will stop too early, failing to find valid keys that were probed forward.

But we have to modify our add() to reuse the index marked as REMOVED, when inserting a new key, we should reuse REMOVED slots if they appear before any EMPTY slot. And contains() should skip REMOVED and continue probling. This ensure that the probing sequence is not broken and we can reuse the slot that is removed before. (Not implemented in the above code)