Replaced base64 with base58 in seqhash (#69)

* Replaced base64 with base58 in seqhash
Koeng101 · Mar 26, 2024 · 9acd036 · 9acd036
1 parent d0607bb
commit 9acd036
Show file tree

Hide file tree

Showing 5 changed files with 183 additions and 26 deletions.
diff --git a/README.md b/README.md
@@ -71,6 +71,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [Unreleased]
+- Updated seqhash2 to use base58 rather than base64 [#69](https://github.com/Koeng101/dnadesign/pull/69)
 - Updated dual barcodes primer sets to be created without csv files [#67](https://github.com/Koeng101/dnadesign/pull/67)
 - Added workers to bio as a way to process data [#62](https://github.com/Koeng101/dnadesign/pull/62)
 - Improved megamash efficiency and added []Match JSON conversion [#61](https://github.com/Koeng101/dnadesign/pull/61)

diff --git a/lib/seqhash/base58.go b/lib/seqhash/base58.go
@@ -0,0 +1,67 @@
+package seqhash
+
+import (
+	"errors"
+	"math/big"
+	"strings"
+)
+
+const alphabet = "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz"
+
+// encodeToBase58 encodes a byte slice to a Base58 string
+func encodeToBase58(input []byte) string {
+	// Convert byte slice to a big.Int
+	num := big.NewInt(0).SetBytes(input)
+	base := big.NewInt(int64(len(alphabet)))
+	mod := &big.Int{}
+	var encoded strings.Builder
+
+	// Convert to base58
+	for num.Sign() > 0 {
+		num.DivMod(num, base, mod)
+		encoded.WriteByte(alphabet[mod.Int64()])
+	}
+
+	// Add '1' for each leading 0 byte
+	for _, b := range input {
+		if b != 0 {
+			break
+		}
+		encoded.WriteByte('1')
+	}
+
+	// Reverse the encoded string
+	result := []byte(encoded.String())
+	for i, j := 0, len(result)-1; i < j; i, j = i+1, j-1 {
+		result[i], result[j] = result[j], result[i]
+	}
+
+	return string(result)
+}
+
+// decodeFromBase58 decodes a Base58 string to a byte slice
+func decodeFromBase58(input string) ([]byte, error) {
+	if len(input) == 0 {
+		return []byte{}, nil
+	}
+
+	num := big.NewInt(0)
+	base := big.NewInt(int64(len(alphabet)))
+	for _, c := range input {
+		charIndex := strings.IndexRune(alphabet, c)
+		if charIndex == -1 {
+			return nil, errors.New("invalid character found")
+		}
+		num.Mul(num, base)
+		num.Add(num, big.NewInt(int64(charIndex)))
+	}
+
+	decoded := num.Bytes()
+	// Add leading zeros
+	if input[0] == '1' {
+		leadingZeros := len(input) - len(strings.TrimLeft(input, "1"))
+		decoded = append(make([]byte, leadingZeros), decoded...)
+	}
+
+	return decoded, nil
+}
diff --git a/lib/seqhash/example_test.go b/lib/seqhash/example_test.go
@@ -17,7 +17,7 @@ func Example_basic() {
 
 	sequenceSeqhash, _ := seqhash.EncodeHash2(seqhash.Hash2(sequence, sequenceType, circular, doubleStranded))
 	fmt.Println(sequenceSeqhash)
-	// Output: C_JJgg9ahMxAQzDm2XveE7WA==
+	// Output: C_5X6Hudy3K8ht7r4mvu9Gco
 }
 
 func ExampleRotateSequence() {

diff --git a/lib/seqhash/seqhash.go b/lib/seqhash/seqhash.go
@@ -48,18 +48,18 @@ much shorter. The intended use case are for handling sequences with LLM systems
 since these system's context window is a value resource, and smaller references
 allows the system to be more focused. Seqhash version 2 are approximately 3x
 smaller than version 1 seqhashes. Officially, they are [16]byte arrays, but can
-be also encoded with base64 to get a hash that can be used as a string across
+be also encoded with base58 to get a hash that can be used as a string across
 different systems. Here is a length comparison:
 
-	version 1: v1_DLD_f4028f93e08c5c23cbb8daa189b0a9802b378f1a1c919dcbcf1608a615f46350
-	version 2: C_JPQCj5PgjFwjy7jaoYmwqQ==
+	version 1: v1_DLD_f4028f93e08c5c23cbb8daa189b0a9802b378f1a1c919dcbcf1508a615f46350
+	version 2: C_5X6Hudy3K8ht7r4mvu9Gco
 
 The metadata is now encoded in a 1 byte flag rather than a metadata string,
 instead of 7 rune like in version 1. Rather than use 256 bits for encoding
 the hash, we use 120 bits. Since seqhashes are not meant for security, this
 is good enough (50% collision with 1.3x10^18 hashes), while making them
 conveniently only 16 btyes long. Additionally, encoded prefixes are added
-to the front of the base64 encoded hash as a heuristic device for LLMs while
+to the front of the base58 encoded hash as a heuristic device for LLMs while
 processing batches of seqhashes.
 
 In addition, seqhashes can now encode fragments. Fragments are double stranded
@@ -68,12 +68,17 @@ overhangs flanking both sides. These fragments can encode genetic parts - and
 an important part of any vector containing these parts would be the part
 seqhash, rather than the vector seqhash. This enhancement allows you to
 identify genetic parts irregardless of their context.
+
+Base58 is used rather than base64 so that seqhashes can easily be added into
+urls without a "/" in the identifier. Ironically, it also makes smaller hashes
+than base64 due to base64 chunking 3 bytes at a time - at 16 bytes, 2 blank
+bytes are added to make the seqhash divisible by 3. Base58 chunks differently,
+and so doesn't encounter this problem.
 */
 package seqhash
 
 import (
 	"crypto/sha256"
-	"encoding/base64"
 	"errors"
 	"sort"
 	"strings"
@@ -190,7 +195,7 @@ var (
 func EncodeFlag(version int, sequenceType SequenceType, circularity bool, doubleStranded bool) byte {
 	var flag byte
 
-	// Encode the version (assuming version is in the range 0-15)
+	// Encode the version (assuming version is in the range 0-16)
 	flag |= (byte(version) << hash2versionShift)
 
 	// Encode the circularity
@@ -285,9 +290,9 @@ func Hash2(sequence string, sequenceType SequenceType, circular bool, doubleStra
 	flag := EncodeFlag(2, sequenceType, circular, doubleStranded)
 	result[0] = flag
 
-	// Compute BLAKE3, then copy those to the remaining 15 bytes
+	// Compute BLAKE3, then copy those to the remaining 16 bytes
 	newhash := sha256.Sum256([]byte(deterministicSequence))
-	copy(result[1:], newhash[:15])
+	copy(result[1:], newhash[:16])
 
 	return result, nil
 }
@@ -305,7 +310,7 @@ func Hash2(sequence string, sequenceType SequenceType, circular bool, doubleStra
 // enzymes.
 //
 // In order to make sure fwdOverhangLength and revOverhangLength fit in the
-// hash, the hash is truncated at 13 bytes rather than 15, and both int8 are
+// hash, the hash is truncated at 13 bytes rather than 16, and both int8 are
 // inserted. So the bytes would be:
 //
 //	flag + fwdOverhangLength + revOverhangLength + [13]byte(hash)
@@ -387,12 +392,40 @@ var Hash2Metadata = map[Hash2MetadataKey]rune{
 	{FRAGMENT, true, true}:   'N',
 }
 
-// EncodeHash2 encodes Hash2 as a base64 string. It also adds a single
+// EncodeHash2 encodes Hash2 as a base58 string. It also adds a single
 // letter metadata tag that can be used as an easy heuristic for an LLM to
 // identify misbehaving code.
 func EncodeHash2(hash [16]byte, err error) (string, error) {
+	if err != nil {
+		return "", err
+	}
 	_, sequenceType, circularity, doubleStranded := DecodeFlag(hash[0])
-	encoded := base64.StdEncoding.EncodeToString(hash[:])
+	encoded := encodeToBase58(hash[:])
+
+	return string(Hash2Metadata[Hash2MetadataKey{sequenceType, circularity, doubleStranded}]) + "_" + encoded, nil
+}
+
+// DecodeHash2 decodes a seqhash into a [16]byte, including the metadata tag.
+func DecodeHash2(encodedString string) ([16]byte, error) {
+	// First, we need to decompose the the string into the metadata and the
+	// seqhash.
+	parts := strings.SplitN(encodedString, "_", 2)
+	if len(parts) != 2 {
+		return [16]byte{}, errors.New("invalid encoded string format")
+	}
+
+	// Decode the Base58 encoded part
+	decodedBytes, err := decodeFromBase58(parts[1])
+	if err != nil {
+		return [16]byte{}, err
+	}
+
+	// Ensure decoded bytes fit into a [16]byte array
+	if len(decodedBytes) != 16 {
+		return [16]byte{}, errors.New("decoded hash does not match expected length")
+	}
 
-	return string(Hash2Metadata[Hash2MetadataKey{sequenceType, circularity, doubleStranded}]) + "_" + encoded, err
+	var hash [16]byte
+	copy(hash[:], decodedBytes)
+	return hash, nil
 }
diff --git a/lib/seqhash/seqhash_test.go b/lib/seqhash/seqhash_test.go
@@ -2,6 +2,7 @@ package seqhash
 
 import (
 	"bytes"
+	"errors"
 	"fmt"
 	"os"
 	"testing"
@@ -34,37 +35,85 @@ func TestHash2(t *testing.T) {
 
 	// Test circular double stranded hashing
 	seqhash, _ := EncodeHash2(Hash2("TTAGCCCAT", "DNA", true, true))
-	if seqhash != "A_LGxts7bxq55Uiq+E94pcYg==" {
-		t.Errorf("Circular double stranded hashing failed. Expected A_LGxts7bxq55Uiq+E94pcYg==, got: " + seqhash)
+	if seqhash != "A_6VAbBfXD8BSZh2HJZqgGgR" {
+		t.Errorf("Circular double stranded hashing failed. Expected A_6VAbBfXD8BSZh2HJZqgGgR, got: " + seqhash)
 	}
 	// Test circular single stranded hashing
 	seqhash, _ = EncodeHash2(Hash2("TTAGCCCAT", "DNA", true, false))
-	if seqhash != "B_KB3s/EXx/C9wJvVE/gzw7Q==" {
-		t.Errorf("Circular single stranded hashing failed. Expected B_KB3s/EXx/C9wJvVE/gzw7Q==, got: " + seqhash)
+	if seqhash != "B_5xKbuHELJCCQWJwQi7W1ak" {
+		t.Errorf("Circular single stranded hashing failed. Expected B_5xKbuHELJCCQWJwQi7W1ak, got: " + seqhash)
 	}
 	// Test linear double stranded hashing
 	seqhash, _ = EncodeHash2(Hash2("TTAGCCCAT", "DNA", false, true))
-	if seqhash != "C_JN15Uk5YpkXcKaJt0ozLRQ==" {
-		t.Errorf("Linear double stranded hashing failed. Expected C_JN15Uk5YpkXcKaJt0ozLRQ==, got: " + seqhash)
+	if seqhash != "C_5Z2pHCXbxWUPYiZj6J1Nag" {
+		t.Errorf("Linear double stranded hashing failed. Expected C_5Z2pHCXbxWUPYiZj6J1Nag, got: " + seqhash)
 	}
 	// Test linear single stranded hashing
 	seqhash, _ = EncodeHash2(Hash2("TTAGCCCAT", "DNA", false, false))
-	if seqhash != "D_IC0pLlPHC/zPQpSqU6hy0A==" {
-		t.Errorf("Linear single stranded hashing failed. Expected D_IC0pLlPHC/zPQpSqU6hy0A==, got: " + seqhash)
+	if seqhash != "D_4yT7etihWZHHNXUpbM5tUf" {
+		t.Errorf("Linear single stranded hashing failed. Expected D_4yT7etihWZHHNXUpbM5tUf, got: " + seqhash)
 	}
 
 	// Test RNA Seqhash
 	seqhash, _ = EncodeHash2(Hash2("TTAGCCCAT", "RNA", false, false))
-	if seqhash != "H_IS0pLlPHC/zPQpSqU6hy0A==" {
-		t.Errorf("Linear single stranded hashing failed. Expected H_IS0pLlPHC/zPQpSqU6hy0A==, got: " + seqhash)
+	if seqhash != "H_56cWv4dacvRJxUUcXYsdP5" {
+		t.Errorf("Linear single stranded hashing failed. Expected H_56cWv4dacvRJxUUcXYsdP5, got: " + seqhash)
 	}
 	// Test Protein Seqhash
 	seqhash, _ = EncodeHash2(Hash2("MGC*", "PROTEIN", false, false))
-	if seqhash != "I_IiAwHj+EfYcQCf6Ty64wUg==" {
-		t.Errorf("Linear single stranded hashing failed. Expected I_IiAwHj+EfYcQCf6Ty64wUg==, got: " + seqhash)
+	if seqhash != "I_5DQsEyDHLh2r4njCcupAuF" {
+		t.Errorf("Linear single stranded hashing failed. Expected I_5DQsEyDHLh2r4njCcupAuF, got: " + seqhash)
 	}
 }
 
+func TestEncodeAndDecode(t *testing.T) {
+	rawBytes, err := Hash2("ATGC", "DNA", false, true)
+	if err != nil {
+		t.Errorf("Got bad hash: %s", err)
+	}
+	encoded, err := EncodeHash2(rawBytes, err)
+	if err != nil {
+		t.Errorf("Failed to encode: %s", err)
+	}
+	decoded, err := DecodeHash2(encoded)
+	if err != nil {
+		t.Errorf("Failed to decode: %s", err)
+	}
+	for i := range rawBytes {
+		if rawBytes[i] != decoded[i] {
+			t.Errorf("Failed to decode properly.")
+		}
+	}
+	_, err = EncodeHash2([16]byte{}, errors.New("test"))
+	if err == nil {
+		t.Errorf("should fail on test error")
+	}
+
+	// Test no metadata
+	_, err = DecodeHash2("")
+	if err == nil {
+		t.Errorf("should fail on no metadata")
+	}
+	// Test empty decode
+	_, err = DecodeHash2("A_")
+	if err == nil {
+		t.Errorf("should fail on empty data")
+	}
+	// Test bad char
+	_, err = DecodeHash2("A_/")
+	if err == nil {
+		t.Errorf("should fail on bad character")
+	}
+	// Test 1s
+	_, err = DecodeHash2("A_11111")
+	if err == nil {
+		t.Errorf("should fail on 1s because length is wrong.")
+	}
+
+	// just to make sure gocov goes through
+	_ = encodeToBase58([]byte{0, 0, 0, 0})
+}
+
 func TestLeastRotation(t *testing.T) {
 	file, _ := os.Open("../data/puc19.gbk")
 	defer file.Close()
@@ -110,7 +159,14 @@ func TestHash2Fragment(t *testing.T) {
 	}
 	// Test actual hash
 	sqHash, _ := EncodeHash2(Hash2Fragment("ATGGGCTAA", 4, 4))
-	expectedHash := "K_IwQE3XlSTlimRdwpom3SjA=="
+	expectedHash := "K_5KnZQEnPRzJSYPkbPwLCJF"
+	if sqHash != expectedHash {
+		t.Errorf("Expected %s, Got: %s", expectedHash, sqHash)
+	}
+
+	// Test another hash
+	sqHash, _ = EncodeHash2(Hash2Fragment("TTAGCCCAT", 4, 4))
+	expectedHash = "K_5KnZQEnPRzJSYPkbPwLCJF"
 	if sqHash != expectedHash {
 		t.Errorf("Expected %s, Got: %s", expectedHash, sqHash)
 	}