Skip to content

Commit

Permalink
Replaced base64 with base58 in seqhash (#69)
Browse files Browse the repository at this point in the history
* Replaced base64 with base58 in seqhash
  • Loading branch information
Koeng101 authored Mar 26, 2024
1 parent d0607bb commit 9acd036
Show file tree
Hide file tree
Showing 5 changed files with 183 additions and 26 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
- Updated seqhash2 to use base58 rather than base64 [#69](https://github.com/Koeng101/dnadesign/pull/69)
- Updated dual barcodes primer sets to be created without csv files [#67](https://github.com/Koeng101/dnadesign/pull/67)
- Added workers to bio as a way to process data [#62](https://github.com/Koeng101/dnadesign/pull/62)
- Improved megamash efficiency and added []Match JSON conversion [#61](https://github.com/Koeng101/dnadesign/pull/61)
Expand Down
67 changes: 67 additions & 0 deletions lib/seqhash/base58.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
package seqhash

import (
"errors"
"math/big"
"strings"
)

const alphabet = "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz"

// encodeToBase58 encodes a byte slice to a Base58 string
func encodeToBase58(input []byte) string {
// Convert byte slice to a big.Int
num := big.NewInt(0).SetBytes(input)
base := big.NewInt(int64(len(alphabet)))
mod := &big.Int{}
var encoded strings.Builder

// Convert to base58
for num.Sign() > 0 {
num.DivMod(num, base, mod)
encoded.WriteByte(alphabet[mod.Int64()])
}

// Add '1' for each leading 0 byte
for _, b := range input {
if b != 0 {
break
}
encoded.WriteByte('1')
}

// Reverse the encoded string
result := []byte(encoded.String())
for i, j := 0, len(result)-1; i < j; i, j = i+1, j-1 {
result[i], result[j] = result[j], result[i]
}

return string(result)
}

// decodeFromBase58 decodes a Base58 string to a byte slice
func decodeFromBase58(input string) ([]byte, error) {
if len(input) == 0 {
return []byte{}, nil
}

num := big.NewInt(0)
base := big.NewInt(int64(len(alphabet)))
for _, c := range input {
charIndex := strings.IndexRune(alphabet, c)
if charIndex == -1 {
return nil, errors.New("invalid character found")
}
num.Mul(num, base)
num.Add(num, big.NewInt(int64(charIndex)))
}

decoded := num.Bytes()
// Add leading zeros
if input[0] == '1' {
leadingZeros := len(input) - len(strings.TrimLeft(input, "1"))
decoded = append(make([]byte, leadingZeros), decoded...)
}

return decoded, nil
}
2 changes: 1 addition & 1 deletion lib/seqhash/example_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ func Example_basic() {

sequenceSeqhash, _ := seqhash.EncodeHash2(seqhash.Hash2(sequence, sequenceType, circular, doubleStranded))
fmt.Println(sequenceSeqhash)
// Output: C_JJgg9ahMxAQzDm2XveE7WA==
// Output: C_5X6Hudy3K8ht7r4mvu9Gco
}

func ExampleRotateSequence() {
Expand Down
57 changes: 45 additions & 12 deletions lib/seqhash/seqhash.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,18 +48,18 @@ much shorter. The intended use case are for handling sequences with LLM systems
since these system's context window is a value resource, and smaller references
allows the system to be more focused. Seqhash version 2 are approximately 3x
smaller than version 1 seqhashes. Officially, they are [16]byte arrays, but can
be also encoded with base64 to get a hash that can be used as a string across
be also encoded with base58 to get a hash that can be used as a string across
different systems. Here is a length comparison:
version 1: v1_DLD_f4028f93e08c5c23cbb8daa189b0a9802b378f1a1c919dcbcf1608a615f46350
version 2: C_JPQCj5PgjFwjy7jaoYmwqQ==
version 1: v1_DLD_f4028f93e08c5c23cbb8daa189b0a9802b378f1a1c919dcbcf1508a615f46350
version 2: C_5X6Hudy3K8ht7r4mvu9Gco
The metadata is now encoded in a 1 byte flag rather than a metadata string,
instead of 7 rune like in version 1. Rather than use 256 bits for encoding
the hash, we use 120 bits. Since seqhashes are not meant for security, this
is good enough (50% collision with 1.3x10^18 hashes), while making them
conveniently only 16 btyes long. Additionally, encoded prefixes are added
to the front of the base64 encoded hash as a heuristic device for LLMs while
to the front of the base58 encoded hash as a heuristic device for LLMs while
processing batches of seqhashes.
In addition, seqhashes can now encode fragments. Fragments are double stranded
Expand All @@ -68,12 +68,17 @@ overhangs flanking both sides. These fragments can encode genetic parts - and
an important part of any vector containing these parts would be the part
seqhash, rather than the vector seqhash. This enhancement allows you to
identify genetic parts irregardless of their context.
Base58 is used rather than base64 so that seqhashes can easily be added into
urls without a "/" in the identifier. Ironically, it also makes smaller hashes
than base64 due to base64 chunking 3 bytes at a time - at 16 bytes, 2 blank
bytes are added to make the seqhash divisible by 3. Base58 chunks differently,
and so doesn't encounter this problem.
*/
package seqhash

import (
"crypto/sha256"
"encoding/base64"
"errors"
"sort"
"strings"
Expand Down Expand Up @@ -190,7 +195,7 @@ var (
func EncodeFlag(version int, sequenceType SequenceType, circularity bool, doubleStranded bool) byte {
var flag byte

// Encode the version (assuming version is in the range 0-15)
// Encode the version (assuming version is in the range 0-16)
flag |= (byte(version) << hash2versionShift)

// Encode the circularity
Expand Down Expand Up @@ -285,9 +290,9 @@ func Hash2(sequence string, sequenceType SequenceType, circular bool, doubleStra
flag := EncodeFlag(2, sequenceType, circular, doubleStranded)
result[0] = flag

// Compute BLAKE3, then copy those to the remaining 15 bytes
// Compute BLAKE3, then copy those to the remaining 16 bytes
newhash := sha256.Sum256([]byte(deterministicSequence))
copy(result[1:], newhash[:15])
copy(result[1:], newhash[:16])

return result, nil
}
Expand All @@ -305,7 +310,7 @@ func Hash2(sequence string, sequenceType SequenceType, circular bool, doubleStra
// enzymes.
//
// In order to make sure fwdOverhangLength and revOverhangLength fit in the
// hash, the hash is truncated at 13 bytes rather than 15, and both int8 are
// hash, the hash is truncated at 13 bytes rather than 16, and both int8 are
// inserted. So the bytes would be:
//
// flag + fwdOverhangLength + revOverhangLength + [13]byte(hash)
Expand Down Expand Up @@ -387,12 +392,40 @@ var Hash2Metadata = map[Hash2MetadataKey]rune{
{FRAGMENT, true, true}: 'N',
}

// EncodeHash2 encodes Hash2 as a base64 string. It also adds a single
// EncodeHash2 encodes Hash2 as a base58 string. It also adds a single
// letter metadata tag that can be used as an easy heuristic for an LLM to
// identify misbehaving code.
func EncodeHash2(hash [16]byte, err error) (string, error) {
if err != nil {
return "", err
}
_, sequenceType, circularity, doubleStranded := DecodeFlag(hash[0])
encoded := base64.StdEncoding.EncodeToString(hash[:])
encoded := encodeToBase58(hash[:])

return string(Hash2Metadata[Hash2MetadataKey{sequenceType, circularity, doubleStranded}]) + "_" + encoded, nil
}

// DecodeHash2 decodes a seqhash into a [16]byte, including the metadata tag.
func DecodeHash2(encodedString string) ([16]byte, error) {
// First, we need to decompose the the string into the metadata and the
// seqhash.
parts := strings.SplitN(encodedString, "_", 2)
if len(parts) != 2 {
return [16]byte{}, errors.New("invalid encoded string format")
}

// Decode the Base58 encoded part
decodedBytes, err := decodeFromBase58(parts[1])
if err != nil {
return [16]byte{}, err
}

// Ensure decoded bytes fit into a [16]byte array
if len(decodedBytes) != 16 {
return [16]byte{}, errors.New("decoded hash does not match expected length")
}

return string(Hash2Metadata[Hash2MetadataKey{sequenceType, circularity, doubleStranded}]) + "_" + encoded, err
var hash [16]byte
copy(hash[:], decodedBytes)
return hash, nil
}
82 changes: 69 additions & 13 deletions lib/seqhash/seqhash_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package seqhash

import (
"bytes"
"errors"
"fmt"
"os"
"testing"
Expand Down Expand Up @@ -34,37 +35,85 @@ func TestHash2(t *testing.T) {

// Test circular double stranded hashing
seqhash, _ := EncodeHash2(Hash2("TTAGCCCAT", "DNA", true, true))
if seqhash != "A_LGxts7bxq55Uiq+E94pcYg==" {
t.Errorf("Circular double stranded hashing failed. Expected A_LGxts7bxq55Uiq+E94pcYg==, got: " + seqhash)
if seqhash != "A_6VAbBfXD8BSZh2HJZqgGgR" {
t.Errorf("Circular double stranded hashing failed. Expected A_6VAbBfXD8BSZh2HJZqgGgR, got: " + seqhash)
}
// Test circular single stranded hashing
seqhash, _ = EncodeHash2(Hash2("TTAGCCCAT", "DNA", true, false))
if seqhash != "B_KB3s/EXx/C9wJvVE/gzw7Q==" {
t.Errorf("Circular single stranded hashing failed. Expected B_KB3s/EXx/C9wJvVE/gzw7Q==, got: " + seqhash)
if seqhash != "B_5xKbuHELJCCQWJwQi7W1ak" {
t.Errorf("Circular single stranded hashing failed. Expected B_5xKbuHELJCCQWJwQi7W1ak, got: " + seqhash)
}
// Test linear double stranded hashing
seqhash, _ = EncodeHash2(Hash2("TTAGCCCAT", "DNA", false, true))
if seqhash != "C_JN15Uk5YpkXcKaJt0ozLRQ==" {
t.Errorf("Linear double stranded hashing failed. Expected C_JN15Uk5YpkXcKaJt0ozLRQ==, got: " + seqhash)
if seqhash != "C_5Z2pHCXbxWUPYiZj6J1Nag" {
t.Errorf("Linear double stranded hashing failed. Expected C_5Z2pHCXbxWUPYiZj6J1Nag, got: " + seqhash)
}
// Test linear single stranded hashing
seqhash, _ = EncodeHash2(Hash2("TTAGCCCAT", "DNA", false, false))
if seqhash != "D_IC0pLlPHC/zPQpSqU6hy0A==" {
t.Errorf("Linear single stranded hashing failed. Expected D_IC0pLlPHC/zPQpSqU6hy0A==, got: " + seqhash)
if seqhash != "D_4yT7etihWZHHNXUpbM5tUf" {
t.Errorf("Linear single stranded hashing failed. Expected D_4yT7etihWZHHNXUpbM5tUf, got: " + seqhash)
}

// Test RNA Seqhash
seqhash, _ = EncodeHash2(Hash2("TTAGCCCAT", "RNA", false, false))
if seqhash != "H_IS0pLlPHC/zPQpSqU6hy0A==" {
t.Errorf("Linear single stranded hashing failed. Expected H_IS0pLlPHC/zPQpSqU6hy0A==, got: " + seqhash)
if seqhash != "H_56cWv4dacvRJxUUcXYsdP5" {
t.Errorf("Linear single stranded hashing failed. Expected H_56cWv4dacvRJxUUcXYsdP5, got: " + seqhash)
}
// Test Protein Seqhash
seqhash, _ = EncodeHash2(Hash2("MGC*", "PROTEIN", false, false))
if seqhash != "I_IiAwHj+EfYcQCf6Ty64wUg==" {
t.Errorf("Linear single stranded hashing failed. Expected I_IiAwHj+EfYcQCf6Ty64wUg==, got: " + seqhash)
if seqhash != "I_5DQsEyDHLh2r4njCcupAuF" {
t.Errorf("Linear single stranded hashing failed. Expected I_5DQsEyDHLh2r4njCcupAuF, got: " + seqhash)
}
}

func TestEncodeAndDecode(t *testing.T) {
rawBytes, err := Hash2("ATGC", "DNA", false, true)
if err != nil {
t.Errorf("Got bad hash: %s", err)
}
encoded, err := EncodeHash2(rawBytes, err)
if err != nil {
t.Errorf("Failed to encode: %s", err)
}
decoded, err := DecodeHash2(encoded)
if err != nil {
t.Errorf("Failed to decode: %s", err)
}
for i := range rawBytes {
if rawBytes[i] != decoded[i] {
t.Errorf("Failed to decode properly.")
}
}
_, err = EncodeHash2([16]byte{}, errors.New("test"))
if err == nil {
t.Errorf("should fail on test error")
}

// Test no metadata
_, err = DecodeHash2("")
if err == nil {
t.Errorf("should fail on no metadata")
}
// Test empty decode
_, err = DecodeHash2("A_")
if err == nil {
t.Errorf("should fail on empty data")
}
// Test bad char
_, err = DecodeHash2("A_/")
if err == nil {
t.Errorf("should fail on bad character")
}
// Test 1s
_, err = DecodeHash2("A_11111")
if err == nil {
t.Errorf("should fail on 1s because length is wrong.")
}

// just to make sure gocov goes through
_ = encodeToBase58([]byte{0, 0, 0, 0})
}

func TestLeastRotation(t *testing.T) {
file, _ := os.Open("../data/puc19.gbk")
defer file.Close()
Expand Down Expand Up @@ -110,7 +159,14 @@ func TestHash2Fragment(t *testing.T) {
}
// Test actual hash
sqHash, _ := EncodeHash2(Hash2Fragment("ATGGGCTAA", 4, 4))
expectedHash := "K_IwQE3XlSTlimRdwpom3SjA=="
expectedHash := "K_5KnZQEnPRzJSYPkbPwLCJF"
if sqHash != expectedHash {
t.Errorf("Expected %s, Got: %s", expectedHash, sqHash)
}

// Test another hash
sqHash, _ = EncodeHash2(Hash2Fragment("TTAGCCCAT", 4, 4))
expectedHash = "K_5KnZQEnPRzJSYPkbPwLCJF"
if sqHash != expectedHash {
t.Errorf("Expected %s, Got: %s", expectedHash, sqHash)
}
Expand Down

0 comments on commit 9acd036

Please sign in to comment.