Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tokenizer #78

Open
wants to merge 27 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions cli/tokenizer/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
module github.com/koeng101/dnadesign/cli/tokenizer

go 1.22.0

require (
github.com/dustin/go-humanize v1.0.1 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/ncruces/go-strftime v0.1.9 // indirect
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
golang.org/x/sys v0.19.0 // indirect
modernc.org/gc/v3 v3.0.0-20240107210532-573471604cb6 // indirect
modernc.org/libc v1.52.1 // indirect
modernc.org/mathutil v1.6.0 // indirect
modernc.org/memory v1.8.0 // indirect
modernc.org/sqlite v1.30.1 // indirect
modernc.org/strutil v1.2.0 // indirect
modernc.org/token v1.1.0 // indirect
)
29 changes: 29 additions & 0 deletions cli/tokenizer/go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/ncruces/go-strftime v0.1.9 h1:bY0MQC28UADQmHmaF5dgpLmImcShSi2kHU9XLdhx/f4=
github.com/ncruces/go-strftime v0.1.9/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o=
golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
modernc.org/gc/v3 v3.0.0-20240107210532-573471604cb6 h1:5D53IMaUuA5InSeMu9eJtlQXS2NxAhyWQvkKEgXZhHI=
modernc.org/gc/v3 v3.0.0-20240107210532-573471604cb6/go.mod h1:Qz0X07sNOR1jWYCrJMEnbW/X55x206Q7Vt4mz6/wHp4=
modernc.org/libc v1.52.1 h1:uau0VoiT5hnR+SpoWekCKbLqm7v6dhRL3hI+NQhgN3M=
modernc.org/libc v1.52.1/go.mod h1:HR4nVzFDSDizP620zcMCgjb1/8xk2lg5p/8yjfGv1IQ=
modernc.org/mathutil v1.6.0 h1:fRe9+AmYlaej+64JsEEhoWuAYBkOtQiMEU7n/XgfYi4=
modernc.org/mathutil v1.6.0/go.mod h1:Ui5Q9q1TR2gFm0AQRqQUaBWFLAhQpCwNcuhBOSedWPo=
modernc.org/memory v1.8.0 h1:IqGTL6eFMaDZZhEWwcREgeMXYwmW83LYW8cROZYkg+E=
modernc.org/memory v1.8.0/go.mod h1:XPZ936zp5OMKGWPqbD3JShgd/ZoQ7899TUuQqxY+peU=
modernc.org/sqlite v1.30.1 h1:YFhPVfu2iIgUf9kuA1CR7iiHdcEEsI2i+yjRYHscyxk=
modernc.org/sqlite v1.30.1/go.mod h1:DUmsiWQDaAvU4abhc/N+djlom/L2o8f7gZ95RCvyoLU=
modernc.org/strutil v1.2.0 h1:agBi9dp1I+eOnxXeiZawM8F4LawKv4NzGWSaLfyeNZA=
modernc.org/strutil v1.2.0/go.mod h1:/mdcBmfOibveCTBxUl5B5l6W+TTH1FXPLHZE6bTosX0=
modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
101 changes: 101 additions & 0 deletions cli/tokenizer/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
package main

import (
"bufio"
"crypto/md5"
"database/sql"
"encoding/binary"
"flag"
"fmt"
"log"
"os"
"strings"

_ "modernc.org/sqlite"

"github.com/koeng101/dnadesign/lib/bio"
"github.com/koeng101/dnadesign/lib/tokenizer"
)

// Function to convert []uint16 to a byte slice
func uint16SliceToBytes(slice []uint16) []byte {
buf := make([]byte, len(slice)*2)
for i, v := range slice {
binary.LittleEndian.PutUint16(buf[i*2:], v)
}
return buf
}

// Function to convert byte slice back to []uint16
func bytesToUint16Slice(buf []byte) []uint16 {
slice := make([]uint16, len(buf)/2)
for i := range slice {
slice[i] = binary.LittleEndian.Uint16(buf[i*2:])
}
return slice
}

func main() {
// Parse the command line flags
flag.Parse()

// Connect to database
db, err := sql.Open("sqlite", "./sequences.db")
if err != nil {
log.Fatal(err)
}
defer db.Close()

// Create the table if it doesn't exist
_, err = db.Exec(`
PRAGMA journal_mode = WAL;
PRAGMA synchronous = NORMAL; -- https://news.ycombinator.com/item?id=34247738
PRAGMA cache_size = 20000; -- https://news.ycombinator.com/item?id=34247738
PRAGMA foreign_keys = ON;
PRAGMA strict = ON;
PRAGMA busy_timeout = 5000;

CREATE TABLE IF NOT EXISTS sequences (
checksum TEXT PRIMARY KEY,
sequence TEXT,
tokens BLOB
);
`)
if err != nil {
log.Fatal(err)
}

// Get a default tokenizer
tokenizer := tokenizer.DefaultAminoAcidTokenizer()
fmt.Println("initializing parser")
tokenizerJSON, err := tokenizer.ToJSON()
if err != nil {
fmt.Println("Err: ", err)
}
fmt.Println(tokenizerJSON)
refParser := bio.NewFastaParser(bufio.NewReader(os.Stdin))
count := 0
for {
if (count % 10000) == 0 {
fmt.Printf("Processed sequence: %d\n", count)
}
protein, err := refParser.Next()
if err != nil {
break
}
sequence := strings.ToUpper(protein.Sequence)
tokens, _ := tokenizer.TokenizeProtein(sequence)
tokensBytes := uint16SliceToBytes(tokens)
checksum := fmt.Sprintf("%x", md5.Sum([]byte(sequence)))
count++

// Insert into the database
_, err = db.Exec(`
INSERT INTO sequences (checksum, sequence, tokens)
VALUES (?, ?, ?);
`, checksum, sequence, tokensBytes)
if err != nil {
log.Fatal(err)
}
}
}
72 changes: 72 additions & 0 deletions cli/tokenizer/process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import os
import sqlite3
import numpy as np
from tqdm import tqdm

# Connection to your database
db_path = "./sequences.db"
conn = sqlite3.connect(db_path)

# Calculate split index for training and validation
def calculate_split_index(total_rows, val_percentage):
return int(total_rows * (1 - val_percentage))

def fetch_data(val_percentage=0.01):
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM sequences")
total_rows = cursor.fetchone()[0]
split_index = calculate_split_index(total_rows, val_percentage)

# Fetch data with randomized order
cursor.execute("SELECT tokens FROM sequences ORDER BY RANDOM()")

count = 0
while True:
row = cursor.fetchone()
if row is None:
break
yield row[0], count < split_index
count += 1

cursor.close()

# Function to convert blob bytes to uint16 array
def bytes_to_uint16(buf):
arr = np.frombuffer(buf, dtype=np.uint16)
return np.append(arr, 0) # Append 0 as the EOT token

if __name__ == '__main__':
train_filename = os.path.join(os.path.dirname(__file__), 'train.bin')
val_filename = os.path.join(os.path.dirname(__file__), 'val.bin')
dtype = np.uint16

# Initialize memmap files with rough size estimates, adjusted as needed
train_arr = np.memmap(train_filename, dtype=dtype, mode='w+', shape=(1,))
val_arr = np.memmap(val_filename, dtype=dtype, mode='w+', shape=(1,))

train_idx = 0
val_idx = 0
for tokens, is_train in fetch_data():
tokens_uint16 = bytes_to_uint16(tokens)

# Determine where to store the tokens
if is_train:
if train_idx + len(tokens_uint16) > len(train_arr):
train_arr.flush()
train_arr = np.memmap(train_filename, dtype=dtype, mode='r+', shape=(train_idx + len(tokens_uint16),))
train_arr[train_idx:train_idx + len(tokens_uint16)] = tokens_uint16
train_idx += len(tokens_uint16)
else:
if val_idx + len(tokens_uint16) > len(val_arr):
val_arr.flush()
val_arr = np.memmap(val_filename, dtype=dtype, mode='r+', shape=(val_idx + len(tokens_uint16),))
val_arr[val_idx:val_idx + len(tokens_uint16)] = tokens_uint16
val_idx += len(tokens_uint16)

train_arr.flush()
val_arr.flush()
conn.close()

print(f"Training data written to {train_filename}. Size: {train_idx * np.dtype(dtype).itemsize / (1024**2)} MB")
print(f"Validation data written to {val_filename}. Size: {val_idx * np.dtype(dtype).itemsize / (1024**2)} MB")

1 change: 1 addition & 0 deletions go.work
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ go 1.22.0
use (
./external
./lib
./cli/tokenizer
)
37 changes: 21 additions & 16 deletions lib/bio/uniprot/uniprot.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ import (
"io"
"net/http"
"net/url"

"golang.org/x/net/html/charset"
)

// Decoder decodes XML elements2
Expand Down Expand Up @@ -69,31 +71,34 @@ type Parser struct {
// from which to parse fasta formatted sequences.
func NewParser(r io.Reader) *Parser {
decoder := xml.NewDecoder(r)
decoder.CharsetReader = charset.NewReaderLabel
return &Parser{decoder: decoder}
}

func (p *Parser) Next() (Entry, error) {
decoderToken, err := p.decoder.Token()
for {
decoderToken, err := p.decoder.Token()

// Check decoding
if err != nil {
// If we are the end of the file, return io.EOF
if err.Error() == "EOF" {
return Entry{}, io.EOF
}
}

// Actual parsing
startElement, ok := decoderToken.(xml.StartElement)
if ok && startElement.Name.Local == "entry" {
var e Entry
err = p.decoder.DecodeElement(&e, &startElement)
// Check decoding
if err != nil {
// If we are the end of the file, return io.EOF
if err.Error() == "EOF" {
return Entry{}, io.EOF
}
return Entry{}, err
}
return e, nil

// Actual parsing
startElement, ok := decoderToken.(xml.StartElement)
if ok && startElement.Name.Local == "entry" {
var e Entry
err = p.decoder.DecodeElement(&e, &startElement)
if err != nil {
return Entry{}, err
}
return e, nil
}
}
return p.Next()
}

// BaseURL encodes the base URL for the Uniprot REST API.
Expand Down
7 changes: 7 additions & 0 deletions lib/bio/uniprot/uniprot_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,4 +110,11 @@ func TestGet(t *testing.T) {
if err == nil {
t.Errorf("Expected an error for invalid URL, but got none")
}
for _, reference := range entry.DbReference {
if reference.Type == "Pfam" {
if reference.Id != "PF01353" {
t.Errorf("Expected Pfam ID PF01353")
}
}
}
}
1 change: 1 addition & 0 deletions lib/bio/uniprot/xml.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 6 additions & 1 deletion lib/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,10 @@ go 1.22.0
require (
github.com/google/go-cmp v0.6.0
github.com/koeng101/dnadesign/external v0.0.0-20240213205901-f4998ef84117
golang.org/x/sync v0.5.0
golang.org/x/sync v0.7.0
)

require (
golang.org/x/net v0.26.0 // indirect
golang.org/x/text v0.16.0 // indirect
)
5 changes: 5 additions & 0 deletions lib/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,10 @@ github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/koeng101/dnadesign/external v0.0.0-20240213205901-f4998ef84117 h1:MLWgADbigSsAmDP3yG93ESlN0Ek9QLtH5uHigmWVXwg=
github.com/koeng101/dnadesign/external v0.0.0-20240213205901-f4998ef84117/go.mod h1:nb80z/jm5HMCxfNZ50cBJa5TffkXxpY9okvqnBj8RrM=
golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ=
golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE=
golang.org/x/sync v0.5.0 h1:60k92dhOjHxJkrqnwsfl8KuaHbn/5dl0lUPUklKo3qE=
golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4=
golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI=
Binary file added lib/tokenizer/data/gfp_rfp_lacZ.xml.gz
Binary file not shown.
Loading
Loading