From 08e6272aaa080dd0b9cdb2fb5efcf885e0093e76 Mon Sep 17 00:00:00 2001 From: Mark Reed Date: Thu, 16 May 2024 08:51:25 -0700 Subject: [PATCH 1/4] Initial go support --- CONTRIBUTING.md | 19 +++++ go/stringzilla/main.go | 161 +++++++++++++++++++++++++++++++++++++++++ scripts/bench.go | 55 ++++++++++++++ scripts/test.go | 52 +++++++++++++ 4 files changed, 287 insertions(+) create mode 100644 go/stringzilla/main.go create mode 100644 scripts/bench.go create mode 100644 scripts/test.go diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c9d6a950..774b84b3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -457,6 +457,25 @@ cargo package --list --allow-dirty If you want to run benchmarks against third-party implementations, check out the [`ashvardanian/memchr_vs_stringzilla`](https://github.com/ashvardanian/memchr_vs_stringzilla/) repository. +## Contributing in Go + +```bash +go run scripts/test.go +go run scripts/bench.go +``` + +To run locally import with a relative path + +```bash + "../StringZilla/go/sz" +``` + +And turn off GO111MODULE + +```bash +export GO111MODULE="off" +``` + ## General Performance Observations ### Unaligned Loads diff --git a/go/stringzilla/main.go b/go/stringzilla/main.go new file mode 100644 index 00000000..c7dcf40e --- /dev/null +++ b/go/stringzilla/main.go @@ -0,0 +1,161 @@ +package sz + +// #cgo CFLAGS: -g -mavx2 +// #include +// #include <../../include/stringzilla/stringzilla.h> +import "C" + +// -Wall -O3 + +import ( + //"fmt" + //"time" + "unsafe" + //"strings" +) + +/* +// Passing a C function pointer around in go isn't working +//type searchFunc func(*C.char, C.ulong, *C.char, C.ulong)C.sz_cptr_t +//func _search( str string, pat string, searchFunc func(*C.char, C.ulong, *C.char, C.ulong)C.sz_cptr_t) uintptr { +func _search( str string, pat string, searchFunc C.sz_find_t ) uintptr { + cstr := (*C.char)(unsafe.Pointer(unsafe.StringData(str))) + cpat := (*C.char)(unsafe.Pointer(unsafe.StringData(pat))) + strlen := len(str) + patlen := len(pat) + ret := unsafe.Pointer( searchFunc(cstr, C.ulong(strlen), cpat, C.ulong(patlen)) ) + return ret +} +*/ + +func Contains( str string, pat string ) bool { + cstr := (*C.char)(unsafe.Pointer(unsafe.StringData(str))) + cpat := (*C.char)(unsafe.Pointer(unsafe.StringData(pat))) + strlen := len(str) + patlen := len(pat) + ret := unsafe.Pointer(C.sz_find( cstr, C.ulong(strlen), cpat, C.ulong(patlen) )) + //ret := _search( str, pat, C.sz_find_t(C.sz_find) ) + return ret != nil +} + +func Index( str string, pat string ) int64 { + cstr := (*C.char)(unsafe.Pointer(unsafe.StringData(str))) + cpat := (*C.char)(unsafe.Pointer(unsafe.StringData(pat))) + strlen := len(str) + patlen := len(pat) + ret := unsafe.Pointer(C.sz_find( cstr, C.ulong(strlen), cpat, C.ulong(patlen) )) + if ret == nil { + return 0 + } + return int64(uintptr(ret)-uintptr(unsafe.Pointer(cstr))) +} + +func Find( str string, pat string ) int64 { + cstr := (*C.char)(unsafe.Pointer(unsafe.StringData(str))) + cpat := (*C.char)(unsafe.Pointer(unsafe.StringData(pat))) + strlen := len(str) + patlen := len(pat) + ret := unsafe.Pointer(C.sz_find( cstr, C.ulong(strlen), cpat, C.ulong(patlen) )) + if ret == nil { + return -1 + } + return int64(uintptr(ret)-uintptr(unsafe.Pointer(cstr))) +} + +func RFind( str string, pat string ) int64 { + cstr := (*C.char)(unsafe.Pointer(unsafe.StringData(str))) + cpat := (*C.char)(unsafe.Pointer(unsafe.StringData(pat))) + strlen := len(str) + patlen := len(pat) + ret := unsafe.Pointer(C.sz_rfind( cstr, C.ulong(strlen), cpat, C.ulong(patlen) )) + if ret == nil { + return -1 + } + return int64(uintptr(ret)-uintptr(unsafe.Pointer(cstr))) +} + +func IndexAny( str string, charset string ) int64 { + cstr := (*C.char)(unsafe.Pointer(unsafe.StringData(str))) + cpat := (*C.char)(unsafe.Pointer(unsafe.StringData(charset))) + strlen := len(str) + patlen := len(charset) + ret := unsafe.Pointer(C.sz_find_char_from( cstr, C.ulong(strlen), cpat, C.ulong(patlen) )) + if ret == nil { + return -1 + } + return int64(uintptr(ret)-uintptr(unsafe.Pointer(cstr))) +} +func FindCharFrom( str string, charset string ) int64 { + return IndexAny( str, charset ) +} + + + +/* +func Contains( s *C.char, s_length int, pattern *C.char, pattern_length int ) bool { + c := unsafe.Pointer(C.sz_find( s, C.ulong(s_length), pattern, C.ulong(pattern_length) )) + return c != nil +} +func Contains_sz( a C.sz_string_view_t, b C.sz_string_view_t ) bool { + c := unsafe.Pointer(C.sz_find( a.start, a.length, b.start, b.length) ) + return c != nil +} +func Index( s *C.char, s_length int, pattern *C.char, pattern_length int ) uintptr { + c := unsafe.Pointer(C.sz_find( s, C.ulong(s_length), pattern, C.ulong(pattern_length) )) + return uintptr(c)-uintptr(unsafe.Pointer(s)) +} +func main() { + + str := strings.Repeat("0123456789", 100000) + "something" + pat := "some" + t := time.Now() + for i := 0; i < 1; i++ { + strings.Contains( str, pat ) + //strings.Index( str, pat ) + } + fmt.Println( time.Since(t) ) + + //a := C.CString(str) + a := (*C.char)(unsafe.Pointer(unsafe.StringData(str))) + b := (*C.char)(unsafe.Pointer(unsafe.StringData(pat))) + alen := len(str) + blen := len(pat) + sva := C.sz_string_view_t {a,(C.ulong)(alen)} + svb := C.sz_string_view_t {b,(C.ulong)(blen)} + fmt.Println(sva.length) + t = time.Now() + for i := 0; i < 1; i++ { + Contains( a, alen, b, blen ) + //index( a, alen, b, blen ) + } + fmt.Println( time.Since(t) ) + + t = time.Now() + for i := 0; i < 1; i++ { + ContainsString( str,pat ) + //Contains_sz( sva, svb ) + //index( a, alen, b, blen ) + } + fmt.Println( time.Since(t) ) + + fmt.Println(strings.Contains( str, pat )) + fmt.Println(Contains_sz( sva, svb )) + fmt.Println(strings.Index( str, pat )) + fmt.Println(Index(a,alen,b,blen)) + //fmt.Println(strings.Contains("something", "some")) + //fmt.Println( contains( a, 9, b, 4 ) ) + //fmt.Println( len(a) ) + //fmt.Println( len(b) ) + //c := unsafe.Pointer(C.sz_find( a, 9, b, 4 ) ) + //d := *C.uchar(c) + //fmt.Println( a ) + //fmt.Println( c ) + //fmt.Println( uintptr(c)-uintptr(unsafe.Pointer(a)) ) + //fmt.Println( C.sz_find( a, 9, b, 4 ) ) + //sz_cptr_t result = sz_find(haystack.start, haystack.length, needle.start, needle.length); + + // In JavaScript, if `indexOf` is unable to indexOf the specified value, then it should return -1 + //if (result == NULL) { napi_create_bigint_int64(env, -1, &js_result); } + //else { napi_create_bigint_uint64(env, result - haystack.start, &js_result); } +} +*/ diff --git a/scripts/bench.go b/scripts/bench.go new file mode 100644 index 00000000..e2ed5ffd --- /dev/null +++ b/scripts/bench.go @@ -0,0 +1,55 @@ +package main + +import ( + "fmt" + "time" + "strings" + sz "../go/stringzilla" +) + +func main() { + + str := strings.Repeat("0123456789", 100000) + "something" + pat := "some" + + fmt.Println("Contains") + t := time.Now() + for i := 0; i < 1; i++ { + strings.Contains( str, pat ) + } + fmt.Println( " ", time.Since(t) , "\tstrings.Contains" ) + + t = time.Now() + for i := 0; i < 1; i++ { + sz.Contains( str,pat ) + } + fmt.Println( " ", time.Since(t) , "\tsz.Contains" ) + + fmt.Println("Index") + t = time.Now() + for i := 0; i < 1; i++ { + strings.Index( str, pat ) + } + fmt.Println( " ", time.Since(t) , "\tstrings.Index" ) + + t = time.Now() + for i := 0; i < 1; i++ { + sz.Index( str,pat ) + } + fmt.Println( " ", time.Since(t) , "\tsz.Index" ) + + fmt.Println("IndexAny") + t = time.Now() + for i := 0; i < 1; i++ { + strings.IndexAny( str, pat ) + } + fmt.Println( " ", time.Since(t) , "\tstrings.IndexAny" ) + + t = time.Now() + for i := 0; i < 1; i++ { + sz.IndexAny( str,pat ) + } + fmt.Println( " ", time.Since(t) , "\tsz.IndexAny" ) + + +} diff --git a/scripts/test.go b/scripts/test.go new file mode 100644 index 00000000..f2d119c0 --- /dev/null +++ b/scripts/test.go @@ -0,0 +1,52 @@ +package main + +import ( + "fmt" + "strings" + "runtime" + sz "../go/stringzilla" +) + +func assertEqual[T comparable](act T, exp T) int { + if exp == act { + return 0 + } + _, _, line, _ := runtime.Caller(1) + fmt.Println("") + fmt.Println(" ERROR line ",line," expected (",exp,") is not equal to actual (",act,")") + return 1 +} + +func main() { + + str := strings.Repeat("0123456789", 100000) + "something" + pat := "some" + ret := 0 + + fmt.Print("Contains ... ") + ret |= assertEqual( sz.Contains( "", "" ), true ) + ret |= assertEqual( sz.Contains( "test", "" ), true ) + ret |= assertEqual( sz.Contains( "test", "s" ), true ) + ret |= assertEqual( sz.Contains( "test", "test" ), true ) + ret |= assertEqual( sz.Contains( "test", "zest" ), false ) + ret |= assertEqual( sz.Contains( "test", "z" ), false ) + if ( ret == 0 ) { + fmt.Println("successful") + } + + fmt.Print("Index ... ") + assertEqual( strings.Index( str, pat ), int(sz.Index( str,pat )) ) + assertEqual( sz.Index( "","" ), 0 ) + assertEqual( sz.Index( "test","" ), 0 ) + assertEqual( sz.Index( "test","t" ), 0 ) + assertEqual( sz.Index( "test","s" ), 2 ) + fmt.Println("successful") + + fmt.Print("IndexAny ... ") + assertEqual( strings.IndexAny( str, pat ), int(sz.IndexAny( str,pat )) ) + assertEqual( sz.IndexAny( "test", "st" ), 0 ) + assertEqual( sz.IndexAny( "west east", "ta" ), 3) + fmt.Println("successful") + + +} From 7de65b730ad81d76d3da9cb3e056300ef53300ee Mon Sep 17 00:00:00 2001 From: Mark Reed Date: Thu, 16 May 2024 08:54:42 -0700 Subject: [PATCH 2/4] Clean up comments --- go/stringzilla/main.go | 71 ------------------------------------------ 1 file changed, 71 deletions(-) diff --git a/go/stringzilla/main.go b/go/stringzilla/main.go index c7dcf40e..580c6262 100644 --- a/go/stringzilla/main.go +++ b/go/stringzilla/main.go @@ -8,10 +8,7 @@ import "C" // -Wall -O3 import ( - //"fmt" - //"time" "unsafe" - //"strings" ) /* @@ -91,71 +88,3 @@ func FindCharFrom( str string, charset string ) int64 { -/* -func Contains( s *C.char, s_length int, pattern *C.char, pattern_length int ) bool { - c := unsafe.Pointer(C.sz_find( s, C.ulong(s_length), pattern, C.ulong(pattern_length) )) - return c != nil -} -func Contains_sz( a C.sz_string_view_t, b C.sz_string_view_t ) bool { - c := unsafe.Pointer(C.sz_find( a.start, a.length, b.start, b.length) ) - return c != nil -} -func Index( s *C.char, s_length int, pattern *C.char, pattern_length int ) uintptr { - c := unsafe.Pointer(C.sz_find( s, C.ulong(s_length), pattern, C.ulong(pattern_length) )) - return uintptr(c)-uintptr(unsafe.Pointer(s)) -} -func main() { - - str := strings.Repeat("0123456789", 100000) + "something" - pat := "some" - t := time.Now() - for i := 0; i < 1; i++ { - strings.Contains( str, pat ) - //strings.Index( str, pat ) - } - fmt.Println( time.Since(t) ) - - //a := C.CString(str) - a := (*C.char)(unsafe.Pointer(unsafe.StringData(str))) - b := (*C.char)(unsafe.Pointer(unsafe.StringData(pat))) - alen := len(str) - blen := len(pat) - sva := C.sz_string_view_t {a,(C.ulong)(alen)} - svb := C.sz_string_view_t {b,(C.ulong)(blen)} - fmt.Println(sva.length) - t = time.Now() - for i := 0; i < 1; i++ { - Contains( a, alen, b, blen ) - //index( a, alen, b, blen ) - } - fmt.Println( time.Since(t) ) - - t = time.Now() - for i := 0; i < 1; i++ { - ContainsString( str,pat ) - //Contains_sz( sva, svb ) - //index( a, alen, b, blen ) - } - fmt.Println( time.Since(t) ) - - fmt.Println(strings.Contains( str, pat )) - fmt.Println(Contains_sz( sva, svb )) - fmt.Println(strings.Index( str, pat )) - fmt.Println(Index(a,alen,b,blen)) - //fmt.Println(strings.Contains("something", "some")) - //fmt.Println( contains( a, 9, b, 4 ) ) - //fmt.Println( len(a) ) - //fmt.Println( len(b) ) - //c := unsafe.Pointer(C.sz_find( a, 9, b, 4 ) ) - //d := *C.uchar(c) - //fmt.Println( a ) - //fmt.Println( c ) - //fmt.Println( uintptr(c)-uintptr(unsafe.Pointer(a)) ) - //fmt.Println( C.sz_find( a, 9, b, 4 ) ) - //sz_cptr_t result = sz_find(haystack.start, haystack.length, needle.start, needle.length); - - // In JavaScript, if `indexOf` is unable to indexOf the specified value, then it should return -1 - //if (result == NULL) { napi_create_bigint_int64(env, -1, &js_result); } - //else { napi_create_bigint_uint64(env, result - haystack.start, &js_result); } -} -*/ From 1882e74a83fa1a555d191589b0a883deab62a1cb Mon Sep 17 00:00:00 2001 From: Mark Reed Date: Thu, 16 May 2024 08:57:34 -0700 Subject: [PATCH 3/4] Export needed before running the test/bench --- CONTRIBUTING.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 774b84b3..fcd76774 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -460,6 +460,7 @@ If you want to run benchmarks against third-party implementations, check out the ## Contributing in Go ```bash +export GO111MODULE="off" go run scripts/test.go go run scripts/bench.go ``` @@ -467,7 +468,7 @@ go run scripts/bench.go To run locally import with a relative path ```bash - "../StringZilla/go/sz" + sz "../StringZilla/go/stringzilla" ``` And turn off GO111MODULE From 575d9d6e9261f78c7aae6494f00bee8f66f12849 Mon Sep 17 00:00:00 2001 From: Mark Reed Date: Fri, 17 May 2024 23:39:50 -0700 Subject: [PATCH 4/4] Add Count --- go/stringzilla/main.go | 42 +++++++++++++++++++++++++++++++++++++++++- scripts/bench.go | 18 +++++++++++++++++- scripts/test.go | 7 +++++++ 3 files changed, 65 insertions(+), 2 deletions(-) diff --git a/go/stringzilla/main.go b/go/stringzilla/main.go index 580c6262..fdaca000 100644 --- a/go/stringzilla/main.go +++ b/go/stringzilla/main.go @@ -59,7 +59,7 @@ func Find( str string, pat string ) int64 { return int64(uintptr(ret)-uintptr(unsafe.Pointer(cstr))) } -func RFind( str string, pat string ) int64 { +func LastIndex( str string, pat string ) int64 { cstr := (*C.char)(unsafe.Pointer(unsafe.StringData(str))) cpat := (*C.char)(unsafe.Pointer(unsafe.StringData(pat))) strlen := len(str) @@ -70,6 +70,9 @@ func RFind( str string, pat string ) int64 { } return int64(uintptr(ret)-uintptr(unsafe.Pointer(cstr))) } +func RFind( str string, pat string ) int64 { + return LastIndex(str,pat) +} func IndexAny( str string, charset string ) int64 { cstr := (*C.char)(unsafe.Pointer(unsafe.StringData(str))) @@ -86,5 +89,42 @@ func FindCharFrom( str string, charset string ) int64 { return IndexAny( str, charset ) } +func Count( str string, pat string, overlap bool ) int64 { + cstr := (*C.char)(unsafe.Pointer(unsafe.StringData(str))) + cpat := (*C.char)(unsafe.Pointer(unsafe.StringData(pat))) + strlen := int64(len(str)) + patlen := int64(len(pat)) + + if strlen == 0 || patlen == 0 || strlen < patlen { + return 0 + } + + count := int64(0); + if overlap == true { + for strlen > 0 { + ret := unsafe.Pointer(C.sz_find( cstr, C.ulong(strlen), cpat, C.ulong(patlen) )) + if ret == nil { + break + } + count += 1 + strlen -= ( 1 + int64(uintptr(ret)-uintptr(unsafe.Pointer(cstr))) ) + cstr = (*C.char)(unsafe.Add(ret,1)) + } + } else { + for strlen > 0 { + ret := unsafe.Pointer(C.sz_find( cstr, C.ulong(strlen), cpat, C.ulong(patlen) )) + if ret == nil { + break + } + count += 1 + strlen -= (patlen+int64(uintptr(ret)-uintptr(unsafe.Pointer(cstr)))) + cstr = (*C.char)(unsafe.Add(ret,patlen)) + } + } + + return count + +} + diff --git a/scripts/bench.go b/scripts/bench.go index e2ed5ffd..680736b2 100644 --- a/scripts/bench.go +++ b/scripts/bench.go @@ -9,7 +9,7 @@ import ( func main() { - str := strings.Repeat("0123456789", 100000) + "something" + str := strings.Repeat("0123456789", 10000) + "something" pat := "some" fmt.Println("Contains") @@ -51,5 +51,21 @@ func main() { } fmt.Println( " ", time.Since(t) , "\tsz.IndexAny" ) + str = strings.Repeat("0123456789", 100000) + "something" + pat = "123456789" + fmt.Println("Count") + t = time.Now() + for i := 0; i < 1; i++ { + strings.Count( str, pat ) + } + fmt.Println( " ", time.Since(t) , "\tstrings.Count" ) + + t = time.Now() + for i := 0; i < 1; i++ { + sz.Count( str,pat, false ) + } + fmt.Println( " ", time.Since(t) , "\tsz.Count" ) + + } diff --git a/scripts/test.go b/scripts/test.go index f2d119c0..8c31de5f 100644 --- a/scripts/test.go +++ b/scripts/test.go @@ -48,5 +48,12 @@ func main() { assertEqual( sz.IndexAny( "west east", "ta" ), 3) fmt.Println("successful") + fmt.Print("Count ... ") + //assertEqual( strings.Count( str, pat ), int(sz.Count( str,pat,false )) ) + assertEqual( sz.Count( "aaaaa", "a", false ), 5 ) + assertEqual( sz.Count( "aaaaa", "aa", false ), 2 ) + assertEqual( sz.Count( "aaaaa", "aa", true ), 4 ) + fmt.Println("successful") + }