From 724fb67ca95c7c7a71135cc98a5c073f900e1520 Mon Sep 17 00:00:00 2001 From: Czarek Nakamoto <cyjan@mrcyjanek.net> Date: Fri, 13 Aug 2021 11:33:01 +0200 Subject: [PATCH 1/8] WIP: Support for JWPUB format * Make abstruse generate binaries with correct branches * Fix small compile errors --- Makefile | 15 +-- genappendix.sh | 7 ++ libjw/jwpub.go | 200 ++++++++++++++++++++++++++++++++++++ libjw/main.go | 14 ++- utils/getwoljwlangs/main.go | 2 +- utils/jwpub-test/parse.go | 15 +++ 6 files changed, 242 insertions(+), 11 deletions(-) create mode 100755 genappendix.sh create mode 100644 libjw/jwpub.go create mode 100644 utils/jwpub-test/parse.go diff --git a/Makefile b/Makefile index 8a28cd1..43f20c5 100755 --- a/Makefile +++ b/Makefile @@ -1,21 +1,24 @@ VERSION=2.0.0 - +APPENDIX=$(shell ./genappendix.sh) install: cp build/bin/${BINNAME}_${GOOS}_${GOARCH} /usr/bin/jwstudy cp dist/debian/logo.png /usr/share/icons/hicolor/scalable/apps/jwstudy.png cp dist/debian/jwstudy.desktop /usr/share/applications +show_appendix: + @echo ${APPENDIX} + android: - goprod -combo="android/386;android/amd64;android/arm;android/arm64;android/all" -tags="nogui" -shouldpkg=true -binname="jwstudy" -version="${VERSION}" -appurl="http://127.0.0.1:4365/" -ldflags="-X main.dataDir=/data/data/x.x.jwstudy/ -X git.mrcyjanek.net/mrcyjanek/jwapi/webui.SPort=4365" + goprod -combo="android/386;android/amd64;android/arm;android/arm64;android/all" -tags="nogui" -shouldpkg=true -binname="jwstudy${APPENDIX}" -version="${VERSION}" -appurl="http://127.0.0.1:4365/" -ldflags="-X main.dataDir=/data/data/x.x.jwstudy/ -X git.mrcyjanek.net/mrcyjanek/jwapi/webui.SPort=4365" linux-lorca: - goprod -combo="linux/amd64;linux/arm;linux/arm64;linux/386" -binname="jwstudy-lorca" -tags="guilorca" -version="${VERSION}" + goprod -combo="linux/amd64;linux/arm;linux/arm64;linux/386" -binname="jwstudy${APPENDIX}-lorca" -tags="guilorca" -version="${VERSION}" linux-browser: - goprod -combo="linux/amd64;linux/arm;linux/arm64;linux/386" -binname="jwstudy-browser" -tags="guibrowser" -version="${VERSION}" + goprod -combo="linux/amd64;linux/arm;linux/arm64;linux/386" -binname="jwstudy${APPENDIX}-browser" -tags="guibrowser" -version="${VERSION}" windows-lorca: - goprod -combo="windows/amd64;windows/386" -binname="jwstudy-lorca" -tags="guilorca" -version="${VERSION}" + goprod -combo="windows/amd64;windows/386" -binname="jwstudy${APPENDIX}-lorca" -tags="guilorca" -version="${VERSION}" windows-browser: - goprod -combo="windows/amd64;windows/386" -binname="jwstudy-browser" -tags="guibrowser" -version="${VERSION}" + goprod -combo="windows/amd64;windows/386" -binname="jwstudy${APPENDIX}-browser" -tags="guibrowser" -version="${VERSION}" diff --git a/genappendix.sh b/genappendix.sh new file mode 100755 index 0000000..3a147b2 --- /dev/null +++ b/genappendix.sh @@ -0,0 +1,7 @@ +#!/bin/bash +if [[ "X$ABSTRUSE_BRANCH" == "X" || "X$ABSTRUSE_BRANCH" == "Xmaster" ]]; +then + echo -n -e "" +else + echo -n -e "-$ABSTRUSE_BRANCH" +fi; \ No newline at end of file diff --git a/libjw/jwpub.go b/libjw/jwpub.go new file mode 100644 index 0000000..8e4ae73 --- /dev/null +++ b/libjw/jwpub.go @@ -0,0 +1,200 @@ +package libjw + +// Most of this file is inspired by +// https://github.com/Miaosi001/JW-Library-macOS/blob/main/JWLibrary/Utility/JWPubExtractor.swift + +import ( + "database/sql" + "log" + "time" + + _ "github.com/mattn/go-sqlite3" // sqlite driver + + "git.mrcyjanek.net/mrcyjanek/jwapi/helpers" +) + +// THIS DOESN'T WORK +// DO NOT USE + +type JWPUBWordMap struct { + WordID int + Word string + SearchIndexDocumentID int + TextUnitCount int + WordOccurrenceCount int + TextUnitIndices []byte + PositionalList []byte + PositionalListIndex []byte +} + +// NOTE: This function have a lot of hardcoded values +// It is *not* ready for production usage +func JWPUBtoMarkdown(jwpub string) { + //var wadd = make(map[string]int) + path := helpers.GetDataDir() + "/_tmp_jwpub" + log.Println(jwpub) + err := helpers.Unzip(jwpub, path) + if err != nil { + log.Fatal(err) + } + err = helpers.Unzip(path+"/contents", path+"/c") + if err != nil { + log.Fatal(err) + } + db, err := sql.Open("sqlite3", path+"/c/fg_E.db") + if err != nil { + log.Fatal(err) + } + defer db.Close() + row, err := db.Query("SELECT WordId, Word FROM Word WHERE 1") + if err != nil { + log.Fatal(err) + } + var wordsmap []JWPUBWordMap + for row.Next() { + var wid int + var w string + err = row.Scan(&wid, &w) + if err != nil { + log.Fatal(err) + } + r := db.QueryRow("SELECT TextUnitIndices, PositionalListIndex, PositionalList FROM SearchIndexDocument WHERE WordId=?", wid) + var tui []byte + var pli []byte + var pl []byte + err = r.Scan(&tui, &pli, &pl) + if err != nil { + log.Fatal(err) + } + wordsmap = append(wordsmap, JWPUBWordMap{ + WordID: wid, + Word: w, + TextUnitIndices: tui, + PositionalList: pl, + PositionalListIndex: pli, + }) + } + + var loop = true + var docID = 0 + var curDocIndex = []byte{128} + var fullText = make(map[int]string) + + sIndexes := wordsmap + for loop { + var finded = false + for i := range sIndexes { + //log.Println("for i:= range sIndexes") + if sIndexes[i].TextUnitIndices[0] == 128 { + //log.Println("if sIndexes[i].TextUnitIndices[0] == 128 {") + //log.Println("byteStartsWith(sIndexes[i].PositionalList, curDocIndex): ", byteStartsWith(sIndexes[i].PositionalList, curDocIndex)) + if byteStartsWith(sIndexes[i].PositionalList, curDocIndex) { + var rem = sIndexes[i].PositionalListIndex[0] + if rem > 128 { + finded = true + wd := sIndexes[i].Word + //if wd != String(fullText[docID]?.split(separator: " ").last ?? "").unaccent() { + // print(curDocIndex, wd) + // fullText[docID]!.append(wd + " ") + //} + fullText[docID] += " " + wd + log.Println("fullText[docID]:", fullText[docID]) + time.Sleep(time.Second) + sIndexes[i].PositionalList = sIndexes[i].PositionalList[len(curDocIndex):] + rem = rem - 1 + sIndexes[i].PositionalListIndex[0] = rem + curDocIndexArray := curDocIndex + var repo = false + for j := range curDocIndexArray { + if j == 0 { + if (curDocIndexArray[j] == 255 && len(curDocIndexArray) == 1) || (curDocIndexArray[j] == 127 && len(curDocIndexArray) > 1) { + repo = true + curDocIndex = []byte{0} + if repo && j == len(curDocIndexArray)-1 { + curDocIndex = append(curDocIndex, 129) + repo = false + } + } else { + curDocIndex = []byte{curDocIndexArray[j] + 1} + repo = false + } + } else { + if repo { + if curDocIndexArray[j] == 255 { + repo = true + curDocIndex = append(curDocIndex, 129) + if repo && j == len(curDocIndexArray)-1 { + curDocIndex = append(curDocIndex, 129) + repo = false + } + } else { + curDocIndex = append(curDocIndex, curDocIndexArray[j]+1) + repo = false + } + } else { + curDocIndex = append(curDocIndex, curDocIndexArray[j]) + } + } + } + break + } else { + sIndexes[i].PositionalListIndex = sIndexes[i].PositionalListIndex[1:] + } + } + } + } + if !finded { + var toRem []int + for i := range sIndexes { + //var docI = sIndexes[i].TextUnitIndices.prefix(3) + //sIndexes[i].TextUnitIndices.removeFirst(3) + if sIndexes[i].TextUnitIndices[0] == 128 { + if len(sIndexes[i].TextUnitIndices) != 1 { + sIndexes[i].TextUnitIndices[1]-- + } + } else { + sIndexes[i].TextUnitIndices[0]-- + } + if len(sIndexes[i].TextUnitIndices) == 0 { + log.Println("toRem", i) + toRem = append(toRem, i) + } + + if len(sIndexes[i].PositionalListIndex) > 0 && sIndexes[i].PositionalListIndex[0] == 128 { + sIndexes[i].PositionalListIndex = sIndexes[i].PositionalListIndex[1:] + } + } + for i := len(toRem); i > 0; i-- { + log.Println("toRem2", sIndexes[toRem[i]]) + sIndexes = append(sIndexes[:toRem[i]], sIndexes[toRem[i]+1:]...) + } + docID++ + curDocIndex = []byte{128} + } + if len(sIndexes) == 0 { + loop = false + } + } + // for (id, text) in fullText where text != "" { + // let dir = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0].appendingPathComponent("w_I_202110/contents/\(id).txt") + // do { + // print(dir) + // try text.write(to: dir, atomically: true, encoding: String.Encoding.utf8) + // } catch { + // print("Error") + // } + // } + +} + +func byteStartsWith(bs []byte, with []byte) bool { + if len(bs) != len(with) { + return false + } + for i := range bs { + if bs[i] != with[i] { + return false + } + } + return true +} diff --git a/libjw/main.go b/libjw/main.go index 50cd2cc..2633581 100644 --- a/libjw/main.go +++ b/libjw/main.go @@ -724,10 +724,16 @@ func GetPublication(publication string, language string, format string, issue st } f.Write(body) f.Sync() - fmt.Println("[libjw][GetPublication] Extracting...", pub.Title) - err = helpers.Unzip(f.Name(), extractpath) - if err != nil { - return structs.PublicationV2{}, errors.New("[libjw][GetPublication] " + err.Error() + " (zipslip of something? Maybe corrupted download, failed to uzip)") + defer f.Close() + if format == "EPUB" { + fmt.Println("[libjw][GetPublication] Extracting...", pub.Title) + err = helpers.Unzip(f.Name(), extractpath) + if err != nil { + return structs.PublicationV2{}, errors.New("[libjw][GetPublication] " + err.Error() + " (zipslip of something? Maybe corrupted download, failed to uzip)") + } + } else if format == "JWPUB" { + fmt.Println("[libjw][GetPublication] Parsing publication...[JWPUB]", pub.Title) + JWPUBtoMarkdown(f.Name()) } struc = structs.PublicationV2{ Title: pub.Title, diff --git a/utils/getwoljwlangs/main.go b/utils/getwoljwlangs/main.go index b31a789..f2923b5 100644 --- a/utils/getwoljwlangs/main.go +++ b/utils/getwoljwlangs/main.go @@ -26,6 +26,6 @@ func main() { code := strings.Split(strings.Split(l, `data-rsconf="`)[1], `"`)[0] title := strings.Split(strings.Split(l, `data-title="`)[1], `"`)[0] - log.Println("code:", code) + log.Println("code:", code, title) } } diff --git a/utils/jwpub-test/parse.go b/utils/jwpub-test/parse.go new file mode 100644 index 0000000..bac2e5c --- /dev/null +++ b/utils/jwpub-test/parse.go @@ -0,0 +1,15 @@ +package main + +import ( + "git.mrcyjanek.net/mrcyjanek/jwapi/helpers" + "git.mrcyjanek.net/mrcyjanek/jwapi/libjw" +) + +func main() { + dataDir := helpers.GetDataDir() + helpers.SetDataDir(dataDir) + helpers.Mkdir(dataDir + "/raw") + helpers.DBInit(dataDir) + libjw.GetPublication("fg", "E", "JWPUB", "") + // libjw.JWPUBtoMarkdown("fg_E.jwpub.orig") +} From c08e38f5af712d32f0a2078877ddeb363e3dd38e Mon Sep 17 00:00:00 2001 From: Czarek Nakamoto <cyjan@mrcyjanek.net> Date: Fri, 13 Aug 2021 11:34:58 +0200 Subject: [PATCH 2/8] Add correct path for icon on linux --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 43f20c5..c14f9ec 100755 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION=2.0.0 APPENDIX=$(shell ./genappendix.sh) install: cp build/bin/${BINNAME}_${GOOS}_${GOARCH} /usr/bin/jwstudy - cp dist/debian/logo.png /usr/share/icons/hicolor/scalable/apps/jwstudy.png + cp dist/debian/logo.png /usr/share/pixmaps/jwstudy.png cp dist/debian/jwstudy.desktop /usr/share/applications show_appendix: From ac53afea72442825de86b8bd65ad9ee5d46891e8 Mon Sep 17 00:00:00 2001 From: Czarek Nakamoto <cyjan@mrcyjanek.net> Date: Fri, 13 Aug 2021 12:33:33 +0200 Subject: [PATCH 3/8] Fixes for the docID = 0 Rest is *still* not working --- libjw/jwpub.go | 61 +++++++++++++++++++++++++-------------- utils/jwpub-test/parse.go | 2 +- 2 files changed, 41 insertions(+), 22 deletions(-) diff --git a/libjw/jwpub.go b/libjw/jwpub.go index 8e4ae73..0c7d5f6 100644 --- a/libjw/jwpub.go +++ b/libjw/jwpub.go @@ -5,8 +5,8 @@ package libjw import ( "database/sql" + "fmt" "log" - "time" _ "github.com/mattn/go-sqlite3" // sqlite driver @@ -41,7 +41,7 @@ func JWPUBtoMarkdown(jwpub string) { if err != nil { log.Fatal(err) } - db, err := sql.Open("sqlite3", path+"/c/fg_E.db") + db, err := sql.Open("sqlite3", path+"/c/w_E_202110.db") if err != nil { log.Fatal(err) } @@ -58,11 +58,11 @@ func JWPUBtoMarkdown(jwpub string) { if err != nil { log.Fatal(err) } - r := db.QueryRow("SELECT TextUnitIndices, PositionalListIndex, PositionalList FROM SearchIndexDocument WHERE WordId=?", wid) + r := db.QueryRow("SELECT TextUnitIndices, PositionalList, PositionalListIndex FROM SearchIndexDocument WHERE WordId=?", wid) var tui []byte - var pli []byte var pl []byte - err = r.Scan(&tui, &pli, &pl) + var pli []byte + err = r.Scan(&tui, &pl, &pli) if err != nil { log.Fatal(err) } @@ -85,8 +85,7 @@ func JWPUBtoMarkdown(jwpub string) { var finded = false for i := range sIndexes { //log.Println("for i:= range sIndexes") - if sIndexes[i].TextUnitIndices[0] == 128 { - //log.Println("if sIndexes[i].TextUnitIndices[0] == 128 {") + if len(sIndexes[i].TextUnitIndices) > 0 && sIndexes[i].TextUnitIndices[0] == 128 { //log.Println("byteStartsWith(sIndexes[i].PositionalList, curDocIndex): ", byteStartsWith(sIndexes[i].PositionalList, curDocIndex)) if byteStartsWith(sIndexes[i].PositionalList, curDocIndex) { var rem = sIndexes[i].PositionalListIndex[0] @@ -98,8 +97,6 @@ func JWPUBtoMarkdown(jwpub string) { // fullText[docID]!.append(wd + " ") //} fullText[docID] += " " + wd - log.Println("fullText[docID]:", fullText[docID]) - time.Sleep(time.Second) sIndexes[i].PositionalList = sIndexes[i].PositionalList[len(curDocIndex):] rem = rem - 1 sIndexes[i].PositionalListIndex[0] = rem @@ -143,28 +140,43 @@ func JWPUBtoMarkdown(jwpub string) { } } } + + if fullText[docID] != "" { + fmt.Println("fullText[docID:", docID, "]:", fullText[docID]) + } if !finded { + log.Println("finded!") var toRem []int for i := range sIndexes { //var docI = sIndexes[i].TextUnitIndices.prefix(3) //sIndexes[i].TextUnitIndices.removeFirst(3) - if sIndexes[i].TextUnitIndices[0] == 128 { - if len(sIndexes[i].TextUnitIndices) != 1 { - sIndexes[i].TextUnitIndices[1]-- - } - } else { - sIndexes[i].TextUnitIndices[0]-- + var docI byte = 0 + if len(sIndexes[i].TextUnitIndices) > 0 { + docI = sIndexes[i].TextUnitIndices[0] } if len(sIndexes[i].TextUnitIndices) == 0 { log.Println("toRem", i) toRem = append(toRem, i) - } + } else { + sIndexes[i].TextUnitIndices = sIndexes[i].TextUnitIndices[1:] + if docI == 128 { + log.Println("finded! 1") + if len(sIndexes[i].TextUnitIndices) != 0 { + sIndexes[i].TextUnitIndices = insertbyte(sIndexes[i].TextUnitIndices, docI-1, 0) + } + } else { + docI-- + sIndexes[i].TextUnitIndices = insertbyte(sIndexes[i].TextUnitIndices, docI, 0) + } - if len(sIndexes[i].PositionalListIndex) > 0 && sIndexes[i].PositionalListIndex[0] == 128 { - sIndexes[i].PositionalListIndex = sIndexes[i].PositionalListIndex[1:] + if len(sIndexes[i].PositionalListIndex) > 0 && sIndexes[i].PositionalListIndex[0] == 128 { + sIndexes[i].PositionalListIndex = sIndexes[i].PositionalListIndex[1:] + log.Println("finded! 3") + } } } - for i := len(toRem); i > 0; i-- { + for i := len(toRem) - 1; i > 0; i-- { + log.Println("Removing...") log.Println("toRem2", sIndexes[toRem[i]]) sIndexes = append(sIndexes[:toRem[i]], sIndexes[toRem[i]+1:]...) } @@ -174,6 +186,9 @@ func JWPUBtoMarkdown(jwpub string) { if len(sIndexes) == 0 { loop = false } + if docID > 10000 { + log.Fatal("docID > 10000, this should not happen.") + } } // for (id, text) in fullText where text != "" { // let dir = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0].appendingPathComponent("w_I_202110/contents/\(id).txt") @@ -187,11 +202,15 @@ func JWPUBtoMarkdown(jwpub string) { } +func insertbyte(a []byte, c byte, i int) []byte { + return append(a[:i], append([]byte{c}, a[i:]...)...) +} + func byteStartsWith(bs []byte, with []byte) bool { - if len(bs) != len(with) { + if len(bs) < len(with) { return false } - for i := range bs { + for i := range with { if bs[i] != with[i] { return false } diff --git a/utils/jwpub-test/parse.go b/utils/jwpub-test/parse.go index bac2e5c..36d26e9 100644 --- a/utils/jwpub-test/parse.go +++ b/utils/jwpub-test/parse.go @@ -10,6 +10,6 @@ func main() { helpers.SetDataDir(dataDir) helpers.Mkdir(dataDir + "/raw") helpers.DBInit(dataDir) - libjw.GetPublication("fg", "E", "JWPUB", "") + libjw.GetPublication("w", "E", "JWPUB", "202110") // libjw.JWPUBtoMarkdown("fg_E.jwpub.orig") } From 5fc63bab1e818fbec24ebb37b5185156468b4947 Mon Sep 17 00:00:00 2001 From: Czarek Nakamoto <cyjan@mrcyjanek.net> Date: Fri, 13 Aug 2021 12:34:53 +0200 Subject: [PATCH 4/8] Don't build apk's on non-master branch --- .abstruse.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.abstruse.yml b/.abstruse.yml index 0662322..4c2ecbb 100644 --- a/.abstruse.yml +++ b/.abstruse.yml @@ -11,7 +11,7 @@ matrix: - image: mrcyjanek/goprod:core-android env: M=android script: - - make $M + - if [[ "$M" == "android" && "$ABSTRUSE_BRANCH" != "master" ]]; then true; else make $M; fi - cp build/deb/*.deb /apt/ || true - cp build/bin/* /archive || true - cp build/apk/* /archive || true \ No newline at end of file From 531ea0dfa1d5639645d32d7936d85155574a0abd Mon Sep 17 00:00:00 2001 From: Czarek Nakamoto <cyjan@mrcyjanek.net> Date: Fri, 13 Aug 2021 13:13:12 +0200 Subject: [PATCH 5/8] some fixes that do not fix anything --- libjw/jwpub.go | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/libjw/jwpub.go b/libjw/jwpub.go index 0c7d5f6..f0762fb 100644 --- a/libjw/jwpub.go +++ b/libjw/jwpub.go @@ -101,6 +101,7 @@ func JWPUBtoMarkdown(jwpub string) { rem = rem - 1 sIndexes[i].PositionalListIndex[0] = rem curDocIndexArray := curDocIndex + fmt.Println(wd) var repo = false for j := range curDocIndexArray { if j == 0 { @@ -145,8 +146,7 @@ func JWPUBtoMarkdown(jwpub string) { fmt.Println("fullText[docID:", docID, "]:", fullText[docID]) } if !finded { - log.Println("finded!") - var toRem []int + var toRem []int = []int{} for i := range sIndexes { //var docI = sIndexes[i].TextUnitIndices.prefix(3) //sIndexes[i].TextUnitIndices.removeFirst(3) @@ -155,12 +155,11 @@ func JWPUBtoMarkdown(jwpub string) { docI = sIndexes[i].TextUnitIndices[0] } if len(sIndexes[i].TextUnitIndices) == 0 { - log.Println("toRem", i) + log.Println("toRem", i, sIndexes[i].Word) toRem = append(toRem, i) } else { sIndexes[i].TextUnitIndices = sIndexes[i].TextUnitIndices[1:] if docI == 128 { - log.Println("finded! 1") if len(sIndexes[i].TextUnitIndices) != 0 { sIndexes[i].TextUnitIndices = insertbyte(sIndexes[i].TextUnitIndices, docI-1, 0) } @@ -168,16 +167,13 @@ func JWPUBtoMarkdown(jwpub string) { docI-- sIndexes[i].TextUnitIndices = insertbyte(sIndexes[i].TextUnitIndices, docI, 0) } - if len(sIndexes[i].PositionalListIndex) > 0 && sIndexes[i].PositionalListIndex[0] == 128 { sIndexes[i].PositionalListIndex = sIndexes[i].PositionalListIndex[1:] - log.Println("finded! 3") } } } - for i := len(toRem) - 1; i > 0; i-- { - log.Println("Removing...") - log.Println("toRem2", sIndexes[toRem[i]]) + for i := len(toRem) - 1; i >= 0; i-- { + log.Println(i, docID, "toRem2", sIndexes[toRem[i]].Word) sIndexes = append(sIndexes[:toRem[i]], sIndexes[toRem[i]+1:]...) } docID++ @@ -185,6 +181,8 @@ func JWPUBtoMarkdown(jwpub string) { } if len(sIndexes) == 0 { loop = false + } else { + log.Println("len(sIndexes):", len(sIndexes)) } if docID > 10000 { log.Fatal("docID > 10000, this should not happen.") From c327b4e03b222e88e51d1073bd695142bbee017f Mon Sep 17 00:00:00 2001 From: Czarek Nakamoto <cyjan@mrcyjanek.net> Date: Sat, 14 Aug 2021 19:14:29 +0200 Subject: [PATCH 6/8] No - it still doesn't work, I'm starting from scratch again and I want to safe the code somewhere --- libjw/jwpub.go | 16 ++++++---------- utils/jwpub-test/parse.go | 10 ++++++++-- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/libjw/jwpub.go b/libjw/jwpub.go index f0762fb..598e078 100644 --- a/libjw/jwpub.go +++ b/libjw/jwpub.go @@ -78,30 +78,27 @@ func JWPUBtoMarkdown(jwpub string) { var loop = true var docID = 0 var curDocIndex = []byte{128} - var fullText = make(map[int]string) + var fullText = make(map[int]string, 255) sIndexes := wordsmap for loop { + var finded = false for i := range sIndexes { - //log.Println("for i:= range sIndexes") - if len(sIndexes[i].TextUnitIndices) > 0 && sIndexes[i].TextUnitIndices[0] == 128 { - //log.Println("byteStartsWith(sIndexes[i].PositionalList, curDocIndex): ", byteStartsWith(sIndexes[i].PositionalList, curDocIndex)) + if sIndexes[i].WordID == 123 { + log.Println(sIndexes[i].Word, sIndexes[i].TextUnitIndices, byteStartsWith(sIndexes[i].TextUnitIndices, []byte{128}), sIndexes[i].PositionalList, byteStartsWith(sIndexes[i].PositionalList, curDocIndex), curDocIndex) + } + if byteStartsWith(sIndexes[i].TextUnitIndices, []byte{128}) { if byteStartsWith(sIndexes[i].PositionalList, curDocIndex) { var rem = sIndexes[i].PositionalListIndex[0] if rem > 128 { finded = true wd := sIndexes[i].Word - //if wd != String(fullText[docID]?.split(separator: " ").last ?? "").unaccent() { - // print(curDocIndex, wd) - // fullText[docID]!.append(wd + " ") - //} fullText[docID] += " " + wd sIndexes[i].PositionalList = sIndexes[i].PositionalList[len(curDocIndex):] rem = rem - 1 sIndexes[i].PositionalListIndex[0] = rem curDocIndexArray := curDocIndex - fmt.Println(wd) var repo = false for j := range curDocIndexArray { if j == 0 { @@ -155,7 +152,6 @@ func JWPUBtoMarkdown(jwpub string) { docI = sIndexes[i].TextUnitIndices[0] } if len(sIndexes[i].TextUnitIndices) == 0 { - log.Println("toRem", i, sIndexes[i].Word) toRem = append(toRem, i) } else { sIndexes[i].TextUnitIndices = sIndexes[i].TextUnitIndices[1:] diff --git a/utils/jwpub-test/parse.go b/utils/jwpub-test/parse.go index 36d26e9..3661303 100644 --- a/utils/jwpub-test/parse.go +++ b/utils/jwpub-test/parse.go @@ -1,6 +1,9 @@ package main import ( + "log" + "os" + "git.mrcyjanek.net/mrcyjanek/jwapi/helpers" "git.mrcyjanek.net/mrcyjanek/jwapi/libjw" ) @@ -10,6 +13,9 @@ func main() { helpers.SetDataDir(dataDir) helpers.Mkdir(dataDir + "/raw") helpers.DBInit(dataDir) - libjw.GetPublication("w", "E", "JWPUB", "202110") - // libjw.JWPUBtoMarkdown("fg_E.jwpub.orig") + //libjw.GetPublication("w", "E", "JWPUB", "202110") + if _, err := os.Stat("pub.jwpub"); os.IsNotExist(err) { + log.Fatal("Hey! Please put `pub.jwpub' in this directory, you can get one from this link: https://www.jw.org/download/?issue=202107&output=html&pub=g&fileformat=JWPUB&alllangs=0&langwritten=E&txtCMSLang=E&isBible=0") + } + libjw.JWPUBtoMarkdown("pub.jwpub") } From 7e5ba58f6a92a671cfb2666a6dd24cd709d4a21f Mon Sep 17 00:00:00 2001 From: Czarek Nakamoto <cyjan@mrcyjanek.net> Date: Sat, 14 Aug 2021 19:36:39 +0200 Subject: [PATCH 7/8] **FIXED** jwpub is now reading correcty Yay --- libjw/jwpub.go | 91 +++++++++++++++++++++++--------------------------- 1 file changed, 42 insertions(+), 49 deletions(-) diff --git a/libjw/jwpub.go b/libjw/jwpub.go index 598e078..89a95c6 100644 --- a/libjw/jwpub.go +++ b/libjw/jwpub.go @@ -75,30 +75,33 @@ func JWPUBtoMarkdown(jwpub string) { }) } + sIndexes := wordsmap var loop = true var docID = 0 var curDocIndex = []byte{128} - var fullText = make(map[int]string, 255) + var fullText = make(map[int]string) - sIndexes := wordsmap for loop { - var finded = false for i := range sIndexes { - if sIndexes[i].WordID == 123 { - log.Println(sIndexes[i].Word, sIndexes[i].TextUnitIndices, byteStartsWith(sIndexes[i].TextUnitIndices, []byte{128}), sIndexes[i].PositionalList, byteStartsWith(sIndexes[i].PositionalList, curDocIndex), curDocIndex) - } if byteStartsWith(sIndexes[i].TextUnitIndices, []byte{128}) { if byteStartsWith(sIndexes[i].PositionalList, curDocIndex) { var rem = sIndexes[i].PositionalListIndex[0] if rem > 128 { finded = true - wd := sIndexes[i].Word + var wd = sIndexes[i].Word + //if wd != String(fullText[docID]?.split(separator: " ").last ?? "").unaccent() { + // print(curDocIndex, wd) + // fullText[docID]!.append(wd + " ") + //} fullText[docID] += " " + wd sIndexes[i].PositionalList = sIndexes[i].PositionalList[len(curDocIndex):] + //sIndexes[i].PositionalList = sIndexes[i].PositionalList.trimmingCharacters(in: .whitespacesAndNewlines) rem = rem - 1 - sIndexes[i].PositionalListIndex[0] = rem - curDocIndexArray := curDocIndex + sIndexes[i].PositionalListIndex = sIndexes[i].PositionalListIndex[1:] + sIndexes[i].PositionalListIndex = insertbyte(sIndexes[i].PositionalListIndex, rem, 0) + //sIndexes[i].PositionalListIndex = rem + sIndexes[i].PositionalListIndex + var curDocIndexArray = curDocIndex var repo = false for j := range curDocIndexArray { if j == 0 { @@ -132,67 +135,57 @@ func JWPUBtoMarkdown(jwpub string) { } } break - } else { - sIndexes[i].PositionalListIndex = sIndexes[i].PositionalListIndex[1:] } } } } - - if fullText[docID] != "" { - fmt.Println("fullText[docID:", docID, "]:", fullText[docID]) - } if !finded { - var toRem []int = []int{} + var toRem []int for i := range sIndexes { - //var docI = sIndexes[i].TextUnitIndices.prefix(3) - //sIndexes[i].TextUnitIndices.removeFirst(3) - var docI byte = 0 - if len(sIndexes[i].TextUnitIndices) > 0 { - docI = sIndexes[i].TextUnitIndices[0] + var docI = sIndexes[i].TextUnitIndices[0] + sIndexes[i].TextUnitIndices = sIndexes[i].TextUnitIndices[1:] + if docI == 128 { + //sIndexes[i].TextUnitIndices = sIndexes[i].TextUnitIndices.trimmingCharacters(in: .whitespacesAndNewlines) + if len(sIndexes[i].TextUnitIndices) != 0 { + docI = sIndexes[i].TextUnitIndices[0] + sIndexes[i].TextUnitIndices = sIndexes[i].TextUnitIndices[1:] + docI = docI - 1 + sIndexes[i].TextUnitIndices = insertbyte(sIndexes[i].TextUnitIndices, docI, 0) + } + } else { + docI = docI - 1 + sIndexes[i].TextUnitIndices = insertbyte(sIndexes[i].TextUnitIndices, docI, 0) } if len(sIndexes[i].TextUnitIndices) == 0 { toRem = append(toRem, i) - } else { - sIndexes[i].TextUnitIndices = sIndexes[i].TextUnitIndices[1:] - if docI == 128 { - if len(sIndexes[i].TextUnitIndices) != 0 { - sIndexes[i].TextUnitIndices = insertbyte(sIndexes[i].TextUnitIndices, docI-1, 0) - } - } else { - docI-- - sIndexes[i].TextUnitIndices = insertbyte(sIndexes[i].TextUnitIndices, docI, 0) - } - if len(sIndexes[i].PositionalListIndex) > 0 && sIndexes[i].PositionalListIndex[0] == 128 { - sIndexes[i].PositionalListIndex = sIndexes[i].PositionalListIndex[1:] - } + } + var rem = sIndexes[i].PositionalListIndex[0] + if rem == 128 { + sIndexes[i].PositionalListIndex = sIndexes[i].PositionalListIndex[1:] + //sIndexes[i].PositionalListIndex = sIndexes[i].PositionalListIndex.trimmingCharacters(in: .whitespacesAndNewlines) } } for i := len(toRem) - 1; i >= 0; i-- { - log.Println(i, docID, "toRem2", sIndexes[toRem[i]].Word) sIndexes = append(sIndexes[:toRem[i]], sIndexes[toRem[i]+1:]...) } - docID++ + fmt.Println(fullText[docID]) + docID += 1 curDocIndex = []byte{128} } if len(sIndexes) == 0 { loop = false - } else { - log.Println("len(sIndexes):", len(sIndexes)) - } - if docID > 10000 { - log.Fatal("docID > 10000, this should not happen.") } } - // for (id, text) in fullText where text != "" { - // let dir = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0].appendingPathComponent("w_I_202110/contents/\(id).txt") - // do { - // print(dir) - // try text.write(to: dir, atomically: true, encoding: String.Encoding.utf8) - // } catch { - // print("Error") - // } + //print(fullText) + //for (id, text) in fullText where text != "" { + // let dir = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0].appendingPathComponent("w_I_202110/contents/\(id).txt") + // do { + // print(dir) + // try text.write(to: dir, atomically: true, encoding: String.Encoding.utf8) + // } catch { + // print("Error") // } + //} } From cdb2c5b509ce3c4a3e944f6ce80d606da5eff728 Mon Sep 17 00:00:00 2001 From: Czarek Nakamoto <cyjan@mrcyjanek.net> Date: Sat, 14 Aug 2021 20:02:27 +0200 Subject: [PATCH 8/8] FIX: highlights were not stored correctly --- webui/apiDB.go | 21 ++------------------- webui/html/static/common.js | 2 +- 2 files changed, 3 insertions(+), 20 deletions(-) diff --git a/webui/apiDB.go b/webui/apiDB.go index 225b685..cd198f9 100644 --- a/webui/apiDB.go +++ b/webui/apiDB.go @@ -2,27 +2,14 @@ package webui import ( "fmt" - "log" "net/http" - "net/url" "strings" "git.mrcyjanek.net/mrcyjanek/jwapi/helpers" ) func apiDBget(w http.ResponseWriter, req *http.Request) { - url := req.URL.Path - splited := strings.Split(string(url), "/") - if len(splited) < 5 { - fmt.Fprintln(w, "/api/db/get/<key>") - return - } - key := splited[4] - if key == "" { - w.Write([]byte("0")) - return - } - w.Write(helpers.Get(key)) + w.Write(helpers.Get(req.URL.RawQuery)) } func apiDBset(w http.ResponseWriter, req *http.Request) { @@ -38,10 +25,6 @@ func apiDBset(w http.ResponseWriter, req *http.Request) { return } query := req.URL.RawQuery - value, err := url.QueryUnescape(query) - if err != nil { - log.Fatal(err) - } - helpers.Set(key, []byte(value)) + helpers.Set(key, []byte(query)) } diff --git a/webui/html/static/common.js b/webui/html/static/common.js index f60b0a4..f0c31f3 100644 --- a/webui/html/static/common.js +++ b/webui/html/static/common.js @@ -55,7 +55,7 @@ function dbGet(key) { return localStorage[key] } var xhr = new XMLHttpRequest(); - xhr.open("GET", "/api/db/get/"+encodeURIComponent(key), false); + xhr.open("GET", "/api/db/get?"+encodeURIComponent(key), false); xhr.onerror = function (e) { console.error(xhr.statusText); };