From a5137dc6b958f720bc1f27b8a62a4aa3bce88af3 Mon Sep 17 00:00:00 2001 From: Vegard Stikbakke Date: Thu, 15 Feb 2024 21:46:05 +0100 Subject: [PATCH] Store detected boxes JSON in DynamoDB as well --- cmd/cli/main.go | 59 +++++++++++++++++++++++++++++++------------- cmd/lambda/main.go | 32 ++++++++++++++---------- dynamodb/dynamodb.go | 3 ++- 3 files changed, 63 insertions(+), 31 deletions(-) diff --git a/cmd/cli/main.go b/cmd/cli/main.go index 898b819..c3afa4f 100644 --- a/cmd/cli/main.go +++ b/cmd/cli/main.go @@ -4,12 +4,15 @@ import ( "crypto/sha256" "encoding/json" "fmt" + "log" "os" "strings" "text/tabwriter" "github.com/vegarsti/extract" + "github.com/vegarsti/extract/box" "github.com/vegarsti/extract/dynamodb" + "github.com/vegarsti/extract/image" "github.com/vegarsti/extract/textract" ) @@ -43,31 +46,53 @@ func main() { // Check if table is stored checksum := fmt.Sprintf("%x", sha256.Sum256(imageBytes)) - fmt.Println(checksum) - storedBytes, err := dynamodb.GetTable(checksum) - if err != nil { - die(err) - } - if storedBytes != nil { - var table [][]string - json.Unmarshal(storedBytes, &table) - writeTable(table) - return - } + // fmt.Println(checksum) + // storedBytes, err := dynamodb.GetTable(checksum) + // if err != nil { + // die(err) + // } + // if storedBytes != nil { + // var table [][]string + // json.Unmarshal(storedBytes, &table) + // writeTable(table) + // return + // } file := &extract.File{ Bytes: imageBytes, ContentType: contentType, } - - output, err := textract.AnalyzeDocument(file) + // Don't use Textract's Analyze Document, use OCR and custom algorithm instead + output, err := textract.DetectDocumentText(file) if err != nil { - die(err) + die(fmt.Errorf("textract text detection failed: %w", err)) } - table, err := textract.ToTableFromDetectedTable(output) + boxes, err := textract.ToLinesFromOCR(output) if err != nil { - die(err) + die(fmt.Errorf("failed to convert to boxes: %w", err)) } + rows, table := box.ToTable(boxes) + + // Add boxes + if contentType == extract.PNG { + newEncodedImage, err := image.AddBoxes(file.Bytes, boxes) + if err != nil { + log.Printf("add boxes to image 1 failed: %v", err) + } else { + rowsFlattened := make([]box.Box, 0) + for _, row := range rows { + rowsFlattened = append(rowsFlattened, row...) + } + newEncodedImage2, err := image.AddBoxes(file.Bytes, rowsFlattened) + if err != nil { + log.Printf("add boxes to image 2 failed: %v", err) + file.BytesWithBoxes = []byte(newEncodedImage) + file.BytesWithRowBoxes = []byte(newEncodedImage2) + } + fmt.Println("hello") + } + } + writeTable(table) // store in dynamo db @@ -75,7 +100,7 @@ func main() { if err != nil { die(err) } - if err := dynamodb.PutTable(checksum[:], tableJSON); err != nil { + if err := dynamodb.PutTable(checksum[:], tableJSON, []byte{}); err != nil { die(err) } } diff --git a/cmd/lambda/main.go b/cmd/lambda/main.go index b31e150..8a2350d 100644 --- a/cmd/lambda/main.go +++ b/cmd/lambda/main.go @@ -160,25 +160,27 @@ func getTable(file *extract.File) ([][]string, error) { } rows, table := box.ToTable(boxes) - // Add boxes - if file.ContentType == extract.PNG { - newEncodedImage, err := image.AddBoxes(file.Bytes, boxes) - if err != nil { - log.Printf("add boxes to image 1 failed: %v", err) - } else { + // Create images with words and cells + go func() { + if file.ContentType == extract.PNG { + imageWithWords, err := image.AddBoxes(file.Bytes, boxes) + if err != nil { + log.Printf("add word boxes to image failed: %v", err) + return + } rowsFlattened := make([]box.Box, 0) for _, row := range rows { rowsFlattened = append(rowsFlattened, row...) } - newEncodedImage2, err := image.AddBoxes(file.Bytes, rowsFlattened) + imageWithCells, err := image.AddBoxes(file.Bytes, rowsFlattened) if err != nil { - log.Printf("add boxes to image 2 failed: %v", err) - } else { - file.BytesWithBoxes = []byte(newEncodedImage) - file.BytesWithRowBoxes = []byte(newEncodedImage2) + log.Printf("add cell boxes to image failed: %v", err) + return } + file.BytesWithBoxes = []byte(imageWithWords) + file.BytesWithRowBoxes = []byte(imageWithCells) } - } + }() log.Printf("ocr-to-table: %s", time.Since(startAlgorithm).String()) if err != nil { @@ -223,7 +225,11 @@ func getTable(file *extract.File) ([][]string, error) { }) g.Go(func() error { startPut := time.Now() - if err := dynamodb.PutTable(file.Checksum, tableBytes); err != nil { + boxesJSON, err := json.Marshal(boxes) + if err != nil { + return fmt.Errorf("failed to convert boxes to json: %w", err) + } + if err := dynamodb.PutTable(file.Checksum, tableBytes, boxesJSON); err != nil { return fmt.Errorf("dynamodb.PutTable: %w", err) } log.Printf("dynamodb put: %s", time.Since(startPut).String()) diff --git a/dynamodb/dynamodb.go b/dynamodb/dynamodb.go index cb840b9..d1e3976 100644 --- a/dynamodb/dynamodb.go +++ b/dynamodb/dynamodb.go @@ -34,7 +34,7 @@ func CreateTable(sess *session.Session) error { return nil } -func PutTable(checksum string, table []byte) error { +func PutTable(checksum string, table []byte, boxesJSON []byte) error { sess, err := session.NewSession() if err != nil { return fmt.Errorf("unable to create session: %w", err) @@ -46,6 +46,7 @@ func PutTable(checksum string, table []byte) error { // Old: Used table detection directly, new uses custom algorithm // "JSONTable": {B: table}, "JSONTableCustomDetection": {B: table}, + "JSONBoxes": {B: boxesJSON}, }, TableName: aws.String("Tables"), }