diff --git a/README.md b/README.md index ed1d3b0..185f81b 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ List of formats read: | ------------- | ------------- | ------------- | ------------- | | TXT | X | text/plain; charset=utf-8 | | | RTF | X | text/rtf | | +| DOC (partial) | X | application/x-ole-storage | | | ODT | X | application/vnd.oasis.opendocument.text | X | | DOCX | X | application/vnd.openxmlformats-officedocument.wordprocessingml.document | X | | PPTX | X | application/vnd.openxmlformats-officedocument.presentationml.presentation | X | diff --git a/go.mod b/go.mod index a6f8617..bd14428 100644 --- a/go.mod +++ b/go.mod @@ -2,12 +2,18 @@ module github.com/vytek/opencrucible go 1.21.0 -require github.com/gabriel-vasile/mimetype v1.4.2 +require ( + github.com/gabriel-vasile/mimetype v1.4.2 + github.com/vytek/doc-to-text v0.0.0-20230905185522-d1fc42e42cf5 +) require ( github.com/EndFirstCorp/peekingReader v0.0.0-20171012052444-257fb6f1a1a6 // indirect + github.com/richardlehane/mscfb v1.0.4 // indirect + github.com/richardlehane/msoleps v1.0.3 // indirect github.com/sirupsen/logrus v1.7.0 // indirect golang.org/x/sys v0.8.0 // indirect + golang.org/x/text v0.13.0 // indirect gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect ) diff --git a/go.sum b/go.sum index efe1754..31e1055 100644 --- a/go.sum +++ b/go.sum @@ -24,15 +24,24 @@ github.com/lu4p/cat v0.1.5 h1:s51Bp/ns3u6n+hjjL2F77ySY6j/GD5SJG/t6Ok4Y1S0= github.com/lu4p/cat v0.1.5/go.mod h1:G3YRyjSvBipqMBRZ2uLf1oRL3/eGGmuZf96m95Y4jRQ= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/richardlehane/mscfb v1.0.4 h1:WULscsljNPConisD5hR0+OyZjwK46Pfyr6mPu5ZawpM= +github.com/richardlehane/mscfb v1.0.4/go.mod h1:YzVpcZg9czvAuhk9T+a3avCpcFPMUWm7gK3DypaEsUk= +github.com/richardlehane/msoleps v1.0.1/go.mod h1:BWev5JBpU9Ko2WAgmZEuiz4/u3ZYTKbjLycmwiWUfWg= +github.com/richardlehane/msoleps v1.0.3 h1:aznSZzrwYRl3rLKRT3gUk9am7T/mLNSnJINvN0AQoVM= +github.com/richardlehane/msoleps v1.0.3/go.mod h1:BWev5JBpU9Ko2WAgmZEuiz4/u3ZYTKbjLycmwiWUfWg= github.com/sirupsen/logrus v1.7.0 h1:ShrD1U9pZB12TX0cVy0DtePoCH97K8EtX+mg7ZARUtM= github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= github.com/stretchr/testify v1.2.2 h1:bSDNvY7ZPG5RlJ8otE/7V6gMiyenm9RtJ7IUVIAoJ1w= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= +github.com/vytek/doc-to-text v0.0.0-20230905185522-d1fc42e42cf5 h1:J8lrTo8meIVBYTnRgaRR2arJZtwlAtxH2L6J76B6cLw= +github.com/vytek/doc-to-text v0.0.0-20230905185522-d1fc42e42cf5/go.mod h1:AZVaCq/aXiO5U68pHGyyKn7skv5xCXirj/7xKGNDt4A= golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.8.0 h1:EBmGv8NaZBZTWvrbjNoL6HVt+IVy3QDQpJs7VRIw3tU= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/xmlpath.v2 v2.0.0-20150820204837-860cbeca3ebc h1:LMEBgNcZUqXaP7evD1PZcL6EcDVa2QOFuI+cqM3+AJM= diff --git a/opencrucible.go b/opencrucible.go index eed8335..f773904 100644 --- a/opencrucible.go +++ b/opencrucible.go @@ -14,10 +14,11 @@ import ( "github.com/gocaio/metagopenoffice" "github.com/h2non/filetype" "github.com/lu4p/cat" + doctotext "github.com/vytek/doc-to-text" ) // Version exposes the current package version. -const Version = "0.0.6" +const Version = "0.0.7" //Detects @@ -147,6 +148,16 @@ func PPTXFileParseToString(FileToParse string) (string, error) { return pptx, err } +func DOCFileParseToString(FileToParse string) (string, error) { + file, err := os.Open(FileToParse) + if err != nil { + return "", fmt.Errorf("error opening file: %s", err) + } + defer file.Close() + doc, err := doctotext.DocToText(file, false) + return doc, err +} + //Metadata // See: https://www.lazy-tech.net/project/pdf_metadata_parsing_golang diff --git a/opencrucible_test.go b/opencrucible_test.go index e2aee24..773d131 100644 --- a/opencrucible_test.go +++ b/opencrucible_test.go @@ -128,6 +128,18 @@ func TestDOCXMSFileParser(t *testing.T) { } } +func TestDOCFileParser(t *testing.T) { + got, err := DOCFileParseToString(filepath.Join("test_file", "test_file_doc.doc")) + if err != nil { + t.Errorf("error loading file \n %s", err) + } + want := "This is a test file to test library\r" + t.Logf("Parsed: %s", got) + if got != want { + t.Errorf("got %q, wanted %q", got, want) + } +} + // This�is�a�test�file�to�test�library // 54 68 69 73 00 69 73 00 61 00 74 65 73 74 00 66 69 6C 65 00 74 6F 00 74 65 73 74 00 6C 69 62 72 61 72 79 20 // Must be 20 NOT 00 @@ -231,3 +243,19 @@ func TestPPTXDetect(t *testing.T) { t.Errorf("got %q, wanted %q", got, want) } } + +func TestDOCDetect(t *testing.T) { + pdf, err := os.ReadFile(filepath.Join("test_file", "test_file_doc.doc")) + if err != nil { + t.Errorf("error loading file \n %s", err) + } + got, _, err := DetectFileTypeMIME(pdf) + if err != nil { + t.Errorf("unable to detect file \n %s", err) + } + want := "application/x-ole-storage" + t.Logf("Parsed: %s", got) + if got != want { + t.Errorf("got %q, wanted %q", got, want) + } +} diff --git a/test_file/test_file_html.html b/test_file/test_file_html.html new file mode 100644 index 0000000..b12a615 --- /dev/null +++ b/test_file/test_file_html.html @@ -0,0 +1 @@ +

This is a test file to test library

\ No newline at end of file