Skip to content

Commit

Permalink
Add partial DOC Parser
Browse files Browse the repository at this point in the history
  • Loading branch information
Vytek committed Sep 5, 2023
1 parent 170f1d1 commit 3b1b645
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 2 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ List of formats read:
| ------------- | ------------- | ------------- | ------------- |
| TXT | X | text/plain; charset=utf-8 | |
| RTF | X | text/rtf | |
| DOC (partial) | X | application/x-ole-storage | |
| ODT | X | application/vnd.oasis.opendocument.text | X |
| DOCX | X | application/vnd.openxmlformats-officedocument.wordprocessingml.document | X |
| PPTX | X | application/vnd.openxmlformats-officedocument.presentationml.presentation | X |
Expand Down
8 changes: 7 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,18 @@ module github.com/vytek/opencrucible

go 1.21.0

require github.com/gabriel-vasile/mimetype v1.4.2
require (
github.com/gabriel-vasile/mimetype v1.4.2
github.com/vytek/doc-to-text v0.0.0-20230905185522-d1fc42e42cf5
)

require (
github.com/EndFirstCorp/peekingReader v0.0.0-20171012052444-257fb6f1a1a6 // indirect
github.com/richardlehane/mscfb v1.0.4 // indirect
github.com/richardlehane/msoleps v1.0.3 // indirect
github.com/sirupsen/logrus v1.7.0 // indirect
golang.org/x/sys v0.8.0 // indirect
golang.org/x/text v0.13.0 // indirect
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
)

Expand Down
9 changes: 9 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,24 @@ github.com/lu4p/cat v0.1.5 h1:s51Bp/ns3u6n+hjjL2F77ySY6j/GD5SJG/t6Ok4Y1S0=
github.com/lu4p/cat v0.1.5/go.mod h1:G3YRyjSvBipqMBRZ2uLf1oRL3/eGGmuZf96m95Y4jRQ=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/richardlehane/mscfb v1.0.4 h1:WULscsljNPConisD5hR0+OyZjwK46Pfyr6mPu5ZawpM=
github.com/richardlehane/mscfb v1.0.4/go.mod h1:YzVpcZg9czvAuhk9T+a3avCpcFPMUWm7gK3DypaEsUk=
github.com/richardlehane/msoleps v1.0.1/go.mod h1:BWev5JBpU9Ko2WAgmZEuiz4/u3ZYTKbjLycmwiWUfWg=
github.com/richardlehane/msoleps v1.0.3 h1:aznSZzrwYRl3rLKRT3gUk9am7T/mLNSnJINvN0AQoVM=
github.com/richardlehane/msoleps v1.0.3/go.mod h1:BWev5JBpU9Ko2WAgmZEuiz4/u3ZYTKbjLycmwiWUfWg=
github.com/sirupsen/logrus v1.7.0 h1:ShrD1U9pZB12TX0cVy0DtePoCH97K8EtX+mg7ZARUtM=
github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0=
github.com/stretchr/testify v1.2.2 h1:bSDNvY7ZPG5RlJ8otE/7V6gMiyenm9RtJ7IUVIAoJ1w=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/vytek/doc-to-text v0.0.0-20230905185522-d1fc42e42cf5 h1:J8lrTo8meIVBYTnRgaRR2arJZtwlAtxH2L6J76B6cLw=
github.com/vytek/doc-to-text v0.0.0-20230905185522-d1fc42e42cf5/go.mod h1:AZVaCq/aXiO5U68pHGyyKn7skv5xCXirj/7xKGNDt4A=
golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M=
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.8.0 h1:EBmGv8NaZBZTWvrbjNoL6HVt+IVy3QDQpJs7VRIw3tU=
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k=
golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
gopkg.in/xmlpath.v2 v2.0.0-20150820204837-860cbeca3ebc h1:LMEBgNcZUqXaP7evD1PZcL6EcDVa2QOFuI+cqM3+AJM=
Expand Down
13 changes: 12 additions & 1 deletion opencrucible.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,11 @@ import (
"github.com/gocaio/metagopenoffice"
"github.com/h2non/filetype"
"github.com/lu4p/cat"
doctotext "github.com/vytek/doc-to-text"
)

// Version exposes the current package version.
const Version = "0.0.6"
const Version = "0.0.7"

//Detects

Expand Down Expand Up @@ -147,6 +148,16 @@ func PPTXFileParseToString(FileToParse string) (string, error) {
return pptx, err
}

func DOCFileParseToString(FileToParse string) (string, error) {
file, err := os.Open(FileToParse)
if err != nil {
return "", fmt.Errorf("error opening file: %s", err)
}
defer file.Close()
doc, err := doctotext.DocToText(file, false)
return doc, err
}

//Metadata

// See: https://www.lazy-tech.net/project/pdf_metadata_parsing_golang
Expand Down
28 changes: 28 additions & 0 deletions opencrucible_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,18 @@ func TestDOCXMSFileParser(t *testing.T) {
}
}

func TestDOCFileParser(t *testing.T) {
got, err := DOCFileParseToString(filepath.Join("test_file", "test_file_doc.doc"))
if err != nil {
t.Errorf("error loading file \n %s", err)
}
want := "This is a test file to test library\r"
t.Logf("Parsed: %s", got)
if got != want {
t.Errorf("got %q, wanted %q", got, want)
}
}

// This�is�a�test�file�to�test�library
// 54 68 69 73 00 69 73 00 61 00 74 65 73 74 00 66 69 6C 65 00 74 6F 00 74 65 73 74 00 6C 69 62 72 61 72 79 20
// Must be 20 NOT 00
Expand Down Expand Up @@ -231,3 +243,19 @@ func TestPPTXDetect(t *testing.T) {
t.Errorf("got %q, wanted %q", got, want)
}
}

func TestDOCDetect(t *testing.T) {
pdf, err := os.ReadFile(filepath.Join("test_file", "test_file_doc.doc"))
if err != nil {
t.Errorf("error loading file \n %s", err)
}
got, _, err := DetectFileTypeMIME(pdf)
if err != nil {
t.Errorf("unable to detect file \n %s", err)
}
want := "application/x-ole-storage"
t.Logf("Parsed: %s", got)
if got != want {
t.Errorf("got %q, wanted %q", got, want)
}
}
1 change: 1 addition & 0 deletions test_file/test_file_html.html
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<html><head><meta content="text/html; charset=UTF-8" http-equiv="content-type"><style type="text/css">ol{margin:0;padding:0}table td,table th{padding:0}.c2{color:#000000;font-weight:400;text-decoration:none;vertical-align:baseline;font-size:11pt;font-family:"Arial";font-style:normal}.c1{padding-top:0pt;padding-bottom:0pt;line-height:1.15;orphans:2;widows:2;text-align:left}.c0{background-color:#ffffff;max-width:451.4pt;padding:72pt 72pt 72pt 72pt}.title{padding-top:0pt;color:#000000;font-size:26pt;padding-bottom:3pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}.subtitle{padding-top:0pt;color:#666666;font-size:15pt;padding-bottom:16pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}li{color:#000000;font-size:11pt;font-family:"Arial"}p{margin:0;color:#000000;font-size:11pt;font-family:"Arial"}h1{padding-top:20pt;color:#000000;font-size:20pt;padding-bottom:6pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h2{padding-top:18pt;color:#000000;font-size:16pt;padding-bottom:6pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h3{padding-top:16pt;color:#434343;font-size:14pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h4{padding-top:14pt;color:#666666;font-size:12pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h5{padding-top:12pt;color:#666666;font-size:11pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h6{padding-top:12pt;color:#666666;font-size:11pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;font-style:italic;orphans:2;widows:2;text-align:left}</style></head><body class="c0 doc-content"><p class="c1"><span class="c2">This is a test file to test library</span></p></body></html>

0 comments on commit 3b1b645

Please sign in to comment.