forked from ernesto-jimenez/scraperboard
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmarkdownify.go
124 lines (111 loc) · 2.82 KB
/
markdownify.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
package scraperboard
// FIXME: Refactor this, it's quite messy
import (
"bytes"
"fmt"
"io"
"regexp"
"strings"
"unicode"
"code.google.com/p/go.net/html"
"github.com/PuerkitoBio/goquery"
)
// MarkdownifyReader takes a io.Reader with HTML and returns the text in Markdown
func MarkdownifyReader(r io.Reader) (string, error) {
doc, err := goquery.NewDocumentFromReader(r)
if err != nil {
return "", err
}
selection := doc.Selection
return strings.TrimSpace(markdownify(selection)), nil
}
func markdownify(s *goquery.Selection) string {
var buf bytes.Buffer
// Slightly optimized vs calling Each: no single selection object created
for _, n := range s.Nodes {
buf.WriteString(getNodeText(n))
}
return strings.TrimSpace(buf.String())
}
// Get the specified node's text content.
// BUG: It doesn't respect <pre> tags
func getNodeText(node *html.Node) string {
var buf bytes.Buffer
// Clear redundant whitespace from text
if node.Type == html.TextNode {
text := normalizeWhitespace(node.Data)
if node.NextSibling == nil || isBlock(node.NextSibling) {
text = strings.TrimRightFunc(text, unicode.IsSpace)
}
if isBlock(node.NextSibling) {
text = text + "\n\n"
}
if isBlock(node.PrevSibling) {
text = strings.TrimLeftFunc(text, unicode.IsSpace)
}
return text
}
// change BRs to spaces unless it has two in which case we add extra
if node.Data == "br" {
if node.NextSibling != nil && node.NextSibling.Data == "br" {
return "\n\n"
}
if node.PrevSibling != nil && node.PrevSibling.Data == "br" {
return ""
}
return " "
}
if node.FirstChild == nil {
return ""
}
if node.Data == "a" {
href, exists := getAttributeValue("href", node)
text := getNodeText(node.FirstChild)
if !exists {
return text
}
if strings.TrimSpace(text) == "" {
return " "
}
return fmt.Sprintf("[%s](%s)", text, href)
}
//buf.WriteString("=> " + node.Data + "|")
if isHeader(node) {
buf.WriteString("# ")
}
for c := node.FirstChild; c != nil; c = c.NextSibling {
buf.WriteString(getNodeText(c))
}
if isBlock(node) {
buf.WriteString("\n\n")
}
return buf.String()
}
func isBlock(node *html.Node) bool {
return node != nil && (isParagraph(node) || isHeader(node))
}
func isParagraph(node *html.Node) bool {
return node != nil && node.Data == "p"
}
func isHeader(node *html.Node) bool {
return node != nil && len(node.Data) == 2 && node.Data[0] == 'h' && node.Data[1] != 'r'
}
// Private function to get the specified attribute's value from a node.
func getAttributeValue(attrName string, n *html.Node) (val string, exists bool) {
if n == nil {
return
}
for _, a := range n.Attr {
if a.Key == attrName {
val = a.Val
exists = true
return
}
}
return
}
func normalizeWhitespace(str string) string {
exp := regexp.MustCompile("[[:space:]]+")
str = exp.ReplaceAllString(str, " ")
return str
}