Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implemented a fix for newlines on br and block elements #42

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions microdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,11 @@ def __repr__(self):
'time': 'datetime',
}

block_elements = [
"div",
"p",
"h1", "h2", "h3", "h4", "h5", "h6"
]

def _find_items(e):
items = []
Expand Down Expand Up @@ -232,8 +237,15 @@ def _text(e):
chunks.append(e.data)
elif hasattr(e, 'tagName') and e.tagName == 'script':
return ''
elif hasattr(e, 'tagName') and e.tagName == 'br':
chunks.append("\n")

for child in e.childNodes:
chunks.append(_text(child))

if hasattr(e, 'tagName') and e.tagName in block_elements:
chunks.append("\n")

return ''.join(chunks)


Expand Down
40 changes: 40 additions & 0 deletions test-data/example-dirty.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
<!doctype html>
<html>
<!-- shamelessly stolen from http://schema.org/Person -->
<head>
<title>person example</title>
</head>

<body>
<div itemid="http://www.xyz.edu/~jane" itemscope itemtype="http://schema.org/Person">
<span itemprop="name">Jane Doe</span>
<img src="janedoe.jpg" itemprop="image" />

<span itemprop="jobTitle">Professor</span>
<div itemprop="description">A Professor that likes<br>Linebreaks</div>
<div itemprop="address" itemscope itemtype="http://schema.org/PostalAddress">
<span itemprop="streetAddress">
<p>20341 Whitworth Institute</p><p>405 N. Whitworth</p>
<!-- Comment Node -->
<script>
// Unrelated text
</script>
</span>
<span itemprop="addressLocality">Seattle</span>,
<span itemprop="addressRegion">WA</span>
<span itemprop="postalCode">98052</span>
</div>
<span itemprop="telephone">(425) 123-4567</span>
<a href="mailto:[email protected]" itemprop="email">[email protected]</a>

Jane's home page:
<a href="http://www.janedoe.com" itemprop="url">janedoe.com</a>

Graduate students:
<a href="http://www.xyz.edu/students/alicejones.html" itemprop="colleagues">
Alice Jones</a>
<a href="http://www.xyz.edu/students/bobsmith.html" itemprop="colleagues">
Bob Smith</a>
</div>
</body>
</html>
17 changes: 17 additions & 0 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,23 @@ def test_skip_level(self):
self.assertEqual(len(items), 1)
self.assertEqual(items[0].name, "Jane Doe")

def test_newlines(self):
items = get_items(open("test-data/example-dirty.html"))
# this html should have just one main item
self.assertTrue(len(items), 1)

item = items[0]

# description contains a br tag so it should have a newline
self.assertEqual(item.description.strip(), "A Professor that likes\nLinebreaks")

self.assertEqual(item.address.itemtype, [URI("http://schema.org/PostalAddress")])
# street adress should contain newlines because p is a block element
self.assertEqual(item.address.streetAddress.strip(), "20341 Whitworth Institute\n405 N. Whitworth")
self.assertEqual(item.address.addressLocality, "Seattle")

item = items[0]

def test_parse_multiple_props(self):
items = get_items(open("test-data/multiple-props.html"))

Expand Down