Skip to content

Commit

Permalink
google+: fix scraping
Browse files Browse the repository at this point in the history
  • Loading branch information
snarfed committed Jul 26, 2017
1 parent dad2314 commit ff219b2
Showing 1 changed file with 9 additions and 6 deletions.
15 changes: 9 additions & 6 deletions granary/googleplus.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@
from oauth_dropins.webutil import util

SEARCH_MAX_RESULTS = 20
HTML_ACTIVITIES_RE = re.compile(
r"AF_initDataCallback\({key: *'ds:[^']*', *isError: *false *, *hash: *'[^']*', *data:function\(\){return *(\[\[[^[].+?)}}\);</script>",
re.DOTALL)


class GooglePlus(source.Source):
Expand Down Expand Up @@ -314,13 +317,13 @@ def html_to_activities(self, html):
list of ActivityStreams activity dicts
"""
# extract JSON data blob
script_start = "AF_initDataCallback({key: 'ds:5', isError: false , hash: '10', data:function(){return"
start = html.find(script_start)
end = html.find('}});</script>', start)
if start == -1 or end == -1:
match = HTML_ACTIVITIES_RE.search(html)
if not match:
return []

html = match.group(1)
if not html:
return []
start += len(script_start)
html = html[start:end]

# insert placeholder nulls for omitted values, e.g. [,,,"x",,,] so that we
# can decode it as JSON. run twice to handle overlaps.
Expand Down

0 comments on commit ff219b2

Please sign in to comment.