google+: fix scraping

snarfed · Jul 26, 2017 · ff219b2 · ff219b2
1 parent dad2314
commit ff219b2
Showing 1 changed file with 9 additions and 6 deletions.
diff --git a/granary/googleplus.py b/granary/googleplus.py
@@ -21,6 +21,9 @@
 from oauth_dropins.webutil import util
 
 SEARCH_MAX_RESULTS = 20
+HTML_ACTIVITIES_RE = re.compile(
+  r"AF_initDataCallback\({key: *'ds:[^']*', *isError: *false *, *hash: *'[^']*', *data:function\(\){return *(\[\[[^[].+?)}}\);</script>",
+  re.DOTALL)
 
 
 class GooglePlus(source.Source):
@@ -314,13 +317,13 @@ def html_to_activities(self, html):
       list of ActivityStreams activity dicts
     """
     # extract JSON data blob
-    script_start = "AF_initDataCallback({key: 'ds:5', isError:  false , hash: '10', data:function(){return"
-    start = html.find(script_start)
-    end = html.find('}});</script>', start)
-    if start == -1 or end == -1:
+    match = HTML_ACTIVITIES_RE.search(html)
+    if not match:
+      return []
+
+    html = match.group(1)
+    if not html:
       return []
-    start += len(script_start)
-    html = html[start:end]
 
     # insert placeholder nulls for omitted values, e.g. [,,,"x",,,] so that we
     # can decode it as JSON. run twice to handle overlaps.