add scraping and standalone entry helpers (#90)

facundoolano · Jan 8, 2024 · e7982c4 · e7982c4
1 parent 105c002
commit e7982c4
Show file tree

Hide file tree

Showing 5 changed files with 68 additions and 35 deletions.
diff --git a/feedi/models.py b/feedi/models.py
@@ -521,6 +521,17 @@ class Entry(db.Model):
     __table_args__ = (sa.UniqueConstraint("feed_id", "remote_id"),
                       sa.Index("entry_sort_ts", sort_date.desc()))
 
+    @classmethod
+    def from_url(cls, user_id, url):
+        "Load an entry for the given article url if it exists, otherwise create a new one."
+        entry = db.session.scalar(db.select(cls)
+                                  .filter_by(content_url=url, user_id=user_id))
+
+        if not entry:
+            values = parsers.html.fetch(url)
+            entry = cls(user_id=user_id, **values)
+        return entry
+
     def __repr__(self):
         return f'<Entry {self.feed_id}/{self.remote_id}>'
 

diff --git a/feedi/parsers/html.py b/feedi/parsers/html.py
@@ -1,43 +1,51 @@
 import datetime
+import json
 
 import dateparser
 from bs4 import BeautifulSoup
 from feedi import scraping
 from feedi.requests import requests
 
 
-def fetch(url, full_content=False):
-    "Return the entry values for an article at the given url."
+def fetch(url):
+    """
+    Return the entry values for an article at the given url.
+    Raises ValueError if the url doesn't seem to point to an article (it doesn't have a title).
+    Raises HTTPError if the request is not successfull.
+    """
 
     response = requests.get(url)
+    response.raise_for_status()
+
+    if not response.ok:
+        raise Exception()
+
     soup = BeautifulSoup(response.content, 'lxml')
+    metadata = scraping.all_meta(soup)
+
+    title = metadata.get('og:title', metadata.get('twitter:title'))
 
-    published = scraping.extract_meta(soup, 'og:article:published_time')
-    if published:
-        display_date = dateparser.parse(published)
+    if not title or (metadata.get('og:type') and metadata['og:type'] != 'article'):
+        raise ValueError(f"{url} is missing article metadata")
+
+    if 'og:article:published_time' in metadata:
+        display_date = dateparser.parse(metadata['og:article:published_time'])
     else:
         display_date = datetime.datetime.utcnow()
 
-    title = scraping.extract_meta(soup, 'og:title', 'twitter:title')
-    if not title and soup.title:
-        title = soup.title.text
-
-    username = scraping.extract_meta(soup, 'author') or ''
-    username = username.split(',')[0]
+    username = metadata.get('author', '').split(',')[0]
 
     entry = {
         'remote_id': url,
         'title': title,
         'username': username,
         'display_date': display_date,
         'sort_date': datetime.datetime.utcnow(),
-        'content_short': scraping.extract_meta(soup, 'og:description', 'description'),
-        'media_url': scraping.extract_meta(soup, 'og:image', 'twitter:image'),
+        'content_short': metadata.get('og:description', metadata.get('description')),
+        'media_url': metadata.get('og:image', metadata.get('twitter:image')),
         'target_url': url,
         'content_url': url,
+        'raw_data': json.dumps(metadata)
     }
 
-    if full_content:
-        entry['content_full'] = scraping.extract(html=response.content)['content']
-
     return entry
diff --git a/feedi/parsers/rss.py b/feedi/parsers/rss.py
@@ -301,14 +301,14 @@ def discover_feed(url):
     for type in link_types:
         link = soup.find('link', type=type, href=True)
         if link:
-            feed_url = make_absolute(url, link['href'])
+            feed_url = scraping.make_absolute(url, link['href'])
             return feed_url, title
 
     # if none found in the html, try with common urls, provided that they exist
     # and are xml content
     common_paths = ['/feed', '/rss', '/feed.xml', '/rss.xml']
     for path in common_paths:
-        rss_url = make_absolute(url, path)
+        rss_url = scraping.make_absolute(url, path)
         res = requests.get(rss_url)
         mime = res.headers.get('Content-Type', '').split(';')[0]
         if res.ok and mime.endswith('xml'):
@@ -317,14 +317,6 @@ def discover_feed(url):
     return None, title
 
 
-def make_absolute(url, path):
-    "If `path` is a relative url, join it with the given absolute url."
-    if not urllib.parse.urlparse(path).netloc:
-
-        path = urllib.parse.urljoin(url, path)
-    return path
-
-
 def pretty_print(url):
     feed = feedparser.parse(url)
     pp = pprint.PrettyPrinter(depth=10)

diff --git a/feedi/routes.py b/feedi/routes.py
@@ -11,7 +11,7 @@
 import feedi.tasks as tasks
 from feedi import scraping
 from feedi.models import db
-from feedi.parsers import html, mastodon, rss
+from feedi.parsers import mastodon, rss
 
 
 @app.route("/users/<username>")
@@ -424,17 +424,16 @@ def entry_add():
     Redirects to the content reader for the article at the given URL, creating a new entry for it
     if there isn't already one.
     """
-
     # TODO sanitize?
     url = flask.request.args['url']
-    entry = db.session.scalar(db.select(models.Entry)
-                              .filter_by(content_url=url, user_id=current_user.id))
 
-    if not entry:
-        values = html.fetch(url, full_content=True)
-        entry = models.Entry(user_id=current_user.id, **values)
-        db.session.add(entry)
-        db.session.commit()
+    try:
+        entry = models.Entry.from_url(current_user.id, url)
+    except Exception:
+        return redirect_response(url)
+
+    db.session.add(entry)
+    db.session.commit()
     return redirect_response(flask.url_for('entry_view', id=entry.id))
 
 

diff --git a/feedi/scraping.py b/feedi/scraping.py
@@ -73,6 +73,29 @@ def extract_meta(soup, *tags):
                 return meta_tag['content']
 
 
+def all_meta(soup):
+    result = {}
+    for attr in ['property', 'name', 'itemprop']:
+        for meta_tag in soup.find_all("meta", {attr: True}, content=True):
+            result[meta_tag[attr]] = meta_tag['content']
+    return result
+
+
+def extract_links(url, html):
+    soup = BeautifulSoup(html, 'lxml')
+    # checks tag.text so it skips image links
+    links = soup.find_all(lambda tag: tag.name == 'a' and tag.text)
+    return [(make_absolute(url, a['href']), a.text) for a in links]
+
+
+def make_absolute(url, path):
+    "If `path` is a relative url, join it with the given absolute url."
+    if not urllib.parse.urlparse(path).netloc:
+        path = urllib.parse.urljoin(url, path)
+    return path
+
+
+# TODO this should be renamed, and maybe other things in this modules, using extract too much
 def extract(url=None, html=None):
     # The mozilla/readability npm package shows better results at extracting the
     # article content than all the python libraries I've tried... even than the readabilipy