facundoolano · facundoolano · Jan 5, 2024 · Jan 5, 2024 · Jan 5, 2024 · Jan 5, 2024
diff --git a/feedi/models.py b/feedi/models.py
@@ -521,6 +521,17 @@ class Entry(db.Model):
     __table_args__ = (sa.UniqueConstraint("feed_id", "remote_id"),
                       sa.Index("entry_sort_ts", sort_date.desc()))
 
+    @classmethod
+    def from_url(cls, user_id, url):
+        "TODO"
+        entry = db.session.scalar(db.select(cls)
+                                  .filter_by(content_url=url, user_id=user_id))
+
+        if not entry:
+            values = parsers.html.fetch(url)
+            entry = cls(user_id=user_id, **values)
+        return entry
+
     def __repr__(self):
         return f'<Entry {self.feed_id}/{self.remote_id}>'
 

diff --git a/feedi/parsers/html.py b/feedi/parsers/html.py
@@ -1,43 +1,51 @@
 import datetime
+import json
 
 import dateparser
 from bs4 import BeautifulSoup
 from feedi import scraping
 from feedi.requests import requests
 
 
-def fetch(url, full_content=False):
-    "Return the entry values for an article at the given url."
+def fetch(url):
+    """
+    Return the entry values for an article at the given url.
+    Raises ValueError if the url doesn't seem to point to an article (it doesn't have a title).
+    Raises HTTPError if the request is not successfull.
+    """
 
     response = requests.get(url)
+    response.raise_for_status()
+
+    if not response.ok:
+        raise Exception()
+
     soup = BeautifulSoup(response.content, 'lxml')
+    metadata = scraping.all_meta(soup)
+
+    title = metadata.get('og:title', metadata.get('twitter:title'))
 
-    published = scraping.extract_meta(soup, 'og:article:published_time')
-    if published:
-        display_date = dateparser.parse(published)
+    if not title or (metadata.get('og:type') and metadata['og:type'] != 'article'):
+        raise ValueError(f"{url} is missing article metadata")
+
+    if 'og:article:published_time' in metadata:
+        display_date = dateparser.parse(metadata['og:article:published_time'])
     else:
         display_date = datetime.datetime.utcnow()
 
-    title = scraping.extract_meta(soup, 'og:title', 'twitter:title')
-    if not title and soup.title:
-        title = soup.title.text
-
-    username = scraping.extract_meta(soup, 'author') or ''
-    username = username.split(',')[0]
+    username = metadata.get('author', '').split(',')[0]
 
     entry = {
         'remote_id': url,
         'title': title,
         'username': username,
         'display_date': display_date,
         'sort_date': datetime.datetime.utcnow(),
-        'content_short': scraping.extract_meta(soup, 'og:description', 'description'),
-        'media_url': scraping.extract_meta(soup, 'og:image', 'twitter:image'),
+        'content_short': metadata.get('og:description', metadata.get('description')),
+        'media_url': metadata.get('og:image', metadata.get('twitter:image')),
         'target_url': url,
         'content_url': url,
+        'raw_data': json.dumps(metadata)
     }
 
-    if full_content:
-        entry['content_full'] = scraping.extract(html=response.content)['content']
-
     return entry
diff --git a/feedi/routes.py b/feedi/routes.py
@@ -11,7 +11,7 @@
 import feedi.tasks as tasks
 from feedi import scraping
 from feedi.models import db
-from feedi.parsers import html, mastodon, rss
+from feedi.parsers import mastodon, rss
 
 
 @app.route("/users/<username>")
@@ -216,6 +216,43 @@ def entry_favorite(id):
     return '', 204
 
 
+@app.delete("/entries/<int:id>")
+@login_required
+def entry_explode(id):
+    "Mark the given entry as viewed, scan the links in its contents, create entries and render them."
+    # may be overloading the delete method a bit here
+
+    entry = db.get_or_404(models.Entry, id)
+    if entry.user_id != current_user.id:
+        flask.abort(404)
+
+    entry.fetch_content()
+    entry.viewed = entry.viewed or datetime.datetime.utcnow()
+    db.session.commit()
+
+    # extract a unique set of links considering both short and full content
+    urls = []
+    for content in [entry.content_short, entry.content_full]:
+        if content:
+            urls += scraping.extract_links(content)
+    urls = set(urls)
+
+    # create (or load old) entries for those that are valid articles
+    entries = []
+    for url in urls:
+        try:
+            subentry = models.Entry.from_url(current_user.id, url)
+            entries.append(subentry)
+        except Exception:
+            continue
+
+    db.session.add_all(entries)
+    db.session.commit()
+
+    return flask.render_template('entry_list_page.html',
+                                 entries=entries)
+
+
 @app.put("/mastodon/favorites/<int:id>")
 @login_required
 def mastodon_favorite(id):
@@ -424,17 +461,16 @@ def entry_add():
     Redirects to the content reader for the article at the given URL, creating a new entry for it
     if there isn't already one.
     """
-
     # TODO sanitize?
     url = flask.request.args['url']
-    entry = db.session.scalar(db.select(models.Entry)
-                              .filter_by(content_url=url, user_id=current_user.id))
 
-    if not entry:
-        values = html.fetch(url, full_content=True)
-        entry = models.Entry(user_id=current_user.id, **values)
-        db.session.add(entry)
-        db.session.commit()
+    try:
+        entry = models.Entry.from_url(current_user.id, url)
+    except Exception:
+        return redirect_response(url)
+
+    db.session.add(entry)
+    db.session.commit()
     return redirect_response(flask.url_for('entry_view', id=entry.id))
 
 

diff --git a/feedi/scraping.py b/feedi/scraping.py
@@ -73,6 +73,24 @@ def extract_meta(soup, *tags):
                 return meta_tag['content']
 
 
+def all_meta(soup):
+    result = {}
+    for attr in ['property', 'name', 'itemprop']:
+        for meta_tag in soup.find_all("meta", {attr: True}, content=True):
+            result[meta_tag[attr]] = meta_tag['content']
+    return result
+
+
+def extract_links(html):
+    soup = BeautifulSoup(html, 'lxml')
+    # checks tag.text so it skips image links
+    # checks startswith http to exclude local links (not sure if it's the best assumption?)
+    links = soup.find_all(lambda tag: tag.name ==
+                          'a' and tag.text and tag['href'].startswith('http'))
+    return [a['href'] for a in links]
+
+
+# TODO this should be renamed, and maybe other things in this modules, using extract too much
 def extract(url=None, html=None):
     # The mozilla/readability npm package shows better results at extracting the
     # article content than all the python libraries I've tried... even than the readabilipy

diff --git a/feedi/templates/entry_commands.html b/feedi/templates/entry_commands.html
@@ -13,6 +13,10 @@
 {% if entry.id and request.path != url_for('entry_view', id=entry.id) %}
 <a hx-boost="true" href="{{ url_for('entry_view', id=entry.id) }}" class="dropdown-item"><span class="icon"><i class="fas fa-book-reader"></i></span> View in reader</a>
 {% endif %}
+<a class="dropdown-item" hx-delete="{{ url_for('entry_explode', id=entry.id )}}" hx-swap="none"
+   _="on htmx:beforeRequest or htmx:afterRequest toggle .fa-spin on <i/> in me
+          then on htmx:afterRequest go to url {{ url_for('entry_list') }}">
+    <span class="icon"><i class="fas fa-bomb fa-lg"></i></span> Explode</a>
 <a class="dropdown-item" _="on click writeText('{{ entry.content_url }}') into the navigator's clipboard"><span class="icon"><i class="fas fa-link"></i></span> Copy URL</a>
 
 {% if entry.is_external_link %}
@@ -35,6 +39,6 @@
 <a href="{{ url_for('feed_edit', feed_name=entry.feed.name ) }}" class="dropdown-item"><span class="icon"><i class="far fa-edit"></i></span> Edit {{ entry.feed.name }}</a>
 <a hx-delete="{{ url_for('feed_delete', feed_name=entry.feed.name ) }}" _="on htmx:afterRequest go to url {{ url_for('entry_list') }}" class="dropdown-item"><span class="icon"><i class="far fa-trash-alt"></i></span> Delete {{ entry.feed.name }}</a>
 <hr class="dropdown-divider is-hidden-mobile">
-<a class="dropdown-item is-hidden-mobile" href="{{ url_for('raw_entry', id=entry.id ) }}" target="_blank"><span class="icon"><i class="fas fa-file-code"></i></span>  View raw entry data</a>
 <a class="dropdown-item is-hidden-mobile" href="{{ url_for('raw_feed', feed_name=entry.feed.name ) }}" target="_blank"><span class="icon"><i class="fas fa-file-code"></i></span>  View raw feed data</a>
 {% endif %}
+<a class="dropdown-item is-hidden-mobile" href="{{ url_for('raw_entry', id=entry.id ) }}" target="_blank"><span class="icon"><i class="fas fa-file-code"></i></span>  View raw entry data</a>
diff --git a/feedi/templates/entry_header.html b/feedi/templates/entry_header.html
@@ -83,6 +83,13 @@
            href="{{ entry.comments_url}}" target="_blank"
         ><i class="fas fa-comment-alt"></i></a>
         {% endif %}
+        <a tabindex="-1" class="level-item icon hover-icon is-white is-rounded" title="Explode"
+           hx-delete="{{ url_for('entry_explode', id=entry.id )}}"
+           hx-target="closest .feed-entry"
+           hx-swap="outerHTML"
+           _ = "on click set x to the closest .feed-entry then set x.style.opacity to 0.5"
+        ><i class="fas fa-bomb fa-lg"></i></a>
+
         <div class="dropdown is-right"
              _="on intersection(intersecting) having margin '0px 0px -50% 0px'
                     if intersecting remove .is-up else add .is-up -- show dropup up or dropdown depending position relative to middle of screen">

diff --git a/feedi/templates/entry_list_page.html b/feedi/templates/entry_list_page.html
@@ -83,6 +83,13 @@
             <i class="fas fa-comment-alt"></i>
         </a>
         {% endif %}
+        <a tabindex="-1" class="icon is-white is-rounded level-item" title="Explode"
+           hx-delete="{{ url_for('entry_explode', id=entry.id )}}"
+           hx-target="closest .feed-entry"
+           hx-swap="outerHTML"
+           _ = "on click set x to the closest .feed-entry then set x.style.opacity to 0.5"
+        ><i class="fas fa-bomb fa-lg"></i></a>
+
         <a class="icon is-white is-rounded level-item"
            tabindex="-1"
            _="on click toggle .is-active on the next .dropdown then