diff --git a/feedi/models.py b/feedi/models.py index 826477d..ab3aafc 100644 --- a/feedi/models.py +++ b/feedi/models.py @@ -521,6 +521,17 @@ class Entry(db.Model): __table_args__ = (sa.UniqueConstraint("feed_id", "remote_id"), sa.Index("entry_sort_ts", sort_date.desc())) + @classmethod + def from_url(cls, user_id, url): + "TODO" + entry = db.session.scalar(db.select(cls) + .filter_by(content_url=url, user_id=user_id)) + + if not entry: + values = parsers.html.fetch(url) + entry = cls(user_id=user_id, **values) + return entry + def __repr__(self): return f'' diff --git a/feedi/parsers/html.py b/feedi/parsers/html.py index 36e304a..f4d1cb0 100644 --- a/feedi/parsers/html.py +++ b/feedi/parsers/html.py @@ -1,4 +1,5 @@ import datetime +import json import dateparser from bs4 import BeautifulSoup @@ -6,24 +7,33 @@ from feedi.requests import requests -def fetch(url, full_content=False): - "Return the entry values for an article at the given url." +def fetch(url): + """ + Return the entry values for an article at the given url. + Raises ValueError if the url doesn't seem to point to an article (it doesn't have a title). + Raises HTTPError if the request is not successfull. + """ response = requests.get(url) + response.raise_for_status() + + if not response.ok: + raise Exception() + soup = BeautifulSoup(response.content, 'lxml') + metadata = scraping.all_meta(soup) + + title = metadata.get('og:title', metadata.get('twitter:title')) - published = scraping.extract_meta(soup, 'og:article:published_time') - if published: - display_date = dateparser.parse(published) + if not title or (metadata.get('og:type') and metadata['og:type'] != 'article'): + raise ValueError(f"{url} is missing article metadata") + + if 'og:article:published_time' in metadata: + display_date = dateparser.parse(metadata['og:article:published_time']) else: display_date = datetime.datetime.utcnow() - title = scraping.extract_meta(soup, 'og:title', 'twitter:title') - if not title and soup.title: - title = soup.title.text - - username = scraping.extract_meta(soup, 'author') or '' - username = username.split(',')[0] + username = metadata.get('author', '').split(',')[0] entry = { 'remote_id': url, @@ -31,13 +41,11 @@ def fetch(url, full_content=False): 'username': username, 'display_date': display_date, 'sort_date': datetime.datetime.utcnow(), - 'content_short': scraping.extract_meta(soup, 'og:description', 'description'), - 'media_url': scraping.extract_meta(soup, 'og:image', 'twitter:image'), + 'content_short': metadata.get('og:description', metadata.get('description')), + 'media_url': metadata.get('og:image', metadata.get('twitter:image')), 'target_url': url, 'content_url': url, + 'raw_data': json.dumps(metadata) } - if full_content: - entry['content_full'] = scraping.extract(html=response.content)['content'] - return entry diff --git a/feedi/routes.py b/feedi/routes.py index db40e67..79ebea4 100644 --- a/feedi/routes.py +++ b/feedi/routes.py @@ -11,7 +11,7 @@ import feedi.tasks as tasks from feedi import scraping from feedi.models import db -from feedi.parsers import html, mastodon, rss +from feedi.parsers import mastodon, rss @app.route("/users/") @@ -216,6 +216,43 @@ def entry_favorite(id): return '', 204 +@app.delete("/entries/") +@login_required +def entry_explode(id): + "Mark the given entry as viewed, scan the links in its contents, create entries and render them." + # may be overloading the delete method a bit here + + entry = db.get_or_404(models.Entry, id) + if entry.user_id != current_user.id: + flask.abort(404) + + entry.fetch_content() + entry.viewed = entry.viewed or datetime.datetime.utcnow() + db.session.commit() + + # extract a unique set of links considering both short and full content + urls = [] + for content in [entry.content_short, entry.content_full]: + if content: + urls += scraping.extract_links(content) + urls = set(urls) + + # create (or load old) entries for those that are valid articles + entries = [] + for url in urls: + try: + subentry = models.Entry.from_url(current_user.id, url) + entries.append(subentry) + except Exception: + continue + + db.session.add_all(entries) + db.session.commit() + + return flask.render_template('entry_list_page.html', + entries=entries) + + @app.put("/mastodon/favorites/") @login_required def mastodon_favorite(id): @@ -424,17 +461,16 @@ def entry_add(): Redirects to the content reader for the article at the given URL, creating a new entry for it if there isn't already one. """ - # TODO sanitize? url = flask.request.args['url'] - entry = db.session.scalar(db.select(models.Entry) - .filter_by(content_url=url, user_id=current_user.id)) - if not entry: - values = html.fetch(url, full_content=True) - entry = models.Entry(user_id=current_user.id, **values) - db.session.add(entry) - db.session.commit() + try: + entry = models.Entry.from_url(current_user.id, url) + except Exception: + return redirect_response(url) + + db.session.add(entry) + db.session.commit() return redirect_response(flask.url_for('entry_view', id=entry.id)) diff --git a/feedi/scraping.py b/feedi/scraping.py index 37aa102..a04666a 100644 --- a/feedi/scraping.py +++ b/feedi/scraping.py @@ -73,6 +73,24 @@ def extract_meta(soup, *tags): return meta_tag['content'] +def all_meta(soup): + result = {} + for attr in ['property', 'name', 'itemprop']: + for meta_tag in soup.find_all("meta", {attr: True}, content=True): + result[meta_tag[attr]] = meta_tag['content'] + return result + + +def extract_links(html): + soup = BeautifulSoup(html, 'lxml') + # checks tag.text so it skips image links + # checks startswith http to exclude local links (not sure if it's the best assumption?) + links = soup.find_all(lambda tag: tag.name == + 'a' and tag.text and tag['href'].startswith('http')) + return [a['href'] for a in links] + + +# TODO this should be renamed, and maybe other things in this modules, using extract too much def extract(url=None, html=None): # The mozilla/readability npm package shows better results at extracting the # article content than all the python libraries I've tried... even than the readabilipy diff --git a/feedi/templates/entry_commands.html b/feedi/templates/entry_commands.html index 11519f3..89e4153 100644 --- a/feedi/templates/entry_commands.html +++ b/feedi/templates/entry_commands.html @@ -13,6 +13,10 @@ {% if entry.id and request.path != url_for('entry_view', id=entry.id) %} View in reader {% endif %} + + Explode Copy URL {% if entry.is_external_link %} @@ -35,6 +39,6 @@ Edit {{ entry.feed.name }} Delete {{ entry.feed.name }} - View raw entry data View raw feed data {% endif %} + View raw entry data diff --git a/feedi/templates/entry_header.html b/feedi/templates/entry_header.html index 5c54e7b..4cf3216 100644 --- a/feedi/templates/entry_header.html +++ b/feedi/templates/entry_header.html @@ -83,6 +83,13 @@ href="{{ entry.comments_url}}" target="_blank" > {% endif %} + +