diff --git a/feedi/models.py b/feedi/models.py index 826477d..9eea908 100644 --- a/feedi/models.py +++ b/feedi/models.py @@ -521,6 +521,17 @@ class Entry(db.Model): __table_args__ = (sa.UniqueConstraint("feed_id", "remote_id"), sa.Index("entry_sort_ts", sort_date.desc())) + @classmethod + def from_url(cls, user_id, url): + "Load an entry for the given article url if it exists, otherwise create a new one." + entry = db.session.scalar(db.select(cls) + .filter_by(content_url=url, user_id=user_id)) + + if not entry: + values = parsers.html.fetch(url) + entry = cls(user_id=user_id, **values) + return entry + def __repr__(self): return f'' diff --git a/feedi/parsers/html.py b/feedi/parsers/html.py index 36e304a..f4d1cb0 100644 --- a/feedi/parsers/html.py +++ b/feedi/parsers/html.py @@ -1,4 +1,5 @@ import datetime +import json import dateparser from bs4 import BeautifulSoup @@ -6,24 +7,33 @@ from feedi.requests import requests -def fetch(url, full_content=False): - "Return the entry values for an article at the given url." +def fetch(url): + """ + Return the entry values for an article at the given url. + Raises ValueError if the url doesn't seem to point to an article (it doesn't have a title). + Raises HTTPError if the request is not successfull. + """ response = requests.get(url) + response.raise_for_status() + + if not response.ok: + raise Exception() + soup = BeautifulSoup(response.content, 'lxml') + metadata = scraping.all_meta(soup) + + title = metadata.get('og:title', metadata.get('twitter:title')) - published = scraping.extract_meta(soup, 'og:article:published_time') - if published: - display_date = dateparser.parse(published) + if not title or (metadata.get('og:type') and metadata['og:type'] != 'article'): + raise ValueError(f"{url} is missing article metadata") + + if 'og:article:published_time' in metadata: + display_date = dateparser.parse(metadata['og:article:published_time']) else: display_date = datetime.datetime.utcnow() - title = scraping.extract_meta(soup, 'og:title', 'twitter:title') - if not title and soup.title: - title = soup.title.text - - username = scraping.extract_meta(soup, 'author') or '' - username = username.split(',')[0] + username = metadata.get('author', '').split(',')[0] entry = { 'remote_id': url, @@ -31,13 +41,11 @@ def fetch(url, full_content=False): 'username': username, 'display_date': display_date, 'sort_date': datetime.datetime.utcnow(), - 'content_short': scraping.extract_meta(soup, 'og:description', 'description'), - 'media_url': scraping.extract_meta(soup, 'og:image', 'twitter:image'), + 'content_short': metadata.get('og:description', metadata.get('description')), + 'media_url': metadata.get('og:image', metadata.get('twitter:image')), 'target_url': url, 'content_url': url, + 'raw_data': json.dumps(metadata) } - if full_content: - entry['content_full'] = scraping.extract(html=response.content)['content'] - return entry diff --git a/feedi/parsers/rss.py b/feedi/parsers/rss.py index 929a69b..35a7ca8 100644 --- a/feedi/parsers/rss.py +++ b/feedi/parsers/rss.py @@ -301,14 +301,14 @@ def discover_feed(url): for type in link_types: link = soup.find('link', type=type, href=True) if link: - feed_url = make_absolute(url, link['href']) + feed_url = scraping.make_absolute(url, link['href']) return feed_url, title # if none found in the html, try with common urls, provided that they exist # and are xml content common_paths = ['/feed', '/rss', '/feed.xml', '/rss.xml'] for path in common_paths: - rss_url = make_absolute(url, path) + rss_url = scraping.make_absolute(url, path) res = requests.get(rss_url) mime = res.headers.get('Content-Type', '').split(';')[0] if res.ok and mime.endswith('xml'): @@ -317,14 +317,6 @@ def discover_feed(url): return None, title -def make_absolute(url, path): - "If `path` is a relative url, join it with the given absolute url." - if not urllib.parse.urlparse(path).netloc: - - path = urllib.parse.urljoin(url, path) - return path - - def pretty_print(url): feed = feedparser.parse(url) pp = pprint.PrettyPrinter(depth=10) diff --git a/feedi/routes.py b/feedi/routes.py index db40e67..f6bba6a 100644 --- a/feedi/routes.py +++ b/feedi/routes.py @@ -11,7 +11,7 @@ import feedi.tasks as tasks from feedi import scraping from feedi.models import db -from feedi.parsers import html, mastodon, rss +from feedi.parsers import mastodon, rss @app.route("/users/") @@ -424,17 +424,16 @@ def entry_add(): Redirects to the content reader for the article at the given URL, creating a new entry for it if there isn't already one. """ - # TODO sanitize? url = flask.request.args['url'] - entry = db.session.scalar(db.select(models.Entry) - .filter_by(content_url=url, user_id=current_user.id)) - if not entry: - values = html.fetch(url, full_content=True) - entry = models.Entry(user_id=current_user.id, **values) - db.session.add(entry) - db.session.commit() + try: + entry = models.Entry.from_url(current_user.id, url) + except Exception: + return redirect_response(url) + + db.session.add(entry) + db.session.commit() return redirect_response(flask.url_for('entry_view', id=entry.id)) diff --git a/feedi/scraping.py b/feedi/scraping.py index 37aa102..12ae96a 100644 --- a/feedi/scraping.py +++ b/feedi/scraping.py @@ -73,6 +73,29 @@ def extract_meta(soup, *tags): return meta_tag['content'] +def all_meta(soup): + result = {} + for attr in ['property', 'name', 'itemprop']: + for meta_tag in soup.find_all("meta", {attr: True}, content=True): + result[meta_tag[attr]] = meta_tag['content'] + return result + + +def extract_links(url, html): + soup = BeautifulSoup(html, 'lxml') + # checks tag.text so it skips image links + links = soup.find_all(lambda tag: tag.name == 'a' and tag.text) + return [(make_absolute(url, a['href']), a.text) for a in links] + + +def make_absolute(url, path): + "If `path` is a relative url, join it with the given absolute url." + if not urllib.parse.urlparse(path).netloc: + path = urllib.parse.urljoin(url, path) + return path + + +# TODO this should be renamed, and maybe other things in this modules, using extract too much def extract(url=None, html=None): # The mozilla/readability npm package shows better results at extracting the # article content than all the python libraries I've tried... even than the readabilipy