Skip to content

Commit

Permalink
add scraping and standalone entry helpers (#90)
Browse files Browse the repository at this point in the history
  • Loading branch information
facundoolano authored Jan 8, 2024
1 parent 105c002 commit e7982c4
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 35 deletions.
11 changes: 11 additions & 0 deletions feedi/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,17 @@ class Entry(db.Model):
__table_args__ = (sa.UniqueConstraint("feed_id", "remote_id"),
sa.Index("entry_sort_ts", sort_date.desc()))

@classmethod
def from_url(cls, user_id, url):
"Load an entry for the given article url if it exists, otherwise create a new one."
entry = db.session.scalar(db.select(cls)
.filter_by(content_url=url, user_id=user_id))

if not entry:
values = parsers.html.fetch(url)
entry = cls(user_id=user_id, **values)
return entry

def __repr__(self):
return f'<Entry {self.feed_id}/{self.remote_id}>'

Expand Down
40 changes: 24 additions & 16 deletions feedi/parsers/html.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,51 @@
import datetime
import json

import dateparser
from bs4 import BeautifulSoup
from feedi import scraping
from feedi.requests import requests


def fetch(url, full_content=False):
"Return the entry values for an article at the given url."
def fetch(url):
"""
Return the entry values for an article at the given url.
Raises ValueError if the url doesn't seem to point to an article (it doesn't have a title).
Raises HTTPError if the request is not successfull.
"""

response = requests.get(url)
response.raise_for_status()

if not response.ok:
raise Exception()

soup = BeautifulSoup(response.content, 'lxml')
metadata = scraping.all_meta(soup)

title = metadata.get('og:title', metadata.get('twitter:title'))

published = scraping.extract_meta(soup, 'og:article:published_time')
if published:
display_date = dateparser.parse(published)
if not title or (metadata.get('og:type') and metadata['og:type'] != 'article'):
raise ValueError(f"{url} is missing article metadata")

if 'og:article:published_time' in metadata:
display_date = dateparser.parse(metadata['og:article:published_time'])
else:
display_date = datetime.datetime.utcnow()

title = scraping.extract_meta(soup, 'og:title', 'twitter:title')
if not title and soup.title:
title = soup.title.text

username = scraping.extract_meta(soup, 'author') or ''
username = username.split(',')[0]
username = metadata.get('author', '').split(',')[0]

entry = {
'remote_id': url,
'title': title,
'username': username,
'display_date': display_date,
'sort_date': datetime.datetime.utcnow(),
'content_short': scraping.extract_meta(soup, 'og:description', 'description'),
'media_url': scraping.extract_meta(soup, 'og:image', 'twitter:image'),
'content_short': metadata.get('og:description', metadata.get('description')),
'media_url': metadata.get('og:image', metadata.get('twitter:image')),
'target_url': url,
'content_url': url,
'raw_data': json.dumps(metadata)
}

if full_content:
entry['content_full'] = scraping.extract(html=response.content)['content']

return entry
12 changes: 2 additions & 10 deletions feedi/parsers/rss.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,14 +301,14 @@ def discover_feed(url):
for type in link_types:
link = soup.find('link', type=type, href=True)
if link:
feed_url = make_absolute(url, link['href'])
feed_url = scraping.make_absolute(url, link['href'])
return feed_url, title

# if none found in the html, try with common urls, provided that they exist
# and are xml content
common_paths = ['/feed', '/rss', '/feed.xml', '/rss.xml']
for path in common_paths:
rss_url = make_absolute(url, path)
rss_url = scraping.make_absolute(url, path)
res = requests.get(rss_url)
mime = res.headers.get('Content-Type', '').split(';')[0]
if res.ok and mime.endswith('xml'):
Expand All @@ -317,14 +317,6 @@ def discover_feed(url):
return None, title


def make_absolute(url, path):
"If `path` is a relative url, join it with the given absolute url."
if not urllib.parse.urlparse(path).netloc:

path = urllib.parse.urljoin(url, path)
return path


def pretty_print(url):
feed = feedparser.parse(url)
pp = pprint.PrettyPrinter(depth=10)
Expand Down
17 changes: 8 additions & 9 deletions feedi/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import feedi.tasks as tasks
from feedi import scraping
from feedi.models import db
from feedi.parsers import html, mastodon, rss
from feedi.parsers import mastodon, rss


@app.route("/users/<username>")
Expand Down Expand Up @@ -424,17 +424,16 @@ def entry_add():
Redirects to the content reader for the article at the given URL, creating a new entry for it
if there isn't already one.
"""

# TODO sanitize?
url = flask.request.args['url']
entry = db.session.scalar(db.select(models.Entry)
.filter_by(content_url=url, user_id=current_user.id))

if not entry:
values = html.fetch(url, full_content=True)
entry = models.Entry(user_id=current_user.id, **values)
db.session.add(entry)
db.session.commit()
try:
entry = models.Entry.from_url(current_user.id, url)
except Exception:
return redirect_response(url)

db.session.add(entry)
db.session.commit()
return redirect_response(flask.url_for('entry_view', id=entry.id))


Expand Down
23 changes: 23 additions & 0 deletions feedi/scraping.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,29 @@ def extract_meta(soup, *tags):
return meta_tag['content']


def all_meta(soup):
result = {}
for attr in ['property', 'name', 'itemprop']:
for meta_tag in soup.find_all("meta", {attr: True}, content=True):
result[meta_tag[attr]] = meta_tag['content']
return result


def extract_links(url, html):
soup = BeautifulSoup(html, 'lxml')
# checks tag.text so it skips image links
links = soup.find_all(lambda tag: tag.name == 'a' and tag.text)
return [(make_absolute(url, a['href']), a.text) for a in links]


def make_absolute(url, path):
"If `path` is a relative url, join it with the given absolute url."
if not urllib.parse.urlparse(path).netloc:
path = urllib.parse.urljoin(url, path)
return path


# TODO this should be renamed, and maybe other things in this modules, using extract too much
def extract(url=None, html=None):
# The mozilla/readability npm package shows better results at extracting the
# article content than all the python libraries I've tried... even than the readabilipy
Expand Down

0 comments on commit e7982c4

Please sign in to comment.