Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

entry explode #89

Closed
wants to merge 17 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions feedi/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,17 @@ class Entry(db.Model):
__table_args__ = (sa.UniqueConstraint("feed_id", "remote_id"),
sa.Index("entry_sort_ts", sort_date.desc()))

@classmethod
def from_url(cls, user_id, url):
"TODO"
entry = db.session.scalar(db.select(cls)
.filter_by(content_url=url, user_id=user_id))

if not entry:
values = parsers.html.fetch(url)
entry = cls(user_id=user_id, **values)
return entry

def __repr__(self):
return f'<Entry {self.feed_id}/{self.remote_id}>'

Expand Down
40 changes: 24 additions & 16 deletions feedi/parsers/html.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,51 @@
import datetime
import json

import dateparser
from bs4 import BeautifulSoup
from feedi import scraping
from feedi.requests import requests


def fetch(url, full_content=False):
"Return the entry values for an article at the given url."
def fetch(url):
"""
Return the entry values for an article at the given url.
Raises ValueError if the url doesn't seem to point to an article (it doesn't have a title).
Raises HTTPError if the request is not successfull.
"""

response = requests.get(url)
response.raise_for_status()

if not response.ok:
raise Exception()

soup = BeautifulSoup(response.content, 'lxml')
metadata = scraping.all_meta(soup)

title = metadata.get('og:title', metadata.get('twitter:title'))

published = scraping.extract_meta(soup, 'og:article:published_time')
if published:
display_date = dateparser.parse(published)
if not title or (metadata.get('og:type') and metadata['og:type'] != 'article'):
raise ValueError(f"{url} is missing article metadata")

if 'og:article:published_time' in metadata:
display_date = dateparser.parse(metadata['og:article:published_time'])
else:
display_date = datetime.datetime.utcnow()

title = scraping.extract_meta(soup, 'og:title', 'twitter:title')
if not title and soup.title:
title = soup.title.text

username = scraping.extract_meta(soup, 'author') or ''
username = username.split(',')[0]
username = metadata.get('author', '').split(',')[0]

entry = {
'remote_id': url,
'title': title,
'username': username,
'display_date': display_date,
'sort_date': datetime.datetime.utcnow(),
'content_short': scraping.extract_meta(soup, 'og:description', 'description'),
'media_url': scraping.extract_meta(soup, 'og:image', 'twitter:image'),
'content_short': metadata.get('og:description', metadata.get('description')),
'media_url': metadata.get('og:image', metadata.get('twitter:image')),
'target_url': url,
'content_url': url,
'raw_data': json.dumps(metadata)
}

if full_content:
entry['content_full'] = scraping.extract(html=response.content)['content']

return entry
54 changes: 45 additions & 9 deletions feedi/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import feedi.tasks as tasks
from feedi import scraping
from feedi.models import db
from feedi.parsers import html, mastodon, rss
from feedi.parsers import mastodon, rss


@app.route("/users/<username>")
Expand Down Expand Up @@ -216,6 +216,43 @@ def entry_favorite(id):
return '', 204


@app.delete("/entries/<int:id>")
@login_required
def entry_explode(id):
"Mark the given entry as viewed, scan the links in its contents, create entries and render them."
# may be overloading the delete method a bit here

entry = db.get_or_404(models.Entry, id)
if entry.user_id != current_user.id:
flask.abort(404)

entry.fetch_content()
entry.viewed = entry.viewed or datetime.datetime.utcnow()
db.session.commit()

# extract a unique set of links considering both short and full content
urls = []
for content in [entry.content_short, entry.content_full]:
if content:
urls += scraping.extract_links(content)
urls = set(urls)

# create (or load old) entries for those that are valid articles
entries = []
for url in urls:
try:
subentry = models.Entry.from_url(current_user.id, url)
entries.append(subentry)
except Exception:
continue

db.session.add_all(entries)
db.session.commit()

return flask.render_template('entry_list_page.html',
entries=entries)


@app.put("/mastodon/favorites/<int:id>")
@login_required
def mastodon_favorite(id):
Expand Down Expand Up @@ -424,17 +461,16 @@ def entry_add():
Redirects to the content reader for the article at the given URL, creating a new entry for it
if there isn't already one.
"""

# TODO sanitize?
url = flask.request.args['url']
entry = db.session.scalar(db.select(models.Entry)
.filter_by(content_url=url, user_id=current_user.id))

if not entry:
values = html.fetch(url, full_content=True)
entry = models.Entry(user_id=current_user.id, **values)
db.session.add(entry)
db.session.commit()
try:
entry = models.Entry.from_url(current_user.id, url)
except Exception:
return redirect_response(url)

db.session.add(entry)
db.session.commit()
return redirect_response(flask.url_for('entry_view', id=entry.id))


Expand Down
18 changes: 18 additions & 0 deletions feedi/scraping.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,24 @@ def extract_meta(soup, *tags):
return meta_tag['content']


def all_meta(soup):
result = {}
for attr in ['property', 'name', 'itemprop']:
for meta_tag in soup.find_all("meta", {attr: True}, content=True):
result[meta_tag[attr]] = meta_tag['content']
return result


def extract_links(html):
soup = BeautifulSoup(html, 'lxml')
# checks tag.text so it skips image links
# checks startswith http to exclude local links (not sure if it's the best assumption?)
links = soup.find_all(lambda tag: tag.name ==
'a' and tag.text and tag['href'].startswith('http'))
return [a['href'] for a in links]


# TODO this should be renamed, and maybe other things in this modules, using extract too much
def extract(url=None, html=None):
# The mozilla/readability npm package shows better results at extracting the
# article content than all the python libraries I've tried... even than the readabilipy
Expand Down
6 changes: 5 additions & 1 deletion feedi/templates/entry_commands.html
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
{% if entry.id and request.path != url_for('entry_view', id=entry.id) %}
<a hx-boost="true" href="{{ url_for('entry_view', id=entry.id) }}" class="dropdown-item"><span class="icon"><i class="fas fa-book-reader"></i></span> View in reader</a>
{% endif %}
<a class="dropdown-item" hx-delete="{{ url_for('entry_explode', id=entry.id )}}" hx-swap="none"
_="on htmx:beforeRequest or htmx:afterRequest toggle .fa-spin on <i/> in me
then on htmx:afterRequest go to url {{ url_for('entry_list') }}">
<span class="icon"><i class="fas fa-bomb fa-lg"></i></span> Explode</a>
<a class="dropdown-item" _="on click writeText('{{ entry.content_url }}') into the navigator's clipboard"><span class="icon"><i class="fas fa-link"></i></span> Copy URL</a>

{% if entry.is_external_link %}
Expand All @@ -35,6 +39,6 @@
<a href="{{ url_for('feed_edit', feed_name=entry.feed.name ) }}" class="dropdown-item"><span class="icon"><i class="far fa-edit"></i></span> Edit {{ entry.feed.name }}</a>
<a hx-delete="{{ url_for('feed_delete', feed_name=entry.feed.name ) }}" _="on htmx:afterRequest go to url {{ url_for('entry_list') }}" class="dropdown-item"><span class="icon"><i class="far fa-trash-alt"></i></span> Delete {{ entry.feed.name }}</a>
<hr class="dropdown-divider is-hidden-mobile">
<a class="dropdown-item is-hidden-mobile" href="{{ url_for('raw_entry', id=entry.id ) }}" target="_blank"><span class="icon"><i class="fas fa-file-code"></i></span> View raw entry data</a>
<a class="dropdown-item is-hidden-mobile" href="{{ url_for('raw_feed', feed_name=entry.feed.name ) }}" target="_blank"><span class="icon"><i class="fas fa-file-code"></i></span> View raw feed data</a>
{% endif %}
<a class="dropdown-item is-hidden-mobile" href="{{ url_for('raw_entry', id=entry.id ) }}" target="_blank"><span class="icon"><i class="fas fa-file-code"></i></span> View raw entry data</a>
7 changes: 7 additions & 0 deletions feedi/templates/entry_header.html
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,13 @@
href="{{ entry.comments_url}}" target="_blank"
><i class="fas fa-comment-alt"></i></a>
{% endif %}
<a tabindex="-1" class="level-item icon hover-icon is-white is-rounded" title="Explode"
hx-delete="{{ url_for('entry_explode', id=entry.id )}}"
hx-target="closest .feed-entry"
hx-swap="outerHTML"
_ = "on click set x to the closest .feed-entry then set x.style.opacity to 0.5"
><i class="fas fa-bomb fa-lg"></i></a>

<div class="dropdown is-right"
_="on intersection(intersecting) having margin '0px 0px -50% 0px'
if intersecting remove .is-up else add .is-up -- show dropup up or dropdown depending position relative to middle of screen">
Expand Down
7 changes: 7 additions & 0 deletions feedi/templates/entry_list_page.html
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,13 @@
<i class="fas fa-comment-alt"></i>
</a>
{% endif %}
<a tabindex="-1" class="icon is-white is-rounded level-item" title="Explode"
hx-delete="{{ url_for('entry_explode', id=entry.id )}}"
hx-target="closest .feed-entry"
hx-swap="outerHTML"
_ = "on click set x to the closest .feed-entry then set x.style.opacity to 0.5"
><i class="fas fa-bomb fa-lg"></i></a>

<a class="icon is-white is-rounded level-item"
tabindex="-1"
_="on click toggle .is-active on the next .dropdown then
Expand Down