forked from Matthew-Salaway/XNotes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
unfurl_links.py
87 lines (72 loc) · 3.11 KB
/
unfurl_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import re
import requests
from bs4 import BeautifulSoup
from nltk.metrics import edit_distance
from dotenv import load_dotenv
from data import tweet_id_to_text
from utils import truncate_text, call_gpt
load_dotenv()
def extract_links_from_text(text):
# Using regular expression to find all URLs in the text
text = str(text)
urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', str(text))
return urls
def is_twitter_url(url):
return ("twitter.com" in url and "/status/" in url)
def extract_status_id(url: str) -> str:
# find the Tweet ID from a tweet URL
start_index = url.find("/status/")
start_index += len("/status/")
end_index = url.find("/", start_index+1)
return url[start_index:end_index] if end_index != -1 else url[start_index:]
def extract_text_from_simple_link(url, timeout):
try:
response = requests.get(url, timeout=timeout)
response.raise_for_status() # Raise an exception for 4xx and 5xx status codes
except: return "ERROR: The link text could not be retrieved."
# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Extract text from the HTML
text_content = soup.get_text()
return text_content
def clean_text(text):
# Remove unnecessary whitespace and newline characters
cleaned_text = re.sub(r'\s+', ' ', text).strip()
return cleaned_text
def count_words(input_string):
words = input_string.split()
return len(words)
def unfurl_links(text):
"""
Given text which may contain links, fetch the content of those links
and incorporate the content into the body of the original text.
"""
# Use regex to extract all links from the text
links = extract_links_from_text(text)
media_flag = False
for url in links:
# Sometimes a link redirects to another link, so we need to follow the redirect
redirected_url = requests.get(url, timeout=5).url
# Links to other Tweets require authentication via the Twitter API
if is_twitter_url(redirected_url): # TODO: Check this works
tweet_id = extract_status_id(redirected_url)
tweet_text = tweet_id_to_text(tweet_id)
# Links to photos and videos will return the same text as the original tweet
if edit_distance(tweet_text, text) < 10:
media_flag = True
else: content = tweet_text
# Links to all other sources are fetched with the requests library
else:
print("4")
link_text = extract_text_from_simple_link(redirected_url, timeout=10)
content = call_gpt("Please provide a one-paragraph summary of the following content:\n\n" + link_text, "gpt-3.5-turbo-1106")
# # And shortened to their first 1000 words
# link_text = clean_text(link_text)
# link_text = truncate_text(link_text, 1000)
# content = link_text
# Replace each link with its content
if media_flag==False: replacement_text = f"\n\nLink to: {redirected_url}\n\nContent from Link: {content}\n\n"
else: replacement_text = "\n\nWARNING: This tweet contains an image or video that cannot be displayed.\n\n"
text = text.replace(url, replacement_text)
print("5")
return text