-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathscraping.py
39 lines (32 loc) · 1.22 KB
/
scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import requests
### main scripts used for scraping the text
def scrape(reddit_url):
map = {}
headers = {'User-agent': 'Mozilla/5.0'}
r = requests.get(reddit_url + "/.json", headers=headers)
data = r.json() # Parse JSON data
self_text = data[0]['data']['children'][0]['data']['selftext']
title = data[0]['data']['children'][0]['data']['title']
map['title'] = title
map['desc'] = self_text
print("Scraped! Currently saving ...")
return map
def scrape_llm(reddit_url):
headers = {'User-agent': 'Mozilla/5.0'}
r = requests.get(reddit_url + "/.json", headers=headers)
data = r.json() # Parse JSON data
dist = data['data']['dist']
self_text = data['data']['children']
fin = []
for i in range(dist):
title = self_text[i]['data']['title']
trimmed_title= title.strip()
desc = self_text[i]['data']['selftext']
trimmed_desc = desc.strip()
fin.append([trimmed_title, trimmed_desc])
return fin
def save_map_to_txt(map, file_path):
with open(file_path, 'w', encoding='utf-8') as file:
file.write(f"Title: {map['title']}\n")
file.write(f"Description: {map['desc']}\n")
print("SCRAPING DONE! SUCCESSFULLY SAVED")