-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsongexploder_transcript_scrape.py
189 lines (152 loc) · 6.06 KB
/
songexploder_transcript_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
"""
This is another "run once" script.
Given the lack of research on set lists,
see if there is complementary data to be had.
In this case,
1. scrape `songexploder.net/` (with `BeautifulSoup`) to ...
2. get `.PDF` transcripts of artist interviews, and then ...
3. parse those PDFs (with `PyPDF2`) in order to ...
4 identify relevant cases (matching a search term) and return those values.
The module also supports storing the relevant URLs locally,
so the process can pick up directly from there.
Note:
- replace the `USER_AGENT` value with your equivalent. See `utils.py`.
- We have written to songexploder, raising this use case and offering to provide structured data of this kind to their site.
"""
__author__ = ["Mark Gotham", "Shujin Gan"]
from bs4 import BeautifulSoup
import io
from PyPDF2 import PdfReader
import re
from typing import Optional
from urllib.request import Request, urlopen
# Constants
from utils import HEADERS, PARSER, THIS_DIR
TARGET_URL = "https://songexploder.net/episodes"
def get_podcast_pages(
target_url: str = TARGET_URL,
headers: Optional[dict] = None
) -> list:
"""
Extract podcast pages from the target URL.
Args:
- target_url (str): The target URL to extract pages from.
- headers (dict): The headers to include in the HTTP request.
Returns:
- list: A list of page URLs.
"""
if headers is None:
headers = HEADERS
req = Request(url=target_url, headers=headers)
resp = urlopen(req)
soup = BeautifulSoup(resp, PARSER, from_encoding=resp.info().get_param("charset"))
podcast_pages = [link["href"] for link in soup.find_all("a", href=True) if link["href"][:25] == target_url[:25]]
return podcast_pages
def get_transcript_urls(
podcast_pages: list,
headers: Optional[dict] = None
) -> list:
"""
Extract transcript URLs from the podcast pages.
Args:
- podcast_pages (list): A list of podcast page URLs.
- headers (dict): The headers to include in the HTTP request.
Returns:
- list: A list of transcript URLs.
"""
if headers is None:
headers = HEADERS
transcript_urls = []
for page in podcast_pages:
try:
req = Request(url=page, headers=headers)
resp = urlopen(req)
html = resp.read().decode("utf-8")
pattern = r'<a\s+href="([^"]+)"[^>]*>click here\.?</a>'
match_results = re.search(pattern, html, re.IGNORECASE)
if match_results:
title = match_results.group()
pattern = r'<a\s+href="([^"]+)"'
match = re.search(pattern, title)
if match:
href_value = match.group(1)
print(page, "... href found, adding.")
transcript_urls.append(href_value)
else:
print(page, "... No href found.")
else:
print(page, "... None found on this page")
except Exception as e:
print(f"Error occurred: {e}")
return list(dict.fromkeys(transcript_urls))
def extract_transcript_text(
transcript_url: str,
headers: Optional[dict] = None
) -> str:
"""
Extract text from a transcript URL.
Args:
- transcript_url (str): The URL of the transcript to extract text from.
- headers (dict): The headers to include in the HTTP request.
Returns:
- str: The extracted text.
"""
if headers is None:
headers = HEADERS
try:
req = Request(url=transcript_url, headers=headers)
remote_file = urlopen(req).read()
memory_file = io.BytesIO(remote_file)
pdf_file = PdfReader(memory_file)
text = ""
for page in pdf_file.pages:
text += page.extract_text()
return text
except Exception as e:
print(f"Error occurred: {e}")
return ""
def extract_sentences_with_keywords(text: str, pattern: str) -> list:
"""
Extract sentences containing specific keywords from the text.
Args:
- text (str): The text to extract sentences from.
- pattern (str): The pattern to match in the sentences.
Returns:
- list: A list of sentences containing the keywords.
"""
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=[.?])\s', text)
return [sentence for sentence in sentences if re.search(pattern, sentence, re.IGNORECASE)]
def main(
search_pattern: str = r"\b(?:set|setlist|set list)\b",
use_local_url_list: bool = True,
print_when_empty: bool = False
):
if use_local_url_list:
from song_exploder import se_urls as transcript_urls
else:
podcast_pages = get_podcast_pages(TARGET_URL, HEADERS)
transcript_urls = get_transcript_urls(podcast_pages, HEADERS)
print("Processing these URLs... ")
count = 0
for transcript_url in transcript_urls:
count += 1
print("...", transcript_url)
text = extract_transcript_text(transcript_url, HEADERS)
pattern = search_pattern
sentences = extract_sentences_with_keywords(text, pattern)
output_file_path = THIS_DIR / "song_exploder" / f"results_{count}.txt"
if sentences: # i.e., any results found
with open(output_file_path, "a", encoding="utf-8") as output_file:
output_file.write(transcript_url + '\n')
for sentence in sentences:
output_file.write(sentence + '\n')
output_file.write('======== END ============' + '\n')
else:
print(f"No sentences found in {transcript_url}")
if print_when_empty:
with open(output_file_path, 'a', encoding='utf-8') as output_file:
output_file.write(transcript_url + '\n')
output_file.write("No sentences found." + '\n')
output_file.write('======== END ============' + '\n')
if __name__ == "__main__":
main()