-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathget_stackoverflow_snippets_dataset.py
90 lines (76 loc) · 2.86 KB
/
get_stackoverflow_snippets_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import argparse
import random
import requests
import os
from bs4 import BeautifulSoup
parser = argparse.ArgumentParser()
parser.add_argument(
"--dest_dir", help="Destination directory", required=True)
def get_hrefs(soup):
# get all href links
href=[]
for i in soup.find_all("a",class_="s-link",href=True):
href.append(i['href'])
return href
def add_prefix(herfs_list):
new_href=[]
prefix='https://stackoverflow.com'
for h in herfs_list:
new_href.append(prefix+h)
return new_href
def get_popular_python_questions(start_page, end_page, page_size):
soups=[]
for page in range(start_page, end_page + 1):
request = requests.get(
url = f'https://stackoverflow.com/questions/tagged/python?tab=votes&page={page}&pagesize={page_size}')
soup = BeautifulSoup(request.text, "html.parser")
soups.append(soup.find("div", id="questions"))
hrefs=[]
for soup in soups:
hrefs.extend(get_hrefs(soup))
hrefs = add_prefix(hrefs)
return hrefs
def get_random_answer(question_url):
request = requests.get(url = question_url)
soup = BeautifulSoup(request.text, "html.parser")
answers = soup.find_all("div", class_="answercell post-layout--right")
random_index = random.randint(0, len(answers) - 1)
return answers[random_index]
def get_python_code(answer):
code = ""
code_block = answer.find_all("pre")
for code_block in code_block:
raw_code = code_block.find_all("code")
for snippet in raw_code:
for line in snippet.get_text().split('\n'):
if not (line.startswith("...") or line.startswith("*") or line.startswith("/") or line.startswith("<") or line.startswith("-->")):
if line.startswith(">>> "):
code += line[4:] + "\n"
elif line.startswith(">>>"):
code += line[3:] + "\n"
elif line.startswith("$"):
code += line[2:] + "\n"
else:
code += line + "\n"
return code
if __name__ == "__main__":
args = parser.parse_args()
popular_python_questions = get_popular_python_questions(1, 20, 50)
next_id = 1
for question in popular_python_questions:
found_snippet = False
while not found_snippet:
try:
random_answer = get_random_answer(question)
except ValueError:
break
code = get_python_code(random_answer)
if code:
found_snippet = True
if found_snippet:
outfile = os.path.join(args.dest_dir, f"snippet_{next_id}.py")
info = f"# Extracted from {question}"
with open(outfile, "w") as f:
f.write(info+"\n")
f.write(code)
next_id += 1