-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
56 lines (44 loc) · 1.5 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/usr/bin/env python
import urllib2
from bs4 import BeautifulSoup
import logging
class StackOverflow:
@classmethod
def search(cls, text, page = 0, pages = 10):
doc = cls._fetch_result_url(text);
return cls._parse_results(doc, page, pages)
#Fetches the the page of results for a given query string
@classmethod
def _fetch_result_url(cls, text):
text = text.replace(" ", "+")
SEARCH_BASE_URL = "http://stackoverflow.com/search?q={0}"
doc = urllib2.urlopen(SEARCH_BASE_URL.format(text)).read()
return doc
#Parse the StackOverflow results page
@classmethod
def _parse_results(cls, doc, page = 0, pages = 10):
QUESTION_BASE_URL = "http://stackoverflow.com{0}"
soup = BeautifulSoup(doc)
urls = []
anchors = soup.select('.result-link > span > a')
for anchor in anchors:
if anchor:
href = anchor.attrs['href']
if href:
urls.append(QUESTION_BASE_URL.format(href))
return urls[page : page + pages + 1]
@classmethod
def answer(cls, url):
import lxml.html
# doc = urllib2.urlopen(url).read()
content = urllib2.urlopen(url).read()
doc = lxml.html.fromstring(content)
title = doc.find_class("question-hyperlink")
answers = doc.find_class("answer")
for answer in answers:
accepted = answer.find_class("vote-accepted-on")
if accepted:
text = answer.find_class("post-text")[0]
if hasattr(text, "text"):
return { "title": title[0].text_content(), "content": lxml.html.tostring(text) }
return { "title": title[0].text_content(), "content": None }