-
Notifications
You must be signed in to change notification settings - Fork 0
/
ds_scrape.rb
executable file
·49 lines (36 loc) · 1.55 KB
/
ds_scrape.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
require 'rest-client'
require 'nokogiri'
require 'json'
API_BASE = 'https://api.stackexchange.com/2.2/'
QUESTION_ID = ARGV[0]
raise 'No question ID specified!' if QUESTION_ID.nil?
# Retrieve information from the API methods
question = RestClient.get(API_BASE + "questions/#{QUESTION_ID}?site=stats&filter=withbody")
comments = RestClient.get(API_BASE + "questions/#{QUESTION_ID}/comments?site=stats&filter=withbody")
answers = RestClient.get(API_BASE + "questions/#{QUESTION_ID}/answers?site=stats&filter=withbody")
# Parse the returned JSON
qjson = JSON.parse(question)
cjson = JSON.parse(comments)
ajson = JSON.parse(answers)
# Glue all the text together into a corpus, beginning with question text
big_text = Nokogiri::HTML(qjson['items'][0]['body']).text + "\n"
# Process the question comments
comments_list = cjson['items']
comments_list.each do |comment|
big_text = big_text + ' ' + Nokogiri::HTML(comment['body']).text + "\n"
end
# Now process the answers and their comments
answers_list = ajson['items']
answers_list.each do |answer|
big_text = big_text + ' ' + Nokogiri::HTML(answer['body']).text + "\n"
# Retrieve the comments for each and process those as well
answer_comments = RestClient.get(API_BASE + 'answers/' + answer['answer_id'].to_s +
'/comments?site=stats&filter=withbody')
aclistjson = JSON.parse(answer_comments)
aclistjson['items'].each do |acomment|
big_text = big_text + ' ' + Nokogiri::HTML(acomment['body']).text + "\n"
end
end
File.open('qa_output.txt', 'w') do |file|
file.write(big_text)
end