-
Notifications
You must be signed in to change notification settings - Fork 0
/
googlebooks.py
71 lines (59 loc) · 2.39 KB
/
googlebooks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#! /usr/bin/python
# -*- coding: utf-8 -*-
"""All things specifically related to the Google Books website."""
import re
from urllib.parse import parse_qs
from urllib.parse import urlparse
import requests
# import bibtex [1]
import ris
import commons
class Response(commons.BaseResponse):
"""Create googlebooks' response object."""
def __init__(self, googlebook_url, date_format='%Y-%m-%d'):
"""Make the dictionary and run self.generate()."""
self.date_format = date_format
self.url = googlebook_url
#self.bibtex = get_bibtex(googlebook_url) [1]
#self.dictionary = bibtex.parse(self.bibtex) [1]
self.bibtex = get_ris(googlebook_url)
self.dictionary = ris.parse(self.bibtex)
pu = urlparse(googlebook_url)
pq = parse_qs(pu.query)
# default domain is prefered:
self.dictionary['url'] = 'http://' + pu.netloc +\
'/books?id=' + pq['id'][0]
# manually adding page nubmer to dictionary:
if 'pg' in pq:
self.dictionary['pages'] = pq['pg'][0][2:]
self.dictionary['url'] += '&pg=' + pq['pg'][0]
# although google does not provide a language field:
if 'language' not in self.dictionary:
self.detect_language(self.dictionary['title'])
self.generate()
def get_bibtex(googlebook_url):
"""Get bibtex file content from a noormags url."""
# getting id:
pu = urlparse(googlebook_url)
pq = parse_qs(pu.query)
bookid = pq['id'][0]
url = 'http://books.google.com/books/download/?id=' +\
bookid + '&output=bibtex'
# Agent spoofing is needed, otherwise: HTTP Error 401: Unauthorized
headers = {'User-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:24.0)' +
' Gecko/20100101 Firefox/24.0'}
bibtex = requests.get(url, headers=headers).text
return bibtex
def get_ris(googlebook_url):
"""Get ris file content from a noormags url."""
# getting id:
pu = urlparse(googlebook_url)
pq = parse_qs(pu.query)
bookid = pq['id'][0]
url = 'http://books.google.com/books/download/?id=' +\
bookid + '&output=ris'
# Agent spoofing is needed, otherwise: HTTP Error 401: Unauthorized
headers = {'User-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:24.0)' +
' Gecko/20100101 Firefox/24.0'}
ris = requests.get(url, headers=headers).text
return ris