-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbookSearch.py
102 lines (89 loc) · 4.46 KB
/
bookSearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import requests as req
from bs4 import BeautifulSoup as Bs
LIMIT = 25 # maximum number of results per page. values can be either 25,50,100 only
class BookSearch:
# list of default values , if not parameters value are not provided
baseUrl = "http://libgen.rs/search.php"
query = 'abc'
column = 'def'
sort = 'def'
orderBy = 'ASC'
page = '1'
def __init__(self, query, column, sort, sortOrder, page):
# print(column)
self.query = self.query if query is None else (query if " " not in query else query.replace(" ", "+"))
self.column = self.column if column is None else column
self.sort = self.sort if sort is None else sort
self.orderBy = self.orderBy if sortOrder is None else sortOrder
self.page = self.page if page is None else page
self.searchUrl = self.baseUrl + "?&req=" + self.query + "&res=" + str(
LIMIT) + "&phrase=1&view=detailed" + "&column=" + self.column + "&sort=" + self.sort + "&page=" + self.page + "&sortmode=" + self.orderBy
# print(self.searchUrl)
def parse(self):
response = req.get(self.searchUrl).content
obj = Bs(response, 'html.parser', from_encoding="utf-8")
#print(obj)
return self.extract(obj)
def extract(self, obj):
# BeautifulSoup operations to retrieve data from the tags
allTitle = obj.select("td:nth-child(3) b a")
allAuthor = obj.select("tr:nth-child(3) td+ td")
allYear = obj.select("tr:nth-child(6) td:nth-child(2)")
allLanguage = obj.select("tr:nth-child(7) td:nth-child(2)")
allType = obj.select("tr:nth-child(10) td:nth-child(4)")
allImage = obj.select("img", {"alt": "Download"})
allPreLink = obj.select("tbody tr:nth-child(2) td:nth-child(1) a")
allPagination = obj.select("#paginator_example_top~ table tr:nth-child(1) td:nth-child(1) font")
totalFileCount, totalPageCount = self.splitTotal(allPagination)
allPagesCount = obj.select("tr:nth-child(7) td:nth-child(4)")
allSize = obj.select("tr:nth-child(10) td:nth-child(2)")
allPublisher = obj.select("tr:nth-child(5) td:nth-child(2)")
# iterating each value and appending them into lists after conversion
titles = [t.text for t in allTitle]
authors = [auth.text for auth in allAuthor]
years = [year.text for year in allYear]
langs = [lang.text for lang in allLanguage]
types = [ext.text for ext in allType]
images = ["http://libgen.rs" + img['src'] for img in allImage]
prelinks = [link['href'].split("=")[1] for link in allPreLink]
pagesCount = [pc.text for pc in allPagesCount]
sizes = [self.sizeSplit(siz) for siz in allSize]
publishers = [pub.text for pub in allPublisher]
totalFileCount = len(titles) if totalFileCount<=25 else totalFileCount
# JSON array containing all the resultant books
allBooks = []
# result as a Dictionary
resultDict = {"status": 200, "result": "success", "totalFiles": totalFileCount, "totalPages": totalPageCount,
"limit": LIMIT}
for i in range(len(titles)):
# creating a book json object
book = {"title": titles[i], "author": authors[i], "year": years[i], "language": langs[i], "type": types[i],
"image": images[i], "id": prelinks[i], "pages": pagesCount[i], "size": sizes[i],
"publisher": publishers[i]}
# appending them into the JSON Array
allBooks.append(book)
resultDict["books"] = allBooks
return resultDict
def splitTotal(self, given):
# print(given)
if len(given) == 0:
return 0, 0
else:
for string in given:
return self.totalCompute(int(string.text.split(' ')[0]))
def totalCompute(self, total):
totMax = LIMIT if total // LIMIT >= LIMIT else total // LIMIT
if not total % LIMIT == 0:
print(totMax)
return total, totMax + 1
else:
return total, totMax
def sizeSplit(self, string):
# just normal string operation
arr = string.text.split(' ')
result = arr[0] + " " + arr[1]
return result
def defaultBookResult():
# if the api request is invalid , returning the below object. dict['result'] is based on the uses
resultDict = {'books': [], 'totalPages': 0, 'totalFiles': 0, 'limit': str(LIMIT), "status": 400}
return resultDict