-
Notifications
You must be signed in to change notification settings - Fork 7
/
common.py
91 lines (65 loc) · 1.93 KB
/
common.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/python3
# -*- coding:utf8 -*-
import base64
from bs4 import BeautifulSoup
import requests
import pickle
import pycld2 as cld2
import string
SLEEP_TIME = None
SNIPPET_THREAD_NUM = None
DETAIL_THREAD_NUM = None
REVIEW_THREAD_NUM = None
USER_THREAD_NUM = None
HOTEL_PER_PAGE = 30
REVIEW_PER_PAGE = 5
REVIEW_CHUNK_SIZE = 300
EXTRACT_CHUNK_SIZE = 500
TA_ROOT = 'https://www.tripadvisor.com.au/'
TA_DB = 'ta.db'
def load_soup_string(soup_str):
return BeautifulSoup(soup_str, 'lxml')
def load_soup_local(fn):
file = read_file(fn)
return BeautifulSoup(file, 'lxml')
def load_soup_online(url):
req = requests.get(url)
data = req.text
req.close()
return BeautifulSoup(data, 'lxml')
def read_binary(fn):
with open(fn, 'rb') as fp:
data = [x for x in pickle.load(fp)]
fp.close()
return data
def write_binary(fn, data):
with open(fn, 'wb') as fp:
pickle.dump(data, fp)
fp.close()
def write_file(fn, data):
with open(fn, 'w', encoding='utf8') as f:
f.write(data)
f.close()
def read_file(fn):
with open(fn, 'r', encoding='utf8') as f:
data = f.read()
f.close()
return data
def str_to_b64(s):
return base64.b64encode(s.encode('utf8'))
def b64_to_str(b):
return base64.b64decode(b).decode('utf8')
def remove_script_tag(soup):
[s.decompose() for s in soup.findAll('script')]
return soup
def detect_lang(text):
try:
is_reliable, text_bytes_found, details = cld2.detect(text)
except:
text = ''.join(x for x in text if x in string.printable)
is_reliable, text_bytes_found, details = cld2.detect(text)
# print('detected: %s' % detectedLangName)
# print('reliable: %s' % (isReliable != 0))
# print('textBytes: %s' % textBytesFound)
# print('details: %s' % str(details))
return details[0][1]