-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcookiepediacrawler.py
81 lines (63 loc) · 2.72 KB
/
cookiepediacrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
import argparse
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--chrome_path', nargs='?', default='/usr/bin/chromedriver')
parser.add_argument('--csv_path', '-csv', nargs='?', default='cookie_data.csv')
# csv contents will be copied to dest file after at least one run with the original csv
# So use destination csv as the original csv_path after one run to avoid column being overwritten
parser.add_argument('--csv_dest_path', '-dest', nargs='?')
parser.add_argument('--range_start', '-start', nargs='?', default=0, type=int) # inclusive, assumes first row of data is 2
parser.add_argument('--range_end', '-end', nargs='?', type=int) # exclusive
parser.add_argument('--update_interval', '-i', nargs='?', default=10, type=int)
parser.add_argument('--hide_errors', '-e', default=True, action='store_false')
args = parser.parse_args()
# Create new instance of Chrome in incognito mode
option = webdriver.ChromeOptions()
option.add_argument("--incognito")
option.add_argument('headless')
driver = webdriver.Chrome(executable_path=args.chrome_path, chrome_options=option)
# Load csv
cookie_df = pd.read_csv(args.csv_path)
cookie_df.index += 2 # adjust for df 0 indexing
# Set up result output csv
if args.csv_dest_path:
save_path = args.csv_dest_path
else:
save_path = args.csv_path
# Set up purpose column
if not ("purpose" in cookie_df.columns):
cookie_df["purpose"] = ""
# Set up range
range_start = args.range_start
range_end = args.range_end
if range_start < 2:
range_start = 2
if not range_end or range_end > len(cookie_df) + 2:
range_end = len(cookie_df) + 2
cookies_seen = set()
for i in range(range_start, range_end):
try:
cookie_name = cookie_df["cookie_name"][i]
if cookie_name in cookies_seen:
continue
cookies_seen.add(cookie_name)
# Query url
driver.get("https://cookiepedia.co.uk/cookies/" + cookie_name)
que = driver.find_element_by_xpath('//div[@id="content-left"]/p/strong')
cookie_df.purpose[i] = que.text
# Save to csv backup frequency
if i % args.update_interval == 0:
cookie_df.to_csv(save_path, index=False)
# print(i) # Debug
except Exception as e:
if not args.hide_errors:
print("Error on cookie " + str(i) + ": " + str(e))
# Save results
cookie_df.to_csv(save_path, index=False)
print("done")
if __name__ == '__main__':
main()