-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_db.py
114 lines (96 loc) · 3.07 KB
/
create_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from twitter_scraper import *
import os
import numpy as np
from datetime import datetime, timedelta, date
def next_date(y:int,m:int,d:int,add_days:int=1):
pass
def scrap_by_interval(
start_date:str="2009--01--01",
end_date:str="2023--04--01",
interval:int=7, # days
limit:int=100, # tweets to scrap each interval
# sleep_time:int=0, # delay between 2 requests
out:str="out.bin", # path to save tweets
topic:str="" # topic to scrap
):
"""
doc to write properly...
"""
tmp = start_date.split("--")
start_year = int(tmp[0])
start_month = int(tmp[1])
start_day = int(tmp[2])
tmp = end_date.split("--")
end_year = int(tmp[0])
end_month = int(tmp[1])
end_day = int(tmp[2])
print(f"scraping topic \"{topic}\" from {start_day}/{start_month}/{start_year} "
f"to {end_day}/{end_month}/{end_year} "
f"with {interval} days interval\n"
f"output in {out}")
s: TwitterScraper = TwitterScraper()
q: SearchQuery = SearchQuery().with_subject(topic).with_limit(limit)
# tweets_seen:set[np.int64] = set()
current:date = date(start_year,start_month,start_day)
delta_days:timedelta = timedelta(days=interval)
one_day:timedelta = timedelta(days=1)
next_date:date = current + delta_days - one_day # initialisation bizarre mais bref
end_date:datetime = date(end_year,end_month,end_day)
while current < end_date:
print(f" from {current} to {next_date}")
# request
q.set_start_date(current.year, current.month, current.day)
q.set_end_date(next_date.year, next_date.month, next_date.day)
r: list[Tweet] = s.search(q)
# duplicate verification
#TODO
# write in database
for t in r:
t.write(out)
current = next_date + one_day
next_date += delta_days
print("done\n")
if __name__ == '__main__':
try:
os.mkdir('data')
except FileExistsError:
pass
if False:
topics = [
"Crédit mutuel",
"banque",
"finance",
"cinéma",
"critique",
"jeu vidéo",
"minecraft",
"science",
"informatique",
"intelligence artificielle",
"service client",
"trump",
"biden",
"politique",
"france",
"états-unis"
#...
]
start_date="2009--01--01"
end_date="2023--04--01"
else: # pour tester si ça fonctionne
topics = [
"crédit mutuel",
"banque"
]
start_date="2020--01--01"
end_date ="2020--03--20"
for topic in topics:
# remove any non filename-friendly chars from topic (for the db file)
path_db = os.path.join('data', f"{''.join(x for x in topic if x.isalnum() and x.isascii())}.bin")
scrap_by_interval(
start_date=start_date,
end_date=end_date,
interval=7,
out=path_db,
topic=topic
)