-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
99 lines (77 loc) · 2.77 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import os
import boto3
import csv
import logging
from dotenv import load_dotenv, find_dotenv
import mysql.connector
def file_list(work_folder):
fileset = set()
fileset = set(f for f in os.listdir(work_folder)
if os.path.isfile(os.path.join(work_folder, f)))
return fileset
def csv_list(local_file):
csv_set = set()
with open(local_file, newline='') as f:
for row in csv.reader(f):
csv_set.add(row[0])
print("CSV file list records returned: ", len(csv_set))
return csv_set
def db_file_list(results):
fileset = set()
logger.info('Connecting to the DB...')
mydb = mysql.connector.connect(
host=os.getenv("MYSQL_HOST"),
port=os.getenv("MYSQL_PORT"),
user=os.getenv("MYSQL_USER"),
password=os.getenv("MYSQL_PASS"),
database=os.getenv("MYSQL_DB")
)
mycursor = mydb.cursor()
mycursor.execute(f"{os.getenv('SQL_QUERY')} LIMIT {results}")
myresult = mycursor.fetchall()
fileset.update(x[0] for x in myresult)
print("Database records returned: ", len(fileset))
return fileset
def s3_file_list(results):
fileset = set()
logger.info('Connecting to S3...')
response = client.Bucket(os.getenv('SPACES_BUCKET'))
filelist = list(response.objects.all())
fileset.update(f.key for f in filelist)
print("S3 records returned: ", len(fileset))
return fileset
def write_csv(filename, filelist):
with open(filename, 'w', newline='') as myfile:
wr = csv.writer(myfile)
for row in filelist:
wr.writerow([row])
def main():
global logger, client
load_dotenv(find_dotenv())
logging.basicConfig(
level=logging.INFO,
format=f'%(asctime)s %(levelname)s %(message)s'
)
logger = logging.getLogger()
logger.debug('Starting...')
folder = os.getenv("LOCAL_FOLDER")
session = boto3.session.Session()
client = session.resource('s3',
region_name='nyc3',
endpoint_url='https://nyc3.digitaloceanspaces.com',
aws_access_key_id=os.getenv('SPACES_KEY'),
aws_secret_access_key=os.getenv('SPACES_SECRET'))
max_results = 99999
folder_local = file_list(folder)
folder_db = db_file_list(max_results)
folder_s3 = s3_file_list(max_results)
diff_db = folder_db.difference(folder_s3)
diff_s3 = folder_s3.difference(folder_db)
diff_csv = csv_list("filelist.csv").difference(folder_s3)
write_csv("errors.csv", diff_db)
write_csv("not_in_db.csv", diff_s3)
write_csv("csv_not_in_s3.csv", diff_csv)
print(f"{len(diff_db)} files are enabled in the database, but not found in the file system.")
for f in diff_db:
print(f)
main()