-
Notifications
You must be signed in to change notification settings - Fork 0
/
fix-mapping.py
77 lines (64 loc) · 5.26 KB
/
fix-mapping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import polars as pl
import os as os
from datetime import datetime
import re
def readFile(path):
if path.endswith('.csv'):
df = pl.read_csv(path, truncate_ragged_lines=True) #truncate_ragged_lines=True is used to avoid errors when reading csv files with different number of columns, it will fill the missing columns with null values
print(f"Read {path} succesfully")
elif path.endswith('.xlsx'):
df = pl.read_excel(path)
print(f"Read {path} succesfully")
else:
raise ValueError('File type not supported')
return df
def generateCSV(df, path):
base_name, extension = os.path.splitext(path)
current_datetime = datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
path = f"{base_name}_{current_datetime}{extension}"
df.write_csv(path, separator=';')
print(f"CSV generated at {path}")
def processMapping(df):
#pdf_directory = r"C:\Users\Amani.devos\OneDrive - Circet Benelux NV\Bureaublad\Dataloader\Survey Migration\Final Migration Grimbergen_GavereKruisemOudenaarde\SurveyReports"
pdf_directory = input("Enter the path to the PDF directory: ").replace('"', '')
df = df.drop(['Id', 'OwnerName', 'Type', 'OwnerId', 'LastModifiedDate', 'CreatedDate', 'ParentId', 'ContentDocumentLinkId'])
df = df.sort('ParentName')
df = df.with_columns(
(pl.lit(pdf_directory) + '/' + df['FilePath'].apply(replace_non_ascii_with_regex) + '.pdf').alias('FilePath'))
print(df)
df_new = pl.DataFrame({
'FirstPublishLocation:sitetracker__Site__c-Name': df['ParentName'],
'Title': df['Name'].apply(replace_non_ascii_with_regex),
'PathOnClient': df['FilePath'],
'VersionData': df['FilePath']
})
print(df_new)
return df_new
replacement_dict = {
"é": "e", "è": "e", "ô": "o", "ö": "o", "à": "a", "î": "i", "ë": "e", "ç": "c",
"ê": "e", "ü": "u", "â": "a", "ù": "u", "û": "u", "ï": "i", "ä": "a", "É": "E",
"È": "E", "Ô": "O", "Ö": "O", "À": "A", "Î": "I", "Ë": "E", "Ç": "C", "Ê": "E",
"Ü": "U", "Â": "A", "Ù": "U", "Û": "U", "Ï": "I", "Ä": "A", "À": "A", "Â": "A",
"Ã": "A", 'Ä': 'A', "Å": "A", "Æ": "AE", "Ç": "C", "È": "E", "É": "E",
"Ê": "E", "Ë": "E", "Ì": "I", "Î": "I", "Ñ": "N", "Ò": "O", 'Ó': "O",
'Ô': 'O', "Õ": "O", "Ö": "O", "×": "x", "Ø": "O", "Ù": "U", "Ú": "U",
"Û": "U", "Ü": "U", "Þ": "Th", "ß": "ss", "á": "a", "â": "a", "ã": "a",
"ä": "a", "å": "a", "æ": "ae", "ç": "c", "è": "e", "é": "e", "ê": "e",
"ë": "e", "ì": "i", "Ã": "i", "î": "i", "ï": "i", "ð": "d", "ñ": "n",
"ò": "o", "ó": "o", "õ": "o", "ö": "o", "÷": "÷", "ø": "o", "ù": "u",
"ú": "u", "û": "u", "ü": "u", "ý": "y", "þ": "th", "ÿ": "y"
}
# Create a regular expression from the dictionary keys
regex = re.compile("(%s)" % "|".join(map(re.escape, replacement_dict.keys())))
def replace_non_ascii_with_regex(cell):
return regex.sub(lambda mo: replacement_dict[mo.string[mo.start():mo.end()]], cell)
# generally slower than the regex approach, but less complex
#def replaceNonAscii(cell):
#
# cell = cell.replace("é", "e").replace("è", "e").replace("ô", "o").replace("ö", "o").replace("à", "a").replace("î", "i").replace("ë", "e").replace("ç", "c").replace("ê", "e").replace("ü", "u").replace("â", "a").replace("ù", "u").replace("û", "u").replace("ï", "i").replace("ä", "a").replace("É", "E").replace("È", "E").replace("Ô", "O").replace("Ö", "O").replace("À", "A").replace("Î", "I").replace("Ë", "E").replace("Ç", "C").replace("Ê", "E").replace("Ü", "U").replace("Â", "A").replace("Ù", "U").replace("Û", "U").replace("Ï", "I").replace("Ä", "A").replace("À", "A").replace("Â", "A").replace("Ã", "A").replace('Ä', 'A').replace("Ã…", "A").replace("Æ", "AE").replace("Ç", "C").replace("È", "E").replace("É", "E").replace("Ê", "E").replace("Ë", "E").replace("ÃŒ", "I").replace("ÃŽ", "I").replace("Ñ", "N").replace("Ã’", "O").replace('Ó', "O").replace('Ô', 'O').replace("Õ", "O").replace("Ö", "O").replace("×", "x").replace("Ø", "O").replace("Ù", "U").replace("Ú", "U").replace("Û", "U").replace("Ãœ", "U").replace("Þ", "Th").replace("ß", "ss").replace("á", "a").replace("â", "a").replace("ã", "a").replace("ä", "a").replace("Ã¥", "a").replace("æ", "ae").replace("ç", "c").replace("è", "e").replace("é", "e").replace("ê", "e").replace("ë", "e").replace("ì", "i").replace("Ã", "i").replace("î", "i").replace("ï", "i").replace("ð", "d").replace("ñ", "n").replace("ò", "o").replace("ó", "o").replace("õ", "o").replace("ö", "o").replace("÷", "÷").replace("ø", "o").replace("ù", "u").replace("ú", "u").replace("û", "u").replace("ü", "u").replace("ý", "y").replace("þ", "th").replace("ÿ", "y").replace('Ã', 'a')
#
# return cell
mapping_file_path = input("Enter the path to the mapping file: ").replace('"', '')
df_mapping = readFile(mapping_file_path)
df_mapping = processMapping(df_mapping)
generateCSV(df_mapping, os.path.dirname(mapping_file_path + "/MAPPING-TO-UPLOAD.csv"))