-
Notifications
You must be signed in to change notification settings - Fork 0
/
script.py
28 lines (22 loc) · 977 Bytes
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import polars as pl
def isolate_duplicates(df, column):
series = df[column].is_duplicated()
df = df.with_columns(series.alias("is_duplicated"))
df_unique = df.unique(subset=[column])
print(df)
print(df_unique)
return df_unique
def toDelete(df, column):
# Identify duplicates
duplicate_indices = df.group_by(column).len().filter(pl.col('len') > 1).select(column) # Get the indices of the duplicates, indices is the plural of index
print(duplicate_indices)
# Filter the DataFrame to get one copy of each duplicate
df_duplicates = df.join(duplicate_indices, on=column).unique(subset=[column])
return df_duplicates
def main():
df = pl.read_csv("sitetracker__Job__c_2024-07-10T09_50_08.csv", separator=";")
df_duplicates = toDelete(df, "sitetracker__Site__c")
print(df_duplicates.sort("sitetracker__Site__c"))
df_duplicates.write_csv("toDelete.csv", separator=";")
if __name__ == "__main__":
main()