Skip to content

Commit

Permalink
unique print added
Browse files Browse the repository at this point in the history
  • Loading branch information
pamrein committed Aug 7, 2024
1 parent 0d19f83 commit 4777611
Showing 1 changed file with 13 additions and 6 deletions.
19 changes: 13 additions & 6 deletions dataset_extractor_lotus/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,15 +360,22 @@ def read_arg(argv):
smiles_column: "smiles"
}
).unique()
#).unique(subset=["smiles"])

# Find duplicates in the 'id' column
duplicates = df.group_by("id").count().filter(pl.col("count") > 1)
# Find duplicates in the columns
duplicates_id = df.group_by("id").count().filter(pl.col("count") > 1)
duplicates_smiles = df.group_by("smiles").count().filter(pl.col("count") > 1)

# info about dataframe
print(f"""--- Uniqueness of dataframe ---\nall columns: {df.unique().shape[1]}\nid: {df.unique(subset="id").shape[0]}\nsmiles: {df.unique(subset="smiles").shape[0]}""")

# Print the duplicate IDs
if not duplicates.is_empty():
print(f'Duplicate IDs found:\n{duplicates}')
else:
print("No duplicate IDs found.")
if not duplicates_id.is_empty():
print(f'Duplicate IDs found:\n{duplicates_id}')

# Print the duplicate SMILES
if not duplicates_smiles.is_empty():
print(f'Duplicate SMILES found:\n{duplicates_smiles}')

# Write the transformed DataFrame to a new CSV file
df.write_csv(output_path_file)
Expand Down

0 comments on commit 4777611

Please sign in to comment.