forked from josenimo/PyProteomics
-
Notifications
You must be signed in to change notification settings - Fork 0
/
contaminants.py
31 lines (24 loc) · 1.08 KB
/
contaminants.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import pandas as pd
import numpy as np
from tabulate import tabulate
def filter_out_contaminants(adata, qc_export_path=None):
print("----- Filter out contaminants -----")
#create condition
condition1 = adata.var["Protein.Ids"].str.contains("Cont_")
condition2 = adata.var_names.str.contains("Cont_")
combined_condition = condition1 | condition2
filtered_out = adata[:, combined_condition]
filtered_out.var["Species"] = filtered_out.var["Protein.Names"].str.split("_").str[-1]
print("the following proteins were filtered out:")
print(tabulate(
filtered_out.var.sort_values(by="Species")[["Genes","Protein.Names","Species"]].values,
headers=["Genes","Protein.Names","Species"],
tablefmt='psql',
showindex="always",
maxcolwidths=[20,20,20]))
if qc_export_path:
filtered_out.var.sort_values(by="Species")[["Genes","Protein.Names","Species"]].to_csv(qc_export_path)
adata = adata[:, ~combined_condition]
print(f"The output object has {adata.shape[1]} proteins in it")
print("\n")
return adata