Skip to content

Commit

Permalink
Add benchmark dataset creation
Browse files Browse the repository at this point in the history
  • Loading branch information
adjavon committed Sep 4, 2024
1 parent 285efe3 commit 0daed58
Showing 1 changed file with 29 additions and 0 deletions.
29 changes: 29 additions & 0 deletions notebooks/nontargeting_experiments/make_benchmark_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@

# %% Make an intermediate dataset
import pandas as pd

location = "/mnt/efs/dlmbl/G-et/csv/dataset_split_1168.csv"

metadata = pd.read_csv(location)

# %%
assert "nontargeting" in metadata['gene'].values
assert "CCT2" in metadata['gene'].values
# %% Keep only the nontargeting and CCT2 genes
sample = metadata[metadata['gene'].isin(["nontargeting", "CCT2"])]

# %%
sample[sample.split=="train"].gene.value_counts()

# %%
# Sub-sample the non-targeting ones to have the same number of cells as CCT2
sampled_nontargeting = sample[sample.gene=="nontargeting"].sample(n=len(sample[sample.gene=="CCT2"]), random_state=42)
sampled_cct2 = sample[sample.gene=="CCT2"]

# %%
sampled = pd.concat([sampled_nontargeting, sampled_cct2])

# %%
sampled.to_csv("/mnt/efs/dlmbl/G-et/csv/dataset_split_benchmark.csv", index=False)

# %%

0 comments on commit 0daed58

Please sign in to comment.