Skip to content

Commit

Permalink
Refactor symlink_files function and update
Browse files Browse the repository at this point in the history
DesignIP class
  • Loading branch information
alsmith151 committed Jan 12, 2024
1 parent 238dd31 commit 2895c8d
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 35 deletions.
49 changes: 41 additions & 8 deletions seqnado/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,9 +448,34 @@ class DesignIP(BaseModel):
)

@computed_field
@property
def sample_names_ip(self) -> List[str]:
sample_names = set()
for experiment in self.assays.values():
sample_names.add(experiment.ip_files.name)

return list(sample_names)

@property
def sample_names_control(self) -> List[str]:
sample_names = set()
for experiment in self.assays.values():
if experiment.control is not None:
sample_names.add(experiment.control_files.name)

return list(sample_names)

@property
def sample_names(self) -> List[str]:
return list(self.assays.keys())
return self.sample_names_ip + self.sample_names_control

@property
def ip_names(self) -> List[str]:
return list(set([experiment.ip for experiment in self.assays.values()]))

@property
def control_names(self) -> List[str]:
return list(set([experiment.control for experiment in self.assays.values()]))

@computed_field
@property
Expand Down Expand Up @@ -599,10 +624,16 @@ def symlink_files(
r2_path_new = pathlib.Path(f"{output_dir}/{assay_name}_2.fastq.gz")

if not r1_path_new.exists():
r1_path_new.symlink_to(assay.r1.path)
try:
r1_path_new.symlink_to(assay.r1.path.resolve())
except FileExistsError:
logger.warning(f"Symlink for {r1_path_new} already exists.")

if assay.r2 and not r2_path_new.exists():
r2_path_new.symlink_to(assay.r2.path)
try:
r2_path_new.symlink_to(assay.r2.path.resolve())
except FileExistsError:
logger.warning(f"Symlink for {r2_path_new} already exists.")


def symlink_fastq_files(
Expand All @@ -620,12 +651,14 @@ def symlink_fastq_files(

elif isinstance(design, DesignIP):
for experiment_name, experiment in design.assays.items():
for assay_name, assay in experiment.ip_files.items():
symlink_files(output_dir, assay, assay_name)
assay = experiment.ip_files
assay_name = assay.name
symlink_files(output_dir, assay, assay_name)

if experiment.control_files:
for assay_name, assay in experiment.control_files.items():
symlink_files(output_dir, assay, assay_name)
assay = experiment.control_files
assay_name = assay.name
symlink_files(output_dir, assay, assay_name)


def define_output_files(
Expand Down Expand Up @@ -676,7 +709,7 @@ def define_output_files(
assay_output.extend(
expand(
"seqnado_output/peaks/{method}/{ip}.bed",
ip=kwargs["sample_names_ip"],
ip=kwargs["ip"],
method=peak_calling_method,
)
)
Expand Down
2 changes: 1 addition & 1 deletion seqnado/workflow/rules/qc.smk
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def get_fastqc_files(*args, **kwargs):
fq_files = pathlib.Path("seqnado_output/fastqs").glob("*.fastq.gz")
for fq_file in fq_files:
fastqc_file = fastqc_dir / (fq_file.stem.replace(".fastq", "") + ".html")
fastqc_files.append(str(fq_file))
fastqc_files.append(str(fastqc_file))

return fastqc_files

Expand Down
66 changes: 40 additions & 26 deletions seqnado/workflow/snakefile_chip
Original file line number Diff line number Diff line change
Expand Up @@ -4,41 +4,46 @@ import shutil
from datetime import datetime
import glob
from snakemake.utils import min_version
import seqnado.utils
from seqnado.utils import DesignIP
import seqnado.utils as utils
import pandas as pd

####################
# Hardcoded config #
####################
ASSAY = "ChIP"
configfile: "config_chip.yml"
container: "library://asmith151/seqnado/seqnado_pipeline:latest"

seqnado.utils.format_config_dict(config)
####################
# Experiment config #
####################

# Get experiment design
# Load config
utils.format_config_dict(config)

# Generate design
if os.path.exists(config["design"]):
# Expect columns - sample fq1 fq2 antibody control
FASTQ_SAMPLES = seqnado.utils.ChipseqFastqSamples(
pd.read_csv(config["design"], sep="[\s+,\t]", engine="python")
)
assert FASTQ_SAMPLES.design.shape[0] > 0, "No samples found in design file"
df = pd.read_csv(config["design"], sep="[\s+,\t]", engine="python")
DESIGN = DesignIP.from_dataframe(df)
else:
DESIGN = DesignIP.from_directory(".")

for col in ["sample", "fq1", "fq2", "antibody", "control"]:
assert col in FASTQ_SAMPLES.design.columns, f"Design file must contain columns sample, fq1, fq2, antibody, control. Columns found: {FASTQ_SAMPLES.design.columns}"
# Attempt to symlink the fastq files
assert len(DESIGN.fastq_paths) > 0, "No fastq files found in the working directory or no design file provided."
utils.symlink_fastq_files(DESIGN, output_dir="seqnado_output/fastqs")

else:
# Use pattern matching to get samples
fq_files = list(seqnado.utils.get_fastq_files("."))
if fq_files:
FASTQ_SAMPLES = seqnado.utils.ChipseqFastqSamples.from_files(fq_files)
else:
raise ValueError("No FASTQ files found in the working directory")

DESIGN = FASTQ_SAMPLES.design
ANTIBODIES = FASTQ_SAMPLES.antibodies
SAMPLE_NAMES = FASTQ_SAMPLES.sample_names_all
SAMPLE_NAMES_IP = FASTQ_SAMPLES.sample_names_ip
SAMPLE_NAMES_CONTROL = FASTQ_SAMPLES.sample_names_control
SAMPLE_NAMES_PAIRED = FASTQ_SAMPLES.paired_ip_and_control



# Define global variables
SAMPLE_NAMES = DESIGN.sample_names
SAMPLE_NAMES_IP = DESIGN.sample_names_ip
SAMPLE_NAMES_CONTROL = DESIGN.sample_names_control
IP = DESIGN.ip_names
CONTROL = DESIGN.control_names

# Load required rules
include: "rules/qc.smk"
include: "rules/fastq_trim.smk"
include: "rules/align.smk"
Expand All @@ -49,13 +54,22 @@ include: "rules/heatmap.smk"
include: "rules/hub.smk"


ANALYSIS_OUTPUT = seqnado.utils.define_output_files(sample_names=SAMPLE_NAMES,
# Define output files
ANALYSIS_OUTPUT = seqnado.utils.define_output_files(
sample_names=SAMPLE_NAMES,
assay=ASSAY,
sample_names_ip=SAMPLE_NAMES_IP,
ip=SAMPLE_NAMES_IP,
**config
)


# Define wildcard constraints
wildcard_constraints:
read = r"[12]",
sample = "|".join(SAMPLE_NAMES),
treatment= "|".join(SAMPLE_NAMES_IP),
control = "|".join(SAMPLE_NAMES_CONTROL),


rule all:
input:
Expand Down

0 comments on commit 2895c8d

Please sign in to comment.