diff --git a/VERSIONLOG.md b/VERSIONLOG.md index 8ca5110f..fc4cce2f 100644 --- a/VERSIONLOG.md +++ b/VERSIONLOG.md @@ -1,5 +1,9 @@ # Scilifelab_epps Version Log +## 20241009.1 + +Improve AVITI run manifest generation with sample-level settings. No longer produce submanifests. + ## 20241006.2 Improve aviti run parameter parser diff --git a/scripts/generate_aviti_run_manifest.py b/scripts/generate_aviti_run_manifest.py index cf9fe732..37e2909f 100644 --- a/scripts/generate_aviti_run_manifest.py +++ b/scripts/generate_aviti_run_manifest.py @@ -133,6 +133,18 @@ def get_flowcell_id(process: Process) -> str: return flowcell_id +def dict_to_manifest_col(d: dict) -> str: + """Turn a list of key-value pairs into a string fitting into a manifest column.""" + for k, v in d.items(): + for char in [",", ":", " "]: + assert char not in k, f"Character '{char}' not allowed in manifest columns." + assert char not in v, f"Character '{char}' not allowed in manifest columns." + + s = " ".join([f"{k}:{v}" for k, v in d.items()]) + + return s + + def get_manifests(process: Process, manifest_root_name: str) -> list[tuple[str, str]]: """Generate multiple manifests, grouping samples by index multiplicity and length, adding PhiX controls of appropriate lengths as needed. @@ -146,6 +158,7 @@ def get_manifests(process: Process, manifest_root_name: str) -> list[tuple[str, # Assert lanes lanes = [art_out.location[1].split(":")[0] for art_out in arts_out] + lanes.sort() assert set(lanes) == {"1"} or set(lanes) == { "1", "2", @@ -199,6 +212,16 @@ def get_manifests(process: Process, manifest_root_name: str) -> list[tuple[str, row["Recipe"] = seq_setup row["phix_loaded"] = phix_loaded row["phix_set_name"] = phix_set_name + row["lims_label"] = lims_label + + # Add special case settings + row_settings = {} + if TENX_SINGLE_PAT.findall(lims_label): + # For 10X 8-mer single indexes (e.g. SI-NA-A1) it is usually required that + # index 1 sequences shall be written as a separate FastQ file (I1). + # In this case we need the additional option I1Fastq,TRUE. + row_settings["I1Fastq"] = "True" + row["settings"] = dict_to_manifest_col(row_settings) sample_rows.append(row) @@ -234,24 +257,51 @@ def get_manifests(process: Process, manifest_root_name: str) -> list[tuple[str, rows_to_check = group.to_dict(orient="records") check_distances(rows_to_check) - manifests = [] - for manifest_type in ["untrimmed", "trimmed", "empty", "partitioned"]: - manifests += make_manifests_by_type( - df_samples_and_controls, process, manifest_root_name, manifest_type + # Start building manifests + manifests: list[tuple[str, str]] = [] + for manifest_type in ["untrimmed", "trimmed", "empty"]: + manifest_name, manifest_contents = make_manifest( + df_samples_and_controls, + process, + manifest_root_name, + manifest_type, ) + manifests.append((manifest_name, manifest_contents)) return manifests -def make_manifests_by_type( +def make_manifest( df_samples_and_controls: pd.DataFrame, process: Process, manifest_root_name: str, manifest_type: str, -) -> list[tuple[str, str]]: - df = df_samples_and_controls.copy() +) -> tuple[str, str]: + # Make copy of input df and subset columns to include in manifest + df = df_samples_and_controls[ + [ + "SampleName", + "Index1", + "Index2", + "Lane", + "Project", + "Recipe", + "lims_label", + "settings", + ] + ].copy() + + file_name = f"{manifest_root_name}_{manifest_type}.csv" + runValues_section = "\n".join( + [ + "[RUNVALUES]", + "KeyName, Value", + f"lims_step_name, {process.type.name}", + f"lims_step_id, {process.id}", + f"manifest_file, {file_name}", + ] + ) - # Settings section is the same across all manifest types settings_section = "\n".join( [ "[SETTINGS]", @@ -259,139 +309,28 @@ def make_manifests_by_type( ] ) - manifests = [] if manifest_type == "untrimmed": - file_name = f"{manifest_root_name}_untrimmed.csv" - - runValues_section = "\n".join( - [ - "[RUNVALUES]", - "KeyName, Value", - f'lims_step_name, "{process.type.name}"', - f'lims_step_id, "{process.id}"', - f'manifest_file, "{file_name}"', - ] - ) - - samples_section = ( - f"[SAMPLES]\n{df.iloc[:, 0:6].to_csv(index=None, header=True)}" - ) - - manifest_contents = "\n\n".join( - [runValues_section, settings_section, samples_section] - ) - - manifests.append((file_name, manifest_contents)) + samples_section = f"[SAMPLES]\n{df.to_csv(index=None, header=True)}" elif manifest_type == "trimmed": - file_name = f"{manifest_root_name}_trimmed.csv" - - runValues_section = "\n".join( - [ - "[RUNVALUES]", - "KeyName, Value", - f'lims_step_name, "{process.type.name}"', - f'lims_step_id, "{process.id}"', - f'manifest_file, "{file_name}"', - ] - ) - - # Trim down min_idx1_len = df["Index1"].apply(len).min() min_idx2_len = df["Index2"].apply(len).min() df["Index1"] = df["Index1"].apply(lambda x: x[:min_idx1_len]) df["Index2"] = df["Index2"].apply(lambda x: x[:min_idx2_len]) - samples_section = ( - f"[SAMPLES]\n{df.iloc[:, 0:6].to_csv(index=None, header=True)}" - ) - - manifest_contents = "\n\n".join( - [runValues_section, settings_section, samples_section] - ) - manifests.append((file_name, manifest_contents)) + samples_section = f"[SAMPLES]\n{df.to_csv(index=None, header=True)}" elif manifest_type == "empty": - file_name = f"{manifest_root_name}_empty.csv" - - runValues_section = "\n".join( - [ - "[RUNVALUES]", - "KeyName, Value", - f'lims_step_name, "{process.type.name}"', - f'lims_step_id, "{process.id}"', - f'manifest_file, "{file_name}"', - ] - ) - - manifest_contents = "\n\n".join([runValues_section, settings_section]) - manifests.append((file_name, manifest_contents)) - - elif manifest_type == "partitioned": - # Drop PhiX controls, to be re-added by length - df = df[df["Project"] != "Control"] - - # Get idx lengths for calculations - df.loc[:, "len_idx1"] = df["Index1"].apply(len) - df.loc[:, "len_idx2"] = df["Index2"].apply(len) - - # Break down by index lengths and lane, creating composite manifests - n = 0 - for (len_idx1, len_idx2, lane), group in df.groupby( - ["len_idx1", "len_idx2", "Lane"] - ): - file_name = f"{manifest_root_name}_{n}.csv" - runValues_section = "\n".join( - [ - "[RUNVALUES]", - "KeyName, Value", - f'lims_step_name, "{process.type.name}"', - f'lims_step_id, "{process.id}"', - f'manifest_file, "{file_name}"', - f"manifest_group, {n+1}/{len(df.groupby(['len_idx1', 'len_idx2', 'Lane']))}", - f"grouped_by, len_idx1:{len_idx1} len_idx2:{len_idx2} lane:{lane}", - ] - ) - - # Add PhiX stratified by index length - if group["phix_loaded"].any(): - phix_set_name = group["phix_set_name"].iloc[0] - phix_set = PHIX_SETS[phix_set_name] - - # Add row for each PhiX index pair - for phix_idx_pair in phix_set["indices"]: - row = {} - row["SampleName"] = phix_set["nickname"] - row["Index1"] = fit_seq(phix_idx_pair[0], len_idx1) - row["Index2"] = fit_seq(phix_idx_pair[1], len_idx2) - row["Lane"] = group["Lane"].iloc[0] - row["Project"] = "Control" - row["Recipe"] = "0-0" - row["len_idx1"] = len_idx1 - row["len_idx2"] = len_idx2 - - group = pd.concat( - [group, pd.DataFrame([row])], - ignore_index=True, - ) - - samples_section = ( - f"[SAMPLES]\n{group.iloc[:, 0:6].to_csv(index=None, header=True)}" - ) - - manifest_contents = "\n\n".join( - [runValues_section, settings_section, samples_section] - ) - - manifests.append((file_name, manifest_contents)) - n += 1 + samples_section = "" else: raise AssertionError("Invalid manifest type.") - manifests.sort(key=lambda x: x[0]) + manifest_contents = "\n\n".join( + [runValues_section, settings_section, samples_section] + ) - return manifests + return (file_name, manifest_contents) def fit_seq(seq: str, length: int, seq_extension: str | None = None) -> str: @@ -513,7 +452,7 @@ def main(args: Namespace): manifest_root_name = f"AVITI_run_manifest_{flowcell_id}_{process.id}_{TIMESTAMP}_{process.technician.name.replace(' ','')}" # Create manifest(s) - manifests = get_manifests(process, manifest_root_name) + manifests: list[tuple[str, str]] = get_manifests(process, manifest_root_name) # Write manifest(s) for file, content in manifests: @@ -522,9 +461,9 @@ def main(args: Namespace): # Zip manifest(s) zip_file = f"{manifest_root_name}.zip" files = [file for file, _ in manifests] - with ZipFile(zip_file, "w") as zipf: + with ZipFile(zip_file, "w") as zip_stream: for file in files: - zipf.write(file) + zip_stream.write(file) os.remove(file) # Upload manifest(s)