Skip to content

Commit

Permalink
Merge pull request #372 from kedhammar/aviti-manifest-dev
Browse files Browse the repository at this point in the history
Improve manifest generation
  • Loading branch information
kedhammar authored Oct 9, 2024
2 parents 0ef0e9c + 173a111 commit 44e5817
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 129 deletions.
4 changes: 4 additions & 0 deletions VERSIONLOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Scilifelab_epps Version Log

## 20241009.1

Improve AVITI run manifest generation with sample-level settings. No longer produce submanifests.

## 20241006.2

Improve aviti run parameter parser
Expand Down
197 changes: 68 additions & 129 deletions scripts/generate_aviti_run_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,18 @@ def get_flowcell_id(process: Process) -> str:
return flowcell_id


def dict_to_manifest_col(d: dict) -> str:
"""Turn a list of key-value pairs into a string fitting into a manifest column."""
for k, v in d.items():
for char in [",", ":", " "]:
assert char not in k, f"Character '{char}' not allowed in manifest columns."
assert char not in v, f"Character '{char}' not allowed in manifest columns."

s = " ".join([f"{k}:{v}" for k, v in d.items()])

return s


def get_manifests(process: Process, manifest_root_name: str) -> list[tuple[str, str]]:
"""Generate multiple manifests, grouping samples by index multiplicity and length,
adding PhiX controls of appropriate lengths as needed.
Expand All @@ -146,6 +158,7 @@ def get_manifests(process: Process, manifest_root_name: str) -> list[tuple[str,

# Assert lanes
lanes = [art_out.location[1].split(":")[0] for art_out in arts_out]
lanes.sort()
assert set(lanes) == {"1"} or set(lanes) == {
"1",
"2",
Expand Down Expand Up @@ -199,6 +212,16 @@ def get_manifests(process: Process, manifest_root_name: str) -> list[tuple[str,
row["Recipe"] = seq_setup
row["phix_loaded"] = phix_loaded
row["phix_set_name"] = phix_set_name
row["lims_label"] = lims_label

# Add special case settings
row_settings = {}
if TENX_SINGLE_PAT.findall(lims_label):
# For 10X 8-mer single indexes (e.g. SI-NA-A1) it is usually required that
# index 1 sequences shall be written as a separate FastQ file (I1).
# In this case we need the additional option I1Fastq,TRUE.
row_settings["I1Fastq"] = "True"
row["settings"] = dict_to_manifest_col(row_settings)

sample_rows.append(row)

Expand Down Expand Up @@ -234,164 +257,80 @@ def get_manifests(process: Process, manifest_root_name: str) -> list[tuple[str,
rows_to_check = group.to_dict(orient="records")
check_distances(rows_to_check)

manifests = []
for manifest_type in ["untrimmed", "trimmed", "empty", "partitioned"]:
manifests += make_manifests_by_type(
df_samples_and_controls, process, manifest_root_name, manifest_type
# Start building manifests
manifests: list[tuple[str, str]] = []
for manifest_type in ["untrimmed", "trimmed", "empty"]:
manifest_name, manifest_contents = make_manifest(
df_samples_and_controls,
process,
manifest_root_name,
manifest_type,
)
manifests.append((manifest_name, manifest_contents))

return manifests


def make_manifests_by_type(
def make_manifest(
df_samples_and_controls: pd.DataFrame,
process: Process,
manifest_root_name: str,
manifest_type: str,
) -> list[tuple[str, str]]:
df = df_samples_and_controls.copy()
) -> tuple[str, str]:
# Make copy of input df and subset columns to include in manifest
df = df_samples_and_controls[
[
"SampleName",
"Index1",
"Index2",
"Lane",
"Project",
"Recipe",
"lims_label",
"settings",
]
].copy()

file_name = f"{manifest_root_name}_{manifest_type}.csv"
runValues_section = "\n".join(
[
"[RUNVALUES]",
"KeyName, Value",
f"lims_step_name, {process.type.name}",
f"lims_step_id, {process.id}",
f"manifest_file, {file_name}",
]
)

# Settings section is the same across all manifest types
settings_section = "\n".join(
[
"[SETTINGS]",
"SettingName, Value",
]
)

manifests = []
if manifest_type == "untrimmed":
file_name = f"{manifest_root_name}_untrimmed.csv"

runValues_section = "\n".join(
[
"[RUNVALUES]",
"KeyName, Value",
f'lims_step_name, "{process.type.name}"',
f'lims_step_id, "{process.id}"',
f'manifest_file, "{file_name}"',
]
)

samples_section = (
f"[SAMPLES]\n{df.iloc[:, 0:6].to_csv(index=None, header=True)}"
)

manifest_contents = "\n\n".join(
[runValues_section, settings_section, samples_section]
)

manifests.append((file_name, manifest_contents))
samples_section = f"[SAMPLES]\n{df.to_csv(index=None, header=True)}"

elif manifest_type == "trimmed":
file_name = f"{manifest_root_name}_trimmed.csv"

runValues_section = "\n".join(
[
"[RUNVALUES]",
"KeyName, Value",
f'lims_step_name, "{process.type.name}"',
f'lims_step_id, "{process.id}"',
f'manifest_file, "{file_name}"',
]
)

# Trim down
min_idx1_len = df["Index1"].apply(len).min()
min_idx2_len = df["Index2"].apply(len).min()
df["Index1"] = df["Index1"].apply(lambda x: x[:min_idx1_len])
df["Index2"] = df["Index2"].apply(lambda x: x[:min_idx2_len])

samples_section = (
f"[SAMPLES]\n{df.iloc[:, 0:6].to_csv(index=None, header=True)}"
)

manifest_contents = "\n\n".join(
[runValues_section, settings_section, samples_section]
)
manifests.append((file_name, manifest_contents))
samples_section = f"[SAMPLES]\n{df.to_csv(index=None, header=True)}"

elif manifest_type == "empty":
file_name = f"{manifest_root_name}_empty.csv"

runValues_section = "\n".join(
[
"[RUNVALUES]",
"KeyName, Value",
f'lims_step_name, "{process.type.name}"',
f'lims_step_id, "{process.id}"',
f'manifest_file, "{file_name}"',
]
)

manifest_contents = "\n\n".join([runValues_section, settings_section])
manifests.append((file_name, manifest_contents))

elif manifest_type == "partitioned":
# Drop PhiX controls, to be re-added by length
df = df[df["Project"] != "Control"]

# Get idx lengths for calculations
df.loc[:, "len_idx1"] = df["Index1"].apply(len)
df.loc[:, "len_idx2"] = df["Index2"].apply(len)

# Break down by index lengths and lane, creating composite manifests
n = 0
for (len_idx1, len_idx2, lane), group in df.groupby(
["len_idx1", "len_idx2", "Lane"]
):
file_name = f"{manifest_root_name}_{n}.csv"
runValues_section = "\n".join(
[
"[RUNVALUES]",
"KeyName, Value",
f'lims_step_name, "{process.type.name}"',
f'lims_step_id, "{process.id}"',
f'manifest_file, "{file_name}"',
f"manifest_group, {n+1}/{len(df.groupby(['len_idx1', 'len_idx2', 'Lane']))}",
f"grouped_by, len_idx1:{len_idx1} len_idx2:{len_idx2} lane:{lane}",
]
)

# Add PhiX stratified by index length
if group["phix_loaded"].any():
phix_set_name = group["phix_set_name"].iloc[0]
phix_set = PHIX_SETS[phix_set_name]

# Add row for each PhiX index pair
for phix_idx_pair in phix_set["indices"]:
row = {}
row["SampleName"] = phix_set["nickname"]
row["Index1"] = fit_seq(phix_idx_pair[0], len_idx1)
row["Index2"] = fit_seq(phix_idx_pair[1], len_idx2)
row["Lane"] = group["Lane"].iloc[0]
row["Project"] = "Control"
row["Recipe"] = "0-0"
row["len_idx1"] = len_idx1
row["len_idx2"] = len_idx2

group = pd.concat(
[group, pd.DataFrame([row])],
ignore_index=True,
)

samples_section = (
f"[SAMPLES]\n{group.iloc[:, 0:6].to_csv(index=None, header=True)}"
)

manifest_contents = "\n\n".join(
[runValues_section, settings_section, samples_section]
)

manifests.append((file_name, manifest_contents))
n += 1
samples_section = ""

else:
raise AssertionError("Invalid manifest type.")

manifests.sort(key=lambda x: x[0])
manifest_contents = "\n\n".join(
[runValues_section, settings_section, samples_section]
)

return manifests
return (file_name, manifest_contents)


def fit_seq(seq: str, length: int, seq_extension: str | None = None) -> str:
Expand Down Expand Up @@ -513,7 +452,7 @@ def main(args: Namespace):
manifest_root_name = f"AVITI_run_manifest_{flowcell_id}_{process.id}_{TIMESTAMP}_{process.technician.name.replace(' ','')}"

# Create manifest(s)
manifests = get_manifests(process, manifest_root_name)
manifests: list[tuple[str, str]] = get_manifests(process, manifest_root_name)

# Write manifest(s)
for file, content in manifests:
Expand All @@ -522,9 +461,9 @@ def main(args: Namespace):
# Zip manifest(s)
zip_file = f"{manifest_root_name}.zip"
files = [file for file, _ in manifests]
with ZipFile(zip_file, "w") as zipf:
with ZipFile(zip_file, "w") as zip_stream:
for file in files:
zipf.write(file)
zip_stream.write(file)
os.remove(file)

# Upload manifest(s)
Expand Down

0 comments on commit 44e5817

Please sign in to comment.