Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Aviti run manifest dev #359

Merged
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
a8d93fc
remove unused code
kedhammar Sep 11, 2024
e80b6c7
remove unused code
kedhammar Sep 11, 2024
ee7aa17
differentiate phix sets
kedhammar Sep 11, 2024
6be8ca2
handle special idxs
kedhammar Sep 11, 2024
1f81f7a
fix syntax error
kedhammar Sep 11, 2024
b7d4504
try tackling mypy issues
kedhammar Sep 11, 2024
12c43db
improve readability for mypy
kedhammar Sep 11, 2024
1aba9b1
revcomp all i5 for dual idxs
kedhammar Sep 11, 2024
db5dac9
refactor and fix ss3
kedhammar Sep 11, 2024
f8de81c
add todods
kedhammar Sep 11, 2024
221803a
bugfix
kedhammar Sep 11, 2024
fc9f6d1
big wip
kedhammar Sep 12, 2024
dfabee5
starting to look good :)
kedhammar Sep 12, 2024
071936e
mypy fix
kedhammar Sep 12, 2024
8082fd0
bump vlog
kedhammar Sep 12, 2024
70938ee
Merge branch 'master' into aviti-manifest-fix
kedhammar Sep 12, 2024
e18ded7
remove todos
kedhammar Sep 12, 2024
ac5028e
Merge branch 'aviti-manifest-fix' of github.com:kedhammar/scilifelab_…
kedhammar Sep 12, 2024
44ed19b
group by lane
kedhammar Sep 12, 2024
4b8eeab
bugfix
kedhammar Sep 12, 2024
3e9467f
Merge branch 'master' into aviti-manifest-fix
kedhammar Sep 12, 2024
036d79d
typo
kedhammar Sep 12, 2024
6b6138f
Merge branch 'aviti-manifest-fix' of github.com:kedhammar/scilifelab_…
kedhammar Sep 12, 2024
d5f4eae
make untrimmed, trimmed and partitioned manifests
kedhammar Sep 13, 2024
ebc02b1
Merge branch 'master' into aviti-manifest-fix
kedhammar Sep 13, 2024
f8464ec
Update VERSIONLOG.md
kedhammar Sep 13, 2024
176d6ec
simplify code and fix mypy
kedhammar Sep 13, 2024
0f41c00
Merge branch 'aviti-manifest-fix' of github.com:kedhammar/scilifelab_…
kedhammar Sep 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
153 changes: 117 additions & 36 deletions scripts/generate_aviti_run_manifest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env python

import json
import logging
import os
import re
Expand All @@ -13,12 +14,56 @@
from genologics.lims import Lims
from Levenshtein import hamming as distance

from data.Chromium_10X_indexes import Chromium_10X_indexes
from scilifelab_epps.epp import upload_file
from scilifelab_epps.wrapper import epp_decorator
from scripts.generate_minknow_samplesheet import get_pool_sample_label_mapping

TIMESTAMP = dt.now().strftime("%y%m%d_%H%M%S")
LABEL_SEQ_SUBSTRING = re.compile(r"[ACGT]{4,}(-[ACGT]{4,})?")

# Pre-compile regexes in global scope:
IDX_PAT = re.compile("([ATCG]{4,}N*)-?([ATCG]*)")
TENX_SINGLE_PAT = re.compile("SI-(?:GA|NA)-[A-H][1-9][0-2]?")
TENX_DUAL_PAT = re.compile("SI-(?:TT|NT|NN|TN|TS)-[A-H][1-9][0-2]?")
SMARTSEQ_PAT = re.compile("SMARTSEQ[1-9]?-[1-9][0-9]?[A-P]")

# Set up Element PhiX control sets, keys are options in LIMS dropdown UDF
PHIX_SETS = {
"PhiX Control Library, Adept": {
"nickname": "PhiX_Adept",
"indices": [
("ATGTCGCTAG", "CTAGCTCGTA"),
("CACAGATCGT", "ACGAGAGTCT"),
("GCACATAGTC", "GACTACTAGC"),
("TGTGTCGACA", "TGTCTGACAG"),
],
},
"Cloudbreak PhiX Control Library, Elevate": {
"nickname": "PhiX_Elevate",
"indices": [
("ACGTGTAGC", "GCTAGTGCA"),
("CACATGCTG", "AGACACTGT"),
("GTACACGAT", "CTCGTACAG"),
("TGTGCATCA", "TAGTCGATC"),
],
},
"Cloudbreak Freestyle PhiX Control, Third Party": {
"nickname": "PhiX_Third",
"indices": [
("ATGTCGCTAG", "CTAGCTCGTA"),
("CACAGATCGT", "ACGAGAGTCT"),
("GCACATAGTC", "GACTACTAGC"),
("TGTGTCGACA", "TGTCTGACAG"),
],
},
}

# Load SS3 indexes
SMARTSEQ3_indexes_json = (
"/opt/gls/clarity/users/glsai/repos/scilifelab_epps/data/SMARTSEQ3_indexes.json"
)
with open(SMARTSEQ3_indexes_json) as file:
SMARTSEQ3_indexes = json.loads(file.read())


def get_flowcell_id(process: Process) -> str:
Expand Down Expand Up @@ -74,6 +119,47 @@ def get_settings_section() -> str:
return settings_section


def idxs_from_label(label: str) -> list[str | tuple[str, str]]:
"""From a LIMS reagent label, return list whose elements are
single indices or tuples of dual index pairs.
"""

# Initialize result
idxs = []

# Expand 10X single indexes
if TENX_SINGLE_PAT.findall(label):
for tenXidx in Chromium_10X_indexes[TENX_SINGLE_PAT.findall(label)[0]]:
idxs.append(tenXidx)
# Case of 10X dual indexes
elif TENX_DUAL_PAT.findall(label):
i7_idx = Chromium_10X_indexes[TENX_DUAL_PAT.findall(label)[0][0]]
i5_idx = Chromium_10X_indexes[TENX_DUAL_PAT.findall(label)[0][1]]
idxs.append((i7_idx, revcomp(i5_idx)))
kedhammar marked this conversation as resolved.
Show resolved Hide resolved
# Case of SS3 indexes
elif SMARTSEQ_PAT.findall(label):
for i7_idx in SMARTSEQ3_indexes[label][0]:
for i5_idx in SMARTSEQ3_indexes[label][1]:
idxs.append((i7_idx, revcomp(i5_idx)))
# NoIndex cases
elif label.replace(",", "").upper() == "NOINDEX" or (
label.replace(",", "").upper() == ""
):
raise AssertionError("NoIndex cases not allowed.")
# Ordinary indexes
elif IDX_PAT.findall(label):
idx_match = IDX_PAT.findall(label)[0]
if "-" in idx_match:
idx1, idx2 = idx_match.split("-")
idxs.append((idx1, idx2))
else:
idxs.append(idx_match)
else:
raise AssertionError(f"Could not parse index from '{label}'.")

return idxs


def get_samples_section(process: Process) -> str:
"""Generate the [SAMPLES] section of the AVITI run manifest and return it as a string."""

Expand Down Expand Up @@ -103,28 +189,13 @@ def get_samples_section(process: Process) -> str:
), "Unequal number of samples and reagent labels."

sample2label: dict[str, str] = get_pool_sample_label_mapping(art_out)
samples = art_out.samples
labels = art_out.reagent_labels

assert len(set(labels)) == len(labels), "Detected non-unique reagent labels."
assert len(set(art_out.reagent_labels)) == len(
art_out.reagent_labels
), "Detected non-unique reagent labels."

samples = art_out.samples
# Iterate over samples
for sample in samples:
lims_label = sample2label[sample.name]

# Parse sample index
label_seq_match = re.search(LABEL_SEQ_SUBSTRING, lims_label)
assert (
label_seq_match is not None
), f"Could not parse label sequence from {lims_label}"
label_seq = label_seq_match.group(0)

if "-" in label_seq:
index1, index2 = label_seq.split("-")
else:
index1 = label_seq
index2 = ""

# Project name and sequencing setup
if sample.project:
project = sample.project.name.replace(".", "__").replace(",", "")
Expand All @@ -133,34 +204,44 @@ def get_samples_section(process: Process) -> str:
project = "Control"
seq_setup = "0-0"

row = {}
row["SampleName"] = sample.name
row["Index1"] = index1
row["Index2"] = index2
row["Lane"] = lane
row["Project"] = project
row["Recipe"] = seq_setup
# Add row(s), depending on index type
lims_label = sample2label[sample.name]
for idx in idxs_from_label(lims_label):
row = {}
row["SampleName"] = sample.name
if isinstance(idx, tuple):
row["Index1"], row["Index2"] = idx
else:
row["Index1"] = idx
row["Index2"] = ""
row["Lane"] = lane
row["Project"] = project
row["Recipe"] = seq_setup

lane_rows.append(row)
lane_rows.append(row)

# Add PhiX controls if added:
phix_loaded: bool = art_out.udf["% phiX"] != 0
phix_set_name = art_out.udf.get("Element PhiX Set", None)

if phix_loaded:
for phix_idx_pair in [
("ACGTGTAGC", "GCTAGTGCA"),
("CACATGCTG", "AGACACTGT"),
("GTACACGAT", "CTCGTACAG"),
("TGTGCATCA", "TAGTCGATC"),
]:
assert (
phix_set_name is not None
), "PhiX controls loaded but no kit specified."

phix_set = PHIX_SETS[phix_set_name]

for phix_idx_pair in phix_set["indices"]:
row = {}
row["SampleName"] = "PhiX"
row["SampleName"] = phix_set["nickname"]
row["Index1"] = phix_idx_pair[0]
row["Index2"] = phix_idx_pair[1]
row["Lane"] = lane
row["Project"] = "PhiX"
row["Project"] = phix_set["nickname"]
row["Recipe"] = "0-0"
lane_rows.append(row)
else:
assert phix_set is None, "PhiX controls specified but not loaded."

# Check for index collision within lane, across samples and PhiX
check_distances(lane_rows)
Expand Down
105 changes: 0 additions & 105 deletions scripts/samplesheet_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -621,84 +621,6 @@ def gen_Nextseq_lane_data(pro):
return (content, data)


def gen_MinION_QC_data(pro):
keep_idx_flag = True if pro.type.name == "MinION QC" else False
data = []
for out in pro.all_outputs():
if NGISAMPLE_PAT.findall(out.name):
nanopore_barcode_seq = (
out.udf["Nanopore Barcode"].split("_")[1]
if out.udf["Nanopore Barcode"] != "None"
else ""
)
sample_name = out.name
idxs = out.reagent_labels[0]

sp_obj = {}
sp_obj["sn"] = sample_name
sp_obj["npbs"] = nanopore_barcode_seq

# Case of 10X indexes
if TENX_SINGLE_PAT.findall(idxs):
for tenXidx in Chromium_10X_indexes[TENX_SINGLE_PAT.findall(idxs)[0]]:
tenXidx_no = (
Chromium_10X_indexes[TENX_SINGLE_PAT.findall(idxs)[0]].index(
tenXidx
)
+ 1
)
sp_obj_sub = {}
sp_obj_sub["sn"] = sp_obj["sn"] + "_" + str(tenXidx_no)
sp_obj_sub["npbs"] = sp_obj["npbs"]
sp_obj_sub["idxt"] = "truseq"
sp_obj_sub["idx"] = tenXidx.replace(",", "")
data.append(sp_obj_sub)
# Case of 10X dual indexes
elif TENX_DUAL_PAT.findall(idxs):
sp_obj["idxt"] = "truseq_dual"
sp_obj["idx"] = (
Chromium_10X_indexes[TENX_DUAL_PAT.findall(idxs)[0]][0]
+ "-"
+ Chromium_10X_indexes[TENX_DUAL_PAT.findall(idxs)[0]][1]
)
data.append(sp_obj)
# Case of NoIndex
elif idxs == "NoIndex" or idxs == "" or not idxs:
sp_obj["idxt"] = "truseq"
sp_obj["idx"] = ""
data.append(sp_obj)
# Case of index sequences between brackets
elif re.findall(r"\((.*?)\)", idxs):
idxs = re.findall(r"\((.*?)\)", idxs)[0]
if "-" not in idxs:
sp_obj["idxt"] = "truseq"
sp_obj["idx"] = idxs
data.append(sp_obj)
else:
sp_obj["idxt"] = "truseq_dual"
sp_obj["idx"] = idxs
data.append(sp_obj)
# Case of single index
elif "-" not in idxs:
sp_obj["idxt"] = "truseq"
sp_obj["idx"] = idxs
data.append(sp_obj)
# Case of dual index
else:
sp_obj["idxt"] = "truseq_dual"
sp_obj["idx"] = idxs
data.append(sp_obj)
str_data = ""
for line in sorted(data, key=lambda x: x["sn"]):
if keep_idx_flag:
l_data = [line["sn"], line["npbs"], line["idxt"], line["idx"]]
else:
l_data = [line["sn"], line["npbs"], "", ""]
str_data = str_data + ",".join(l_data) + "\n"

return str_data


def find_barcode(sample_idxs, sample, process):
# print "trying to find {} barcode in {}".format(sample.name, process.name)
for art in process.all_inputs():
Expand Down Expand Up @@ -809,33 +731,6 @@ def main(lims, args):
except Exception as e:
log.append(str(e))

elif process.type.name in [
"MinION QC",
"Load Sample and Sequencing (MinION) 1.0",
]:
content = gen_MinION_QC_data(process)
run_type = "QC" if process.type.name == "MinION QC" else "DELIVERY"
fc_name = (
run_type
+ "_"
+ process.udf["Nanopore Kit"]
+ "_"
+ process.udf["Flowcell ID"].upper()
+ "_"
+ "Samplesheet"
+ "_"
+ process.id
)
if os.path.exists(f"/srv/ngi-nas-ns/samplesheets/nanopore/{thisyear}"):
try:
with open(
f"/srv/ngi-nas-ns/samplesheets/nanopore/{thisyear}/{fc_name}.csv",
"w",
) as sf:
sf.write(content)
except Exception as e:
log.append(str(e))

if not args.test:
for out in process.all_outputs():
if out.name == "Scilifelab SampleSheet":
Expand Down
Loading