Skip to content

Commit

Permalink
fix: CQDG-00 Use study code for normalize variants
Browse files Browse the repository at this point in the history
  • Loading branch information
jecos committed Oct 27, 2023
1 parent 913cd31 commit 9569f43
Show file tree
Hide file tree
Showing 9 changed files with 18 additions and 16 deletions.
2 changes: 1 addition & 1 deletion config/output/config/dev-cqdg.conf
Original file line number Diff line number Diff line change
Expand Up @@ -2060,7 +2060,7 @@ datalake {
partitionby=[
chromosome
]
path="/{{OWNER}}/{{STUDY_ID}}/{{DATASET}}/variants.*.vep.vcf.gz"
path="/{{OWNER}}/{{STUDY_CODE}}/{{DATASET}}/variants.*.vep.vcf.gz"
readoptions {}
repartition {
column-names=[
Expand Down
2 changes: 1 addition & 1 deletion config/output/config/prod-cqdg.conf
Original file line number Diff line number Diff line change
Expand Up @@ -2060,7 +2060,7 @@ datalake {
partitionby=[
chromosome
]
path="/{{OWNER}}/{{STUDY_ID}}/{{DATASET}}/variants.*.vep.vcf.gz"
path="/{{OWNER}}/{{STUDY_CODE}}/{{DATASET}}/variants.*.vep.vcf.gz"
readoptions {}
repartition {
column-names=[
Expand Down
2 changes: 1 addition & 1 deletion config/output/config/qa-cqdg.conf
Original file line number Diff line number Diff line change
Expand Up @@ -2060,7 +2060,7 @@ datalake {
partitionby=[
chromosome
]
path="/{{OWNER}}/{{STUDY_ID}}/{{DATASET}}/variants.*.vep.vcf.gz"
path="/{{OWNER}}/{{STUDY_CODE}}/{{DATASET}}/variants.*.vep.vcf.gz"
readoptions {}
repartition {
column-names=[
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ object ConfigurationGenerator extends App {
DatasetConf(
id = "raw_vcf",
storageid = storage_vcf,
path = "/{{OWNER}}/{{STUDY_ID}}/{{DATASET}}/variants.*.vep.vcf.gz",
path = "/{{OWNER}}/{{STUDY_CODE}}/{{DATASET}}/variants.*.vep.vcf.gz",
format = VCF,
loadtype = OverWrite,
partitionby = List("chromosome"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,15 @@ import org.apache.spark.sql.functions.{array_contains, col, lit}

import java.time.LocalDateTime

case class Consequences(rc: RuntimeETLContext, studyId: String, owner: String, dataset: String, referenceGenomePath: Option[String]) extends BaseConsequences(rc: RuntimeETLContext, annotationsColumn = csq, groupByLocus = true) {
case class Consequences(rc: RuntimeETLContext, studyId: String, studyCode: String, owner: String, dataset: String, referenceGenomePath: Option[String]) extends BaseConsequences(rc: RuntimeETLContext, annotationsColumn = csq, groupByLocus = true) {
private val raw_variant_calling: DatasetConf = conf.getDataset("raw_vcf")
override val mainDestination: DatasetConf = conf.getDataset("normalized_consequences")

override def extract(lastRunDateTime: LocalDateTime = minDateTime,
currentRunDateTime: LocalDateTime = LocalDateTime.now()): Map[String, DataFrame] = {
Map(
raw_vcf -> vcf(raw_variant_calling.location
.replace("{{STUDY_ID}}", s"$studyId")
.replace("{{STUDY_CODE}}", s"$studyCode")
.replace("{{DATASET}}", s"$dataset")
.replace("{{OWNER}}", s"$owner"), referenceGenomePath = None)
.where(col("contigName").isin(validContigNames: _*))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,20 @@ object RunNormalizedGenomic {
@main
def snv(rc: RuntimeETLContext,
@arg(name = "study-id", short = 's', doc = "Study Id") studyId: String,
@arg(name = "owner", short = 's', doc = "Owner") owner: String,
@arg(name = "dataset", short = 's', doc = "Dataset") dataset: String,
@arg(name = "study-code", short = 'c', doc = "Study Code") studyCode: String,
@arg(name = "owner", short = 'o', doc = "Owner") owner: String,
@arg(name = "dataset", short = 'd', doc = "Dataset") dataset: String,
@arg(name = "release-id", short = 'r', doc = "Release Id") releaseId: String,
@arg(name = "reference-genome-path", short = 'g', doc = "Reference Genome Path") referenceGenomePath: Option[String]): Unit = SNV(rc, studyId, owner, dataset, releaseId, referenceGenomePath).run()
@arg(name = "reference-genome-path", short = 'g', doc = "Reference Genome Path") referenceGenomePath: Option[String]): Unit = SNV(rc, studyId, studyCode,owner, dataset, releaseId, referenceGenomePath).run()


@main
def consequences(rc: RuntimeETLContext,
@arg(name = "study-id", short = 's', doc = "Study Id") studyId: String,
@arg(name = "owner", short = 's', doc = "Owner") owner: String,
@arg(name = "dataset", short = 's', doc = "Dataset") dataset: String,
@arg(name = "reference-genome-path", short = 'g', doc = "Reference Genome Path") referenceGenomePath: Option[String]): Unit = Consequences(rc, studyId, owner, dataset, referenceGenomePath).run()
@arg(name = "study-code", short = 'c', doc = "Study Code") studyCode: String,
@arg(name = "owner", short = 'o', doc = "Owner") owner: String,
@arg(name = "dataset", short = 'd', doc = "Dataset") dataset: String,
@arg(name = "reference-genome-path", short = 'g', doc = "Reference Genome Path") referenceGenomePath: Option[String]): Unit = Consequences(rc, studyId, studyCode, owner, dataset, referenceGenomePath).run()

def main(args: Array[String]): Unit = ParserForMethods(this).runOrThrow(args, allowPositional = true)
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import org.apache.spark.sql.{DataFrame, SparkSession}

import java.time.LocalDateTime

case class SNV(rc: RuntimeETLContext, studyId: String, owner: String, dataset: String, releaseId: String, referenceGenomePath: Option[String]) extends SimpleSingleETL(rc) {
case class SNV(rc: RuntimeETLContext, studyId: String, studyCode: String, owner: String, dataset: String, releaseId: String, referenceGenomePath: Option[String]) extends SimpleSingleETL(rc) {
private val enriched_specimen: DatasetConf = conf.getDataset("enriched_specimen")
private val raw_variant_calling: DatasetConf = conf.getDataset("raw_vcf")
private val normalized_task: DatasetConf = conf.getDataset("normalized_task")
Expand All @@ -23,7 +23,7 @@ case class SNV(rc: RuntimeETLContext, studyId: String, owner: String, dataset: S

Map(
"raw_vcf" -> vcf(raw_variant_calling.location
.replace("{{STUDY_ID}}", s"$studyId")
.replace("{{STUDY_CODE}}", s"$studyCode")
.replace("{{DATASET}}", s"$dataset")
.replace("{{OWNER}}", s"$owner"), referenceGenomePath = None)
.where(col("contigName").isin(validContigNames: _*)),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class ConsequencesSpec extends SparkSpec with WithTestConfig {


it should "generate normalized consequences from input VCF" in {
val results = Consequences(TestETLContext(), "STU0000001", "owner", "dataset_default", None).transform(data)
val results = Consequences(TestETLContext(), "STU0000001", "STU0000001", "owner", "dataset_default", None).transform(data)

val result = results("normalized_consequences").as[NormalizedConsequences].collect()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class SNVSpec extends AnyFlatSpec with Matchers with WithSparkSession with WithT
).toDF()
)

val results = SNV(TestETLContext(), "STU0000001", "owner", "dataset_default", releaseId = "1", None).transform(dataFomVCFFile)
val results = SNV(TestETLContext(), "STU0000001", "STU0000001", "owner", "dataset_default", releaseId = "1", None).transform(dataFomVCFFile)

val result = results("normalized_snv").as[NORMALIZED_SNV].collect()

Expand Down

0 comments on commit 9569f43

Please sign in to comment.