fix: CQDG-00 Use study code for normalize variants

Ferlab-Ste-Justine · Oct 27, 2023 · 9569f43 · 9569f43
1 parent 913cd31
commit 9569f43
Show file tree

Hide file tree

Showing 9 changed files with 18 additions and 16 deletions.
diff --git a/config/output/config/dev-cqdg.conf b/config/output/config/dev-cqdg.conf
@@ -2060,7 +2060,7 @@ datalake {
             partitionby=[
                 chromosome
             ]
-            path="/{{OWNER}}/{{STUDY_ID}}/{{DATASET}}/variants.*.vep.vcf.gz"
+            path="/{{OWNER}}/{{STUDY_CODE}}/{{DATASET}}/variants.*.vep.vcf.gz"
             readoptions {}
             repartition {
                 column-names=[

diff --git a/config/output/config/prod-cqdg.conf b/config/output/config/prod-cqdg.conf
@@ -2060,7 +2060,7 @@ datalake {
             partitionby=[
                 chromosome
             ]
-            path="/{{OWNER}}/{{STUDY_ID}}/{{DATASET}}/variants.*.vep.vcf.gz"
+            path="/{{OWNER}}/{{STUDY_CODE}}/{{DATASET}}/variants.*.vep.vcf.gz"
             readoptions {}
             repartition {
                 column-names=[

diff --git a/config/output/config/qa-cqdg.conf b/config/output/config/qa-cqdg.conf
@@ -2060,7 +2060,7 @@ datalake {
             partitionby=[
                 chromosome
             ]
-            path="/{{OWNER}}/{{STUDY_ID}}/{{DATASET}}/variants.*.vep.vcf.gz"
+            path="/{{OWNER}}/{{STUDY_CODE}}/{{DATASET}}/variants.*.vep.vcf.gz"
             readoptions {}
             repartition {
                 column-names=[

diff --git a/config/src/main/scala/bio/ferlab/fhir/etl/config/ConfigurationGenerator.scala b/config/src/main/scala/bio/ferlab/fhir/etl/config/ConfigurationGenerator.scala
@@ -92,7 +92,7 @@ object ConfigurationGenerator extends App {
       DatasetConf(
         id = "raw_vcf",
         storageid = storage_vcf,
-        path = "/{{OWNER}}/{{STUDY_ID}}/{{DATASET}}/variants.*.vep.vcf.gz",
+        path = "/{{OWNER}}/{{STUDY_CODE}}/{{DATASET}}/variants.*.vep.vcf.gz",
         format = VCF,
         loadtype = OverWrite,
         partitionby = List("chromosome"),

diff --git a/variant-task/src/main/scala/bio/ferlab/etl/normalized/Consequences.scala b/variant-task/src/main/scala/bio/ferlab/etl/normalized/Consequences.scala
@@ -9,15 +9,15 @@ import org.apache.spark.sql.functions.{array_contains, col, lit}
 
 import java.time.LocalDateTime
 
-case class Consequences(rc: RuntimeETLContext, studyId: String, owner: String, dataset: String, referenceGenomePath: Option[String]) extends BaseConsequences(rc: RuntimeETLContext, annotationsColumn = csq, groupByLocus = true) {
+case class Consequences(rc: RuntimeETLContext, studyId: String, studyCode: String, owner: String, dataset: String, referenceGenomePath: Option[String]) extends BaseConsequences(rc: RuntimeETLContext, annotationsColumn = csq, groupByLocus = true) {
   private val raw_variant_calling: DatasetConf = conf.getDataset("raw_vcf")
   override val mainDestination: DatasetConf = conf.getDataset("normalized_consequences")
 
   override def extract(lastRunDateTime: LocalDateTime = minDateTime,
                        currentRunDateTime: LocalDateTime = LocalDateTime.now()): Map[String, DataFrame] = {
     Map(
       raw_vcf -> vcf(raw_variant_calling.location
-        .replace("{{STUDY_ID}}", s"$studyId")
+        .replace("{{STUDY_CODE}}", s"$studyCode")
         .replace("{{DATASET}}", s"$dataset")
         .replace("{{OWNER}}", s"$owner"), referenceGenomePath = None)
         .where(col("contigName").isin(validContigNames: _*))

diff --git a/variant-task/src/main/scala/bio/ferlab/etl/normalized/RunNormalizedGenomic.scala b/variant-task/src/main/scala/bio/ferlab/etl/normalized/RunNormalizedGenomic.scala
@@ -7,18 +7,20 @@ object RunNormalizedGenomic {
   @main
   def snv(rc: RuntimeETLContext,
           @arg(name = "study-id", short = 's', doc = "Study Id") studyId: String,
-          @arg(name = "owner", short = 's', doc = "Owner") owner: String,
-          @arg(name = "dataset", short = 's', doc = "Dataset") dataset: String,
+          @arg(name = "study-code", short = 'c', doc = "Study Code") studyCode: String,
+          @arg(name = "owner", short = 'o', doc = "Owner") owner: String,
+          @arg(name = "dataset", short = 'd', doc = "Dataset") dataset: String,
           @arg(name = "release-id", short = 'r', doc = "Release Id") releaseId: String,
-          @arg(name = "reference-genome-path", short = 'g', doc = "Reference Genome Path") referenceGenomePath: Option[String]): Unit = SNV(rc, studyId, owner, dataset, releaseId, referenceGenomePath).run()
+          @arg(name = "reference-genome-path", short = 'g', doc = "Reference Genome Path") referenceGenomePath: Option[String]): Unit = SNV(rc, studyId, studyCode,owner, dataset, releaseId, referenceGenomePath).run()
 
 
   @main
   def consequences(rc: RuntimeETLContext,
                    @arg(name = "study-id", short = 's', doc = "Study Id") studyId: String,
-                   @arg(name = "owner", short = 's', doc = "Owner") owner: String,
-                   @arg(name = "dataset", short = 's', doc = "Dataset") dataset: String,
-                   @arg(name = "reference-genome-path", short = 'g', doc = "Reference Genome Path") referenceGenomePath: Option[String]): Unit = Consequences(rc, studyId, owner, dataset, referenceGenomePath).run()
+                   @arg(name = "study-code", short = 'c', doc = "Study Code") studyCode: String,
+                   @arg(name = "owner", short = 'o', doc = "Owner") owner: String,
+                   @arg(name = "dataset", short = 'd', doc = "Dataset") dataset: String,
+                   @arg(name = "reference-genome-path", short = 'g', doc = "Reference Genome Path") referenceGenomePath: Option[String]): Unit = Consequences(rc, studyId, studyCode, owner, dataset, referenceGenomePath).run()
 
   def main(args: Array[String]): Unit = ParserForMethods(this).runOrThrow(args, allowPositional = true)
 }
diff --git a/variant-task/src/main/scala/bio/ferlab/etl/normalized/SNV.scala b/variant-task/src/main/scala/bio/ferlab/etl/normalized/SNV.scala
@@ -12,7 +12,7 @@ import org.apache.spark.sql.{DataFrame, SparkSession}
 
 import java.time.LocalDateTime
 
-case class SNV(rc: RuntimeETLContext, studyId: String, owner: String, dataset: String, releaseId: String, referenceGenomePath: Option[String]) extends SimpleSingleETL(rc) {
+case class SNV(rc: RuntimeETLContext, studyId: String, studyCode: String, owner: String, dataset: String, releaseId: String, referenceGenomePath: Option[String]) extends SimpleSingleETL(rc) {
   private val enriched_specimen: DatasetConf = conf.getDataset("enriched_specimen")
   private val raw_variant_calling: DatasetConf = conf.getDataset("raw_vcf")
   private val normalized_task: DatasetConf = conf.getDataset("normalized_task")
@@ -23,7 +23,7 @@ case class SNV(rc: RuntimeETLContext, studyId: String, owner: String, dataset: S
 
     Map(
       "raw_vcf" -> vcf(raw_variant_calling.location
-        .replace("{{STUDY_ID}}", s"$studyId")
+        .replace("{{STUDY_CODE}}", s"$studyCode")
         .replace("{{DATASET}}", s"$dataset")
         .replace("{{OWNER}}", s"$owner"), referenceGenomePath = None)
         .where(col("contigName").isin(validContigNames: _*)),

diff --git a/variant-task/src/test/scala/bio/ferlab/etl/normalized/ConsequencesSpec.scala b/variant-task/src/test/scala/bio/ferlab/etl/normalized/ConsequencesSpec.scala
@@ -26,7 +26,7 @@ class ConsequencesSpec extends SparkSpec with WithTestConfig {
 
 
   it should "generate normalized consequences from input VCF" in {
-    val results = Consequences(TestETLContext(), "STU0000001", "owner", "dataset_default", None).transform(data)
+    val results = Consequences(TestETLContext(), "STU0000001", "STU0000001", "owner", "dataset_default", None).transform(data)
 
     val result = results("normalized_consequences").as[NormalizedConsequences].collect()
 

diff --git a/variant-task/src/test/scala/bio/ferlab/etl/normalized/SNVSpec.scala b/variant-task/src/test/scala/bio/ferlab/etl/normalized/SNVSpec.scala
@@ -41,7 +41,7 @@ class SNVSpec extends AnyFlatSpec with Matchers with WithSparkSession with WithT
       ).toDF()
     )
 
-    val results = SNV(TestETLContext(), "STU0000001", "owner", "dataset_default", releaseId = "1", None).transform(dataFomVCFFile)
+    val results = SNV(TestETLContext(), "STU0000001", "STU0000001", "owner", "dataset_default", releaseId = "1", None).transform(dataFomVCFFile)
 
     val result = results("normalized_snv").as[NORMALIZED_SNV].collect()