From 6048c858ed5d132be5115c250ef444867324a140 Mon Sep 17 00:00:00 2001 From: Adrian Date: Fri, 24 Nov 2023 16:26:02 -0500 Subject: [PATCH] fix: CQDG-475 remove release_id for partition --- config/output/config/dev-cqdg.conf | 54 +++++++------------ config/output/config/prod-cqdg.conf | 54 +++++++------------ config/output/config/qa-cqdg.conf | 54 +++++++------------ .../etl/config/ConfigurationGenerator.scala | 4 +- .../bio/ferlab/fhir/etl/PrepareIndex.scala | 34 ++++++------ .../etl/centricTypes/BiospecimenCentric.scala | 6 +-- .../fhir/etl/centricTypes/FileCentric.scala | 6 +-- .../etl/centricTypes/ParticipantCentric.scala | 6 +-- .../etl/centricTypes/SimpleParticipant.scala | 5 +- .../fhir/etl/centricTypes/StudyCentric.scala | 6 +-- .../test/scala/BiospecimenCentricSpec.scala | 2 +- .../src/test/scala/FileCentricSpec.scala | 2 +- .../test/scala/ParticipantCentricSpec.scala | 2 +- .../test/scala/SimpleParticipantSpec.scala | 2 +- .../src/test/scala/StudyCentricSpec.scala | 2 +- 15 files changed, 88 insertions(+), 151 deletions(-) diff --git a/config/output/config/dev-cqdg.conf b/config/output/config/dev-cqdg.conf index 2275e852..7f56b727 100644 --- a/config/output/config/dev-cqdg.conf +++ b/config/output/config/dev-cqdg.conf @@ -1308,8 +1308,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/patient" readoptions {} @@ -1353,8 +1352,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/diagnosis" readoptions {} @@ -1398,8 +1396,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/cause_of_death" readoptions {} @@ -1443,8 +1440,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/disease_status" readoptions {} @@ -1488,8 +1484,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/phenotype" readoptions {} @@ -1533,8 +1528,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/biospecimen" readoptions {} @@ -1578,8 +1572,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/sample_registration" readoptions {} @@ -1623,8 +1616,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/research_study" readoptions {} @@ -1668,8 +1660,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/document_reference" readoptions {} @@ -1713,8 +1704,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/family_relationship" readoptions {} @@ -1758,8 +1748,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/tumor_normal_designation" readoptions {} @@ -1803,8 +1792,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/group" readoptions {} @@ -1848,8 +1836,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/task" readoptions {} @@ -1936,8 +1923,7 @@ datalake { keys=[] loadtype=OverWrite partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/es_index/fhir/simple_participant" readoptions {} @@ -1956,8 +1942,7 @@ datalake { keys=[] loadtype=OverWrite partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/es_index/fhir/study_centric" readoptions {} @@ -1980,8 +1965,7 @@ datalake { keys=[] loadtype=OverWrite partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/es_index/fhir/participant_centric" readoptions {} @@ -2004,8 +1988,7 @@ datalake { keys=[] loadtype=OverWrite partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/es_index/fhir/file_centric" readoptions {} @@ -2028,8 +2011,7 @@ datalake { keys=[] loadtype=OverWrite partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/es_index/fhir/biospecimen_centric" readoptions {} diff --git a/config/output/config/prod-cqdg.conf b/config/output/config/prod-cqdg.conf index b66731d3..79d4adb1 100644 --- a/config/output/config/prod-cqdg.conf +++ b/config/output/config/prod-cqdg.conf @@ -1308,8 +1308,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/patient" readoptions {} @@ -1353,8 +1352,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/diagnosis" readoptions {} @@ -1398,8 +1396,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/cause_of_death" readoptions {} @@ -1443,8 +1440,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/disease_status" readoptions {} @@ -1488,8 +1484,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/phenotype" readoptions {} @@ -1533,8 +1528,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/biospecimen" readoptions {} @@ -1578,8 +1572,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/sample_registration" readoptions {} @@ -1623,8 +1616,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/research_study" readoptions {} @@ -1668,8 +1660,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/document_reference" readoptions {} @@ -1713,8 +1704,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/family_relationship" readoptions {} @@ -1758,8 +1748,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/tumor_normal_designation" readoptions {} @@ -1803,8 +1792,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/group" readoptions {} @@ -1848,8 +1836,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/task" readoptions {} @@ -1936,8 +1923,7 @@ datalake { keys=[] loadtype=OverWrite partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/es_index/fhir/simple_participant" readoptions {} @@ -1956,8 +1942,7 @@ datalake { keys=[] loadtype=OverWrite partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/es_index/fhir/study_centric" readoptions {} @@ -1980,8 +1965,7 @@ datalake { keys=[] loadtype=OverWrite partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/es_index/fhir/participant_centric" readoptions {} @@ -2004,8 +1988,7 @@ datalake { keys=[] loadtype=OverWrite partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/es_index/fhir/file_centric" readoptions {} @@ -2028,8 +2011,7 @@ datalake { keys=[] loadtype=OverWrite partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/es_index/fhir/biospecimen_centric" readoptions {} diff --git a/config/output/config/qa-cqdg.conf b/config/output/config/qa-cqdg.conf index c7ac797e..b00dceb4 100644 --- a/config/output/config/qa-cqdg.conf +++ b/config/output/config/qa-cqdg.conf @@ -1308,8 +1308,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/patient" readoptions {} @@ -1353,8 +1352,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/diagnosis" readoptions {} @@ -1398,8 +1396,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/cause_of_death" readoptions {} @@ -1443,8 +1440,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/disease_status" readoptions {} @@ -1488,8 +1484,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/phenotype" readoptions {} @@ -1533,8 +1528,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/biospecimen" readoptions {} @@ -1578,8 +1572,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/sample_registration" readoptions {} @@ -1623,8 +1616,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/research_study" readoptions {} @@ -1668,8 +1660,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/document_reference" readoptions {} @@ -1713,8 +1704,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/family_relationship" readoptions {} @@ -1758,8 +1748,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/tumor_normal_designation" readoptions {} @@ -1803,8 +1792,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/group" readoptions {} @@ -1848,8 +1836,7 @@ datalake { keys=[] loadtype=OverWritePartition partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/normalized/task" readoptions {} @@ -1936,8 +1923,7 @@ datalake { keys=[] loadtype=OverWrite partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/es_index/fhir/simple_participant" readoptions {} @@ -1956,8 +1942,7 @@ datalake { keys=[] loadtype=OverWrite partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/es_index/fhir/study_centric" readoptions {} @@ -1980,8 +1965,7 @@ datalake { keys=[] loadtype=OverWrite partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/es_index/fhir/participant_centric" readoptions {} @@ -2004,8 +1988,7 @@ datalake { keys=[] loadtype=OverWrite partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/es_index/fhir/file_centric" readoptions {} @@ -2028,8 +2011,7 @@ datalake { keys=[] loadtype=OverWrite partitionby=[ - "study_id", - "release_id" + "study_id" ] path="/es_index/fhir/biospecimen_centric" readoptions {} diff --git a/config/src/main/scala/bio/ferlab/fhir/etl/config/ConfigurationGenerator.scala b/config/src/main/scala/bio/ferlab/fhir/etl/config/ConfigurationGenerator.scala index d7ac7f19..92f25cd3 100644 --- a/config/src/main/scala/bio/ferlab/fhir/etl/config/ConfigurationGenerator.scala +++ b/config/src/main/scala/bio/ferlab/fhir/etl/config/ConfigurationGenerator.scala @@ -17,7 +17,7 @@ object ConfigurationGenerator extends App { sources.map(ds => ds.copy(table = ds.table.map(t => TableConf(tableName, t.name)))) } - private val partitionByStudyIdAndReleaseId = List("study_id", "release_id") + private val partitionByStudyIdAndReleaseId = List("study_id") val sourceNames: Seq[SourceConfig] = Seq( SourceConfig("patient", None, partitionByStudyIdAndReleaseId), SourceConfig("condition", Some("diagnosis"), partitionByStudyIdAndReleaseId), @@ -47,7 +47,7 @@ object ConfigurationGenerator extends App { path = s"/fhir/$rawPath", format = AVRO, loadtype = OverWrite, - partitionby = source.partitionBy + partitionby = source.partitionBy :+ "release_id" ), DatasetConf( id = s"normalized_$tableName", diff --git a/prepare-index/src/main/scala/bio/ferlab/fhir/etl/PrepareIndex.scala b/prepare-index/src/main/scala/bio/ferlab/fhir/etl/PrepareIndex.scala index e4c0fd41..a717c6ff 100644 --- a/prepare-index/src/main/scala/bio/ferlab/fhir/etl/PrepareIndex.scala +++ b/prepare-index/src/main/scala/bio/ferlab/fhir/etl/PrepareIndex.scala @@ -6,7 +6,7 @@ import bio.ferlab.fhir.etl.centricTypes.{BiospecimenCentric, FileCentric, Partic object PrepareIndex extends SparkApp { println(s"ARGS: " + args.mkString("[", ", ", "]")) - val Array(_, _, jobName, releaseId, studyIds) = args + val Array(_, _, jobName, studyIds) = args implicit val (conf, _, spark) = init() @@ -16,25 +16,25 @@ object PrepareIndex extends SparkApp { jobName match { case "study_centric" => - new SimpleParticipant(releaseId, studyList).run() - new StudyCentric(releaseId, studyList).run() + new SimpleParticipant(studyList).run() + new StudyCentric(studyList).run() case "participant_centric" => - new StudyCentric(releaseId, studyList).run() - new SimpleParticipant(releaseId, studyList).run() - new ParticipantCentric(releaseId, studyList).run() + new StudyCentric(studyList).run() + new SimpleParticipant(studyList).run() + new ParticipantCentric(studyList).run() case "file_centric" => - new StudyCentric(releaseId, studyList).run() - new SimpleParticipant(releaseId, studyList).run() - new FileCentric(releaseId, studyList).run() + new StudyCentric(studyList).run() + new SimpleParticipant(studyList).run() + new FileCentric(studyList).run() case "biospecimen_centric" => - new StudyCentric(releaseId, studyList).run() - new SimpleParticipant(releaseId, studyList).run() - new BiospecimenCentric(releaseId, studyList).run() + new StudyCentric(studyList).run() + new SimpleParticipant(studyList).run() + new BiospecimenCentric(studyList).run() case "all" => - new StudyCentric(releaseId, studyList).run() - new SimpleParticipant(releaseId, studyList).run() - new ParticipantCentric(releaseId, studyList).run() - new FileCentric(releaseId, studyList).run() - new BiospecimenCentric(releaseId, studyList).run() + new StudyCentric(studyList).run() + new SimpleParticipant(studyList).run() + new ParticipantCentric(studyList).run() + new FileCentric(studyList).run() + new BiospecimenCentric(studyList).run() } } diff --git a/prepare-index/src/main/scala/bio/ferlab/fhir/etl/centricTypes/BiospecimenCentric.scala b/prepare-index/src/main/scala/bio/ferlab/fhir/etl/centricTypes/BiospecimenCentric.scala index fa24ec92..2b11fff5 100644 --- a/prepare-index/src/main/scala/bio/ferlab/fhir/etl/centricTypes/BiospecimenCentric.scala +++ b/prepare-index/src/main/scala/bio/ferlab/fhir/etl/centricTypes/BiospecimenCentric.scala @@ -9,7 +9,7 @@ import org.apache.spark.sql.{DataFrame, SparkSession} import java.time.LocalDateTime -class BiospecimenCentric(releaseId: String, studyIds: List[String])(implicit configuration: Configuration) extends ETL { +class BiospecimenCentric(studyIds: List[String])(implicit configuration: Configuration) extends ETL { override val mainDestination: DatasetConf = conf.getDataset("es_index_biospecimen_centric") val normalized_biospecimen: DatasetConf = conf.getDataset("normalized_biospecimen") @@ -24,9 +24,7 @@ class BiospecimenCentric(releaseId: String, studyIds: List[String])(implicit con currentRunDateTime: LocalDateTime = LocalDateTime.now())(implicit spark: SparkSession): Map[String, DataFrame] = { Seq(normalized_biospecimen, normalized_drs_document_reference, simple_participant, es_index_study_centric, normalized_sequencing_experiment, normalized_sample_registration, es_index_file_centric) - .map(ds => ds.id -> ds.read.where(col("release_id") === releaseId) - .where(col("study_id").isin(studyIds: _*)) - ).toMap + .map(ds => ds.id -> ds.read.where(col("study_id").isin(studyIds: _*))).toMap } override def transform(data: Map[String, DataFrame], diff --git a/prepare-index/src/main/scala/bio/ferlab/fhir/etl/centricTypes/FileCentric.scala b/prepare-index/src/main/scala/bio/ferlab/fhir/etl/centricTypes/FileCentric.scala index ea16faf6..30639e6a 100644 --- a/prepare-index/src/main/scala/bio/ferlab/fhir/etl/centricTypes/FileCentric.scala +++ b/prepare-index/src/main/scala/bio/ferlab/fhir/etl/centricTypes/FileCentric.scala @@ -9,7 +9,7 @@ import org.apache.spark.sql.{DataFrame, SparkSession, functions} import java.time.LocalDateTime -class FileCentric(releaseId: String, studyIds: List[String])(implicit configuration: Configuration) extends ETL { +class FileCentric(studyIds: List[String])(implicit configuration: Configuration) extends ETL { override val mainDestination: DatasetConf = conf.getDataset("es_index_file_centric") val normalized_drs_document_reference: DatasetConf = conf.getDataset("normalized_document_reference") @@ -22,9 +22,7 @@ class FileCentric(releaseId: String, studyIds: List[String])(implicit configurat override def extract(lastRunDateTime: LocalDateTime = minDateTime, currentRunDateTime: LocalDateTime = LocalDateTime.now())(implicit spark: SparkSession): Map[String, DataFrame] = { Seq(normalized_drs_document_reference, normalized_biospecimen, simple_participant, es_index_study_centric, normalized_sequencing_experiment, normalized_sample_registration) - .map(ds => ds.id -> ds.read.where(col("release_id") === releaseId) - .where(col("study_id").isin(studyIds: _*)) - ).toMap + .map(ds => ds.id -> ds.read.where(col("study_id").isin(studyIds: _*))).toMap } override def transform(data: Map[String, DataFrame], diff --git a/prepare-index/src/main/scala/bio/ferlab/fhir/etl/centricTypes/ParticipantCentric.scala b/prepare-index/src/main/scala/bio/ferlab/fhir/etl/centricTypes/ParticipantCentric.scala index b7e6c871..7c6c3f9e 100644 --- a/prepare-index/src/main/scala/bio/ferlab/fhir/etl/centricTypes/ParticipantCentric.scala +++ b/prepare-index/src/main/scala/bio/ferlab/fhir/etl/centricTypes/ParticipantCentric.scala @@ -9,7 +9,7 @@ import org.apache.spark.sql.{DataFrame, SparkSession} import java.time.LocalDateTime -class ParticipantCentric(releaseId: String, studyIds: List[String])(implicit configuration: Configuration) extends ETL { +class ParticipantCentric(studyIds: List[String])(implicit configuration: Configuration) extends ETL { override val mainDestination: DatasetConf = conf.getDataset("es_index_participant_centric") val simple_participant: DatasetConf = conf.getDataset("simple_participant") @@ -22,9 +22,7 @@ class ParticipantCentric(releaseId: String, studyIds: List[String])(implicit con override def extract(lastRunDateTime: LocalDateTime = minDateTime, currentRunDateTime: LocalDateTime = LocalDateTime.now())(implicit spark: SparkSession): Map[String, DataFrame] = { Seq(simple_participant, normalized_drs_document_reference, normalized_biospecimen, normalized_sequencing_experiment, normalized_sample_registration, es_index_study_centric) - .map(ds => ds.id -> ds.read.where(col("release_id") === releaseId) - .where(col("study_id").isin(studyIds: _*)) - ).toMap + .map(ds => ds.id -> ds.read.where(col("study_id").isin(studyIds: _*))).toMap } override def transform(data: Map[String, DataFrame], diff --git a/prepare-index/src/main/scala/bio/ferlab/fhir/etl/centricTypes/SimpleParticipant.scala b/prepare-index/src/main/scala/bio/ferlab/fhir/etl/centricTypes/SimpleParticipant.scala index f1dbbe68..3573a4a6 100644 --- a/prepare-index/src/main/scala/bio/ferlab/fhir/etl/centricTypes/SimpleParticipant.scala +++ b/prepare-index/src/main/scala/bio/ferlab/fhir/etl/centricTypes/SimpleParticipant.scala @@ -9,7 +9,7 @@ import org.apache.spark.sql.{DataFrame, SparkSession} import java.time.LocalDateTime -class SimpleParticipant(releaseId: String, studyIds: List[String])(implicit configuration: Configuration) extends ETL { +class SimpleParticipant(studyIds: List[String])(implicit configuration: Configuration) extends ETL { override val mainDestination: DatasetConf = conf.getDataset("simple_participant") val normalized_patient: DatasetConf = conf.getDataset("normalized_patient") @@ -31,8 +31,7 @@ class SimpleParticipant(releaseId: String, studyIds: List[String])(implicit conf (Seq( normalized_patient, normalized_phenotype, normalized_disease, normalized_disease_status, normalized_cause_of_death, normalized_group, normalized_family_relationship, normalized_researchstudy) - .map(ds => ds.id -> ds.read.where(col("release_id") === releaseId) - .where(col("study_id").isin(studyIds: _*)) + .map(ds => ds.id -> ds.read.where(col("study_id").isin(studyIds: _*)) ) ++ Seq( hpo_terms.id -> hpo_terms.read, mondo_terms.id -> mondo_terms.read, diff --git a/prepare-index/src/main/scala/bio/ferlab/fhir/etl/centricTypes/StudyCentric.scala b/prepare-index/src/main/scala/bio/ferlab/fhir/etl/centricTypes/StudyCentric.scala index 5c1a857f..874e11cc 100644 --- a/prepare-index/src/main/scala/bio/ferlab/fhir/etl/centricTypes/StudyCentric.scala +++ b/prepare-index/src/main/scala/bio/ferlab/fhir/etl/centricTypes/StudyCentric.scala @@ -9,7 +9,7 @@ import org.apache.spark.sql.functions.{col, transform => sparkTransform, _} import java.time.LocalDateTime -class StudyCentric(releaseId: String, studyIds: List[String])(implicit configuration: Configuration) extends ETL { +class StudyCentric(studyIds: List[String])(implicit configuration: Configuration) extends ETL { override val mainDestination: DatasetConf = conf.getDataset("es_index_study_centric") val normalized_researchstudy: DatasetConf = conf.getDataset("normalized_research_study") @@ -29,9 +29,7 @@ class StudyCentric(releaseId: String, studyIds: List[String])(implicit configura normalized_researchstudy, normalized_drs_document_reference, normalized_patient, normalized_group, normalized_diagnosis, normalized_task, normalized_phenotype, normalized_sequencing_experiment, normalized_biospecimen, normalized_sample_registration) - .map(ds => ds.id -> ds.read.where(col("release_id") === releaseId) - .where(col("study_id").isin(studyIds: _*)) - ).toMap + .map(ds => ds.id -> ds.read.where(col("study_id").isin(studyIds: _*))).toMap } diff --git a/prepare-index/src/test/scala/BiospecimenCentricSpec.scala b/prepare-index/src/test/scala/BiospecimenCentricSpec.scala index 689f5048..b9de48ae 100644 --- a/prepare-index/src/test/scala/BiospecimenCentricSpec.scala +++ b/prepare-index/src/test/scala/BiospecimenCentricSpec.scala @@ -73,7 +73,7 @@ class BiospecimenCentricSpec extends AnyFlatSpec with Matchers with WithSparkSes ).toDF(), ) - val output = new BiospecimenCentric("5", List("STU0000001"))(conf).transform(data) + val output = new BiospecimenCentric(List("STU0000001"))(conf).transform(data) output.keys should contain("es_index_biospecimen_centric") diff --git a/prepare-index/src/test/scala/FileCentricSpec.scala b/prepare-index/src/test/scala/FileCentricSpec.scala index 67507f6a..871b4d04 100644 --- a/prepare-index/src/test/scala/FileCentricSpec.scala +++ b/prepare-index/src/test/scala/FileCentricSpec.scala @@ -42,7 +42,7 @@ class FileCentricSpec extends AnyFlatSpec with Matchers with WithSparkSession { ).toDF(), ) - val output = new FileCentric("5", List("STU0000001"))(conf).transform(data) + val output = new FileCentric(List("STU0000001"))(conf).transform(data) output.keys should contain("es_index_file_centric") val file_centric = output("es_index_file_centric").as[FILE_CENTRIC].collect() diff --git a/prepare-index/src/test/scala/ParticipantCentricSpec.scala b/prepare-index/src/test/scala/ParticipantCentricSpec.scala index db4babba..f365ccb6 100644 --- a/prepare-index/src/test/scala/ParticipantCentricSpec.scala +++ b/prepare-index/src/test/scala/ParticipantCentricSpec.scala @@ -37,7 +37,7 @@ class ParticipantCentricSpec extends AnyFlatSpec with Matchers with WithSparkSes "es_index_study_centric" -> Seq(STUDY_CENTRIC()).toDF(), ) - val output = new ParticipantCentric("re_000001", List("SD_Z6MWD3H0"))(conf).transform(data) + val output = new ParticipantCentric(List("SD_Z6MWD3H0"))(conf).transform(data) output.keys should contain("es_index_participant_centric") diff --git a/prepare-index/src/test/scala/SimpleParticipantSpec.scala b/prepare-index/src/test/scala/SimpleParticipantSpec.scala index 08d4cfd5..21b01fe9 100644 --- a/prepare-index/src/test/scala/SimpleParticipantSpec.scala +++ b/prepare-index/src/test/scala/SimpleParticipantSpec.scala @@ -42,7 +42,7 @@ class SimpleParticipantSpec extends AnyFlatSpec with Matchers with WithSparkSess "icd_terms" -> read(getClass.getResource("/icd_terms.json").toString, "Json", Map(), None, None) ) - val output = new SimpleParticipant("re_000001", List("SD_Z6MWD3H0"))(conf).transform(data) + val output = new SimpleParticipant(List("SD_Z6MWD3H0"))(conf).transform(data) output.keys should contain("simple_participant") diff --git a/prepare-index/src/test/scala/StudyCentricSpec.scala b/prepare-index/src/test/scala/StudyCentricSpec.scala index c35aece2..6d356797 100644 --- a/prepare-index/src/test/scala/StudyCentricSpec.scala +++ b/prepare-index/src/test/scala/StudyCentricSpec.scala @@ -65,7 +65,7 @@ class StudyCentricSpec extends AnyFlatSpec with Matchers with WithSparkSession { "duo_terms" -> read(getClass.getResource("/duo_terms.csv").toString, "csv", Map("header" -> "true"), None, None), ) - val output = new StudyCentric("5", List("STU0000001"))(conf).transform(data) + val output = new StudyCentric(List("STU0000001"))(conf).transform(data) output.keys should contain("es_index_study_centric")