Skip to content

Commit

Permalink
Enable NanoporeParser to validate and parse pod5 based nanopore struc…
Browse files Browse the repository at this point in the history
…tures (#126)

* Enable NanoporeParser to check for pod5 and dorado basecaller generated files

* Add Full data structure example

* Add JD
  • Loading branch information
Steffengreiner authored Nov 3, 2023
1 parent 56f82b5 commit 8fa9117
Show file tree
Hide file tree
Showing 110 changed files with 317 additions and 22 deletions.
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@
<dependency>
<artifactId>data-model-lib</artifactId>
<groupId>life.qbic</groupId>
<version>2.25.0</version>
<version>2.27.0</version>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
Expand Down
42 changes: 25 additions & 17 deletions src/main/groovy/life/qbic/utils/NanoporeParser.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package life.qbic.utils
import com.fasterxml.jackson.databind.ObjectMapper
import groovy.json.JsonSlurper
import groovy.util.logging.Log4j2
import life.qbic.datamodel.instruments.OxfordNanoporeInstrumentOutputDoradoMinimal
import life.qbic.datamodel.instruments.OxfordNanoporeInstrumentOutputMinimal
import net.jimblackler.jsonschemafriend.Schema
import net.jimblackler.jsonschemafriend.SchemaStore
Expand All @@ -14,7 +15,6 @@ import java.nio.file.Path
import java.nio.file.Paths
import java.text.ParseException
import life.qbic.datamodel.datasets.OxfordNanoporeExperiment

import java.util.stream.Collectors

@Log4j2
Expand Down Expand Up @@ -95,8 +95,8 @@ class NanoporeParser {
jsonStarted = true
}
if (jsonStarted) {
def split = line.replaceAll("\\s+","").split(":")
if(split.size() == 2 && split[1].replaceAll('"',"").size() <= 1){
def split = line.replaceAll("\\s+", "").split(":")
if (split.size() == 2 && split[1].replaceAll('"', "").size() <= 1) {
log.info("Metadata value ${split[0]} missing in ${reportFile["path"]}")
}
buffer.append(line)
Expand All @@ -110,12 +110,11 @@ class NanoporeParser {
new File(Paths.get(root.toString(), summaryFile["path"].toString()) as String)
.readLines().each { line ->
def split = line.split("=")
if(split.size() > 1){
if (split.size() > 1) {
finalMetaData[split[0]] = split[1]
}
else {
} else {
log.info("Metadata value ${split[0]} missing in ${summaryFile["path"]}, defaulting to empty value")
finalMetaData[split[0]] = ""
finalMetaData[split[0]] = ""
}
}
return finalMetaData
Expand Down Expand Up @@ -178,18 +177,27 @@ class NanoporeParser {
* @throws net.jimblackler.jsonschemafriend.ValidationException
*/
private static void validateJson(String json) throws ValidationException {
// Step 1: load schema
// Step 1: load json
ObjectMapper objectMapper = new ObjectMapper()
Object jsonObject = objectMapper.readValue(json, Object)

SchemaStore schemaStore = new SchemaStore()
Schema schema = schemaStore.loadSchema(OxfordNanoporeInstrumentOutputMinimal.getSchemaAsStream())
Validator validator = new Validator()
validator.validate(schema, jsonObject)
try {
//Validate against Fast5 Based Oxford Measurement
Schema schema = schemaStore.loadSchema(OxfordNanoporeInstrumentOutputMinimal.getSchemaAsStream())
validator.validate(schema, jsonObject)
} catch (ValidationException ignored) {
//Validate against Pod5 Based Oxford Measurement
Schema schema = schemaStore.loadSchema(OxfordNanoporeInstrumentOutputDoradoMinimal.getSchemaAsStream())
validator.validate(schema, jsonObject)
}
}

/*
* Converts a file tree into a json object.
*/

private static class DirectoryConverter {
private static final PREDEFINED_EXTENSIONS = ["fastq.gz"]
private static final IGNORED_FOLDERNAMES = ["qc"]
Expand Down Expand Up @@ -239,11 +247,11 @@ class NanoporeParser {
List<File> children = currentDirectory.listFiles()

List<File> visibleChildren = children.stream()
.filter(file -> !file.isHidden()).collect(Collectors.toList());
.filter(file -> !file.isHidden()).collect(Collectors.toList())

for (File file : children) {
if (!visibleChildren.contains(file)) {
hiddenFiles.add(file);
hiddenFiles.add(file)
}
}

Expand All @@ -252,11 +260,11 @@ class NanoporeParser {
return !IGNORED_FOLDERNAMES.contains(currentFolderName)
}.collect {
file ->
if (file.isFile()) {
convertFile(file.toPath())
} else if (file.isDirectory()) {
convertDirectory(file.toPath())
}
if (file.isFile()) {
convertFile(file.toPath())
} else if (file.isDirectory()) {
convertDirectory(file.toPath())
}
}

def convertedDirectory = [
Expand Down
35 changes: 35 additions & 0 deletions src/test/groovy/life/qbic/utils/NanoporeParserSpec.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,41 @@ class NanoporeParserSpec extends Specification {
thrown(ValidationException)
}

def "parsing a valid minimal file structure for dorado based basecalling containing additional unknown files and folder still returns an OxfordNanoporeExperiment Object"() {
given:
def pathToDirectory = Paths.get(exampleDirectoriesRoot, "validates/QABCD001AB_E12A345a01_PAE12345_nanopore_valid_dorado_minimal")
when:
def experiment = NanoporeParser.parseFileStructure(pathToDirectory)
then:
assert experiment instanceof OxfordNanoporeExperiment
// Check that the metadata from the report file has been retrieved
assert experiment.getMeasurements().get(0).getMachineHost() == "PCT0094"
// Check that the metadata from the summary file has been retrieved
assert experiment.getMeasurements().get(0).getLibraryPreparationKit() == "SQK-LSK109-XL"
}

def "parsing a valid file structure for dorado based basecalling containing additional unknown files and folder still returns an OxfordNanoporeExperiment Object"() {
given:
def pathToDirectory = Paths.get(exampleDirectoriesRoot, "validates/QABCD001AB_E12A345a01_PAE12345_nanopore_valid_dorado_example")
when:
def experiment = NanoporeParser.parseFileStructure(pathToDirectory)
then:
assert experiment instanceof OxfordNanoporeExperiment
// Check that the metadata from the report file has been retrieved
assert experiment.getMeasurements().get(0).getMachineHost() == "PCT0094"
// Check that the metadata from the summary file has been retrieved
assert experiment.getMeasurements().get(0).getLibraryPreparationKit() == "SQK-LSK109-XL"
}

def "parsing an invalid minimal file structure for dorado based basecalling leads to a ValidationException"() {
given:
def pathToDirectory = Paths.get(exampleDirectoriesRoot, "fails/QABCD001AB_E12A345a01_PAE12345_missing_skip_folder")
when:
def experiment = NanoporeParser.parseFileStructure(pathToDirectory)
then:
thrown(ValidationException)
}

def "parsing the alternative valid file structure with metadata missing returns an OxfordNanoporeExperiment Object"() {
given:
def pathToDirectory = Paths.get(exampleDirectoriesRoot, "validates/QABCD001AB_E12A345a01_PAE12345_nanopore_new_minimal")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
instrument=PCT0094
position=1-A3-D3
flow_cell_id=PAE24142
sample_id=QNANO027AE_E19D023a01_200211
protocol_group_id=20200211_QNANO
protocol=sequencing/sequencing_PRO002_DNA:FLO-PRO002:SQK-LSK109-XL
protocol_run_id=5a7cfc2a-81b0-412d-baa0-51b939cd8e76
acquisition_run_id=c6028297dff19d01e7c5fba6487de807d1e99c05
started=2020-02-11T15:52:10.465982+01:00
acquisition_stopped=2020-02-14T08:39:54.688916+01:00
processing_stopped=2020-02-14T08:39:58.804639+01:00
basecalling_enabled=1
sequencing_summary_file=sequencing_summary_PAE24142_c6028297.txt
fast5_files_in_final_dest=2189
fast5_files_in_fallback=0
fastq_files_in_final_dest=2189
fastq_files_in_fallback=0
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
Tracking ID
===========

{
"asic_id": "0004A30B0022C63E",
"asic_id_eeprom": "0004A30B0022C63E",
"asic_temp": "32.631687",
"asic_version": "Unknown",
"auto_update": "0",
"auto_update_source": "https://mirror.oxfordnanoportal.com/software/MinKNOW/",
"bream_is_standard": "0",
"configuration_version": "1.0.7",
"device_id": "1-E9-H9",
"device_type": "promethion",
"distribution_status": "stable",
"distribution_version": "19.12.5",
"exp_script_name": "N/A",
"exp_script_purpose": "sequencing_run",
"exp_start_time": "2020-01-28T15:17:38Z",
"flow_cell_id": "PAE26989",
"flow_cell_product_code": "FLO-PRO002",
"guppy_version": "3.2.8+bd67289",
"heatsink_temp": "36.179111",
"hostname": "PCT0094",
"hublett_board_id": "0132136faade2e15",
"hublett_firmware_version": "2.0.12",
"installation_type": "nc",
"ip_address": "",
"local_firmware_file": "1",
"mac_address": "",
"operating_system": "ubuntu 16.04",
"protocol_group_id": "20200128_QNANO",
"protocol_run_id": "",
"protocols_version": "4.3.16",
"run_id": "db9e9383d44d80bbe1e2600c7a7419056610d46d",
"sample_id": "QNANO036AD_E19D023b04",
"satellite_board_id": "0000000000000000",
"satellite_firmware_version": "2.0.12",
"usb_config": "firm_1.2.3_ware#rbt_4.5.6_rbt#ctrl#USB3",
"version": "3.6.1"
}

Duty Time
=========

ID: db9e9383d44d80bbe1e2600c7a7419056610d46d

Channel State,Experiment Time (minutes),State Time (samples),
strand,0,144832342
strand,1,158421270
strand,2,378095352
strand,3,472685319
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
instrument=PCT0094
position=1-A3-D3
flow_cell_id=PAE24142
sample_id=QNANO027AE_E19D023a01_200211
protocol_group_id=20200211_QNANO
protocol=sequencing/sequencing_PRO002_DNA:FLO-PRO002:SQK-LSK109-XL
protocol_run_id=5a7cfc2a-81b0-412d-baa0-51b939cd8e76
acquisition_run_id=c6028297dff19d01e7c5fba6487de807d1e99c05
started=2020-02-11T15:52:10.465982+01:00
acquisition_stopped=2020-02-14T08:39:54.688916+01:00
processing_stopped=2020-02-14T08:39:58.804639+01:00
basecalling_enabled=1
sequencing_summary_file=sequencing_summary_PAE24142_c6028297.txt
fast5_files_in_final_dest=2189
fast5_files_in_fallback=0
fastq_files_in_final_dest=2189
fastq_files_in_fallback=0
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
Tracking ID
===========

{
"asic_id": "0004A30B0022C63E",
"asic_id_eeprom": "0004A30B0022C63E",
"asic_temp": "32.631687",
"asic_version": "Unknown",
"auto_update": "0",
"auto_update_source": "https://mirror.oxfordnanoportal.com/software/MinKNOW/",
"bream_is_standard": "0",
"configuration_version": "1.0.7",
"device_id": "1-E9-H9",
"device_type": "promethion",
"distribution_status": "stable",
"distribution_version": "19.12.5",
"exp_script_name": "N/A",
"exp_script_purpose": "sequencing_run",
"exp_start_time": "2020-01-28T15:17:38Z",
"flow_cell_id": "PAE26989",
"flow_cell_product_code": "FLO-PRO002",
"guppy_version": "3.2.8+bd67289",
"heatsink_temp": "36.179111",
"hostname": "PCT0094",
"hublett_board_id": "0132136faade2e15",
"hublett_firmware_version": "2.0.12",
"installation_type": "nc",
"ip_address": "",
"local_firmware_file": "1",
"mac_address": "",
"operating_system": "ubuntu 16.04",
"protocol_group_id": "20200128_QNANO",
"protocol_run_id": "",
"protocols_version": "4.3.16",
"run_id": "db9e9383d44d80bbe1e2600c7a7419056610d46d",
"sample_id": "QNANO036AD_E19D023b04",
"satellite_board_id": "0000000000000000",
"satellite_firmware_version": "2.0.12",
"usb_config": "firm_1.2.3_ware#rbt_4.5.6_rbt#ctrl#USB3",
"version": "3.6.1"
}

Duty Time
=========

ID: db9e9383d44d80bbe1e2600c7a7419056610d46d

Channel State,Experiment Time (minutes),State Time (samples),
strand,0,144832342
strand,1,158421270
strand,2,378095352
strand,3,472685319
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Loading

0 comments on commit 8fa9117

Please sign in to comment.