Skip to content

Commit

Permalink
Add masseq to run_deepvariant
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 702021003
  • Loading branch information
kishwarshafin authored and pichuan committed Dec 3, 2024
1 parent 27075bd commit d6b7b43
Show file tree
Hide file tree
Showing 6 changed files with 63 additions and 53 deletions.
9 changes: 9 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,15 @@ ADD https://storage.googleapis.com/deepvariant/models/DeepVariant/${VERSION}/sav
ADD https://storage.googleapis.com/deepvariant/models/DeepVariant/${VERSION}/savedmodels/deepvariant.ont.savedmodel/variables/variables.index .
RUN chmod -R +r /opt/models/ont_r104/*

WORKDIR /opt/models/masseq
ADD https://storage.googleapis.com/deepvariant/models/DeepVariant/${VERSION}/savedmodels/deepvariant.masseq.savedmodel/fingerprint.pb .
ADD https://storage.googleapis.com/deepvariant/models/DeepVariant/${VERSION}/savedmodels/deepvariant.masseq.savedmodel/saved_model.pb .
ADD https://storage.googleapis.com/deepvariant/models/DeepVariant/${VERSION}/savedmodels/deepvariant.masseq.savedmodel/example_info.json .
WORKDIR /opt/models/masseq/variables
ADD https://storage.googleapis.com/deepvariant/models/DeepVariant/${VERSION}/savedmodels/deepvariant.masseq.savedmodel/variables/variables.data-00000-of-00001 .
ADD https://storage.googleapis.com/deepvariant/models/DeepVariant/${VERSION}/savedmodels/deepvariant.masseq.savedmodel/variables/variables.index .
RUN chmod -R +r /opt/models/masseq/*

# Copy small models
WORKDIR /opt/smallmodels/wgs
ADD https://storage.googleapis.com/deepvariant/models/DeepVariant/${VERSION}/smallmodels/deepvariant.wgs.smallmodel/fingerprint.pb .
Expand Down
65 changes: 26 additions & 39 deletions docs/deepvariant-masseq-case-study.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ steps.
Lets first create directories to organize files.

```bash
mkdir -p data benchmark reference output happy
mkdir -p input benchmark reference output happy
```

### Download the GRCh38 Reference
Expand Down Expand Up @@ -56,8 +56,8 @@ For this case study, we download the chr20 of a HG004 MAS-Seq BAM.
```bash
HTTPDIR=https://storage.googleapis.com/deepvariant/masseq-case-study

curl -L ${HTTPDIR}/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.chr20.bam > data/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.chr20.bam
curl -L ${HTTPDIR}/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.chr20.bam.bai > data/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.chr20.bam.bai
curl -L ${HTTPDIR}/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.chr20.bam > input/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.chr20.bam
curl -L ${HTTPDIR}/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.chr20.bam.bai > input/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.chr20.bam.bai
```


Expand All @@ -69,58 +69,42 @@ include regions where the BAM file has 10x or more coverage.
```bash
HTTPDIR=https://storage.googleapis.com/deepvariant/masseq-case-study

curl -L ${HTTPDIR}/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.depth.10x.exons.bed > data/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.depth.10x.exons.bed
```




### Download the MAS-Seq model

Finally, lets download the MAS-Seq model that we will use to call variants.

```bash
gsutil cp -R gs://deepvariant/models/DeepVariant/1.8.0/savedmodels/deepvariant.masseq.savedmodel .
curl -L ${HTTPDIR}/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.depth.10x.exons.bed > input/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.depth.10x.exons.bed
```

### Running DeepVariant MAS-Seq on a CPU-only machine

The command below will run the DeepVariant MAS-Seq model and produce an output
VCF (`output/out.vcf.gz`).
VCF.

```bash
BIN_VERSION="head687331500"
BIN_VERSION="1.8.0"

sudo docker run \
-v "$(pwd):$(pwd)" \
-w $(pwd) \
-v "${PWD}/input":"/input" \
-v "${PWD}/output":"/output" \
-v "${PWD}/reference":"/reference" \
google/deepvariant:"${BIN_VERSION}" \
run_deepvariant \
--model_type=PACBIO \
--customized_model=deepvariant.masseq.savedmodel \
--ref=reference/GRCh38_no_alt_analysis_set.fasta \
--reads=data/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.chr20.bam \
--output_vcf=output/HG004.output.vcf.gz \
--model_type=MASSEQ \
--ref=/reference/GRCh38_no_alt_analysis_set.fasta \
--reads=/input/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.chr20.bam \
--output_vcf=/output/HG004.output.vcf.gz \
--num_shards=$(nproc) \
--regions=chr20 \
--make_examples_extra_args="phase_reads=true,sort_by_haplotypes=true,parse_sam_aux_fields=true,realign_reads=false,vsc_min_fraction_indels=0.12,alt_aligned_pileup=diff_channels,trim_reads_for_pileup=true,pileup_image_width=199,min_mapping_quality=1,track_ref_reads=true,partition_size=25000,max_reads_per_partition=0,max_reads_for_dynamic_bases_per_region=1500" \
--disable_small_model=true \
--intermediate_results_dir=output/intermediate_results_dir
--intermediate_results_dir=/output/intermediate_results_dir
```

**Flag summary**

* `--model_type` - Sets the model and options, but we will override the model
with `--customized model`.
* `--model_type` - Sets the model and options for MAS-Seq data.
* `--customized_model` - Points to a model trained using MAS-Seq data.
* `--ref` - Specifies the reference sequence.
* `--reads` - Specifies the input bam file.
* `--output_vcf` - Specifies the output variant file.
* `--num_shards` - Sets the number of shards to the number of available
processors (`$(nproc)`). This is used to perform parallelization.
* `--regions` - Restricts to chr20 to make this case study faster.
* `--make_examples_extra_args=` - Passes additional arguments to
make_examples.
* `--intermediate_results_dir` - Outputs results to an intermediate directory.
This is optional. If you don't need the intermediate files, no need to
specify this flag.
Expand All @@ -132,18 +116,21 @@ For running on GPU machines, or using Singularity instead of Docker, see

```bash
sudo docker run \
-v $(pwd):$(pwd) \
-w $(pwd) \
-v "${PWD}/benchmark":"/benchmark" \
-v "${PWD}/input":"/input" \
-v "${PWD}/output":"/output" \
-v "${PWD}/reference":"/reference" \
-v "${PWD}/happy:/happy" \
jmcdani20/hap.py:v0.3.12 /opt/hap.py/bin/hap.py \
benchmark/HG004_GRCh38_1_22_v4.2.1_benchmark.vcf.gz \
output/HG004.output.vcf.gz \
-f benchmark/HG004_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed \
-r reference/GRCh38_no_alt_analysis_set.fasta \
-o happy/happy.output \
/benchmark/HG004_GRCh38_1_22_v4.2.1_benchmark.vcf.gz \
/output/HG004.output.vcf.gz \
-f /benchmark/HG004_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed \
-r /reference/GRCh38_no_alt_analysis_set.fasta \
-o /happy/happy.output \
--engine=vcfeval \
--pass-only \
-l chr20 \
--target-regions=data/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.depth.10x.exons.bed \
--target-regions=/input/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.depth.10x.exons.bed \
--threads=$(nproc)
```

Expand Down
16 changes: 16 additions & 0 deletions scripts/run_deepvariant.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ class ModelType(enum.Enum):
PACBIO = 'PACBIO'
ONT_R104 = 'ONT_R104'
HYBRID_PACBIO_ILLUMINA = 'HYBRID_PACBIO_ILLUMINA'
MASSEQ = 'MASSEQ'


# Required flags.
Expand Down Expand Up @@ -277,6 +278,7 @@ class ModelType(enum.Enum):
ModelType.PACBIO: '/opt/models/pacbio',
ModelType.ONT_R104: '/opt/models/ont_r104',
ModelType.HYBRID_PACBIO_ILLUMINA: '/opt/models/hybrid_pacbio_illumina',
ModelType.MASSEQ: '/opt/models/masseq',
}


Expand Down Expand Up @@ -495,6 +497,20 @@ def make_examples_command(
special_args['trim_reads_for_pileup'] = True
elif model_type == ModelType.HYBRID_PACBIO_ILLUMINA:
special_args['trim_reads_for_pileup'] = True
elif model_type == ModelType.MASSEQ:
special_args['alt_aligned_pileup'] = 'diff_channels'
special_args['max_reads_per_partition'] = 0
special_args['min_mapping_quality'] = 1
special_args['parse_sam_aux_fields'] = True
special_args['partition_size'] = 25000
special_args['phase_reads'] = True
special_args['pileup_image_width'] = 199
special_args['realign_reads'] = False
special_args['sort_by_haplotypes'] = True
special_args['track_ref_reads'] = True
special_args['vsc_min_fraction_indels'] = 0.12
special_args['trim_reads_for_pileup'] = True
special_args['max_reads_for_dynamic_bases_per_region'] = 1500

_set_small_model_config(
special_args, model_type, _CUSTOMIZED_SMALL_MODEL.value
Expand Down
11 changes: 5 additions & 6 deletions third_party/nucleus/io/gfile.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,13 @@ namespace nucleus {

bool Exists(const std::string& filename) {
// FileExists sets s to tensorflow::error::NOT_FOUND if it doesn't exist.
tensorflow::Status s = tensorflow::Env::Default()->FileExists(filename);
absl::Status s = tensorflow::Env::Default()->FileExists(filename);
return s.ok();
}

std::vector<std::string> Glob(const std::string& pattern) {
std::vector<std::string> results;
::tensorflow::Status s =
absl::Status s =
tensorflow::Env::Default()->GetMatchingPaths(pattern, &results);
return results;
}
Expand All @@ -56,7 +56,7 @@ ReadableFile::ReadableFile() {}

std::unique_ptr<ReadableFile> ReadableFile::New(const std::string& filename) {
std::unique_ptr<tensorflow::RandomAccessFile> file;
tensorflow::Status status =
absl::Status status =
tensorflow::Env::Default()->NewRandomAccessFile(filename, &file);
if (!status.ok()) {
return nullptr;
Expand Down Expand Up @@ -91,8 +91,7 @@ WritableFile::WritableFile() {}
std::unique_ptr<WritableFile> WritableFile::New(const std::string& filename) {
std::unique_ptr<tensorflow::WritableFile> file;

tensorflow::Status s =
tensorflow::Env::Default()->NewWritableFile(filename, &file);
absl::Status s = tensorflow::Env::Default()->NewWritableFile(filename, &file);

if (!s.ok()) {
return nullptr;
Expand All @@ -105,7 +104,7 @@ std::unique_ptr<WritableFile> WritableFile::New(const std::string& filename) {
}

bool WritableFile::Write(const std::string& s) {
tensorflow::Status status = file_->Append(s);
absl::Status status = file_->Append(s);
return status.ok();
}

Expand Down
4 changes: 2 additions & 2 deletions third_party/nucleus/io/tfrecord_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ TFRecordReader::TFRecordReader() {}
std::unique_ptr<TFRecordReader> TFRecordReader::New(
const std::string& filename, const std::string& compression_type) {
std::unique_ptr<tensorflow::RandomAccessFile> file;
tensorflow::Status s =
absl::Status s =
tensorflow::Env::Default()->NewRandomAccessFile(filename, &file);
if (!s.ok()) {
LOG(ERROR) << s;
Expand Down Expand Up @@ -74,7 +74,7 @@ bool TFRecordReader::GetNext() {
return false;
}

tensorflow::Status s = reader_->ReadRecord(&offset_, &record_);
absl::Status s = reader_->ReadRecord(&offset_, &record_);

return s.ok();
}
Expand Down
11 changes: 5 additions & 6 deletions third_party/nucleus/io/tfrecord_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,7 @@ TFRecordWriter::TFRecordWriter() {}
std::unique_ptr<TFRecordWriter> TFRecordWriter::New(
const std::string& filename, const std::string& compression_type) {
std::unique_ptr<tensorflow::WritableFile> file;
tensorflow::Status s =
tensorflow::Env::Default()->NewWritableFile(filename, &file);
absl::Status s = tensorflow::Env::Default()->NewWritableFile(filename, &file);
if (!s.ok()) {
LOG(ERROR) << s;
return nullptr;
Expand All @@ -69,29 +68,29 @@ bool TFRecordWriter::WriteRecord(const std::string& record) {
if (writer_ == nullptr) {
return false;
}
tensorflow::Status s = writer_->WriteRecord(record);
absl::Status s = writer_->WriteRecord(record);
return s.ok();
}

bool TFRecordWriter::Flush() {
if (writer_ == nullptr) {
return false;
}
tensorflow:: Status s = writer_->Flush();
absl::Status s = writer_->Flush();
return s.ok();
}

bool TFRecordWriter::Close() {
if (writer_ != nullptr) {
tensorflow::Status s = writer_->Close();
absl::Status s = writer_->Close();
if (!s.ok()) {
return false;
}
writer_ = nullptr;
}

if (file_ != nullptr) {
tensorflow:: Status s = file_->Close();
absl::Status s = file_->Close();
if (!s.ok()) {
return false;
}
Expand Down

0 comments on commit d6b7b43

Please sign in to comment.