From f06821583d16a71e569c3bd55c8f89b226e1861e Mon Sep 17 00:00:00 2001 From: pichuan Date: Mon, 20 Mar 2023 11:05:17 -0700 Subject: [PATCH] Format Python files. PiperOrigin-RevId: 518020013 --- deeptrio/dt_constants.py | 4 +- deeptrio/make_examples.py | 271 +++++++++++++------ deeptrio/make_examples_test.py | 476 ++++++++++++++++++++------------- deeptrio/testdata.py | 66 +++-- 4 files changed, 523 insertions(+), 294 deletions(-) diff --git a/deeptrio/dt_constants.py b/deeptrio/dt_constants.py index 26a3cda6..1d9fb729 100644 --- a/deeptrio/dt_constants.py +++ b/deeptrio/dt_constants.py @@ -50,7 +50,9 @@ # The dimensions of a pileup image tensor as height x width x rank. PILEUP_DEFAULT_DIMS = [ - PILEUP_DEFAULT_HEIGHT, PILEUP_DEFAULT_WIDTH, PILEUP_NUM_CHANNELS + PILEUP_DEFAULT_HEIGHT, + PILEUP_DEFAULT_WIDTH, + PILEUP_NUM_CHANNELS, ] # Number of classes represented in the data set. The three classes are diff --git a/deeptrio/make_examples.py b/deeptrio/make_examples.py index fa735b07..d06a8145 100644 --- a/deeptrio/make_examples.py +++ b/deeptrio/make_examples.py @@ -54,77 +54,129 @@ # Flags related to samples in DeepTrio: SAMPLE_NAME_TO_TRAIN_ = flags.DEFINE_string( - 'sample_name_to_train', None, - 'Optional - if not set, default to the value in ' - '--sample_name, i.e. the child. The default is set to be backward ' - 'compatible. If set, it has to match one of --sample_name, ' - '--sample_name_parent1, or --sample_name_parent2. ' - 'Only used for training. When run in calling mode, this is unused because ' - 'examples are generated for all 3 samples together.') + 'sample_name_to_train', + None, + ( + 'Optional - if not set, default to the value in --sample_name, i.e. the' + ' child. The default is set to be backward compatible. If set, it has' + ' to match one of --sample_name, --sample_name_parent1, or' + ' --sample_name_parent2. Only used for training. When run in calling' + ' mode, this is unused because examples are generated for all 3 samples' + ' together.' + ), +) READS_ = flags.DEFINE_string( - 'reads', None, - 'Required. Aligned, sorted, indexed BAM file containing reads from the ' - 'child of the trio. ' - 'Should be aligned to a reference genome compatible with --ref. ' - 'Can provide multiple BAMs (comma-separated).') + 'reads', + None, + ( + 'Required. Aligned, sorted, indexed BAM file containing reads from the ' + 'child of the trio. ' + 'Should be aligned to a reference genome compatible with --ref. ' + 'Can provide multiple BAMs (comma-separated).' + ), +) READS_PARENT1_ = flags.DEFINE_string( - 'reads_parent1', None, - 'Required. Aligned, sorted, indexed BAM file containing reads from parent ' - '1 of the trio. Should be aligned to a reference genome compatible with ' - '--ref. Can provide multiple BAMs (comma-separated).') + 'reads_parent1', + None, + ( + 'Required. Aligned, sorted, indexed BAM file containing reads from' + ' parent 1 of the trio. Should be aligned to a reference genome' + ' compatible with --ref. Can provide multiple BAMs (comma-separated).' + ), +) READS_PARENT2_ = flags.DEFINE_string( - 'reads_parent2', None, - 'Aligned, sorted, indexed BAM file containing reads from parent 2 of the ' - 'trio. Should be aligned to a reference genome compatible with --ref. ' - 'Can provide multiple BAMs (comma-separated).') + 'reads_parent2', + None, + ( + 'Aligned, sorted, indexed BAM file containing reads from parent 2 of' + ' the trio. Should be aligned to a reference genome compatible with' + ' --ref. Can provide multiple BAMs (comma-separated).' + ), +) DOWNSAMPLE_FRACTION_CHILD_ = flags.DEFINE_float( - 'downsample_fraction_child', NO_DOWNSAMPLING, - 'If not ' + str(NO_DOWNSAMPLING) + ' must be a value between 0.0 and 1.0. ' + 'downsample_fraction_child', + NO_DOWNSAMPLING, + 'If not ' + + str(NO_DOWNSAMPLING) + + ' must be a value between 0.0 and 1.0. ' 'Reads will be kept (randomly) with a probability of downsample_fraction ' 'from the input child BAM. This argument makes it easy to create examples ' - 'as though the input BAM had less coverage.') + 'as though the input BAM had less coverage.', +) DOWNSAMPLE_FRACTION_PARENTS_ = flags.DEFINE_float( - 'downsample_fraction_parents', NO_DOWNSAMPLING, - 'If not ' + str(NO_DOWNSAMPLING) + ' must be a value between 0.0 and 1.0. ' + 'downsample_fraction_parents', + NO_DOWNSAMPLING, + 'If not ' + + str(NO_DOWNSAMPLING) + + ' must be a value between 0.0 and 1.0. ' 'Reads will be kept (randomly) with a probability of downsample_fraction ' 'from the input parent BAMs. This argument makes it easy to create examples' - ' as though the input BAMs had less coverage.') + ' as though the input BAMs had less coverage.', +) SAMPLE_NAME_ = flags.DEFINE_string( - 'sample_name', '', - 'Child sample name to use for our sample_name in the output ' - 'Variant/DeepVariantCall protos. If not specified, will be inferred from ' - 'the header information from --reads.') + 'sample_name', + '', + ( + 'Child sample name to use for our sample_name in the output' + ' Variant/DeepVariantCall protos. If not specified, will be inferred' + ' from the header information from --reads.' + ), +) SAMPLE_NAME_PARENT1_ = flags.DEFINE_string( - 'sample_name_parent1', '', - 'Parent1 Sample name to use for our sample_name in the output ' - 'Variant/DeepVariantCall protos. If not specified, will be inferred from ' - 'the header information from --reads_parent1.') + 'sample_name_parent1', + '', + ( + 'Parent1 Sample name to use for our sample_name in the output' + ' Variant/DeepVariantCall protos. If not specified, will be inferred' + ' from the header information from --reads_parent1.' + ), +) SAMPLE_NAME_PARENT2_ = flags.DEFINE_string( - 'sample_name_parent2', '', - 'Parent2 Sample name to use for our sample_name in the output ' - 'Variant/DeepVariantCall protos. If not specified, will be inferred from ' - 'the header information from --reads_parent2.') + 'sample_name_parent2', + '', + ( + 'Parent2 Sample name to use for our sample_name in the output' + ' Variant/DeepVariantCall protos. If not specified, will be inferred' + ' from the header information from --reads_parent2.' + ), +) PILEUP_IMAGE_HEIGHT_PARENT_ = flags.DEFINE_integer( - 'pileup_image_height_parent', 0, - 'Height for the parent pileup image. If 0, uses the default height') + 'pileup_image_height_parent', + 0, + 'Height for the parent pileup image. If 0, uses the default height', +) PILEUP_IMAGE_HEIGHT_CHILD_ = flags.DEFINE_integer( - 'pileup_image_height_child', 0, - 'Height for the child pileup image. If 0, uses the default height') + 'pileup_image_height_child', + 0, + 'Height for the child pileup image. If 0, uses the default height', +) PROPOSED_VARIANTS_CHILD_ = flags.DEFINE_string( - 'proposed_variants_child', None, - '(Only used when --variant_caller=vcf_candidate_importer.) ' - 'Tabix-indexed VCF file containing the proposed positions and alts for ' - '`vcf_candidate_importer` for the child. The GTs will be ignored.') + 'proposed_variants_child', + None, + ( + '(Only used when --variant_caller=vcf_candidate_importer.) ' + 'Tabix-indexed VCF file containing the proposed positions and alts for ' + '`vcf_candidate_importer` for the child. The GTs will be ignored.' + ), +) PROPOSED_VARIANTS_PARENT1_ = flags.DEFINE_string( - 'proposed_variants_parent1', None, - '(Only used when --variant_caller=vcf_candidate_importer.) ' - 'Tabix-indexed VCF file containing the proposed positions and alts for ' - '`vcf_candidate_importer` for the parent 1. The GTs will be ignored.') + 'proposed_variants_parent1', + None, + ( + '(Only used when --variant_caller=vcf_candidate_importer.) ' + 'Tabix-indexed VCF file containing the proposed positions and alts for ' + '`vcf_candidate_importer` for the parent 1. The GTs will be ignored.' + ), +) PROPOSED_VARIANTS_PARENT2_ = flags.DEFINE_string( - 'proposed_variants_parent2', None, - '(Only used when --variant_caller=vcf_candidate_importer.) ' - 'Tabix-indexed VCF file containing the proposed positions and alts for ' - '`vcf_candidate_importer` for the parent 2. The GTs will be ignored.') + 'proposed_variants_parent2', + None, + ( + '(Only used when --variant_caller=vcf_candidate_importer.) ' + 'Tabix-indexed VCF file containing the proposed positions and alts for ' + '`vcf_candidate_importer` for the parent 2. The GTs will be ignored.' + ), +) # We are using this flag for determining intervals for both child and parent # models. In the future, we can consider extending into 3 samples. CANDIDATE_POSITIONS_ = flags.DEFINE_string( @@ -145,38 +197,47 @@ def trio_samples_from_flags(add_flags=True, flags_obj=None): """Collects sample-related options into a list of samples.""" # Sample-specific options. child_sample_name = make_examples_core.assign_sample_name( - sample_name_flag=SAMPLE_NAME_.value, reads_filenames=READS_.value) + sample_name_flag=SAMPLE_NAME_.value, reads_filenames=READS_.value + ) parent1_sample_name = make_examples_core.assign_sample_name( sample_name_flag=SAMPLE_NAME_PARENT1_.value, - reads_filenames=READS_PARENT1_.value) + reads_filenames=READS_PARENT1_.value, + ) parent2_sample_name = make_examples_core.assign_sample_name( sample_name_flag=SAMPLE_NAME_PARENT2_.value, - reads_filenames=READS_PARENT2_.value) + reads_filenames=READS_PARENT2_.value, + ) parent1_options = deepvariant_pb2.SampleOptions( role='parent1', name=parent1_sample_name, variant_caller_options=make_examples_core.make_vc_options( - sample_name=parent1_sample_name, flags_obj=flags_obj), + sample_name=parent1_sample_name, flags_obj=flags_obj + ), order=[0, 1, 2], - pileup_height=dt_constants.PILEUP_DEFAULT_HEIGHT_PARENT) + pileup_height=dt_constants.PILEUP_DEFAULT_HEIGHT_PARENT, + ) child_options = deepvariant_pb2.SampleOptions( role='child', name=child_sample_name, variant_caller_options=make_examples_core.make_vc_options( - sample_name=child_sample_name, flags_obj=flags_obj), + sample_name=child_sample_name, flags_obj=flags_obj + ), order=[0, 1, 2], - pileup_height=dt_constants.PILEUP_DEFAULT_HEIGHT_CHILD) + pileup_height=dt_constants.PILEUP_DEFAULT_HEIGHT_CHILD, + ) parent2_options = deepvariant_pb2.SampleOptions( role='parent2', name=parent2_sample_name, variant_caller_options=make_examples_core.make_vc_options( - sample_name=parent2_sample_name, flags_obj=flags_obj), + sample_name=parent2_sample_name, flags_obj=flags_obj + ), # Swap the two parents when calling on parent2. order=[2, 1, 0], - pileup_height=dt_constants.PILEUP_DEFAULT_HEIGHT_PARENT) + pileup_height=dt_constants.PILEUP_DEFAULT_HEIGHT_PARENT, + ) # If --sample_name_to_train is not set, train on the child. # This is for backward compatibility. @@ -196,9 +257,13 @@ def trio_samples_from_flags(add_flags=True, flags_obj=None): if PROPOSED_VARIANTS_CHILD_.value: child_options.proposed_variants_filename = PROPOSED_VARIANTS_CHILD_.value if PROPOSED_VARIANTS_PARENT1_.value: - parent1_options.proposed_variants_filename = PROPOSED_VARIANTS_PARENT1_.value + parent1_options.proposed_variants_filename = ( + PROPOSED_VARIANTS_PARENT1_.value + ) if PROPOSED_VARIANTS_PARENT2_.value: - parent2_options.proposed_variants_filename = PROPOSED_VARIANTS_PARENT2_.value + parent2_options.proposed_variants_filename = ( + PROPOSED_VARIANTS_PARENT2_.value + ) if DOWNSAMPLE_FRACTION_CHILD_.value != NO_DOWNSAMPLING: child_options.downsample_fraction = DOWNSAMPLE_FRACTION_CHILD_.value @@ -209,7 +274,9 @@ def trio_samples_from_flags(add_flags=True, flags_obj=None): if PILEUP_IMAGE_HEIGHT_CHILD_.value: child_options.pileup_height = PILEUP_IMAGE_HEIGHT_CHILD_.value if PILEUP_IMAGE_HEIGHT_PARENT_.value: - parent1_options.pileup_height = parent2_options.pileup_height = PILEUP_IMAGE_HEIGHT_PARENT_.value + parent1_options.pileup_height = ( + parent2_options.pileup_height + ) = PILEUP_IMAGE_HEIGHT_PARENT_.value if SAMPLE_NAME_TO_TRAIN_.value: if SAMPLE_NAME_TO_TRAIN_.value == SAMPLE_NAME_.value: @@ -218,9 +285,13 @@ def trio_samples_from_flags(add_flags=True, flags_obj=None): sample_role_to_train = parent1_options.role else: errors.log_and_raise( - '--sample_name_to_train must match either --sample_name or ' - '--sample_name_parent1, or it can be unset to default to ' - '--sample_name.', errors.CommandLineError) + ( + '--sample_name_to_train must match either --sample_name or ' + '--sample_name_parent1, or it can be unset to default to ' + '--sample_name.' + ), + errors.CommandLineError, + ) # Ordering here determines the default order of samples, and when a sample # above has a custom .order, then this is the list those indices refer to. @@ -248,22 +319,36 @@ def default_options(add_flags=True, flags_obj=None): flags_obj = FLAGS samples_in_order, sample_role_to_train = trio_samples_from_flags( - add_flags=add_flags, flags_obj=flags_obj) + add_flags=add_flags, flags_obj=flags_obj + ) options = make_examples_options.shared_flags_to_options( add_flags=add_flags, flags_obj=flags_obj, samples_in_order=samples_in_order, sample_role_to_train=sample_role_to_train, - main_sample_index=MAIN_SAMPLE_INDEX) + main_sample_index=MAIN_SAMPLE_INDEX, + ) if add_flags: - options.bam_fname = os.path.basename( - READS_.value) + '|' + (os.path.basename(READS_PARENT1_.value) - if READS_PARENT1_.value else 'None') + '|' + ( - os.path.basename(READS_PARENT2_.value) - if READS_PARENT2_.value else 'None') - options.pic_options.sequencing_type = deepvariant_pb2.PileupImageOptions.TRIO + options.bam_fname = ( + os.path.basename(READS_.value) + + '|' + + ( + os.path.basename(READS_PARENT1_.value) + if READS_PARENT1_.value + else 'None' + ) + + '|' + + ( + os.path.basename(READS_PARENT2_.value) + if READS_PARENT2_.value + else 'None' + ) + ) + options.pic_options.sequencing_type = ( + deepvariant_pb2.PileupImageOptions.TRIO + ) if not options.pic_options.height: options.pic_options.height = dt_constants.PILEUP_DEFAULT_HEIGHT if not options.pic_options.width: @@ -277,24 +362,32 @@ def check_options_are_valid(options): # Check for general flags (shared for DeepVariant and DeepTrio). make_examples_options.check_options_are_valid( - options, main_sample_index=MAIN_SAMPLE_INDEX) + options, main_sample_index=MAIN_SAMPLE_INDEX + ) child = options.sample_options[MAIN_SAMPLE_INDEX] # Sanity check the sample_names (specific to trio). - if (child.variant_caller_options.sample_name == FLAGS.sample_name_parent1 or - child.variant_caller_options.sample_name == FLAGS.sample_name_parent2): + if ( + child.variant_caller_options.sample_name == FLAGS.sample_name_parent1 + or child.variant_caller_options.sample_name == FLAGS.sample_name_parent2 + ): errors.log_and_raise( - 'The sample_name of the child is the same as one of ' - 'the parents.', errors.CommandLineError) + 'The sample_name of the child is the same as one of the parents.', + errors.CommandLineError, + ) if options.pic_options.alt_aligned_pileup == 'rows': - errors.log_and_raise('--alt_aligned_pileup="rows" cannot be used with ' - 'DeepTrio because the pileup images would become ' - 'too tall for InceptionV3.') + errors.log_and_raise( + '--alt_aligned_pileup="rows" cannot be used with ' + 'DeepTrio because the pileup images would become ' + 'too tall for InceptionV3.' + ) - if (options.mode == deepvariant_pb2.MakeExamplesOptions.CANDIDATE_SWEEP and - child.candidate_positions is None): + if ( + options.mode == deepvariant_pb2.MakeExamplesOptions.CANDIDATE_SWEEP + and child.candidate_positions is None + ): errors.log_and_raise( '--candidate_positions is required when --positions_sweep_mode is set.' ) @@ -306,7 +399,9 @@ def main(argv=()): errors.log_and_raise( 'Command line parsing failure: make_examples does not accept ' 'positional arguments but some are present on the command line: ' - '"{}".'.format(str(argv)), errors.CommandLineError) + '"{}".'.format(str(argv)), + errors.CommandLineError, + ) del argv # Unused. proto_utils.uses_fast_cpp_protos_or_die() diff --git a/deeptrio/make_examples_test.py b/deeptrio/make_examples_test.py index 14553d6d..154a8cca 100644 --- a/deeptrio/make_examples_test.py +++ b/deeptrio/make_examples_test.py @@ -158,22 +158,22 @@ class MakeExamplesEnd2EndTest(parameterized.TestCase): dict(mode='candidate_sweep', num_shards=0), dict(mode='candidate_sweep', num_shards=3), dict( - mode='training', num_shards=0, labeler_algorithm='haplotype_labeler'), + mode='training', num_shards=0, labeler_algorithm='haplotype_labeler' + ), dict( - mode='training', num_shards=3, labeler_algorithm='haplotype_labeler'), + mode='training', num_shards=3, labeler_algorithm='haplotype_labeler' + ), dict( - mode='training', num_shards=0, - labeler_algorithm='positional_labeler'), + mode='training', num_shards=0, labeler_algorithm='positional_labeler' + ), dict( - mode='training', num_shards=3, - labeler_algorithm='positional_labeler'), + mode='training', num_shards=3, labeler_algorithm='positional_labeler' + ), ) @flagsaver.flagsaver - def test_make_examples_end2end(self, - mode, - num_shards, - labeler_algorithm=None, - use_fast_pass_aligner=True): + def test_make_examples_end2end( + self, mode, num_shards, labeler_algorithm=None, use_fast_pass_aligner=True + ): self.assertIn(mode, {'calling', 'training', 'candidate_sweep'}) region = ranges.parse_literal('20:10,000,000-10,010,000') FLAGS.write_run_info = True @@ -186,11 +186,14 @@ def test_make_examples_end2end(self, FLAGS.sample_name_parent1 = 'parent1' FLAGS.sample_name_parent2 = 'parent2' FLAGS.candidates = test_utils.test_tmpfile( - _sharded('vsc.tfrecord', num_shards)) + _sharded('vsc.tfrecord', num_shards) + ) FLAGS.examples = test_utils.test_tmpfile( - _sharded('examples.tfrecord', num_shards)) + _sharded('examples.tfrecord', num_shards) + ) child_examples = test_utils.test_tmpfile( - _sharded('examples_child.tfrecord', num_shards)) + _sharded('examples_child.tfrecord', num_shards) + ) if mode == 'candidate_sweep': FLAGS.candidate_positions = test_utils.test_tmpfile( _sharded('candidate_positions', num_shards) @@ -208,20 +211,25 @@ def test_make_examples_end2end(self, if mode == 'calling': FLAGS.gvcf = test_utils.test_tmpfile( - _sharded('gvcf.tfrecord', num_shards)) + _sharded('gvcf.tfrecord', num_shards) + ) child_gvcf = test_utils.test_tmpfile( - _sharded('gvcf_child.tfrecord', num_shards)) + _sharded('gvcf_child.tfrecord', num_shards) + ) child_candidates = test_utils.test_tmpfile( - _sharded('vsc_child.tfrecord', num_shards)) + _sharded('vsc_child.tfrecord', num_shards) + ) else: FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED child_candidates = test_utils.test_tmpfile( - _sharded('vsc.tfrecord', num_shards)) + _sharded('vsc.tfrecord', num_shards) + ) if mode == 'candidate_sweep': - golden_candidate_positions = _sharded(testdata.GOLDEN_CANDIDATE_POSITIONS, - num_shards) + golden_candidate_positions = _sharded( + testdata.GOLDEN_CANDIDATE_POSITIONS, num_shards + ) for task_id in range(max(num_shards, 1)): FLAGS.task = task_id options = make_examples.default_options(add_flags=True) @@ -230,7 +238,8 @@ def test_make_examples_end2end(self, # Check that our run_info proto contains the basic fields we'd expect: # (a) our options are written to the run_info.options field. run_info = make_examples_core.read_make_examples_run_info( - options.run_info_filename) + options.run_info_filename + ) self.assertEqual(run_info.options, options) # (b) run_info.resource_metrics is present and contains our hostname. self.assertTrue(run_info.HasField('resource_metrics')) @@ -243,7 +252,8 @@ def test_make_examples_end2end(self, task_id, candidate_positions ) _, gold_candidates_path = sharded_file_utils.resolve_filespecs( - task_id, golden_candidate_positions) + task_id, golden_candidate_positions + ) self.verify_candidate_positions(candidates_path, gold_candidates_path) # In candidate_sweep mode the test stops here. @@ -254,13 +264,14 @@ def test_make_examples_end2end(self, # to check lots of properties of the output. candidates = sorted( tfrecord.read_tfrecords( - child_candidates, proto=deepvariant_pb2.DeepVariantCall), - key=lambda c: variant_utils.variant_range_tuple(c.variant)) + child_candidates, proto=deepvariant_pb2.DeepVariantCall + ), + key=lambda c: variant_utils.variant_range_tuple(c.variant), + ) self.verify_deepvariant_calls(candidates, options) - self.verify_variants([call.variant for call in candidates], - region, - options, - is_gvcf=False) + self.verify_variants( + [call.variant for call in candidates], region, options, is_gvcf=False + ) # Verify that the variants in the examples are all good. if mode == 'calling': @@ -269,10 +280,12 @@ def test_make_examples_end2end(self, region, options, verify_labels=False, - examples_filename=FLAGS.examples) + examples_filename=FLAGS.examples, + ) if mode == 'training': examples = self.verify_examples( - FLAGS.examples, region, options, verify_labels=True) + FLAGS.examples, region, options, verify_labels=True + ) example_variants = [dv_utils.example_variant(ex) for ex in examples] self.verify_variants(example_variants, region, options, is_gvcf=False) @@ -285,7 +298,8 @@ def test_make_examples_end2end(self, else: golden_file = _sharded(testdata.GOLDEN_TRAINING_EXAMPLES, num_shards) self.assertDeepVariantExamplesEqual( - examples, list(tfrecord.read_tfrecords(golden_file))) + examples, list(tfrecord.read_tfrecords(golden_file)) + ) if mode == 'calling': nist_reader = vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF) @@ -294,24 +308,32 @@ def test_make_examples_end2end(self, # Check the quality of our generated gvcf file. gvcfs = variant_utils.sorted_variants( - tfrecord.read_tfrecords(child_gvcf, proto=variants_pb2.Variant)) + tfrecord.read_tfrecords(child_gvcf, proto=variants_pb2.Variant) + ) self.verify_variants(gvcfs, region, options, is_gvcf=True) self.verify_contiguity(gvcfs, region) - gvcf_golden_file = _sharded(testdata.GOLDEN_POSTPROCESS_GVCF_INPUT, - num_shards) + gvcf_golden_file = _sharded( + testdata.GOLDEN_POSTPROCESS_GVCF_INPUT, num_shards + ) expected_gvcfs = list( - tfrecord.read_tfrecords(gvcf_golden_file, proto=variants_pb2.Variant)) + tfrecord.read_tfrecords(gvcf_golden_file, proto=variants_pb2.Variant) + ) # Despite its name, assertCountEqual checks that all items are equal. self.assertCountEqual(gvcfs, expected_gvcfs) - if (mode == 'training' and num_shards == 0 and - labeler_algorithm != 'positional_labeler'): + if ( + mode == 'training' + and num_shards == 0 + and labeler_algorithm != 'positional_labeler' + ): # The positional labeler doesn't track metrics, so don't try to read them # in when that's the mode. self.assertEqual( make_examples_core.read_make_examples_run_info( - testdata.GOLDEN_MAKE_EXAMPLES_RUN_INFO).labeling_metrics, - run_info.labeling_metrics) + testdata.GOLDEN_MAKE_EXAMPLES_RUN_INFO + ).labeling_metrics, + run_info.labeling_metrics, + ) # Golden sets are created with learning/genomics/internal/create_golden.sh @flagsaver.flagsaver @@ -341,9 +363,11 @@ def test_make_examples_training_end2end_with_customized_classes_labeler(self): golden_file = _sharded(testdata.CUSTOMIZED_CLASSES_GOLDEN_TRAINING_EXAMPLES) # Verify that the variants in the examples are all good. examples = self.verify_examples( - FLAGS.examples, region, options, verify_labels=True) + FLAGS.examples, region, options, verify_labels=True + ) self.assertDeepVariantExamplesEqual( - examples, list(tfrecord.read_tfrecords(golden_file))) + examples, list(tfrecord.read_tfrecords(golden_file)) + ) # Golden sets are created with learning/genomics/internal/create_golden.sh @flagsaver.flagsaver @@ -377,16 +401,17 @@ def test_make_examples_training_end2end_with_alt_aligned_pileup(self): golden_file = _sharded(testdata.ALT_ALIGNED_PILEUP_GOLDEN_TRAINING_EXAMPLES) # Verify that the variants in the examples are all good. examples = self.verify_examples( - FLAGS.examples, region, options, verify_labels=True) + FLAGS.examples, region, options, verify_labels=True + ) self.assertDeepVariantExamplesEqual( - examples, list(tfrecord.read_tfrecords(golden_file))) + examples, list(tfrecord.read_tfrecords(golden_file)) + ) # Pileup image should now have 8 channels. # Height should be 60 + 40 * 2 = 140. self.assertEqual(decode_example(examples[0])['image/shape'], [140, 199, 8]) @flagsaver.flagsaver def test_make_examples_compare_realignment_modes(self): - def _run_with_realignment_mode(enable_joint_realignment, name): FLAGS.enable_joint_realignment = enable_joint_realignment region = ranges.parse_literal('20:10,000,000-10,010,000') @@ -401,7 +426,8 @@ def _run_with_realignment_mode(enable_joint_realignment, name): FLAGS.candidates = test_utils.test_tmpfile(f'{name}.vsc.tfrecord') FLAGS.examples = test_utils.test_tmpfile(f'{name}.examples.tfrecord') child_examples = test_utils.test_tmpfile( - f'{name}_child.examples.tfrecord') + f'{name}_child.examples.tfrecord' + ) FLAGS.regions = [ranges.to_literal(region)] FLAGS.partition_size = 1000 FLAGS.mode = 'calling' @@ -416,7 +442,8 @@ def _run_with_realignment_mode(enable_joint_realignment, name): region, options, verify_labels=False, - examples_filename=FLAGS.examples) + examples_filename=FLAGS.examples, + ) return examples examples1 = _run_with_realignment_mode(False, 'ex1') @@ -443,17 +470,18 @@ def _run_with_realignment_mode(enable_joint_realignment, name): dict( select_types='snps indels', keep_legacy_behavior=True, - expected_count=75), + expected_count=75, + ), dict( select_types='multi-allelics', keep_legacy_behavior=True, - expected_count=4), + expected_count=4, + ), ) @flagsaver.flagsaver - def test_make_examples_with_variant_selection(self, - select_types, - expected_count, - keep_legacy_behavior=False): + def test_make_examples_with_variant_selection( + self, select_types, expected_count, keep_legacy_behavior=False + ): if select_types is not None: FLAGS.select_variant_types = select_types region = ranges.parse_literal('20:10,000,000-10,010,000') @@ -481,29 +509,32 @@ def test_make_examples_with_variant_selection(self, @parameterized.parameters( dict( - mode='calling', which_parent='parent1', sample_name_to_train='child'), + mode='calling', which_parent='parent1', sample_name_to_train='child' + ), dict( - mode='calling', which_parent='parent2', sample_name_to_train='child'), + mode='calling', which_parent='parent2', sample_name_to_train='child' + ), dict( - mode='training', which_parent='parent1', - sample_name_to_train='child'), + mode='training', which_parent='parent1', sample_name_to_train='child' + ), dict( - mode='training', which_parent='parent2', - sample_name_to_train='child'), + mode='training', which_parent='parent2', sample_name_to_train='child' + ), dict( - mode='calling', - which_parent='parent1', - sample_name_to_train='parent1'), + mode='calling', which_parent='parent1', sample_name_to_train='parent1' + ), dict( mode='training', which_parent='parent1', - sample_name_to_train='parent1'), + sample_name_to_train='parent1', + ), # Training on parent2 in a duo is not supported (with a clear error # message). ) @flagsaver.flagsaver - def test_make_examples_training_end2end_duos(self, mode, which_parent, - sample_name_to_train): + def test_make_examples_training_end2end_duos( + self, mode, which_parent, sample_name_to_train + ): region = ranges.parse_literal('20:10,000,000-10,010,000') FLAGS.regions = [ranges.to_literal(region)] FLAGS.ref = testdata.CHR20_FASTA @@ -548,26 +579,33 @@ def test_make_examples_end2end_vcf_candidate_importer(self, mode): FLAGS.pileup_image_height_parent = 40 FLAGS.pileup_image_height_child = 60 FLAGS.candidates = test_utils.test_tmpfile( - _sharded('vcf_candidate_importer.candidates.{}.tfrecord'.format(mode))) + _sharded('vcf_candidate_importer.candidates.{}.tfrecord'.format(mode)) + ) FLAGS.examples = test_utils.test_tmpfile( - _sharded('vcf_candidate_importer.examples.{}.tfrecord'.format(mode))) + _sharded('vcf_candidate_importer.examples.{}.tfrecord'.format(mode)) + ) FLAGS.mode = mode FLAGS.regions = '20:10,000,000-10,010,000' if mode == 'calling': golden_file = _sharded( - testdata.GOLDEN_VCF_CANDIDATE_IMPORTER_CALLING_EXAMPLES_CHILD) + testdata.GOLDEN_VCF_CANDIDATE_IMPORTER_CALLING_EXAMPLES_CHILD + ) path_to_output_examples = test_utils.test_tmpfile( _sharded( - 'vcf_candidate_importer_child.examples.{}.tfrecord'.format(mode))) + 'vcf_candidate_importer_child.examples.{}.tfrecord'.format(mode) + ) + ) FLAGS.proposed_variants_child = testdata.TRUTH_VARIANTS_VCF FLAGS.proposed_variants_parent1 = testdata.TRUTH_VARIANTS_VCF FLAGS.proposed_variants_parent2 = testdata.TRUTH_VARIANTS_VCF else: golden_file = _sharded( - testdata.GOLDEN_VCF_CANDIDATE_IMPORTER_TRAINING_EXAMPLES) + testdata.GOLDEN_VCF_CANDIDATE_IMPORTER_TRAINING_EXAMPLES + ) path_to_output_examples = test_utils.test_tmpfile( - _sharded('vcf_candidate_importer.examples.{}.tfrecord'.format(mode))) + _sharded('vcf_candidate_importer.examples.{}.tfrecord'.format(mode)) + ) FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED @@ -579,9 +617,11 @@ def test_make_examples_end2end_vcf_candidate_importer(self, mode): None, options, verify_labels=mode == 'training', - examples_filename=FLAGS.examples) + examples_filename=FLAGS.examples, + ) self.assertDeepVariantExamplesEqual( - output_examples_to_compare, list(tfrecord.read_tfrecords(golden_file))) + output_examples_to_compare, list(tfrecord.read_tfrecords(golden_file)) + ) def verify_nist_concordance(self, candidates, nist_variants): # Tests that we call almost all of the real variants (according to NIST's @@ -596,9 +636,12 @@ def verify_nist_concordance(self, candidates, nist_variants): tp_count = tp_count + 1 self.assertGreater( - tp_count / len(nist_variants), 0.9705, + tp_count / len(nist_variants), + 0.9705, 'Recall must be greater than 0.9705. TP={}, Truth variants={}'.format( - tp_count, len(nist_variants))) + tp_count, len(nist_variants) + ), + ) def assertDeepVariantExamplesEqual(self, actual, expected): """Asserts that actual and expected tf.Examples from DeepVariant are equal. @@ -623,31 +666,44 @@ def assertDeepVariantExamplesNotEqual(self, actual, expected): pass_not_equal_check = False if len(actual) != len(expected): logging.warning( - 'In assertDeepVariantExamplesNotEqual: ' - 'actual(%d) and expected(%d) has different lengths', len(actual), - len(expected)) + ( + 'In assertDeepVariantExamplesNotEqual: ' + 'actual(%d) and expected(%d) has different lengths' + ), + len(actual), + len(expected), + ) pass_not_equal_check = True min_size = min(len(actual), len(expected)) for i in range(min_size): if decode_example(actual[i]) != decode_example(expected[i]): logging.warning( - 'assertDeepVariantExamplesNotEqual: ' - 'actual example[%d] and expected example[%d] ' - 'are different', i, i) + ( + 'assertDeepVariantExamplesNotEqual: ' + 'actual example[%d] and expected example[%d] ' + 'are different' + ), + i, + i, + ) pass_not_equal_check = True self.assertTrue( - pass_not_equal_check, 'assertDeepVariantExamplesNotEqual failed - ' - 'actual and expected examples are identical.') + pass_not_equal_check, + ( + 'assertDeepVariantExamplesNotEqual failed - ' + 'actual and expected examples are identical.' + ), + ) def assertVariantIsPresent(self, to_find, variants): - def variant_key(v): return (v.reference_bases, v.start, v.end) # Finds a call in our actual call set for each NIST variant, asserting # that we found exactly one. matches = [ - variant for variant in variants + variant + for variant in variants if variant_key(to_find) == variant_key(variant) ] if not matches: @@ -661,14 +717,18 @@ def variant_key(v): return True - def verify_candidate_positions(self, candidate_positions_paths, - candidate_positions_golden_set): + def verify_candidate_positions( + self, candidate_positions_paths, candidate_positions_golden_set + ): with epath.Path(candidate_positions_golden_set).open('rb') as my_file: positions_golden = np.frombuffer(my_file.read(), dtype=np.int32) with epath.Path(candidate_positions_paths).open('rb') as my_file: positions = np.frombuffer(my_file.read(), dtype=np.int32) - logging.warning('%d positions created, %d positions in golden file', - len(positions), len(positions_golden)) + logging.warning( + '%d positions created, %d positions in golden file', + len(positions), + len(positions_golden), + ) self.assertCountEqual(positions, positions_golden) def verify_variants(self, variants, region, options, is_gvcf): @@ -688,7 +748,8 @@ def verify_variants(self, variants, region, options, is_gvcf): call = variant_utils.only_call(variant) self.assertEqual( call.call_set_name, - options.sample_options[1].variant_caller_options.sample_name) + options.sample_options[1].variant_caller_options.sample_name, + ) if is_gvcf: # GVCF records should have 0/0 or ./. (un-called) genotypes as they are # reference sites, have genotype likelihoods and a GQ value. @@ -736,29 +797,36 @@ def verify_deepvariant_calls(self, dv_calls, options): self.assertIn(alt_allele, list(call.allele_support)) self.assertGreaterEqual( len(call.allele_support[alt_allele].read_names), - options.sample_options[1].variant_caller_options.min_count_snps) + options.sample_options[1].variant_caller_options.min_count_snps, + ) def sanity_check_example_info_json(self, example, examples_filename, task_id): """Checks `example_info.json` w/ examples_filename has the right info.""" example_info_json = dv_utils.get_example_info_json_filename( - examples_filename, task_id) + examples_filename, task_id + ) example_info = json.load(gfile.GFile(example_info_json, 'r')) self.assertIn('shape', example_info) - self.assertEqual(example_info['shape'], - dv_utils.example_image_shape(example)) + self.assertEqual( + example_info['shape'], dv_utils.example_image_shape(example) + ) self.assertIn('channels', example_info) self.assertLen(example_info['channels'], example_info['shape'][2]) - def verify_examples(self, - path_to_output_examples, - region, - options, - verify_labels, - examples_filename=None): + def verify_examples( + self, + path_to_output_examples, + region, + options, + verify_labels, + examples_filename=None, + ): # Do some simple structural checks on the tf.Examples in the file. expected_features = [ - 'variant/encoded', 'locus', 'image/encoded', - 'alt_allele_indices/encoded' + 'variant/encoded', + 'locus', + 'image/encoded', + 'alt_allele_indices/encoded', ] if verify_labels: expected_features += ['label'] @@ -781,21 +849,22 @@ def verify_examples(self, if examples: if examples_filename is None: examples_filename = path_to_output_examples - self.sanity_check_example_info_json(examples[0], examples_filename, - options.task_id) + self.sanity_check_example_info_json( + examples[0], examples_filename, options.task_id + ) return examples class MakeExamplesUnitTest(parameterized.TestCase): def test_read_write_run_info(self): - def _read_lines(path): with open(path) as fin: return list(fin.readlines()) golden_actual = make_examples_core.read_make_examples_run_info( - testdata.GOLDEN_MAKE_EXAMPLES_RUN_INFO) + testdata.GOLDEN_MAKE_EXAMPLES_RUN_INFO + ) # We don't really want to inject too much knowledge about the golden right # here, so we only use a minimal test that (a) the run_info_filename is # a non-empty string and (b) the number of candidates sites in the labeling @@ -803,15 +872,18 @@ def _read_lines(path): # least one candidate variant, and the reader should have filled in the # value. self.assertNotEmpty(golden_actual.options.run_info_filename) - self.assertEqual(golden_actual.labeling_metrics.n_candidate_variant_sites, - testdata.N_GOLDEN_TRAINING_EXAMPLES) + self.assertEqual( + golden_actual.labeling_metrics.n_candidate_variant_sites, + testdata.N_GOLDEN_TRAINING_EXAMPLES, + ) # Check that reading + writing the data produces the same lines: tmp_output = test_utils.test_tmpfile('written_run_info.pbtxt') make_examples_core.write_make_examples_run_info(golden_actual, tmp_output) self.assertEqual( _read_lines(testdata.GOLDEN_MAKE_EXAMPLES_RUN_INFO), - _read_lines(tmp_output)) + _read_lines(tmp_output), + ) @flagsaver.flagsaver def test_keep_duplicates(self): @@ -829,8 +901,9 @@ def test_keep_duplicates(self): FLAGS.mode = 'training' FLAGS.examples = '' options = make_examples.default_options(add_flags=True) - self.assertEqual(options.pic_options.read_requirements.keep_duplicates, - True) + self.assertEqual( + options.pic_options.read_requirements.keep_duplicates, True + ) @flagsaver.flagsaver def test_keep_supplementary_alignments(self): @@ -850,7 +923,8 @@ def test_keep_supplementary_alignments(self): options = make_examples.default_options(add_flags=True) self.assertEqual( options.pic_options.read_requirements.keep_supplementary_alignments, - True) + True, + ) @flagsaver.flagsaver def test_keep_secondary_alignments(self): @@ -869,7 +943,8 @@ def test_keep_secondary_alignments(self): FLAGS.examples = '' options = make_examples.default_options(add_flags=True) self.assertEqual( - options.pic_options.read_requirements.keep_secondary_alignments, True) + options.pic_options.read_requirements.keep_secondary_alignments, True + ) @flagsaver.flagsaver def test_min_base_quality(self): @@ -905,8 +980,9 @@ def test_min_mapping_quality(self): FLAGS.mode = 'training' FLAGS.examples = '' options = make_examples.default_options(add_flags=True) - self.assertEqual(options.pic_options.read_requirements.min_mapping_quality, - 15) + self.assertEqual( + options.pic_options.read_requirements.min_mapping_quality, 15 + ) @flagsaver.flagsaver def test_default_options_with_training_random_emit_ref_sites(self): @@ -926,8 +1002,11 @@ def test_default_options_with_training_random_emit_ref_sites(self): FLAGS.training_random_emit_ref_sites = 0.3 options = make_examples.default_options(add_flags=True) self.assertAlmostEqual( - options.sample_options[1].variant_caller_options - .fraction_reference_sites_to_emit, 0.3) + options.sample_options[ + 1 + ].variant_caller_options.fraction_reference_sites_to_emit, + 0.3, + ) @flagsaver.flagsaver def test_default_options_without_training_random_emit_ref_sites(self): @@ -949,8 +1028,11 @@ def test_default_options_without_training_random_emit_ref_sites(self): # redacted # As an approximation, we directly check that the value should be exactly 0. self.assertEqual( - options.sample_options[1].variant_caller_options - .fraction_reference_sites_to_emit, 0.0) + options.sample_options[ + 1 + ].variant_caller_options.fraction_reference_sites_to_emit, + 0.0, + ) @flagsaver.flagsaver def test_confident_regions(self): @@ -972,44 +1054,42 @@ def test_confident_regions(self): # Our expected intervals, inlined from CONFIDENT_REGIONS_BED. expected = _from_literals_list([ - '20:10000847-10002407', '20:10002521-10004171', '20:10004274-10004964', - '20:10004995-10006386', '20:10006410-10007800', '20:10007825-10008018', - '20:10008044-10008079', '20:10008101-10008707', '20:10008809-10008897', - '20:10009003-10009791', '20:10009934-10010531' + '20:10000847-10002407', + '20:10002521-10004171', + '20:10004274-10004964', + '20:10004995-10006386', + '20:10006410-10007800', + '20:10007825-10008018', + '20:10008044-10008079', + '20:10008101-10008707', + '20:10008809-10008897', + '20:10009003-10009791', + '20:10009934-10010531', ]) # Our confident regions should be exactly those found in the BED file. self.assertCountEqual(expected, list(confident_regions)) @parameterized.parameters( - ({ - 'examples': ('foo', 'foo') - },), - ({ - 'examples': ('foo', 'foo'), - 'gvcf': ('bar', 'bar') - },), - ({ - 'examples': ('foo@10', 'foo-00000-of-00010') - },), - ({ - 'task': (0, 0), - 'examples': ('foo@10', 'foo-00000-of-00010') - },), - ({ - 'task': (1, 1), - 'examples': ('foo@10', 'foo-00001-of-00010') - },), - ({ - 'task': (1, 1), - 'examples': ('foo@10', 'foo-00001-of-00010'), - 'gvcf': ('bar@10', 'bar-00001-of-00010') - },), - ({ - 'task': (1, 1), - 'examples': ('foo@10', 'foo-00001-of-00010'), - 'gvcf': ('bar@10', 'bar-00001-of-00010'), - 'candidates': ('baz@10', 'baz-00001-of-00010') - },), + ({'examples': ('foo', 'foo')},), + ({'examples': ('foo', 'foo'), 'gvcf': ('bar', 'bar')},), + ({'examples': ('foo@10', 'foo-00000-of-00010')},), + ({'task': (0, 0), 'examples': ('foo@10', 'foo-00000-of-00010')},), + ({'task': (1, 1), 'examples': ('foo@10', 'foo-00001-of-00010')},), + ( + { + 'task': (1, 1), + 'examples': ('foo@10', 'foo-00001-of-00010'), + 'gvcf': ('bar@10', 'bar-00001-of-00010'), + }, + ), + ( + { + 'task': (1, 1), + 'examples': ('foo@10', 'foo-00001-of-00010'), + 'gvcf': ('bar@10', 'bar-00001-of-00010'), + 'candidates': ('baz@10', 'baz-00001-of-00010'), + }, + ), ) @flagsaver.flagsaver def test_sharded_outputs1(self, settings): @@ -1023,20 +1103,25 @@ def test_sharded_outputs1(self, settings): options = make_examples.default_options(add_flags=True) # Check all of the flags. - for name, option_val in [('examples', options.examples_filename), - ('candidates', options.candidates_filename), - ('gvcf', options.gvcf_filename)]: + for name, option_val in [ + ('examples', options.examples_filename), + ('candidates', options.candidates_filename), + ('gvcf', options.gvcf_filename), + ]: expected = settings[name][1] if name in settings else '' self.assertEqual(expected, option_val) def test_catches_bad_argv(self): - with mock.patch.object(logging, 'error') as mock_logging,\ - mock.patch.object(sys, 'exit') as mock_exit: + with ( + mock.patch.object(logging, 'error') as mock_logging, + mock.patch.object(sys, 'exit') as mock_exit, + ): make_examples.main(['make_examples.py', 'extra_arg']) mock_logging.assert_called_once_with( 'Command line parsing failure: make_examples does not accept ' 'positional arguments but some are present on the command line: ' - '"[\'make_examples.py\', \'extra_arg\']".') + "\"['make_examples.py', 'extra_arg']\"." + ) mock_exit.assert_called_once_with(errno.ENOENT) @flagsaver.flagsaver @@ -1060,11 +1145,14 @@ def test_catches_bad_flags(self): # This is the bad flag. FLAGS.confident_regions = '' - with mock.patch.object(logging, 'error') as mock_logging,\ - mock.patch.object(sys, 'exit') as mock_exit: + with ( + mock.patch.object(logging, 'error') as mock_logging, + mock.patch.object(sys, 'exit') as mock_exit, + ): make_examples.main(['make_examples.py']) mock_logging.assert_called_once_with( - 'confident_regions is required when in training mode.') + 'confident_regions is required when in training mode.' + ) mock_exit.assert_called_once_with(errno.ENOENT) @flagsaver.flagsaver @@ -1083,12 +1171,15 @@ def test_regions_and_exclude_regions_flags_with_trio_options(self): FLAGS.exclude_regions = '20:10,010,000-10,100,000' options = make_examples.default_options(add_flags=True) - _, regions_from_options = make_examples_core.processing_regions_from_options( - options) + _, regions_from_options = ( + make_examples_core.processing_regions_from_options(options) + ) self.assertCountEqual( list(ranges.RangeSet(regions_from_options)), _from_literals_list( - ['20:10,000,000-10,009,999', '20:10,100,001-11,000,000'])) + ['20:10,000,000-10,009,999', '20:10,100,001-11,000,000'] + ), + ) @flagsaver.flagsaver def test_incorrect_empty_regions_with_trio_options(self): @@ -1141,7 +1232,7 @@ def add_mock(self, name, retval='dontadd', side_effect='dontadd'): @parameterized.parameters([ deepvariant_pb2.MakeExamplesOptions.TRAINING, - deepvariant_pb2.MakeExamplesOptions.CALLING + deepvariant_pb2.MakeExamplesOptions.CALLING, ]) def test_process_keeps_ordering_of_candidates_and_examples(self, mode): self.processor.options.mode = mode @@ -1162,8 +1253,9 @@ def test_create_pileup_examples_handles_none(self): self.processor.pic = mock.Mock() dv_call = mock.Mock() self.processor.pic.create_pileup_images.return_value = None - self.assertEqual([], - self.processor.create_pileup_examples(dv_call, 'child')) + self.assertEqual( + [], self.processor.create_pileup_examples(dv_call, 'child') + ) self.processor.pic.create_pileup_images.assert_called_once() def test_create_pileup_examples(self): @@ -1171,14 +1263,19 @@ def test_create_pileup_examples(self): self.processor.pic.get_channels.return_value = None self.add_mock( '_encode_tensor', - side_effect=[(b'tensor1', self.default_shape), - (b'tensor2', self.default_shape)]) + side_effect=[ + (b'tensor1', self.default_shape), + (b'tensor2', self.default_shape), + ], + ) dv_call = mock.Mock() dv_call.variant = test_utils.make_variant(start=10, alleles=['A', 'C', 'G']) ex = mock.Mock() alt1, alt2 = ['C'], ['G'] - self.processor.pic.create_pileup_images.return_value = [(alt1, b'tensor1'), - (alt2, b'tensor2')] + self.processor.pic.create_pileup_images.return_value = [ + (alt1, b'tensor1'), + (alt2, b'tensor2'), + ] actual = self.processor.create_pileup_examples(dv_call, 'child') @@ -1197,7 +1294,8 @@ def test_create_pileup_examples(self): label=variant_labeler.VariantLabel( is_confident=True, variant=test_utils.make_variant(start=10, alleles=['A', 'C']), - genotype=(0, 1)), + genotype=(0, 1), + ), expected_label_value=1, ), # Test that a reference variant gets a label value of 0 in the example. @@ -1205,7 +1303,8 @@ def test_create_pileup_examples(self): label=variant_labeler.VariantLabel( is_confident=True, variant=test_utils.make_variant(start=10, alleles=['A', '.']), - genotype=(0, 0)), + genotype=(0, 0), + ), expected_label_value=0, ), ) @@ -1240,15 +1339,18 @@ def test_label_variant_raises_for_non_confident_variant(self): label = variant_labeler.VariantLabel( is_confident=False, variant=test_utils.make_variant(start=10, alleles=['A', 'C']), - genotype=(0, 1)) + genotype=(0, 1), + ) example = self._example_for_variant(label.variant) with self.assertRaisesRegex( - ValueError, 'Cannot add a non-confident label to an example'): + ValueError, 'Cannot add a non-confident label to an example' + ): self.processor.add_label_to_example(example, label) def _example_for_variant(self, variant): - return dv_utils.make_example(variant, list(variant.alternate_bases), b'foo', - self.default_shape) + return dv_utils.make_example( + variant, list(variant.alternate_bases), b'foo', self.default_shape + ) def test_use_original_quality_scores_without_parse_sam_aux_fields(self): FLAGS.mode = 'calling' @@ -1265,8 +1367,12 @@ def test_use_original_quality_scores_without_parse_sam_aux_fields(self): FLAGS.parse_sam_aux_fields = False with self.assertRaisesRegex( - Exception, 'If --use_original_quality_scores is set then ' - '--parse_sam_aux_fields must be set too.'): + Exception, + ( + 'If --use_original_quality_scores is set then ' + '--parse_sam_aux_fields must be set too.' + ), + ): make_examples.default_options(add_flags=True) @parameterized.parameters( @@ -1292,14 +1398,16 @@ def test_image_heights(self, height_parent, height_child): options = make_examples.default_options(add_flags=True) with self.assertRaisesRegex( - Exception, 'Total pileup image heights must be between 75-362.'): + Exception, 'Total pileup image heights must be between 75-362.' + ): make_examples.check_options_are_valid(options) @parameterized.parameters( [ dict(window_width=221), dict(window_width=1001), - ],) + ], + ) def test_align_to_all_haplotypes(self, window_width): # align_to_all_haplotypes() will pull from the reference, so choose a # real variant. @@ -1325,7 +1433,8 @@ def test_align_to_all_haplotypes(self, window_width): self.processor.realigner.ref_reader = self.ref_reader read = test_utils.make_read( - 'A' * 101, start=10046100, cigar='101M', quals=[30] * 101) + 'A' * 101, start=10046100, cigar='101M', quals=[30] * 101 + ) self.processor.realigner.align_to_haplotype = mock.Mock() alt_info = self.processor.align_to_all_haplotypes(variant, [read]) @@ -1344,8 +1453,9 @@ def test_align_to_all_haplotypes(self, window_width): # If variant reference_bases are wrong, it should raise a ValueError. variant.reference_bases = 'G' - with self.assertRaisesRegex(ValueError, - 'does not match the bases in the reference'): + with self.assertRaisesRegex( + ValueError, 'does not match the bases in the reference' + ): self.processor.align_to_all_haplotypes(variant, [read]) diff --git a/deeptrio/testdata.py b/deeptrio/testdata.py index 768da4c4..0ca479fc 100644 --- a/deeptrio/testdata.py +++ b/deeptrio/testdata.py @@ -53,7 +53,8 @@ def deeptrio_testdata(filename): The absolute path to a testdata file. """ return nucleus_test_utils.genomics_testdata( - os.path.join('deeptrio/testdata', filename), GENOMICS_DIR) + os.path.join('deeptrio/testdata', filename), GENOMICS_DIR + ) CHR20_FASTA = None @@ -116,55 +117,76 @@ def init(): CHR20_FASTA = deeptrio_testdata('input/hs37d5.chr20.fa.gz') HG001_CHR20_BAM = deeptrio_testdata('input/HG001.chr20.10_10p1mb_sorted.bam') NA12891_CHR20_BAM = deeptrio_testdata( - 'input/NA12891.chr20.10_10p1mb_sorted.bam') + 'input/NA12891.chr20.10_10p1mb_sorted.bam' + ) NA12892_CHR20_BAM = deeptrio_testdata( - 'input/NA12892.chr20.10_10p1mb_sorted.bam') + 'input/NA12892.chr20.10_10p1mb_sorted.bam' + ) GOLDEN_TRAINING_EXAMPLES = deeptrio_testdata( - 'golden.training_examples.tfrecord.gz') + 'golden.training_examples.tfrecord.gz' + ) GOLDEN_CANDIDATE_POSITIONS = deeptrio_testdata( - 'golden_child.candidate_positions') + 'golden_child.candidate_positions' + ) GOLDEN_CALLING_CANDIDATES = deeptrio_testdata( - 'golden_child.calling_examples.tfrecord.gz') + 'golden_child.calling_examples.tfrecord.gz' + ) GOLDEN_CALLING_EXAMPLES = deeptrio_testdata( - 'golden_child.calling_examples.tfrecord.gz') + 'golden_child.calling_examples.tfrecord.gz' + ) CONFIDENT_REGIONS_BED = deeptrio_testdata( - 'input/test_giab.b37_chr20_100kbp_at_10mb.bed') + 'input/test_giab.b37_chr20_100kbp_at_10mb.bed' + ) TRUTH_VARIANTS_VCF = deeptrio_testdata( 'input/HG001_chr20_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz' ) TRUTH_VARIANTS_VCF_WITH_TYPES = deeptrio_testdata( - 'input/with_types.test_nist.b37_chr20_4kbp_at_10mb.vcf.gz') + 'input/with_types.test_nist.b37_chr20_4kbp_at_10mb.vcf.gz' + ) GOLDEN_POSTPROCESS_INPUT = deeptrio_testdata( - 'golden.postprocess_single_site_input.tfrecord.gz') + 'golden.postprocess_single_site_input.tfrecord.gz' + ) GOLDEN_POSTPROCESS_OUTPUT = deeptrio_testdata( - 'golden.postprocess_single_site_output.vcf') + 'golden.postprocess_single_site_output.vcf' + ) GOLDEN_POSTPROCESS_OUTPUT_COMPRESSED = deeptrio_testdata( - 'golden.postprocess_single_site_output.vcf.gz') + 'golden.postprocess_single_site_output.vcf.gz' + ) GOLDEN_POSTPROCESS_GVCF_INPUT = deeptrio_testdata( - 'golden_child.postprocess_gvcf_input.tfrecord.gz') + 'golden_child.postprocess_gvcf_input.tfrecord.gz' + ) GOLDEN_POSTPROCESS_GVCF_OUTPUT = deeptrio_testdata( - 'golden.postprocess_gvcf_output.g.vcf') + 'golden.postprocess_gvcf_output.g.vcf' + ) GOLDEN_MAKE_EXAMPLES_RUN_INFO = deeptrio_testdata( - 'golden.training_examples.tfrecord.gz.run_info.pbtxt') + 'golden.training_examples.tfrecord.gz.run_info.pbtxt' + ) WS_ALLELE_COUNT_LINEAR_MODEL = deeptrio_testdata( - 'window_selector_allele_count_linear.pbtxt') + 'window_selector_allele_count_linear.pbtxt' + ) WS_ALLELE_COUNT_LINEAR_MODEL_PCKL = deeptrio_testdata( - 'window_selector_allele_count_linear.pckl') + 'window_selector_allele_count_linear.pckl' + ) WS_VARIANT_READS_THRESHOLD_MODEL = deeptrio_testdata( - 'window_selector_variant_read_threshold.pbtxt') + 'window_selector_variant_read_threshold.pbtxt' + ) # For CustomizedClassesVariantLabeler. global CUSTOMIZED_CLASSES_GOLDEN_TRAINING_EXAMPLES CUSTOMIZED_CLASSES_GOLDEN_TRAINING_EXAMPLES = deeptrio_testdata( - 'customized_classes.golden.training_examples.tfrecord.gz') + 'customized_classes.golden.training_examples.tfrecord.gz' + ) # For alt-aligned pileups global ALT_ALIGNED_PILEUP_GOLDEN_TRAINING_EXAMPLES ALT_ALIGNED_PILEUP_GOLDEN_TRAINING_EXAMPLES = deeptrio_testdata( - 'alt_aligned_pileup.golden.training_examples.tfrecord.gz') + 'alt_aligned_pileup.golden.training_examples.tfrecord.gz' + ) GOLDEN_VCF_CANDIDATE_IMPORTER_TRAINING_EXAMPLES = deeptrio_testdata( - 'golden.vcf_candidate_importer.training_examples.tfrecord.gz') + 'golden.vcf_candidate_importer.training_examples.tfrecord.gz' + ) GOLDEN_VCF_CANDIDATE_IMPORTER_CALLING_EXAMPLES_CHILD = deeptrio_testdata( - 'golden_child.vcf_candidate_importer.calling_examples.tfrecord.gz') + 'golden_child.vcf_candidate_importer.calling_examples.tfrecord.gz' + )