From 5003c655d3caaed70a961070d9512c3b7bea17ef Mon Sep 17 00:00:00 2001 From: Tom White Date: Mon, 8 Jul 2024 15:04:31 +0100 Subject: [PATCH] Ensure INFO and FORMAT keys are unique ignoring case --- hypothesis_vcf/strategies.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/hypothesis_vcf/strategies.py b/hypothesis_vcf/strategies.py index eac89fe..b8e4671 100644 --- a/hypothesis_vcf/strategies.py +++ b/hypothesis_vcf/strategies.py @@ -67,8 +67,6 @@ def get_header(self): "SOMATIC", "VALIDATED", "1000G", - # conflicts with 'variant_id' variable; see RESERVED_VARIABLE_NAMES in sgkit - "id", ] # [Table 2: Reserved genotype keys] @@ -101,9 +99,10 @@ def vcf_field_keys(category): field_key_regex = r"[A-Za-z_][0-9A-Za-z_.]" def is_reserved_key(key): - return (category == "INFO" and key in RESERVED_INFO_KEYS) or ( - category == "FORMAT" and key in RESERVED_FORMAT_KEYS - ) + # 'id' is reserved since it conflicts with 'variant_id' variable in VCF Zarr + return ( + category == "INFO" and key in RESERVED_INFO_KEYS or key.lower() == "id" + ) or (category == "FORMAT" and key in RESERVED_FORMAT_KEYS) return from_regex(field_key_regex, fullmatch=True).filter( lambda key: not is_reserved_key(key) @@ -275,18 +274,20 @@ def vcf( ------- A Hypothesis strategy to generate a VCF file, including header, as a string. """ + # ensure INFO and FORMAT keys are unique ignoring case to avoid macOS filesystem + # case-sensitivity issue for VCF Zarr info_fields = draw( lists( vcf_fields("INFO", max_number=max_number), max_size=max_info_fields, - unique_by=lambda f: f.vcf_key, + unique_by=lambda f: f.vcf_key.lower(), ) ) format_fields = draw( lists( vcf_fields("FORMAT", max_number=max_number), max_size=max_format_fields, - unique_by=lambda f: f.vcf_key, + unique_by=lambda f: f.vcf_key.lower(), ) ) sample_ids = draw(