diff --git a/sam_tags/standard_tags.py b/sam_tags/standard_tags.py index 788f6d5..c386db1 100644 --- a/sam_tags/standard_tags.py +++ b/sam_tags/standard_tags.py @@ -2,7 +2,8 @@ Standard tags. """ -from enum import StrEnum, unique +from enum import StrEnum +from enum import unique @unique @@ -261,7 +262,7 @@ class StandardTag(StrEnum): """ MD = "MD" - """ + r""" String encoding mismatched and deleted reference bases. `MD:Z:[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*`: String encoding mismatched and @@ -313,10 +314,125 @@ class StandardTag(StrEnum): """ ML = "ML" - """Base modification probabilities.""" + """ + Base modification probabilities. + + `ML:B:C,scaled-probabilities`: The optional `ML` tag lists the probability of each modification + listed in the `MM` tag being correct, in the order that they occur. The continuous probability + range 0.0 to 1.0 is remapped in equal sized portions to the discrete integers 0 to 255 + inclusively. Thus the probability range corresponding to integer value _N_ is _N_/256 to (_N_ + + 1)/256. + + The SAM encoding therefore uses a byte array of type '`C`' with the number of elements matching + the summation of the number of modifications listed as being present in the `MM` tag accounting + for multi-modifications each having their own probability. + + For example '`MM:Z:C+m,5,12;C+h,5,12;`' may have an associated tag of '`ML:B:C,204,89,26,130`'. + + If the above is rewritten in the multiple-modification form, the probabilities are interleaved + in the order presented, giving '`MM:Z:C+mh,5,12; ML:B:C,204,26,89,130`'. Note where several + possible modifications are presented at the same site, the `ML` values represent the absolute + probabilities of the modification call being correct and not the relative likelihood between the + alternatives. These probabilities should not sum to above 1.0 (≈ 256 in integer encoding, + allowing for some minor rounding errors), but may sum to a lower total with the remainder + representing the probability that none of the listed modification types are present. In the + example used above, the 6th `C` has 80% chance of being `5mC`, 10% chance of being `5hmC` and + 10% chance of being an unmodified `C`. + + `ML` values for ambiguity codes give the probability that the modification is one of the + possible codes compatible with that ambiguity code. For example `MM:Z:C+C,10; ML:B:C,229` + indicates a `C` call with a probability of 90% of having some form of unspecified modification. + """ MM = "MM" - """Base modifications / methylation.""" + """ + Base modifications / methylation. + + `MM:Z:([ACGTUN][-+]([a-z]+|[0-9]+)[.?]?(,[0-9]+)*;)*`: The first character is the unmodified + "fundamental" base as reported by the sequencing instrument for the top strand. It must be one + of '`A`', '`C`', '`G`', '`T`', '`U`' (if RNA) or '`N`' for anything else, including any IUPAC + ambiguity codes in the reported SEQ field. Note '`N`' may be used to match any base rather than + specifically an '`N`' call by the sequencing instrument. This may be used in situations where + the base modification is not a derivation of a standard base type. This is followed by either + plus or minus indicating the strand the modification was observed on (relative to the original + sequenced strand of SEQ with plus meaning same orientation), and one or more base modification + codes. + + Following the base modification codes is a recommended but optional '`.`' or '`?`' describing + how skipped seq bases of the stated base type should be interpreted by downstream tools. When + this flag is '`?`' there is no information about the modification status of the skipped bases + provided. When this flag is not present, or it is '`.`', these bases should be assumed to have + low probability of modification. + + This is then followed by a comma separated list of how many seq bases of the stated base type to + skip, stored as a delta to the last and starting with 0 as the first (or next) base, starting + from the uncomplemented 5' end of the SEQ field. This number series is comparable to the numbers + in an `MD` tag, albeit counting specific base types only and potentially reverse-complemented. + + For example '`C+m,5,12,0;`' tells us there are three potential 5-Methylcytosine bases on the top + strand of SEQ. The first 5 '`C`' bases are unmodified and the 6th, 19th and 20th have + modification status indicated by the corresponding probabilities in the `ML` tag. The 12 + cytosines between the 6th and 19th cytosine are unmodified. Modification probabilities for the + 17 skipped cytosines are not provided. + + When the '`?`' flag is present the tag '`C+m?,5,12,0;`' tells us the modification status of the + first five cytosine bases is unknown, the sixth cytosine is called (as either modified or + unmodified), followed by 12 more unknown cytosines, and the 19th and 20th are called. + + Similarly '`G-m,14;`' indicates the 15th '`G`' there might be a 5-Methylcytosine on the opposite + strand (still counting using the top strand base calls from the 5' end). When the alignment + record is reverse complemented (SAM flag 0x10) these two examples do not change since the tag + always refers to the as-sequenced orientation. See the `test/SAMtags/MM-orient.sam` file for + examples. + + This permits modifications to be listed on either strand with the rare potential for both + strands to have a modification at the same site. If SAM FLAG 0x10 is set, indicating that SEQ + has been reverse complemented from the sequence observed by the sequencing machine, note that + these base modification field values will be in the opposite orientation to SEQ and other + derived SAM fields. + + Note it is permitted for the coordinate list to be empty (for example '`MM:Z:C+m;`'), which may + be used as an explicit indicator that this base modification is not present. It is not permitted + for coordinates to be beyond the length of the sequence. + + When multiple modifications are listed, for example '`C+mh,5,12,0;`', it indicates the + modification may be any of the stated bases. The associated confidence values in the `ML` tag + may be used to determine the relative likelihoods between the options. The example above is + equivalent to '`C+m,5,12,0;C+h,5,12,0;`', although this will have a different ordering of + confidence values in `ML`. Note ChEBI codes cannot be used in the multi-modification form (such + as the '`C+mh`' example above). + + If the modification is not one of the standard common types (listed below) it can be specified + as a numeric ChEBI code. For example '`C+76792,57;`' is the same as '`C+h,57;`'. + + An unmodified base of '`N`' means count any base in SEQ, not only those of '`N`'. Thus + '`N+n,100;`' means the 101st base is Xanthosine (n), irrespective of the sequence composition. + + The standard code types and their associated ChEBI values are listed below, taken from [Viner et + al.](https://www.biorxiv.org/content/10.1101/043794v1). Additionally ambiguity codes '`A`', + '`C`', '`G`', '`T`' and '`U`' exist to represent unspecified modifications bases of their + respective canonical base types, plus code '`N`' to represent an unspecified modification of any + base type. + + |**Unmodified base**|**Code**|**Abbreviation**|**Name** |**ChEBI**| + |:------------------|:-------|:---------------|:-------------------------|:--------| + |C |m |5mC |5-Methylcytosine |27551 | + |C |h |5hmC |5-Hydroxymethylcytosine |76792 | + |C |f |5fC |5-Formylcytosine |76794 | + |C |c |5caC |5-Carboxylcytosine |76793 | + |C |C | |Ambiguity code; any C mod | | + |T |g |5hmU |5-Hydroxymethyluracil |16964 | + |T |e |5fU |5-Formyluracil |80961 | + |T |b |5caU |5-Carboxyluracil |17477 | + |T |T | |Ambiguity code; any T mod | | + |U |U | |Ambiguity code; any U mod | | + |A |a |6mA |6-Methyladenine |28871 | + |A |A | |Ambiguity code; any A mod | | + |G |o |8oxoG |8-Oxoguanine |44605 | + |G |G | |Ambiguity code; any G mod | | + |N |n |Xao |Xanthosine |18107 | + |N |N | |Ambiguity code; any mod | | + """ MQ = "MQ" """ @@ -421,7 +537,7 @@ class StandardTag(StrEnum): """ PT = "PT" - """ + r""" Read annotations for parts of the padded read sequence. `PT:Z:annotag(\|annotag)*` where each _annotag_ matches