-
Notifications
You must be signed in to change notification settings - Fork 88
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
CpG mutation models #2221
Comments
Note however that you can simulate variation in mutation rate using a RateMap - so, for instance, a crude thing to do would be to calculate the density of CpG pairs in 100Kb windows, and then use this to adjust the mutation rate. This would account for coarse-scale mutation rate variation, but not the per-nucleotide heterogeneity, and obviously not the context-dependence. |
One way to approximate this would be to designate regions as CpG and non-CpG on the basis of the reference sequence, then apply mutations twice: once at the CpG sites (masking out the non-CpG sites) and again at the non-CpG sites (masking out the CpG sites). For more accuracy you could restrict this to the methylated CpG sites (see e.g. here). It's not quite right, because there should be some context dependence associated with adjacent sites changing state (e.g. it is possible for the G at a CpG to mutate to T, which would make the C no longer part of a CpG site). But that's probably a second-order effect as long as mutation is rare. For reference, there's a table of transition/transversion rates in humans at CpG and nonCpG sites at https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3548427/table/T2/ - you could imagine using these values in the Here's some code to do it. It's not quite right yet (see comments in the code), but I think the approach is reasonable import tarfile
import msprime
import stdpopsim
import re
import os
import warnings
import numpy as np
from urllib.request import urlretrieve
def get_refseq(
chromosome,
max_chars,
ancestral_alleles_prefix="homo_sapiens_ancestor_",
build="GRCh38",
base_url="https://ftp.ensembl.org/pub/release-111/fasta/ancestral_alleles/"
):
"""
Download and return max_chars of the ancestral sequence.
The download can be large (800Mb), so ensure you have space
"""
def get_fasta_seq(file, max_chars=None):
"""Return the first max_chars of a gzipped FASTA file as a single string"""
seq = ""
seq_num = 0
for i, line in enumerate(file):
if seq_num > 0:
break
if max_chars is not None and len(seq) >= max_chars:
break
if line.startswith(b">"):
if i > 0:
seq_num += 1
else:
seq += line.decode().strip()
if max_chars is None:
return seq
else:
return seq[:max_chars]
ancestral_alleles_file = ancestral_alleles_prefix + build
ancestral_alleles_tarfile = ancestral_alleles_file + ".tar.gz"
if not os.path.isfile(ancestral_alleles_tarfile):
## Large download: make sure you have space
print(f"Downloading large tarball of sequences from {ancestral_alleles_tarfile}")
urlretrieve(
ancestral_alleles_tarfile,
ancestral_alleles_tarfile,
)
with tarfile.open(ancestral_alleles_tarfile, "r:gz") as tar:
filehandle=tar.extractfile(os.path.join(
ancestral_alleles_file,
f"{ancestral_alleles_prefix}{chromosome}.fa"
))
return get_fasta_seq(filehandle, max_chars)
def impose_CpG_mutations(
ts, CpG_transition_rate, CpG_transv_rate, nonCpG_transition_rate, nonCpG_transv_rate,
remove_refseq=True,
):
if ts.num_sites > 0:
raise ValueError("Expecting a ts with no sites / mutations")
if not ts.has_reference_sequence():
raise ValueError(
"Expecting a ts with a reference sequence (e.g. from "
"https://ftp.ensembl.org/pub/release-111/fasta/ancestral_alleles/)"
)
tables = ts.dump_tables()
print("Adding CpG mutations based on the reference sequence")
# Add ancestral states for any ACGT/acgt values
for pos, ancestral_allele in enumerate(ts.reference_sequence.data):
if ancestral_allele in "ACGTacgt":
tables.sites.add_row(position=pos, ancestral_state=ancestral_allele.upper())
used_regions = np.zeros(len(ts.reference_sequence.data), dtype=bool)
CpG_regions = np.zeros(len(ts.reference_sequence.data), dtype=bool)
for m in re.finditer(r"[ACGTacgt]+", ts.reference_sequence.data, re.IGNORECASE):
used_regions[m.start(): m.end()] = True
for m in re.finditer(r"(?:CG)+", ts.reference_sequence.data, re.IGNORECASE):
CpG_regions[m.start(): m.end()] = True
assert np.all(used_regions[CpG_regions]) # all CpG regions are in used_regions
nonCpG_regions = used_regions & ~CpG_regions
# find the switchpoints between regions
CpG_regions = np.where(np.diff(np.pad(CpG_regions, (1, 1))) != 0)[0]
assert len(CpG_regions) % 2 == 0
CpG_regions = CpG_regions.reshape((-1, 2))
nonCpG_regions = np.where(np.diff(np.pad(nonCpG_regions, (1, 1))) != 0)[0]
assert len(nonCpG_regions) % 2 == 0
nonCpG_regions = nonCpG_regions.reshape((-1, 2))
regions_and_rates = {
"CpG": (CpG_regions, [CpG_transition_rate , CpG_transv_rate]),
"nonCpG": (nonCpG_regions, [nonCpG_transition_rate , nonCpG_transv_rate])
}
models = {k: msprime.HKY(rate[0]/rate[1]) for k, (_, rate) in regions_and_rates.items()}
ratemaps = {}
for k, (regions, rates) in regions_and_rates.items():
pos = regions.flatten()
use = [np.sum(rates), 0]
if regions[0, 0] != 0:
pos = np.insert(pos, 0, 0)
use = [use[1], use[0]]
if regions[-1, 1] != ts.sequence_length:
pos = np.append(pos, [int(ts.sequence_length)])
ratemaps[k] = msprime.RateMap(position=pos, rate=np.tile(use, len(pos)//2)[0:(len(pos)-1)])
print("Adding CpG and non-CpG mutations separately, in regions of known ancestral state")
if remove_refseq:
tables.reference_sequence.data = ""
mutated_ts = tables.tree_sequence()
for k in ratemaps.keys():
mutated_ts = msprime.sim_mutations(mutated_ts, rate=ratemaps[k], model=models[k], random_seed = 123)
mutated_ts = mutated_ts.simplify()
print(
f"Added {mutated_ts.num_mutations} mutations: av rate =",
mutated_ts.trim().segregating_sites(mode="site") /
mutated_ts.trim().segregating_sites(mode="branch"),
"per bp per gen"
)
return mutated_ts, CpG_regions, nonCpG_regions
## Start simulation
def simulate_OOA_no_muts(samples, chromosome, sim_length):
refseq = get_refseq(chromosome=20, max_chars=sim_length)
start = len(refseq) - len(refseq.lstrip("Nn.*"))
end = len(refseq.rstrip("Nn.*"))
print(f"Simulating chr{chromosome} from position {start} to {end} without mutations")
species = stdpopsim.get_species("HomSap")
model = species.get_demographic_model("OutOfAfrica_3G09")
contig = species.get_contig(f"chr{chromosome}", mutation_rate=0, left=start, right=end)
engine = stdpopsim.get_engine("msprime")
with warnings.catch_warnings():
warnings.filterwarnings('ignore', message='.*mutation rate 0\.')
ts = engine.simulate(model, contig, samples, seed=123)
tables = ts.dump_tables()
# until https://github.com/popsim-consortium/stdpopsim/issues/1404 is fixed
# add the deleted region back in at the start, so the refseq lines up
if ts.first().num_edges > 0:
tables.edges.left = tables.edges.left + start
tables.edges.right = tables.edges.right + start
tables.sequence_length = tables.sequence_length + start
assert len(refseq) == tables.sequence_length
tables.reference_sequence.data = refseq
return tables.tree_sequence()
chrom = 20
samples = {"YRI": 10, "CEU": 190}
ts = simulate_OOA_no_muts(samples, chromosome=chrom, sim_length=4_000_000)
mutated_ts, CpGregions, nonCpGregions = impose_CpG_mutations(
ts, # Figures below from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3548427/table/T2/
CpG_transition_rate=1.12e-7,
CpG_transv_rate=9.59e-9,
nonCpG_transition_rate=6.18e-9,
nonCpG_transv_rate=3.76e-9,
) To test: print(sum(1 for s in mutated_ts.sites() if len(s.mutations) > 1),
f"/ {mutated_ts.num_sites} sites with >1 mutation")
nCpG = [0, 0]
for s in mutated_ts.sites():
if np.any((s.position >= CpGregions[:, 0]) & (s.position < CpGregions[:, 1])):
if len(s.mutations) > 0:
nCpG[0] += 1
if len(s.mutations) > 1:
nCpG[1] += 1
print(
f"{nCpG[0]} variable CpG sites ({nCpG[1]} with more than one mutation)") Giving:
|
@jeromekelleher suggested that the dinucleotide model would be a good way to get a version that has the correct dynamics. As @petrelharp points out, a dinucleotide model will miss half of the CpGs. Additionally it may be tedious converting the dinucleotide output into a sensible format: the Nevertheless, something like a dinucleotide model might be the only way to get an equialibrium distribution of CpG mutations. If we have ancestral states and a set of transition / transversion probabilities, then the recipe above should work fine, though. |
For reference, when running this on human chr 20 (GRCh38), with 64MB of a relatively large sample size OOA simulation (10,000 diploids)
So we have about 3.5x more non CpG than CpG sites. In smaller samples, we expected 50x more, and I suspect that as the sample size increases, the number of singletons will increase, and many of these will be CpGs. We have most (77%) of multiple-mutation sites in CpGs, and about 60-70% are recurrent mutations (presumably mostly C->T or T->C). However, the recurrent sites make up only about 0.9% of all sites (about 1.5% of sites have multiple mutations). The general conclusion is that even in relatively large samples, we only expect one or two percent of sites to be recurrent, of which more-or-less a majority will be CpGs, most of which will be recurrent mutations, but a reasonable minority will be trillelic. We can probably make a guess at the expected number of recurrent sites given that we know how many sites are triallellic. Click to see codedef simulate_OOA_no_muts(samples, chromosome, sim_length):
with open("homo_sapiens_ancestor_GRCh38/homo_sapiens_ancestor_{chromosome}.fa", "rb") as fh:
refseq = get_fasta_seq(fh)
start = len(refseq) - len(refseq.lstrip("Nn.*"))
end = len(refseq.rstrip("Nn.*"))
print(f"Simulating chr{chromosome} from position {start} to {end} without mutations")
species = stdpopsim.get_species("HomSap")
model = species.get_demographic_model("OutOfAfrica_3G09")
contig = species.get_contig(f"chr{chromosome}", mutation_rate=0, left=start, right=end)
engine = stdpopsim.get_engine("msprime")
with warnings.catch_warnings():
warnings.filterwarnings('ignore', message='.*mutation rate 0\.')
ts = engine.simulate(model, contig, samples, seed=123)
tables = ts.dump_tables()
# until https://github.com/popsim-consortium/stdpopsim/issues/1404 is fixed
# add the deleted region back in at the start, so the refseq lines up
if ts.first().num_edges > 0:
tables.edges.left = tables.edges.left + start
tables.edges.right = tables.edges.right + start
tables.sequence_length = tables.sequence_length + start
assert len(refseq) >=
tables.sequence_length = len(refseq)
tables.reference_sequence.data = refseq
return tables.tree_sequence()
# NB from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3548427/table/T2/ ,
# we expect about 2.583e9 / 48.8e6 == 50 x more non-CpG compared to CpG sites
def n_sites_in_regions(ts, regions):
intervals_iter = iter(regions)
curr_interval = next(intervals_iter)
mult = 0
recur = 0
newallele = 0
n = 0
for site in mutated_ts.sites():
while site.position >= curr_interval[1]:
curr_interval = next(intervals_iter)
if site.position >= curr_interval[0]:
if len(site.mutations) > 1:
mult += 1
if all(m.derived_state == site.mutations[0].derived_state for m in site.mutations):
recur += 1
if any(m.derived_state not in [site.ancestral_state, site.mutations[0].derived_state] for m in site.mutations):
newallele += 1
n += 1
return n, mult, recur, newallele
chrom = 20
samples = {"YRI": 10, "CEU": 190}
ts = simulate_OOA_no_muts(samples, chromosome=chrom, sim_length=4_000_000)
mutated_ts, CpGregions, nonCpGregions = impose_CpG_mutations(
ts, # Figures below from
CpG_transition_rate=1.12e-7,
CpG_transv_rate=9.59e-9,
nonCpG_transition_rate=6.18e-9,
nonCpG_transv_rate=3.76e-9,
)
for name, region in [("CpG", CpGregions), ("non CpG", nonCpGregions)]:
tot, multiple, recur, newallele = n_sites_in_regions(mutated_ts, region)
print(f"{tot} {name} sites, of which {multiple} have > 1 mut ({recur} are identical muts, {newallele} are not biallelic)") |
Elevated mutation rates at CpG dinucleotides are one of the major contributors to mutation rate variation in mammals. These can't easily be simulated by
sim_mutations
, but there are probably reasonable approximations if a reference sequence is defined.@petrelharp says, on slack:
The main time CpG is mentioned on the msprime GitHub repo is in #972, but only in passing, so I have opened this issue for people who are searching for the term in conjunction with
msprime
, and who might want to think about an implementation.The text was updated successfully, but these errors were encountered: