Skip to content

Commit

Permalink
fixed non-positive definite covariance matrix of multivariate normal …
Browse files Browse the repository at this point in the history
…distribution + drafted inplace and not inplace slicing + workin gon merging genomes
  • Loading branch information
jeffersonfparil committed Nov 13, 2024
1 parent 9925d6d commit 758efda
Show file tree
Hide file tree
Showing 3 changed files with 168 additions and 72 deletions.
194 changes: 131 additions & 63 deletions data/genotype.py
Original file line number Diff line number Diff line change
@@ -1,90 +1,88 @@
import numpy as np
from typing import Self
from data.error import IncompatibleParameters
from data.error import IncorrectParameter, IncompatibleParameters


class Genomes:
"""
Genotype data
n: number of entries (an entry maybe an individual diploid genotype, or a tetraploid genotype or a pool of 50 diploid genotypes)
p: total number of loci
m: number of chromosomes
k: number of alleles per locus
entries: vector (nx0; str) of entry names
chromosomes: vector (px0; str) of chromosome names
positions: vector (px0; uint64) of positions per chromosome (i.e. starts at 1 per chromosome)
alleles: matrix (pxk; str) of allele names across p loci and k alleles
loci: vector (p*(k-1)x0; str) of loci names composed of the chromosome name, position, all the allele separated by pipes, and the specific allele
genotypes: matrix (nx(p*(k-1)); np.float64) of allele frequencies across n genotypes and p*(k-1) alleles,
where the k-1 alleles per locus are adjacent each other along the columns
mask: matrix (nx(p*(k-1)); np.bool) of boolean mask
"""

n: int
p: int
k: int
m: int
entries: np.ndarray
chromosomes: np.ndarray
positions: np.ndarray
alleles: np.ndarray
loci: np.ndarray
genotypes: np.ndarray
mask: np.ndarray

def __init__(
self: Self,
entries: np.ndarray,
chromosomes: np.ndarray,
positions: np.ndarray,
alleles: np.ndarray,
genotypes: np.ndarray,
entries: np.ndarray | None = None,
chromosomes: np.ndarray | None = None,
positions: np.ndarray | None = None,
alleles: np.ndarray | None = None,
genotypes: np.ndarray | None = None,
) -> None:
n: int = entries.shape[0]
if n != genotypes.shape[0]:
raise IncompatibleParameters
pk1: int = genotypes.shape[1]
k: int = alleles.shape[1]
if pk1 % (k - 1) > 0:
raise IncompatibleParameters
p: int = int(pk1 / (k - 1))
if (p != chromosomes.shape[0]) or (p != positions.shape[0]) or (p != alleles.shape[0]):
raise IncompatibleParameters
m: int = np.unique(ar=chromosomes).shape[0]
self.n = n
self.p = p
self.k = k
self.m = m
self.entries = entries
self.chromosomes = chromosomes
self.positions = positions
self.alleles = alleles
self.genotypes = genotypes
self.mask = np.ones((n, pk1)).astype(np.bool)
"""
Initialise Genomes with or without data
"""
if (
isinstance(entries, np.ndarray)
and isinstance(chromosomes, np.ndarray)
and isinstance(positions, np.ndarray)
and isinstance(alleles, np.ndarray)
and isinstance(genotypes, np.ndarray)
):
if (
(len(entries.shape) != 1)
or (len(chromosomes.shape) != 1)
or (len(positions.shape) != 1)
or (len(alleles.shape) != 2)
or (len(genotypes.shape) != 2)
):
raise IncorrectParameter
n: int = entries.shape[0]
if n != genotypes.shape[0]:
raise IncompatibleParameters
pk1: int = genotypes.shape[1]
k: int = alleles.shape[1]
if pk1 % (k - 1) > 0:
raise IncompatibleParameters
p: int = int(pk1 / (k - 1))
if (p != chromosomes.shape[0]) or (p != positions.shape[0]) or (p != alleles.shape[0]):
raise IncompatibleParameters
loci: np.ndarray = np.full((pk1,), fill_value='', dtype='<U256', order='C')
for i in range(p):
chr: str = chromosomes[i]
pos: str = str(positions[i])
all_alleles: str = '|'.join(alleles[i, :])
for j in range(k - 1):
ale: str = alleles[i, j]
loci[(i * (k - 1)) + j] = '\t'.join([chr, pos, all_alleles, ale])
self.entries = entries
self.loci = loci
self.genotypes = genotypes
self.mask = np.ones((n, pk1)).astype(np.bool)
else:
self.entries = np.array([''])
self.loci = np.array([''])
self.genotypes = np.array([[0.0]])
self.mask = np.array([[False]])
return None

def __str__(self: Self) -> str:
"""
Print the contents of the class
"""
info: str = (
'{\n\tn: '
+ str(self.n)
+ '\n\t'
+ 'p: '
+ str(self.p)
+ '\n\t'
+ 'k: '
+ str(self.k)
+ '\n\t'
+ 'm: '
+ str(self.m)
+ '\n\t'
+ 'entries: '
'{\n\tentries: '
+ str(self.entries)
+ '\n\t'
+ 'chromosomes: '
+ str(self.chromosomes)
+ '\n\t'
+ 'positions: '
+ str(self.positions)
+ '\n\t'
+ 'alleles: '
+ str(self.alleles)
+ 'loci: '
+ str(self.loci)
+ '\n\t'
+ 'genotypes: '
+ str(self.genotypes)
Expand All @@ -95,10 +93,80 @@ def __str__(self: Self) -> str:
)
return info

def __hash__(self: Self) -> int:
"""
Hash all the fields in the class
"""
hash_int: int = hash(str(self.entries))
hash_int += hash(str(self.loci))
hash_int += hash(str(self.genotypes))
hash_int += hash(str(self.mask))
return hash_int

def __eq__(self: Self, other) -> bool:
"""
Equality test using hashes
"""
return hash(self) == hash(other)

def slice_inplace(self: Self) -> None:
"""
Slice across all rows and columns with at least on True along both axes in self.mask mutating self
"""
idx_entries: list[bool] = (self.mask.sum(axis=1) > 0).tolist()
idx_loci: list[bool] = (self.mask.sum(axis=0) > 0).tolist()
self.entries = self.entries[idx_entries]
self.loci = self.loci[idx_loci]
idx_rows: list[int] = [i for i, j in enumerate(idx_entries) if j]
idx_cols: list[int] = [i for i, j in enumerate(idx_loci) if j]
self.genotypes = self.genotypes[idx_rows, :][:, idx_cols]
self.mask = self.mask[idx_rows, :][:, idx_cols]
return None

def slice(self: Self) -> 'Genomes':
"""
Slice across all rows and columns with at least on True along both axes in self.mask returning a new a Genomes object
"""
idx_entries: list[bool] = (self.mask.sum(axis=1) > 0).tolist()
idx_loci: list[bool] = (self.mask.sum(axis=0) > 0).tolist()
out: Genomes = Genomes()
out.entries = self.entries[idx_entries]
out.loci = self.loci[idx_loci]
idx_rows: list[int] = [i for i, j in enumerate(idx_entries) if j]
idx_cols: list[int] = [i for i, j in enumerate(idx_loci) if j]
out.genotypes = self.genotypes[idx_rows, :][:, idx_cols]
out.mask = self.mask[idx_rows, :][:, idx_cols]
return out

def merge_genotype(self: Self, other: Self, conflict_resolution: tuple[float, float]) -> Self:
entries_intersection: np.ndarray = np.intersect1d(self.entries, other.entries)
loci_intersection: np.ndarray = np.intersect1d(self.loci, other.loci)
print(entries_intersection)
print(loci_intersection)
entries: np.ndarray = np.unique(np.concatenate(self.entries, other.entries))
loci: np.ndarray = np.unique(np.concatenate(self.loci, other.loci))
print(entries)
print(loci)
print(conflict_resolution)

return self


def test_genotype():
# from data.genotype import *
from data.simulation import simulate
import copy

genomes, _ = simulate()
print(genomes)
assert isinstance(genomes, Genomes)
genomes.mask[:, :] = False
genomes.mask[1:6, 10:20] = True
sliced_genomes = copy.deepcopy(genomes)
sliced_genomes.slice_inplace()
sliced_clone = genomes.slice()
assert sliced_genomes == sliced_clone
print(sliced_genomes)
print(sliced_clone)

self = genomes
other_genomes = sliced_genomes
25 changes: 23 additions & 2 deletions data/simulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,13 @@ def simulate_loci_identities(
Raises
------
- IncorrectParameter
- IncompatibleParameters
- RandomSamplingError
- LogicError
"""
if (len(lengths_per_chrom.shape) != 1) or (len(n_loci_per_chrom.shape) != 1):
raise IncorrectParameter
if lengths_per_chrom.shape[0] != n_loci_per_chrom.shape[0]:
raise IncompatibleParameters
p: int = n_loci_per_chrom.sum()
Expand Down Expand Up @@ -187,9 +190,12 @@ def simulate_genotypes(
Raises
------
- IncorrectParameter
- LogicError
- AlleleFreqOverUnderflow
"""
if (len(chromosomes.shape) != 1) or (len(positions.shape) != 1) or (len(alleles.shape) != 2):
raise IncorrectParameter
if (chromosomes.shape[0] != positions.shape[0]) or (chromosomes.shape[0] != alleles.shape[0]):
raise LogicError
uniq_chrom, n_loci_per_chrom = np.unique(chromosomes, return_counts=True)
Expand Down Expand Up @@ -309,11 +315,14 @@ def simulate_allele_effects(
Raises
------
- IncorrectParameter
- IncompatibleParameters
- IncorrectParameter
- FractionOverUnderflow
- RandomSamplingError
"""
if len(genotypes.shape) != 2:
raise IncorrectParameter
pk1: int = genotypes.shape[1]
if neff > pk1:
raise IncompatibleParameters
Expand Down Expand Up @@ -357,12 +366,21 @@ def simulate_allele_effects(
for j in range(i + 1, ntraits):
cor = np.random.uniform(low=min_corr, high=max_corr, size=1)[0]
cov[i, j], cov[j, i] = cor, cor
# Make sure that the variance-covariance matrix if symmetric positive semidefinite, or else it does not make sense in the context of multivariate normal distribution.
# We will simulate negative correlations after sampling the effects.
try:
allele_effects[indexes, range(ntraits)] = np.random.multivariate_normal(
mean=mean, cov=cov, size=neff
sampled_effects: np.ndarray = np.random.multivariate_normal(
mean=mean, cov=np.abs(cov), size=neff
)
except ValueError as err:
raise RandomSamplingError from err
# Insert the effects while simulating some negative correlations
for j in range(sampled_effects.shape[1]):
coef: float = 1.00
if np.random.choice(a=cov[j, :], size=1) < 0.0:
coef *= -1.00
for i in range(sampled_effects.shape[0]):
allele_effects[indexes[i, j], j] = coef * sampled_effects[i, j]
return allele_effects


Expand All @@ -385,10 +403,13 @@ def simulate_phenotypes(
Raises
------
- IncorrectParameter
- IncompatibleParameters
- FractionOverUnderflow
- RandomSamplingError
"""
if (len(genotypes.shape) != 2) or (len(allele_effects.shape) != 2):
raise IncorrectParameter
n: int = genotypes.shape[0]
pk1: int = genotypes.shape[1]
ntraits: int = len(heritabilities)
Expand Down
21 changes: 14 additions & 7 deletions models/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,26 @@
from data.phenotype import Phenomes
import numpy as np
import sklearn.linear_model as lm

# from sklearn.linear_model import RidgeCV, LassoCV
# from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import r2_score, mean_squared_error
# from sklearn.model_selection import RepeatedKFold, KFold, cross_val_predict, cross_validate, KFold, LeaveOneOut, GridSearchCV, RepeatedKFold
# from sklearn.neural_network import MLPRegressor


def ridge(genomes: Genomes, phenomes: Phenomes, penalisations: np.ndarray) -> bool:
from data.simulation import simulate

genomes, phenomes = simulate()
penalisations = np.array([1.5**i for i in range(-40,45)][::-1])
i = 0
penalisations = np.array([1.5**i for i in range(-40, 45)][::-1])

i = -1
X: np.ndarray = genomes.genotypes
y: np.ndarray = phenomes.phenotypes[:,i]
y: np.ndarray = phenomes.phenotypes[:, i]
model: lm.RidgeCV = lm.RidgeCV(alphas=penalisations).fit(X=X, y=y)
model.score(X, y)
return False
# r2_fit = model.score(X, y)
# bhat = model.coef_

mean_squared_error(model.predict(X), y)
r2_score(model.predict(X), y)
return False

0 comments on commit 758efda

Please sign in to comment.