fixed non-positive definite covariance matrix of multivariate normal …

…distribution + drafted inplace and not inplace slicing + workin gon merging genomes
jeffersonfparil · Nov 13, 2024 · 758efda · 758efda
1 parent 9925d6d
commit 758efda
Show file tree

Hide file tree

Showing 3 changed files with 168 additions and 72 deletions.
diff --git a/data/genotype.py b/data/genotype.py
@@ -1,90 +1,88 @@
 import numpy as np
 from typing import Self
-from data.error import IncompatibleParameters
+from data.error import IncorrectParameter, IncompatibleParameters
 
 
 class Genomes:
     """
     Genotype data
-    n: number of entries (an entry maybe an individual diploid genotype, or a tetraploid genotype or a pool of 50 diploid genotypes)
-    p: total number of loci
-    m: number of chromosomes
-    k: number of alleles per locus
     entries: vector (nx0; str) of entry names
-    chromosomes: vector (px0; str) of chromosome names
-    positions: vector (px0; uint64) of positions per chromosome (i.e. starts at 1 per chromosome)
-    alleles: matrix (pxk; str) of allele names across p loci and k alleles
+    loci: vector (p*(k-1)x0; str) of loci names composed of the chromosome name, position, all the allele separated by pipes, and the specific allele
     genotypes: matrix (nx(p*(k-1)); np.float64) of allele frequencies across n genotypes and p*(k-1) alleles,
             where the k-1 alleles per locus are adjacent each other along the columns
     mask: matrix (nx(p*(k-1)); np.bool) of boolean mask
     """
 
-    n: int
-    p: int
-    k: int
-    m: int
     entries: np.ndarray
-    chromosomes: np.ndarray
-    positions: np.ndarray
-    alleles: np.ndarray
+    loci: np.ndarray
     genotypes: np.ndarray
     mask: np.ndarray
 
     def __init__(
         self: Self,
-        entries: np.ndarray,
-        chromosomes: np.ndarray,
-        positions: np.ndarray,
-        alleles: np.ndarray,
-        genotypes: np.ndarray,
+        entries: np.ndarray | None = None,
+        chromosomes: np.ndarray | None = None,
+        positions: np.ndarray | None = None,
+        alleles: np.ndarray | None = None,
+        genotypes: np.ndarray | None = None,
     ) -> None:
-        n: int = entries.shape[0]
-        if n != genotypes.shape[0]:
-            raise IncompatibleParameters
-        pk1: int = genotypes.shape[1]
-        k: int = alleles.shape[1]
-        if pk1 % (k - 1) > 0:
-            raise IncompatibleParameters
-        p: int = int(pk1 / (k - 1))
-        if (p != chromosomes.shape[0]) or (p != positions.shape[0]) or (p != alleles.shape[0]):
-            raise IncompatibleParameters
-        m: int = np.unique(ar=chromosomes).shape[0]
-        self.n = n
-        self.p = p
-        self.k = k
-        self.m = m
-        self.entries = entries
-        self.chromosomes = chromosomes
-        self.positions = positions
-        self.alleles = alleles
-        self.genotypes = genotypes
-        self.mask = np.ones((n, pk1)).astype(np.bool)
+        """
+        Initialise Genomes with or without data
+        """
+        if (
+            isinstance(entries, np.ndarray)
+            and isinstance(chromosomes, np.ndarray)
+            and isinstance(positions, np.ndarray)
+            and isinstance(alleles, np.ndarray)
+            and isinstance(genotypes, np.ndarray)
+        ):
+            if (
+                (len(entries.shape) != 1)
+                or (len(chromosomes.shape) != 1)
+                or (len(positions.shape) != 1)
+                or (len(alleles.shape) != 2)
+                or (len(genotypes.shape) != 2)
+            ):
+                raise IncorrectParameter
+            n: int = entries.shape[0]
+            if n != genotypes.shape[0]:
+                raise IncompatibleParameters
+            pk1: int = genotypes.shape[1]
+            k: int = alleles.shape[1]
+            if pk1 % (k - 1) > 0:
+                raise IncompatibleParameters
+            p: int = int(pk1 / (k - 1))
+            if (p != chromosomes.shape[0]) or (p != positions.shape[0]) or (p != alleles.shape[0]):
+                raise IncompatibleParameters
+            loci: np.ndarray = np.full((pk1,), fill_value='', dtype='<U256', order='C')
+            for i in range(p):
+                chr: str = chromosomes[i]
+                pos: str = str(positions[i])
+                all_alleles: str = '|'.join(alleles[i, :])
+                for j in range(k - 1):
+                    ale: str = alleles[i, j]
+                    loci[(i * (k - 1)) + j] = '\t'.join([chr, pos, all_alleles, ale])
+            self.entries = entries
+            self.loci = loci
+            self.genotypes = genotypes
+            self.mask = np.ones((n, pk1)).astype(np.bool)
+        else:
+            self.entries = np.array([''])
+            self.loci = np.array([''])
+            self.genotypes = np.array([[0.0]])
+            self.mask = np.array([[False]])
+        return None
 
     def __str__(self: Self) -> str:
+        """
+        Print the contents of the class
+        """
         info: str = (
-            '{\n\tn: '
-            + str(self.n)
-            + '\n\t'
-            + 'p: '
-            + str(self.p)
-            + '\n\t'
-            + 'k: '
-            + str(self.k)
-            + '\n\t'
-            + 'm: '
-            + str(self.m)
-            + '\n\t'
-            + 'entries: '
+            '{\n\tentries: '
             + str(self.entries)
             + '\n\t'
-            + 'chromosomes: '
-            + str(self.chromosomes)
-            + '\n\t'
-            + 'positions: '
-            + str(self.positions)
-            + '\n\t'
-            + 'alleles: '
-            + str(self.alleles)
+            + 'loci: '
+            + str(self.loci)
             + '\n\t'
             + 'genotypes: '
             + str(self.genotypes)
@@ -95,10 +93,80 @@ def __str__(self: Self) -> str:
         )
         return info
 
+    def __hash__(self: Self) -> int:
+        """
+        Hash all the fields in the class
+        """
+        hash_int: int = hash(str(self.entries))
+        hash_int += hash(str(self.loci))
+        hash_int += hash(str(self.genotypes))
+        hash_int += hash(str(self.mask))
+        return hash_int
+
+    def __eq__(self: Self, other) -> bool:
+        """
+        Equality test using hashes
+        """
+        return hash(self) == hash(other)
+
+    def slice_inplace(self: Self) -> None:
+        """
+        Slice across all rows and columns with at least on True along both axes in self.mask mutating self
+        """
+        idx_entries: list[bool] = (self.mask.sum(axis=1) > 0).tolist()
+        idx_loci: list[bool] = (self.mask.sum(axis=0) > 0).tolist()
+        self.entries = self.entries[idx_entries]
+        self.loci = self.loci[idx_loci]
+        idx_rows: list[int] = [i for i, j in enumerate(idx_entries) if j]
+        idx_cols: list[int] = [i for i, j in enumerate(idx_loci) if j]
+        self.genotypes = self.genotypes[idx_rows, :][:, idx_cols]
+        self.mask = self.mask[idx_rows, :][:, idx_cols]
+        return None
+
+    def slice(self: Self) -> 'Genomes':
+        """
+        Slice across all rows and columns with at least on True along both axes in self.mask returning a new a Genomes object
+        """
+        idx_entries: list[bool] = (self.mask.sum(axis=1) > 0).tolist()
+        idx_loci: list[bool] = (self.mask.sum(axis=0) > 0).tolist()
+        out: Genomes = Genomes()
+        out.entries = self.entries[idx_entries]
+        out.loci = self.loci[idx_loci]
+        idx_rows: list[int] = [i for i, j in enumerate(idx_entries) if j]
+        idx_cols: list[int] = [i for i, j in enumerate(idx_loci) if j]
+        out.genotypes = self.genotypes[idx_rows, :][:, idx_cols]
+        out.mask = self.mask[idx_rows, :][:, idx_cols]
+        return out
+
+    def merge_genotype(self: Self, other: Self, conflict_resolution: tuple[float, float]) -> Self:
+        entries_intersection: np.ndarray = np.intersect1d(self.entries, other.entries)
+        loci_intersection: np.ndarray = np.intersect1d(self.loci, other.loci)
+        print(entries_intersection)
+        print(loci_intersection)
+        entries: np.ndarray = np.unique(np.concatenate(self.entries, other.entries))
+        loci: np.ndarray = np.unique(np.concatenate(self.loci, other.loci))
+        print(entries)
+        print(loci)
+        print(conflict_resolution)
+
+        return self
+
 
 def test_genotype():
+    # from data.genotype import *
     from data.simulation import simulate
+    import copy
 
     genomes, _ = simulate()
-    print(genomes)
     assert isinstance(genomes, Genomes)
+    genomes.mask[:, :] = False
+    genomes.mask[1:6, 10:20] = True
+    sliced_genomes = copy.deepcopy(genomes)
+    sliced_genomes.slice_inplace()
+    sliced_clone = genomes.slice()
+    assert sliced_genomes == sliced_clone
+    print(sliced_genomes)
+    print(sliced_clone)
+
+    self = genomes
+    other_genomes = sliced_genomes
diff --git a/data/simulation.py b/data/simulation.py
@@ -94,10 +94,13 @@ def simulate_loci_identities(
 
     Raises
     ------
+    - IncorrectParameter
     - IncompatibleParameters
     - RandomSamplingError
     - LogicError
     """
+    if (len(lengths_per_chrom.shape) != 1) or (len(n_loci_per_chrom.shape) != 1):
+        raise IncorrectParameter
     if lengths_per_chrom.shape[0] != n_loci_per_chrom.shape[0]:
         raise IncompatibleParameters
     p: int = n_loci_per_chrom.sum()
@@ -187,9 +190,12 @@ def simulate_genotypes(
 
     Raises
     ------
+    - IncorrectParameter
     - LogicError
     - AlleleFreqOverUnderflow
     """
+    if (len(chromosomes.shape) != 1) or (len(positions.shape) != 1) or (len(alleles.shape) != 2):
+        raise IncorrectParameter
     if (chromosomes.shape[0] != positions.shape[0]) or (chromosomes.shape[0] != alleles.shape[0]):
         raise LogicError
     uniq_chrom, n_loci_per_chrom = np.unique(chromosomes, return_counts=True)
@@ -309,11 +315,14 @@ def simulate_allele_effects(
 
     Raises
     ------
+    - IncorrectParameter
     - IncompatibleParameters
     - IncorrectParameter
     - FractionOverUnderflow
     - RandomSamplingError
     """
+    if len(genotypes.shape) != 2:
+        raise IncorrectParameter
     pk1: int = genotypes.shape[1]
     if neff > pk1:
         raise IncompatibleParameters
@@ -357,12 +366,21 @@ def simulate_allele_effects(
         for j in range(i + 1, ntraits):
             cor = np.random.uniform(low=min_corr, high=max_corr, size=1)[0]
             cov[i, j], cov[j, i] = cor, cor
+    # Make sure that the variance-covariance matrix if symmetric positive semidefinite, or else it does not make sense in the context of multivariate normal distribution.
+    # We will simulate negative correlations after sampling the effects.
     try:
-        allele_effects[indexes, range(ntraits)] = np.random.multivariate_normal(
-            mean=mean, cov=cov, size=neff
+        sampled_effects: np.ndarray = np.random.multivariate_normal(
+            mean=mean, cov=np.abs(cov), size=neff
         )
     except ValueError as err:
         raise RandomSamplingError from err
+    # Insert the effects while simulating some negative correlations
+    for j in range(sampled_effects.shape[1]):
+        coef: float = 1.00
+        if np.random.choice(a=cov[j, :], size=1) < 0.0:
+            coef *= -1.00
+        for i in range(sampled_effects.shape[0]):
+            allele_effects[indexes[i, j], j] = coef * sampled_effects[i, j]
     return allele_effects
 
 
@@ -385,10 +403,13 @@ def simulate_phenotypes(
 
     Raises
     ------
+    - IncorrectParameter
     - IncompatibleParameters
     - FractionOverUnderflow
     - RandomSamplingError
     """
+    if (len(genotypes.shape) != 2) or (len(allele_effects.shape) != 2):
+        raise IncorrectParameter
     n: int = genotypes.shape[0]
     pk1: int = genotypes.shape[1]
     ntraits: int = len(heritabilities)

diff --git a/models/linear.py b/models/linear.py
@@ -2,19 +2,26 @@
 from data.phenotype import Phenomes
 import numpy as np
 import sklearn.linear_model as lm
+
 # from sklearn.linear_model import RidgeCV, LassoCV
-# from sklearn.metrics import r2_score, mean_squared_error
+from sklearn.metrics import r2_score, mean_squared_error
 # from sklearn.model_selection import RepeatedKFold, KFold, cross_val_predict, cross_validate, KFold, LeaveOneOut, GridSearchCV, RepeatedKFold
 # from sklearn.neural_network import MLPRegressor
 
+
 def ridge(genomes: Genomes, phenomes: Phenomes, penalisations: np.ndarray) -> bool:
     from data.simulation import simulate
+
     genomes, phenomes = simulate()
-    penalisations = np.array([1.5**i for i in range(-40,45)][::-1])
-    
-    i = 0
+    penalisations = np.array([1.5**i for i in range(-40, 45)][::-1])
+
+    i = -1
     X: np.ndarray = genomes.genotypes
-    y: np.ndarray = phenomes.phenotypes[:,i]
+    y: np.ndarray = phenomes.phenotypes[:, i]
     model: lm.RidgeCV = lm.RidgeCV(alphas=penalisations).fit(X=X, y=y)
-    model.score(X, y)
-    return False
+    # r2_fit = model.score(X, y)
+    # bhat = model.coef_
+
+    mean_squared_error(model.predict(X), y)
+    r2_score(model.predict(X), y)
+    return False