From 34dcec46ef302c4d0c2c3d9c15c307184b29722a Mon Sep 17 00:00:00 2001 From: daikitag <48062118+daikitag@users.noreply.github.com> Date: Sat, 3 Feb 2024 18:07:41 +0000 Subject: [PATCH] CODE/DOC: normalise_phenotypes Add `normalise_phenotype` function in tstrait. --- docs/api.md | 16 ++++- docs/quick-start.md | 38 +++++++++++ tests/test_simulate_phenotype.py | 109 ++++++++++++++++++++++++++++++- tstrait/__init__.py | 2 + tstrait/simulate_phenotype.py | 56 ++++++++++++++++ 5 files changed, 219 insertions(+), 2 deletions(-) diff --git a/docs/api.md b/docs/api.md index 2ddc46c..beec100 100644 --- a/docs/api.md +++ b/docs/api.md @@ -38,6 +38,14 @@ This page provides a detailed explanation of all public tstrait objects and func TraitModelMultivariateNormal ``` +### Postprocessing functions + +```{eval-rst} +.. autosummary:: + + normalise_phenotypes +``` + ### Result data classes ```{eval-rst} @@ -100,8 +108,14 @@ This page provides a detailed explanation of all public tstrait objects and func .. autoclass:: tstrait.TraitModelMultivariateNormal ``` +### Postprocessing functions + +```{eval-rst} +.. autofunction:: tstrait.normalise_phenotypes +``` + ### Result data classes ```{eval-rst} .. autoclass:: tstrait.PhenotypeResult -``` +``` \ No newline at end of file diff --git a/docs/quick-start.md b/docs/quick-start.md index 2af9e68..7fe6eaf 100644 --- a/docs/quick-start.md +++ b/docs/quick-start.md @@ -148,3 +148,41 @@ plt.show() The environmental noise in tstrait follows a normal distribution. Please see [](phenotype_model) for mathematical details on the phenotype model and [](effect_size_dist) for details on specifying the effect size distribution. + +(normalise_phenotype)= + +## Normalise Phenotype + +The simulated phenotypes can be scaled by using the {func}`normalise_phenotypes` function. The function +will first normalise the phenotype by subtracting the mean of the input phenotype from each +value and divide it by the standard devitation of the input phenotype. +Afterwards, it scales the normalised phenotype based on the mean and variance input. +The output of {func}`normalise_phenotype` is a {class}`pandas.DataFrame` object with the scaled phenotypes. + +An example usage of this function is shown below: + +```{code-cell} + +mean = 0 +var = 1 +normalised_df = tstrait.normalise_phenotypes(phenotype_df, mean=mean, var=var) +normalised_df.head() +``` + +We see that the mean and variance of the normalised phenotype are 0 and 1, as we have indicated them +as inputs of {func}`normalise_phenotypes`. + +```{code-cell} + +print("Mean of the normalised phenotype:", mean) +print("Variance of the normalised phenotype:", var) +``` + +The distribution of the normalised phenotype is shown below. + +```{code-cell} + +plt.hist(normalised_df["phenotype"], bins=40) +plt.title("Normalised Phenotype") +plt.show() +``` diff --git a/tests/test_simulate_phenotype.py b/tests/test_simulate_phenotype.py index b489821..7d6deb7 100644 --- a/tests/test_simulate_phenotype.py +++ b/tests/test_simulate_phenotype.py @@ -7,7 +7,7 @@ @pytest.fixture(scope="class") def sample_ts(): - ts = msprime.sim_ancestry(10, sequence_length=100_000, random_seed=1) + ts = msprime.sim_ancestry(1000, sequence_length=100_000, random_seed=1) ts = msprime.sim_mutations(ts, rate=0.01, random_seed=1) return ts @@ -221,3 +221,110 @@ def test_causal_sites_multivariate(self, sample_ts, causal_sites): pd.testing.assert_frame_equal(result.trait, trait_df) pd.testing.assert_frame_equal(result.phenotype, phenotype_df) + + +class TestNormalise: + def test_output(self, sample_ts): + mean = 2 + var = 4 + model = tstrait.trait_model(distribution="normal", mean=2, var=6) + sim_result = tstrait.sim_phenotype( + ts=sample_ts, num_causal=100, model=model, h2=0.3, random_seed=1 + ) + phenotype_df = sim_result.phenotype + normalised_df = tstrait.normalise_phenotypes(phenotype_df, mean=mean, var=var) + phenotype_array = normalised_df["phenotype"].values + np.testing.assert_almost_equal(np.mean(phenotype_array), mean, decimal=2) + np.testing.assert_almost_equal(np.var(phenotype_array), var, decimal=2) + pd.testing.assert_series_equal( + normalised_df["trait_id"], phenotype_df["trait_id"] + ) + pd.testing.assert_series_equal( + normalised_df["individual_id"], phenotype_df["individual_id"] + ) + + num_ind = sample_ts.num_individuals + assert len(normalised_df) == num_ind + assert normalised_df.shape[1] == 3 + assert list(normalised_df.columns) == [ + "individual_id", + "trait_id", + "phenotype", + ] + + def test_default(self, sample_ts): + mean = 0 + var = 1 + model = tstrait.trait_model(distribution="normal", mean=2, var=6) + sim_result = tstrait.sim_phenotype( + ts=sample_ts, num_causal=100, model=model, h2=0.3, random_seed=1 + ) + phenotype_df = sim_result.phenotype + normalised_df = tstrait.normalise_phenotypes(phenotype_df) + phenotype_array = normalised_df["phenotype"].values + np.testing.assert_almost_equal(np.mean(phenotype_array), mean, decimal=2) + np.testing.assert_almost_equal(np.var(phenotype_array), var, decimal=2) + pd.testing.assert_series_equal( + normalised_df["trait_id"], phenotype_df["trait_id"] + ) + pd.testing.assert_series_equal( + normalised_df["individual_id"], phenotype_df["individual_id"] + ) + + num_ind = sample_ts.num_individuals + assert len(normalised_df) == num_ind + assert normalised_df.shape[1] == 3 + assert list(normalised_df.columns) == [ + "individual_id", + "trait_id", + "phenotype", + ] + + def test_column(self, sample_ts): + model = tstrait.trait_model(distribution="normal", mean=2, var=6) + sim_result = tstrait.sim_phenotype( + ts=sample_ts, num_causal=100, model=model, h2=0.3, random_seed=1 + ) + phenotype_df = sim_result.phenotype + with pytest.raises( + ValueError, match="columns must be included in phenotype_df dataframe" + ): + tstrait.normalise_phenotypes(phenotype_df[["trait_id", "individual_id"]]) + + with pytest.raises( + ValueError, match="columns must be included in phenotype_df dataframe" + ): + tstrait.normalise_phenotypes(phenotype_df[["trait_id", "phenotype"]]) + + with pytest.raises( + ValueError, match="columns must be included in phenotype_df dataframe" + ): + tstrait.normalise_phenotypes(phenotype_df[["phenotype", "individual_id"]]) + + @pytest.mark.parametrize("var", [0, -1]) + def test_negative_var(self, sample_ts, var): + model = tstrait.trait_model(distribution="normal", mean=2, var=6) + sim_result = tstrait.sim_phenotype( + ts=sample_ts, num_causal=100, model=model, h2=0.3, random_seed=1 + ) + phenotype_df = sim_result.phenotype + + with pytest.raises(ValueError, match="Variance must be greater than 0."): + tstrait.normalise_phenotypes(phenotype_df, var=var) + + def test_pleiotropy(self, sample_ts): + mean = 0 + var = 1 + model = tstrait.trait_model( + distribution="multi_normal", mean=np.zeros(2), cov=np.identity(2) + ) + sim_result = tstrait.sim_phenotype( + ts=sample_ts, num_causal=100, model=model, h2=0.3, random_seed=1 + ) + phenotype_df = sim_result.phenotype + normalised_df = tstrait.normalise_phenotypes(phenotype_df, mean=mean, var=var) + grouped = normalised_df.groupby(["trait_id"])[["phenotype"]] + mean_array = grouped.mean().values.T[0] + var_array = grouped.var().values.T[0] + np.testing.assert_almost_equal(mean_array, np.zeros(2), decimal=2) + np.testing.assert_almost_equal(var_array, np.ones(2), decimal=2) diff --git a/tstrait/__init__.py b/tstrait/__init__.py index f30f0c4..d7b0f70 100644 --- a/tstrait/__init__.py +++ b/tstrait/__init__.py @@ -13,6 +13,7 @@ from .simulate_phenotype import ( PhenotypeResult, sim_phenotype, + normalise_phenotypes, ) # noreorder from .trait_model import ( trait_model, @@ -34,6 +35,7 @@ __all__ = [ "__version__", "sim_trait", + "normalise_phenotypes", "PhenotypeResult", "sim_phenotype", "trait_model", diff --git a/tstrait/simulate_phenotype.py b/tstrait/simulate_phenotype.py index 83170a0..1f56918 100644 --- a/tstrait/simulate_phenotype.py +++ b/tstrait/simulate_phenotype.py @@ -1,8 +1,11 @@ from dataclasses import dataclass +import numpy as np import pandas as pd import tstrait +from .base import _check_dataframe + @dataclass class PhenotypeResult: @@ -133,3 +136,56 @@ def sim_phenotype( result = tstrait.PhenotypeResult(trait=trait_df, phenotype=phenotype_df) return result + + +def normalise_phenotypes(phenotype_df, mean=0, var=1): + """Normalise phenotype dataframe. + + Parameters + ---------- + phenotype_df : pandas.DataFrame + Phenotype dataframe. + mean : float, default 0 + Mean of the resulting phenotype. + var : float, default 1 + Variance of the resulting phenotype. + + Returns + ------- + pandas.DataFrame + Dataframe with normalised phenotype. + + Raises + ------ + ValueError + If `var` <= 0. + + Notes + ----- + The following columns must be included in `phenotype_df`: + + * **trait_id**: Trait ID. + * **individual_id**: Individual ID. + * **phenotype**: Simulated phenotypes. + + The dataframe output has the following columns: + + * **trait_id**: Trait ID inside the phenotype_df input. + * **individual_id**: Individual ID inside the phenotype_df input. + * **phenotype**: Normalised phenotype. + + Examples + -------- + See :ref:`normalise_phenotype` section for worked examples. + """ + if var <= 0: + raise ValueError("Variance must be greater than 0.") + phenotype_df = _check_dataframe( + phenotype_df, ["individual_id", "trait_id", "phenotype"], "phenotype_df" + ) + grouped = phenotype_df.groupby("trait_id")[["phenotype"]] + transformed_phenotype = grouped.transform(lambda x: (x - x.mean()) / x.std()) + transformed_phenotype = transformed_phenotype * np.sqrt(var) + mean + phenotype_df.loc[:, "phenotype"] = transformed_phenotype + + return phenotype_df