From 34dcec46ef302c4d0c2c3d9c15c307184b29722a Mon Sep 17 00:00:00 2001
From: daikitag <48062118+daikitag@users.noreply.github.com>
Date: Sat, 3 Feb 2024 18:07:41 +0000
Subject: [PATCH] CODE/DOC: normalise_phenotypes

Add `normalise_phenotype` function in tstrait.
---
 docs/api.md                      |  16 ++++-
 docs/quick-start.md              |  38 +++++++++++
 tests/test_simulate_phenotype.py | 109 ++++++++++++++++++++++++++++++-
 tstrait/__init__.py              |   2 +
 tstrait/simulate_phenotype.py    |  56 ++++++++++++++++
 5 files changed, 219 insertions(+), 2 deletions(-)

diff --git a/docs/api.md b/docs/api.md
index 2ddc46c..beec100 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -38,6 +38,14 @@ This page provides a detailed explanation of all public tstrait objects and func
    TraitModelMultivariateNormal
 ```
 
+### Postprocessing functions
+
+```{eval-rst}
+.. autosummary::
+
+   normalise_phenotypes
+```
+
 ### Result data classes
 
 ```{eval-rst}
@@ -100,8 +108,14 @@ This page provides a detailed explanation of all public tstrait objects and func
 .. autoclass:: tstrait.TraitModelMultivariateNormal
 ```
 
+### Postprocessing functions
+
+```{eval-rst}
+.. autofunction:: tstrait.normalise_phenotypes
+```
+
 ### Result data classes
 
 ```{eval-rst}
 .. autoclass:: tstrait.PhenotypeResult
-```
+```
\ No newline at end of file
diff --git a/docs/quick-start.md b/docs/quick-start.md
index 2af9e68..7fe6eaf 100644
--- a/docs/quick-start.md
+++ b/docs/quick-start.md
@@ -148,3 +148,41 @@ plt.show()
 The environmental noise in tstrait follows a normal distribution. Please see [](phenotype_model)
 for mathematical details on the phenotype model and [](effect_size_dist) for details on
 specifying the effect size distribution.
+
+(normalise_phenotype)=
+
+## Normalise Phenotype
+
+The simulated phenotypes can be scaled by using the {func}`normalise_phenotypes` function. The function
+will first normalise the phenotype by subtracting the mean of the input phenotype from each
+value and divide it by the standard devitation of the input phenotype.
+Afterwards, it scales the normalised phenotype based on the mean and variance input.
+The output of {func}`normalise_phenotype` is a {class}`pandas.DataFrame` object with the scaled phenotypes.
+
+An example usage of this function is shown below:
+
+```{code-cell}
+
+mean = 0
+var = 1
+normalised_df = tstrait.normalise_phenotypes(phenotype_df, mean=mean, var=var)
+normalised_df.head()
+```
+
+We see that the mean and variance of the normalised phenotype are 0 and 1, as we have indicated them
+as inputs of {func}`normalise_phenotypes`.
+
+```{code-cell}
+
+print("Mean of the normalised phenotype:", mean)
+print("Variance of the normalised phenotype:", var)
+```
+
+The distribution of the normalised phenotype is shown below.
+
+```{code-cell}
+
+plt.hist(normalised_df["phenotype"], bins=40)
+plt.title("Normalised Phenotype")
+plt.show()
+```
diff --git a/tests/test_simulate_phenotype.py b/tests/test_simulate_phenotype.py
index b489821..7d6deb7 100644
--- a/tests/test_simulate_phenotype.py
+++ b/tests/test_simulate_phenotype.py
@@ -7,7 +7,7 @@
 
 @pytest.fixture(scope="class")
 def sample_ts():
-    ts = msprime.sim_ancestry(10, sequence_length=100_000, random_seed=1)
+    ts = msprime.sim_ancestry(1000, sequence_length=100_000, random_seed=1)
     ts = msprime.sim_mutations(ts, rate=0.01, random_seed=1)
     return ts
 
@@ -221,3 +221,110 @@ def test_causal_sites_multivariate(self, sample_ts, causal_sites):
 
         pd.testing.assert_frame_equal(result.trait, trait_df)
         pd.testing.assert_frame_equal(result.phenotype, phenotype_df)
+
+
+class TestNormalise:
+    def test_output(self, sample_ts):
+        mean = 2
+        var = 4
+        model = tstrait.trait_model(distribution="normal", mean=2, var=6)
+        sim_result = tstrait.sim_phenotype(
+            ts=sample_ts, num_causal=100, model=model, h2=0.3, random_seed=1
+        )
+        phenotype_df = sim_result.phenotype
+        normalised_df = tstrait.normalise_phenotypes(phenotype_df, mean=mean, var=var)
+        phenotype_array = normalised_df["phenotype"].values
+        np.testing.assert_almost_equal(np.mean(phenotype_array), mean, decimal=2)
+        np.testing.assert_almost_equal(np.var(phenotype_array), var, decimal=2)
+        pd.testing.assert_series_equal(
+            normalised_df["trait_id"], phenotype_df["trait_id"]
+        )
+        pd.testing.assert_series_equal(
+            normalised_df["individual_id"], phenotype_df["individual_id"]
+        )
+
+        num_ind = sample_ts.num_individuals
+        assert len(normalised_df) == num_ind
+        assert normalised_df.shape[1] == 3
+        assert list(normalised_df.columns) == [
+            "individual_id",
+            "trait_id",
+            "phenotype",
+        ]
+
+    def test_default(self, sample_ts):
+        mean = 0
+        var = 1
+        model = tstrait.trait_model(distribution="normal", mean=2, var=6)
+        sim_result = tstrait.sim_phenotype(
+            ts=sample_ts, num_causal=100, model=model, h2=0.3, random_seed=1
+        )
+        phenotype_df = sim_result.phenotype
+        normalised_df = tstrait.normalise_phenotypes(phenotype_df)
+        phenotype_array = normalised_df["phenotype"].values
+        np.testing.assert_almost_equal(np.mean(phenotype_array), mean, decimal=2)
+        np.testing.assert_almost_equal(np.var(phenotype_array), var, decimal=2)
+        pd.testing.assert_series_equal(
+            normalised_df["trait_id"], phenotype_df["trait_id"]
+        )
+        pd.testing.assert_series_equal(
+            normalised_df["individual_id"], phenotype_df["individual_id"]
+        )
+
+        num_ind = sample_ts.num_individuals
+        assert len(normalised_df) == num_ind
+        assert normalised_df.shape[1] == 3
+        assert list(normalised_df.columns) == [
+            "individual_id",
+            "trait_id",
+            "phenotype",
+        ]
+
+    def test_column(self, sample_ts):
+        model = tstrait.trait_model(distribution="normal", mean=2, var=6)
+        sim_result = tstrait.sim_phenotype(
+            ts=sample_ts, num_causal=100, model=model, h2=0.3, random_seed=1
+        )
+        phenotype_df = sim_result.phenotype
+        with pytest.raises(
+            ValueError, match="columns must be included in phenotype_df dataframe"
+        ):
+            tstrait.normalise_phenotypes(phenotype_df[["trait_id", "individual_id"]])
+
+        with pytest.raises(
+            ValueError, match="columns must be included in phenotype_df dataframe"
+        ):
+            tstrait.normalise_phenotypes(phenotype_df[["trait_id", "phenotype"]])
+
+        with pytest.raises(
+            ValueError, match="columns must be included in phenotype_df dataframe"
+        ):
+            tstrait.normalise_phenotypes(phenotype_df[["phenotype", "individual_id"]])
+
+    @pytest.mark.parametrize("var", [0, -1])
+    def test_negative_var(self, sample_ts, var):
+        model = tstrait.trait_model(distribution="normal", mean=2, var=6)
+        sim_result = tstrait.sim_phenotype(
+            ts=sample_ts, num_causal=100, model=model, h2=0.3, random_seed=1
+        )
+        phenotype_df = sim_result.phenotype
+
+        with pytest.raises(ValueError, match="Variance must be greater than 0."):
+            tstrait.normalise_phenotypes(phenotype_df, var=var)
+
+    def test_pleiotropy(self, sample_ts):
+        mean = 0
+        var = 1
+        model = tstrait.trait_model(
+            distribution="multi_normal", mean=np.zeros(2), cov=np.identity(2)
+        )
+        sim_result = tstrait.sim_phenotype(
+            ts=sample_ts, num_causal=100, model=model, h2=0.3, random_seed=1
+        )
+        phenotype_df = sim_result.phenotype
+        normalised_df = tstrait.normalise_phenotypes(phenotype_df, mean=mean, var=var)
+        grouped = normalised_df.groupby(["trait_id"])[["phenotype"]]
+        mean_array = grouped.mean().values.T[0]
+        var_array = grouped.var().values.T[0]
+        np.testing.assert_almost_equal(mean_array, np.zeros(2), decimal=2)
+        np.testing.assert_almost_equal(var_array, np.ones(2), decimal=2)
diff --git a/tstrait/__init__.py b/tstrait/__init__.py
index f30f0c4..d7b0f70 100644
--- a/tstrait/__init__.py
+++ b/tstrait/__init__.py
@@ -13,6 +13,7 @@
 from .simulate_phenotype import (
     PhenotypeResult,
     sim_phenotype,
+    normalise_phenotypes,
 )  # noreorder
 from .trait_model import (
     trait_model,
@@ -34,6 +35,7 @@
 __all__ = [
     "__version__",
     "sim_trait",
+    "normalise_phenotypes",
     "PhenotypeResult",
     "sim_phenotype",
     "trait_model",
diff --git a/tstrait/simulate_phenotype.py b/tstrait/simulate_phenotype.py
index 83170a0..1f56918 100644
--- a/tstrait/simulate_phenotype.py
+++ b/tstrait/simulate_phenotype.py
@@ -1,8 +1,11 @@
 from dataclasses import dataclass
 
+import numpy as np
 import pandas as pd
 import tstrait
 
+from .base import _check_dataframe
+
 
 @dataclass
 class PhenotypeResult:
@@ -133,3 +136,56 @@ def sim_phenotype(
     result = tstrait.PhenotypeResult(trait=trait_df, phenotype=phenotype_df)
 
     return result
+
+
+def normalise_phenotypes(phenotype_df, mean=0, var=1):
+    """Normalise phenotype dataframe.
+
+    Parameters
+    ----------
+    phenotype_df : pandas.DataFrame
+        Phenotype dataframe.
+    mean : float, default 0
+        Mean of the resulting phenotype.
+    var : float, default 1
+        Variance of the resulting phenotype.
+
+    Returns
+    -------
+    pandas.DataFrame
+        Dataframe with normalised phenotype.
+
+    Raises
+    ------
+    ValueError
+        If `var` <= 0.
+
+    Notes
+    -----
+    The following columns must be included in `phenotype_df`:
+
+        * **trait_id**: Trait ID.
+        * **individual_id**: Individual ID.
+        * **phenotype**: Simulated phenotypes.
+
+    The dataframe output has the following columns:
+
+        * **trait_id**: Trait ID inside the phenotype_df input.
+        * **individual_id**: Individual ID inside the phenotype_df input.
+        * **phenotype**: Normalised phenotype.
+
+    Examples
+    --------
+    See :ref:`normalise_phenotype` section for worked examples.
+    """
+    if var <= 0:
+        raise ValueError("Variance must be greater than 0.")
+    phenotype_df = _check_dataframe(
+        phenotype_df, ["individual_id", "trait_id", "phenotype"], "phenotype_df"
+    )
+    grouped = phenotype_df.groupby("trait_id")[["phenotype"]]
+    transformed_phenotype = grouped.transform(lambda x: (x - x.mean()) / x.std())
+    transformed_phenotype = transformed_phenotype * np.sqrt(var) + mean
+    phenotype_df.loc[:, "phenotype"] = transformed_phenotype
+
+    return phenotype_df