Skip to content

Commit

Permalink
CODE/DOC: normalise_phenotypes
Browse files Browse the repository at this point in the history
Add `normalise_phenotype` function in tstrait.
  • Loading branch information
daikitag authored and mergify[bot] committed Feb 6, 2024
1 parent b372bdc commit 34dcec4
Show file tree
Hide file tree
Showing 5 changed files with 219 additions and 2 deletions.
16 changes: 15 additions & 1 deletion docs/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,14 @@ This page provides a detailed explanation of all public tstrait objects and func
TraitModelMultivariateNormal
```

### Postprocessing functions

```{eval-rst}
.. autosummary::
normalise_phenotypes
```

### Result data classes

```{eval-rst}
Expand Down Expand Up @@ -100,8 +108,14 @@ This page provides a detailed explanation of all public tstrait objects and func
.. autoclass:: tstrait.TraitModelMultivariateNormal
```

### Postprocessing functions

```{eval-rst}
.. autofunction:: tstrait.normalise_phenotypes
```

### Result data classes

```{eval-rst}
.. autoclass:: tstrait.PhenotypeResult
```
```
38 changes: 38 additions & 0 deletions docs/quick-start.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,3 +148,41 @@ plt.show()
The environmental noise in tstrait follows a normal distribution. Please see [](phenotype_model)
for mathematical details on the phenotype model and [](effect_size_dist) for details on
specifying the effect size distribution.

(normalise_phenotype)=

## Normalise Phenotype

The simulated phenotypes can be scaled by using the {func}`normalise_phenotypes` function. The function
will first normalise the phenotype by subtracting the mean of the input phenotype from each
value and divide it by the standard devitation of the input phenotype.
Afterwards, it scales the normalised phenotype based on the mean and variance input.
The output of {func}`normalise_phenotype` is a {class}`pandas.DataFrame` object with the scaled phenotypes.

An example usage of this function is shown below:

```{code-cell}
mean = 0
var = 1
normalised_df = tstrait.normalise_phenotypes(phenotype_df, mean=mean, var=var)
normalised_df.head()
```

We see that the mean and variance of the normalised phenotype are 0 and 1, as we have indicated them
as inputs of {func}`normalise_phenotypes`.

```{code-cell}
print("Mean of the normalised phenotype:", mean)
print("Variance of the normalised phenotype:", var)
```

The distribution of the normalised phenotype is shown below.

```{code-cell}
plt.hist(normalised_df["phenotype"], bins=40)
plt.title("Normalised Phenotype")
plt.show()
```
109 changes: 108 additions & 1 deletion tests/test_simulate_phenotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

@pytest.fixture(scope="class")
def sample_ts():
ts = msprime.sim_ancestry(10, sequence_length=100_000, random_seed=1)
ts = msprime.sim_ancestry(1000, sequence_length=100_000, random_seed=1)
ts = msprime.sim_mutations(ts, rate=0.01, random_seed=1)
return ts

Expand Down Expand Up @@ -221,3 +221,110 @@ def test_causal_sites_multivariate(self, sample_ts, causal_sites):

pd.testing.assert_frame_equal(result.trait, trait_df)
pd.testing.assert_frame_equal(result.phenotype, phenotype_df)


class TestNormalise:
def test_output(self, sample_ts):
mean = 2
var = 4
model = tstrait.trait_model(distribution="normal", mean=2, var=6)
sim_result = tstrait.sim_phenotype(
ts=sample_ts, num_causal=100, model=model, h2=0.3, random_seed=1
)
phenotype_df = sim_result.phenotype
normalised_df = tstrait.normalise_phenotypes(phenotype_df, mean=mean, var=var)
phenotype_array = normalised_df["phenotype"].values
np.testing.assert_almost_equal(np.mean(phenotype_array), mean, decimal=2)
np.testing.assert_almost_equal(np.var(phenotype_array), var, decimal=2)
pd.testing.assert_series_equal(
normalised_df["trait_id"], phenotype_df["trait_id"]
)
pd.testing.assert_series_equal(
normalised_df["individual_id"], phenotype_df["individual_id"]
)

num_ind = sample_ts.num_individuals
assert len(normalised_df) == num_ind
assert normalised_df.shape[1] == 3
assert list(normalised_df.columns) == [
"individual_id",
"trait_id",
"phenotype",
]

def test_default(self, sample_ts):
mean = 0
var = 1
model = tstrait.trait_model(distribution="normal", mean=2, var=6)
sim_result = tstrait.sim_phenotype(
ts=sample_ts, num_causal=100, model=model, h2=0.3, random_seed=1
)
phenotype_df = sim_result.phenotype
normalised_df = tstrait.normalise_phenotypes(phenotype_df)
phenotype_array = normalised_df["phenotype"].values
np.testing.assert_almost_equal(np.mean(phenotype_array), mean, decimal=2)
np.testing.assert_almost_equal(np.var(phenotype_array), var, decimal=2)
pd.testing.assert_series_equal(
normalised_df["trait_id"], phenotype_df["trait_id"]
)
pd.testing.assert_series_equal(
normalised_df["individual_id"], phenotype_df["individual_id"]
)

num_ind = sample_ts.num_individuals
assert len(normalised_df) == num_ind
assert normalised_df.shape[1] == 3
assert list(normalised_df.columns) == [
"individual_id",
"trait_id",
"phenotype",
]

def test_column(self, sample_ts):
model = tstrait.trait_model(distribution="normal", mean=2, var=6)
sim_result = tstrait.sim_phenotype(
ts=sample_ts, num_causal=100, model=model, h2=0.3, random_seed=1
)
phenotype_df = sim_result.phenotype
with pytest.raises(
ValueError, match="columns must be included in phenotype_df dataframe"
):
tstrait.normalise_phenotypes(phenotype_df[["trait_id", "individual_id"]])

with pytest.raises(
ValueError, match="columns must be included in phenotype_df dataframe"
):
tstrait.normalise_phenotypes(phenotype_df[["trait_id", "phenotype"]])

with pytest.raises(
ValueError, match="columns must be included in phenotype_df dataframe"
):
tstrait.normalise_phenotypes(phenotype_df[["phenotype", "individual_id"]])

@pytest.mark.parametrize("var", [0, -1])
def test_negative_var(self, sample_ts, var):
model = tstrait.trait_model(distribution="normal", mean=2, var=6)
sim_result = tstrait.sim_phenotype(
ts=sample_ts, num_causal=100, model=model, h2=0.3, random_seed=1
)
phenotype_df = sim_result.phenotype

with pytest.raises(ValueError, match="Variance must be greater than 0."):
tstrait.normalise_phenotypes(phenotype_df, var=var)

def test_pleiotropy(self, sample_ts):
mean = 0
var = 1
model = tstrait.trait_model(
distribution="multi_normal", mean=np.zeros(2), cov=np.identity(2)
)
sim_result = tstrait.sim_phenotype(
ts=sample_ts, num_causal=100, model=model, h2=0.3, random_seed=1
)
phenotype_df = sim_result.phenotype
normalised_df = tstrait.normalise_phenotypes(phenotype_df, mean=mean, var=var)
grouped = normalised_df.groupby(["trait_id"])[["phenotype"]]
mean_array = grouped.mean().values.T[0]
var_array = grouped.var().values.T[0]
np.testing.assert_almost_equal(mean_array, np.zeros(2), decimal=2)
np.testing.assert_almost_equal(var_array, np.ones(2), decimal=2)
2 changes: 2 additions & 0 deletions tstrait/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from .simulate_phenotype import (
PhenotypeResult,
sim_phenotype,
normalise_phenotypes,
) # noreorder
from .trait_model import (
trait_model,
Expand All @@ -34,6 +35,7 @@
__all__ = [
"__version__",
"sim_trait",
"normalise_phenotypes",
"PhenotypeResult",
"sim_phenotype",
"trait_model",
Expand Down
56 changes: 56 additions & 0 deletions tstrait/simulate_phenotype.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
from dataclasses import dataclass

import numpy as np
import pandas as pd
import tstrait

from .base import _check_dataframe


@dataclass
class PhenotypeResult:
Expand Down Expand Up @@ -133,3 +136,56 @@ def sim_phenotype(
result = tstrait.PhenotypeResult(trait=trait_df, phenotype=phenotype_df)

return result


def normalise_phenotypes(phenotype_df, mean=0, var=1):
"""Normalise phenotype dataframe.
Parameters
----------
phenotype_df : pandas.DataFrame
Phenotype dataframe.
mean : float, default 0
Mean of the resulting phenotype.
var : float, default 1
Variance of the resulting phenotype.
Returns
-------
pandas.DataFrame
Dataframe with normalised phenotype.
Raises
------
ValueError
If `var` <= 0.
Notes
-----
The following columns must be included in `phenotype_df`:
* **trait_id**: Trait ID.
* **individual_id**: Individual ID.
* **phenotype**: Simulated phenotypes.
The dataframe output has the following columns:
* **trait_id**: Trait ID inside the phenotype_df input.
* **individual_id**: Individual ID inside the phenotype_df input.
* **phenotype**: Normalised phenotype.
Examples
--------
See :ref:`normalise_phenotype` section for worked examples.
"""
if var <= 0:
raise ValueError("Variance must be greater than 0.")
phenotype_df = _check_dataframe(
phenotype_df, ["individual_id", "trait_id", "phenotype"], "phenotype_df"
)
grouped = phenotype_df.groupby("trait_id")[["phenotype"]]
transformed_phenotype = grouped.transform(lambda x: (x - x.mean()) / x.std())
transformed_phenotype = transformed_phenotype * np.sqrt(var) + mean
phenotype_df.loc[:, "phenotype"] = transformed_phenotype

return phenotype_df

0 comments on commit 34dcec4

Please sign in to comment.