Skip to content

Commit

Permalink
minor updates
Browse files Browse the repository at this point in the history
xinyuejohn committed Apr 15, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
1 parent 4a3c8ed commit 679be09
Showing 3 changed files with 38 additions and 15 deletions.
35 changes: 28 additions & 7 deletions ehrdata/pl/_omop.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import datetime
import warnings
from collections.abc import Sequence
from functools import partial
@@ -40,7 +41,7 @@ def feature_counts(
Returns
-------
pd.DataFrame: Dataframe with feature names and counts
Dataframe with feature names and counts
"""
path = adata.uns["filepath_dict"][source]
if isinstance(path, list):
@@ -55,7 +56,8 @@ def feature_counts(
id_key = f"{source}_concept_id"
else:
id_key = source.split("_")[0] + "_concept_id"
df_source = read_table(adata.uns, table_name=source, dtype=column_types, usecols=[id_key])
df_source = read_table(adata.uns, table_name=source, dtype=column_types, usecols=[id_key, "visit_occurrence_id"])
df_source = df_source[df_source["visit_occurrence_id"].isin(set(adata.obs.index))]
feature_counts = df_source[id_key].value_counts()
if use_dask:
feature_counts = feature_counts.compute()
@@ -86,6 +88,10 @@ def plot_timeseries(
value_key: str = "value_as_number",
time_key: str = "measurement_datetime",
x_label: str = None,
y_label: str = None,
title: str = None,
from_time: Optional[Union[str, datetime.datetime]] = None,
to_time: Optional[Union[str, datetime.datetime]] = None,
show: Optional[bool] = None,
):
"""Plot timeseries data using data from adata.obsm.
@@ -97,7 +103,12 @@ def plot_timeseries(
slot (Union[str, None], optional): Slot to use. Defaults to "obsm".
value_key (str, optional): key in awkward array in adata.obsm to be used as value. Defaults to "value_as_number".
time_key (str, optional): key in awkward array in adata.obsm to be used as time. Defaults to "measurement_datetime".
from_time (Optional[str], optional): Start time for the plot. Defaults to None.
to_time (Optional[str], optional): End time for the plot. Defaults to None.
x_label (str, optional): x labe name. Defaults to None.
y_label (str, optional): y label name. Defaults to None.
title (str, optional): title of the plot. Defaults to None.
show (Optional[bool], optional): Show the plot, do not return axis.
"""
@@ -114,9 +125,18 @@ def plot_timeseries(
_, ax = plt.subplots(figsize=(20, 6))
# Scatter plot
for key in key_list:
df = to_dataframe(adata, key)
x = df[df.visit_occurrence_id == visit_occurrence_id][time_key]
y = df[df.visit_occurrence_id == visit_occurrence_id][value_key]
df = to_dataframe(adata, features=key, visit_occurrence_id=visit_occurrence_id)
if from_time:
if isinstance(from_time, str):
from_time = pd.to_datetime(from_time)
df = df[df[time_key] >= from_time]
if to_time:
if isinstance(to_time, str):
to_time = pd.to_datetime(to_time)
df = df[df[time_key] <= to_time]
df.sort_values(by=time_key, inplace=True)
x = df[time_key]
y = df[value_key]

# Check if x is empty
if not x.empty:
@@ -139,8 +159,9 @@ def plot_timeseries(
# TODO step
# plt.xticks(np.arange(min_x, max_x, step=1))
# Adapt this to input data
plt.xlabel(x_label if x_label else "Hours since ICU admission")

plt.xlabel(x_label if x_label else "Datetime")
plt.ylabel(y_label if y_label else "Value")
plt.title(title if title else f"Timeseries plot for visit_occurrence_id: {visit_occurrence_id}")
plt.tight_layout()
if not show:
return ax
8 changes: 5 additions & 3 deletions ehrdata/tl/_omop.py
Original file line number Diff line number Diff line change
@@ -4,6 +4,7 @@
import numpy as np
import pandas as pd
from anndata import AnnData
from dateutil.parser import ParserError
from pandas.tseries.offsets import DateOffset as Offset
from rich import print as rprint

@@ -108,12 +109,13 @@ def aggregate_timeseries_in_bins(
if slot == "obsm":
for feature in features_list:
print(f"processing feature [{feature}]")
df = to_dataframe(adata, features)
if pd.api.types.is_datetime64_any_dtype(df[time_key]):
df = to_dataframe(adata, feature)
try:
df[time_key] = pd.to_datetime(df[time_key])
func = getattr(df[time_key].dt, time_binning_method, None)
if func is not None:
df[time_key] = func(bin_size)
else:
except (ParserError, ValueError):
# TODO need to take care of this if it doesn't follow omop standard
if bin_size == "h":
df[time_key] = df[time_key] / 3600
10 changes: 5 additions & 5 deletions ehrdata/utils/_omop_utils.py
Original file line number Diff line number Diff line change
@@ -301,7 +301,7 @@ def read_table(
Returns
-------
Union[pd.DataFrame, dd.DataFrame]: a pandas or dask DataFrame
a pandas or dask DataFrame
"""
path = adata_dict["filepath_dict"][table_name]
if isinstance(path, list):
@@ -436,9 +436,9 @@ def df_to_dict(df: pd.DataFrame, key: str, value: str) -> dict:
"""Convert a DataFrame to a dictionary
Args:
df (pd.DataFrame): a DataFrame
key (str): the column name to be used as the key of the dictionary
value (str): the column name to be used as the value of the dictionary
df: a DataFrame
key: the column name to be used as the key of the dictionary
value: the column name to be used as the value of the dictionary
Returns
-------
@@ -497,7 +497,7 @@ def get_feature_info(
Returns
-------
pd.DataFrame: a DataFrame containing the feature information
a DataFrame containing the feature information
"""
if "concept" in adata_dict["tables"]:
column_types = get_column_types(adata_dict, table_name="concept")

0 comments on commit 679be09

Please sign in to comment.