Skip to content

Commit

Permalink
minor updates
Browse files Browse the repository at this point in the history
xinyuejohn committed Apr 15, 2024
1 parent 4a3c8ed commit 679be09
Showing 3 changed files with 38 additions and 15 deletions.
35 changes: 28 additions & 7 deletions ehrdata/pl/_omop.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import datetime
import warnings
from collections.abc import Sequence
from functools import partial
@@ -40,7 +41,7 @@ def feature_counts(
Returns
-------
pd.DataFrame: Dataframe with feature names and counts
Dataframe with feature names and counts
"""
path = adata.uns["filepath_dict"][source]
if isinstance(path, list):
@@ -55,7 +56,8 @@ def feature_counts(
id_key = f"{source}_concept_id"
else:
id_key = source.split("_")[0] + "_concept_id"
df_source = read_table(adata.uns, table_name=source, dtype=column_types, usecols=[id_key])
df_source = read_table(adata.uns, table_name=source, dtype=column_types, usecols=[id_key, "visit_occurrence_id"])
df_source = df_source[df_source["visit_occurrence_id"].isin(set(adata.obs.index))]
feature_counts = df_source[id_key].value_counts()
if use_dask:
feature_counts = feature_counts.compute()
@@ -86,6 +88,10 @@ def plot_timeseries(
value_key: str = "value_as_number",
time_key: str = "measurement_datetime",
x_label: str = None,
y_label: str = None,
title: str = None,
from_time: Optional[Union[str, datetime.datetime]] = None,
to_time: Optional[Union[str, datetime.datetime]] = None,
show: Optional[bool] = None,
):
"""Plot timeseries data using data from adata.obsm.
@@ -97,7 +103,12 @@ def plot_timeseries(
slot (Union[str, None], optional): Slot to use. Defaults to "obsm".
value_key (str, optional): key in awkward array in adata.obsm to be used as value. Defaults to "value_as_number".
time_key (str, optional): key in awkward array in adata.obsm to be used as time. Defaults to "measurement_datetime".
from_time (Optional[str], optional): Start time for the plot. Defaults to None.
to_time (Optional[str], optional): End time for the plot. Defaults to None.
x_label (str, optional): x labe name. Defaults to None.
y_label (str, optional): y label name. Defaults to None.
title (str, optional): title of the plot. Defaults to None.
show (Optional[bool], optional): Show the plot, do not return axis.
"""
@@ -114,9 +125,18 @@ def plot_timeseries(
_, ax = plt.subplots(figsize=(20, 6))
# Scatter plot
for key in key_list:
df = to_dataframe(adata, key)
x = df[df.visit_occurrence_id == visit_occurrence_id][time_key]
y = df[df.visit_occurrence_id == visit_occurrence_id][value_key]
df = to_dataframe(adata, features=key, visit_occurrence_id=visit_occurrence_id)
if from_time:
if isinstance(from_time, str):
from_time = pd.to_datetime(from_time)
df = df[df[time_key] >= from_time]
if to_time:
if isinstance(to_time, str):
to_time = pd.to_datetime(to_time)
df = df[df[time_key] <= to_time]
df.sort_values(by=time_key, inplace=True)
x = df[time_key]
y = df[value_key]

# Check if x is empty
if not x.empty:
@@ -139,8 +159,9 @@ def plot_timeseries(
# TODO step
# plt.xticks(np.arange(min_x, max_x, step=1))
# Adapt this to input data
plt.xlabel(x_label if x_label else "Hours since ICU admission")

plt.xlabel(x_label if x_label else "Datetime")
plt.ylabel(y_label if y_label else "Value")
plt.title(title if title else f"Timeseries plot for visit_occurrence_id: {visit_occurrence_id}")
plt.tight_layout()
if not show:
return ax
8 changes: 5 additions & 3 deletions ehrdata/tl/_omop.py
Original file line number Diff line number Diff line change
@@ -4,6 +4,7 @@
import numpy as np
import pandas as pd
from anndata import AnnData
from dateutil.parser import ParserError
from pandas.tseries.offsets import DateOffset as Offset
from rich import print as rprint

@@ -108,12 +109,13 @@ def aggregate_timeseries_in_bins(
if slot == "obsm":
for feature in features_list:
print(f"processing feature [{feature}]")
df = to_dataframe(adata, features)
if pd.api.types.is_datetime64_any_dtype(df[time_key]):
df = to_dataframe(adata, feature)
try:
df[time_key] = pd.to_datetime(df[time_key])
func = getattr(df[time_key].dt, time_binning_method, None)
if func is not None:
df[time_key] = func(bin_size)
else:
except (ParserError, ValueError):
# TODO need to take care of this if it doesn't follow omop standard
if bin_size == "h":
df[time_key] = df[time_key] / 3600
10 changes: 5 additions & 5 deletions ehrdata/utils/_omop_utils.py
Original file line number Diff line number Diff line change
@@ -301,7 +301,7 @@ def read_table(
Returns
-------
Union[pd.DataFrame, dd.DataFrame]: a pandas or dask DataFrame
a pandas or dask DataFrame
"""
path = adata_dict["filepath_dict"][table_name]
if isinstance(path, list):
@@ -436,9 +436,9 @@ def df_to_dict(df: pd.DataFrame, key: str, value: str) -> dict:
"""Convert a DataFrame to a dictionary
Args:
df (pd.DataFrame): a DataFrame
key (str): the column name to be used as the key of the dictionary
value (str): the column name to be used as the value of the dictionary
df: a DataFrame
key: the column name to be used as the key of the dictionary
value: the column name to be used as the value of the dictionary
Returns
-------
@@ -497,7 +497,7 @@ def get_feature_info(
Returns
-------
pd.DataFrame: a DataFrame containing the feature information
a DataFrame containing the feature information
"""
if "concept" in adata_dict["tables"]:
column_types = get_column_types(adata_dict, table_name="concept")

0 comments on commit 679be09

Please sign in to comment.