Skip to content

Commit

Permalink
Merge pull request #623 from stan-dev/fix/json-encoding
Browse files Browse the repository at this point in the history
Fix json encoding of NaN/infinity
  • Loading branch information
WardBrian authored Sep 23, 2022
2 parents c2bab85 + 92f5ab3 commit fb3dfe2
Show file tree
Hide file tree
Showing 12 changed files with 98 additions and 98 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ repos:
- id: mypy
# Copied from setup.cfg
exclude: ^test/
additional_dependencies: [ numpy >= 1.22, types-ujson ]
additional_dependencies: [ numpy >= 1.22]
# local uses the user-installed pylint, this allows dependency checking
- repo: local
hooks:
Expand Down
2 changes: 1 addition & 1 deletion .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# A comma-separated list of package or module names from where C extensions may
# be loaded. Extensions are loading into the active Python interpreter and may
# run arbitrary code.
extension-pkg-whitelist=ujson
extension-pkg-whitelist=

# Add files or directories to the blacklist. They should be base names, not
# paths.
Expand Down
5 changes: 3 additions & 2 deletions cmdstanpy/model.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
"""CmdStanModel"""

import io
import json
import os
import platform
import re
import shutil
import subprocess
import sys
import threading
from collections import OrderedDict
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from io import StringIO
from multiprocessing import cpu_count
from pathlib import Path
import threading
from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Union

import ujson as json
from tqdm.auto import tqdm

from cmdstanpy import _CMDSTAN_REFRESH, _CMDSTAN_SAMPLING, _CMDSTAN_WARMUP
Expand Down Expand Up @@ -1587,6 +1587,7 @@ def _run_cmdstan(
env=os.environ,
universal_newlines=True,
)
timer: Optional[threading.Timer]
if timeout:

def _timer_target() -> None:
Expand Down
2 changes: 1 addition & 1 deletion cmdstanpy/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def show_versions(output: bool = True) -> str:
except Exception:
deps_info.append(('cmdstan', 'NOT FOUND'))

deps = ['cmdstanpy', 'pandas', 'xarray', 'tqdm', 'numpy', 'ujson']
deps = ['cmdstanpy', 'pandas', 'xarray', 'tqdm', 'numpy']
for module in deps:
try:
if module in sys.modules:
Expand Down
36 changes: 4 additions & 32 deletions cmdstanpy/utils/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,10 @@
Utilities for writing Stan Json files
"""
import json
import math
from collections.abc import Collection
from typing import Any, List, Mapping, Union
from typing import Any, List, Mapping

import numpy as np
import ujson

from .logging import get_logger


def rewrite_inf_nan(
data: Union[float, int, List[Any]]
) -> Union[str, int, float, List[Any]]:
"""Replaces NaN and Infinity with string representations"""
if isinstance(data, float):
if math.isnan(data):
return 'NaN'
if math.isinf(data):
return ('+' if data > 0 else '-') + 'inf'
return data
elif isinstance(data, list):
return [rewrite_inf_nan(item) for item in data]
else:
return data


def serialize_complex(c: Any) -> List[float]:
Expand Down Expand Up @@ -56,7 +36,6 @@ def write_stan_json(path: str, data: Mapping[str, Any]) -> None:
"""
data_out = {}
for key, val in data.items():
handle_nan_inf = False
if val is not None:
if isinstance(val, (str, bytes)) or (
type(val).__module__ != 'numpy'
Expand All @@ -67,9 +46,9 @@ def write_stan_json(path: str, data: Mapping[str, Any]) -> None:
+ f"write_stan_json for key '{key}'"
)
try:
handle_nan_inf = not np.all(np.isfinite(val))
except TypeError:
# handles cases like val == ['hello']
np.isfinite(val)
except TypeError:
# pylint: disable=raise-missing-from
raise ValueError(
"Invalid type provided to "
Expand All @@ -86,12 +65,5 @@ def write_stan_json(path: str, data: Mapping[str, Any]) -> None:
else:
data_out[key] = val

if handle_nan_inf:
data_out[key] = rewrite_inf_nan(data_out[key])

with open(path, 'w') as fd:
try:
ujson.dump(data_out, fd)
except TypeError as e:
get_logger().debug(e)
json.dump(data_out, fd, default=serialize_complex)
json.dump(data_out, fd, default=serialize_complex)
4 changes: 2 additions & 2 deletions cmdstanpy/utils/stancsv.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Utility functions for reading the Stan CSV format
"""
import json
import math
import re
from enum import Enum, auto
Expand All @@ -17,7 +18,6 @@

import numpy as np
import pandas as pd
import ujson

from cmdstanpy import _CMDSTAN_SAMPLING, _CMDSTAN_THIN, _CMDSTAN_WARMUP

Expand Down Expand Up @@ -453,7 +453,7 @@ def read_metric(path: str) -> List[int]:
"""
if path.endswith('.json'):
with open(path, 'r') as fd:
metric_dict = ujson.load(fd)
metric_dict = json.load(fd)
if 'inv_metric' in metric_dict:
dims_np: np.ndarray = np.asarray(metric_dict['inv_metric'])
return list(dims_np.shape)
Expand Down
101 changes: 53 additions & 48 deletions docsrc/users-guide/examples/Run Generated Quantities.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Generating new quantities of interest.\n",
"\n",
Expand All @@ -19,11 +20,11 @@
"- transform parameters for reporting\n",
"- apply full Bayesian decision theory\n",
"- calculate log likelihoods, deviances, etc. for model comparison"
],
"metadata": {}
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Example: add posterior predictive checks to `bernoulli.stan`\n",
"\n",
Expand All @@ -34,12 +35,13 @@
"We instantiate the model `bernoulli`,\n",
"as in the \"Hello World\" section\n",
"of the CmdStanPy [tutorial](https://github.com/stan-dev/cmdstanpy/blob/develop/cmdstanpy_tutorial.ipynb) notebook."
],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from cmdstanpy import cmdstan_path, CmdStanModel, CmdStanMCMC, CmdStanGQ\n",
Expand All @@ -51,153 +53,151 @@
"# instantiate, compile bernoulli model\n",
"model = CmdStanModel(stan_file=stan_file)\n",
"print(model.code())"
],
"outputs": [],
"metadata": {}
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The input data consists of `N` - the number of bernoulli trials and `y` - the list of observed outcomes.\n",
"Inspection of the data shows that on average, there is a 20% chance of success for any given Bernoulli trial."
],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# examine bernoulli data\n",
"import ujson\n",
"import json\n",
"import statistics\n",
"with open(data_file,'r') as fp:\n",
" data_dict = ujson.load(fp)\n",
" data_dict = json.load(fp)\n",
"print(data_dict)\n",
"print('mean of y: {}'.format(statistics.mean(data_dict['y'])))"
],
"outputs": [],
"metadata": {}
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"As in the \"Hello World\" tutorial, we produce a sample from the posterior of the model conditioned on the data:"
],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# fit the model to the data\n",
"fit = model.sample(data=data_file)"
],
"outputs": [],
"metadata": {}
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The fitted model produces an estimate of `theta` - the chance of success"
],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fit.summary()"
],
"outputs": [],
"metadata": {}
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To run a prior predictive check, we add a `generated quantities` block to the model, in which we generate a new data vector `y_rep` using the current estimate of theta. The resulting model is in file [bernoulli_ppc.stan](https://github.com/stan-dev/cmdstanpy/blob/master/test/data/bernoulli_ppc.stan)"
],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model_ppc = CmdStanModel(stan_file='bernoulli_ppc.stan')\n",
"print(model_ppc.code())"
],
"outputs": [],
"metadata": {}
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We run the `generate_quantities` method on `bernoulli_ppc` using existing sample `fit` as input. The `generate_quantities` method takes the values of `theta` in the `fit` sample as the set of draws from the posterior used to generate the corresponsing `y_rep` quantities of interest.\n",
"\n",
"The arguments to the `generate_quantities` method are:\n",
" + `data` - the data used to fit the model\n",
" + `mcmc_sample` - either a `CmdStanMCMC` object or a list of stan-csv files\n"
],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"new_quantities = model_ppc.generate_quantities(data=data_file, mcmc_sample=fit)"
],
"outputs": [],
"metadata": {}
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The `generate_quantities` method returns a `CmdStanGQ` object which contains the values for all variables in the generated quantitites block of the program ``bernoulli_ppc.stan``. Unlike the output from the ``sample`` method, it doesn't contain any information on the joint log probability density, sampler state, or parameters or transformed parameter values.\n",
"\n",
"In this example, each draw consists of the N-length array of replicate of the `bernoulli` model's input variable `y`, which is an N-length array of Bernoulli outcomes."
],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(new_quantities.draws().shape, new_quantities.column_names)\n",
"for i in range(3):\n",
" print (new_quantities.draws()[i,:])"
],
"outputs": [],
"metadata": {}
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can also use ``draws_pd(inc_sample=True)`` to get a pandas DataFrame which combines the input drawset with the generated quantities."
],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sample_plus = new_quantities.draws_pd(inc_sample=True)\n",
"print(type(sample_plus),sample_plus.shape)\n",
"names = list(sample_plus.columns.values[7:18])\n",
"sample_plus.iloc[0:3, :]"
],
"outputs": [],
"metadata": {}
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For models as simple as the bernoulli models here, it would be trivial to re-run the sampler and generate a new sample which contains both the estimate of the parameters `theta` as well as `y_rep` values. For models which are difficult to fit, i.e., when producing a sample is computationally expensive, the `generate_quantities` method is preferred."
],
"metadata": {}
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3.9.5 ('stan')",
"language": "python",
"name": "python3"
},
Expand All @@ -212,8 +212,13 @@
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.5"
},
"vscode": {
"interpreter": {
"hash": "8765ce46b013071999fc1966b52035a7309a0da7551e066cc0f0fa23e83d4f60"
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}
}
Loading

0 comments on commit fb3dfe2

Please sign in to comment.