diff --git a/.gitignore b/.gitignore index d52caa1b..3ec3d795 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,133 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + # Virtual environment files. bin/ lib/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ae319c70..36545cc2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -21,3 +21,18 @@ All submissions, including submissions by project members, require review. We use GitHub pull requests for this purpose. Consult [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more information on using pull requests. + +## Process to add new experiments in BSuite: +1. If you're creating a completely new environment, create a directory in `bsuite/environments` with: `.py`. `.py` should define a new env_class which should be a subclass of `bsuite.environments.base.Environment` (and it should return appropriate info in `bsuite_info()`). + +1. Create directory in `bsuite/experiments` with: `.py`, `sweep.py`, `analysis.py`, `__init__.py`, `_test.py`. + * `.py`: Needs to import the environment used for the experiment that is defined in `bsuite/environments/` + * `.py` and define a load variable in the file that is equal to `.` + * `sweep.py`: Needs to have the parameters that vary for the experiment. e.g., `seed` and `noise_scale` for `cartpole_noise`. Each set of parameters is stored as a dict in a tuple named `SETTINGS`. This file also defines `NUM_EPISODES` and `TAGS` (such as `credit_assignment`, `basic`, `exploration`, etc.). In `TAGS`, the 1st tag should be one of the basic "types" from `summary_analysis.py`: `['basic', 'noise', 'scale', 'exploration', 'credit_assignment', 'memory', 'mdp_playground']`. NOTE: Remember to add a comma after the tag in `TAGS` if there is only 1 tag, because the comma is what makes it a tuple in Python. + * `analysis.py`: Needs to define `score()`, `plot_learning()`, `plot_seeds()` (and possibly other functions like `plot_average`) that will be used by `bsuite/analysis/results.ipynb` to analyse and plot recorded data. + +1. `bsuite/bsuite.py`, `bsuite/sweep.py`, `bsuite/experiments/summary_analysis.py` and `bsuite/analysis/results.ipynb` need to be modified for each new experiment added. We need to add code lines specific to the new experiment, e.g., `from bsuite.experiments. import ...`. + +1. Run `pytest` in bsuite directory and ensure no tests fail. + +1. Run `pytype -j "$(grep -c ^processor /proc/cpuinfo)" bsuite` in bsuite directory and ensure no tests fail. diff --git a/bsuite/analysis/results.ipynb b/bsuite/analysis/results.ipynb index c363fedf..aa3396d5 100644 --- a/bsuite/analysis/results.ipynb +++ b/bsuite/analysis/results.ipynb @@ -19,7 +19,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": {}, @@ -30,7 +30,7 @@ "source": [ "#@title Imports\n", "\n", - "! pip install --quiet git+git://github.com/deepmind/bsuite\n", + "#! pip install --quiet git+git://github.com/deepmind/bsuite\n", "import warnings\n", "\n", "from bsuite.experiments import summary_analysis\n", @@ -49,13 +49,8 @@ }, { "cell_type": "code", - "execution_count": 0, - "metadata": { - "cellView": "form", - "colab": {}, - "colab_type": "code", - "id": "ss3Gk6DzZjqO" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "#@title Import experiment-specific analysis\n", @@ -73,6 +68,12 @@ "from bsuite.experiments.deep_sea import analysis as deep_sea_analysis\n", "from bsuite.experiments.deep_sea_stochastic import analysis as deep_sea_stochastic_analysis\n", "from bsuite.experiments.discounting_chain import analysis as discounting_chain_analysis\n", + "from bsuite.experiments.mdp_playground import analysis as mdp_playground_analysis\n", + "from bsuite.experiments.mdp_playground_delay import analysis as mdp_playground_delay_analysis\n", + "from bsuite.experiments.mdp_playground_p_noise import analysis as mdp_playground_p_noise_analysis\n", + "from bsuite.experiments.mdp_playground_r_noise import analysis as mdp_playground_r_noise_analysis\n", + "from bsuite.experiments.mdp_playground_r_sparse import analysis as mdp_playground_r_sparse_analysis\n", + "from bsuite.experiments.mdp_playground_seq_len import analysis as mdp_playground_seq_len_analysis\n", "from bsuite.experiments.memory_len import analysis as memory_len_analysis\n", "from bsuite.experiments.memory_size import analysis as memory_size_analysis\n", "from bsuite.experiments.mnist import analysis as mnist_analysis\n", @@ -111,7 +112,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "both", "colab": {}, @@ -122,29 +123,46 @@ "source": [ "#@title loading results from local data:\n", "\n", - "experiments = {} # Add results here\n", - "DF, SWEEP_VARS = sqlite_load.load_bsuite(experiments)\n", + "experiments = {\n", + " \"sonnet_dqn\": \"/home/rajanr/bsuite_runs/more_runs/sonnet_dqn/\", \n", + " \"boot_dqn\": \"/home/rajanr/bsuite_runs/more_runs/boot_dqn\",\n", + " \"a2c\": \"/home/rajanr/bsuite_runs/more_runs/a2c\",\n", + " \"a2c_rnn\": \"/home/rajanr/bsuite_runs/more_runs/a2c_rnn\",\n", + " \"random_agent\": \"/home/rajanr/bsuite_runs/more_runs/random_agent\",\n", + "}\n", + "\n", + "experiments = {\n", + " \"sonnet_dqn\": \"/home/rajanr/bsuite_mdpp_10273442\",\n", + "# \"boot_dqn\": \"/home/rajanr/bsuite_mdpp_10273445\",\n", + " \"a2c\": \"/home/rajanr/bsuite_mdpp_10273443\",\n", + " \"a2c_rnn\": \"/home/rajanr/bsuite_mdpp_10273444\",\n", + " \"random_agent\": \"/home/rajanr/bsuite_mdpp_10273446\",\n", + "}\n", + "\n", + "# DF, SWEEP_VARS = sqlite_load.load_bsuite(experiments)\n", "# Or\n", - "# DF, SWEEP_VARS = csv_load.load_bsuite(experiments)" + "DF, SWEEP_VARS = csv_load.load_bsuite(experiments)" ] }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "colab": { "height": 515 }, "colab_type": "code", "id": "plQLUbWPpUhv", - "outputId": "0168d987-13da-415f-f38f-ee94c4214347" + "outputId": "e5bd5ec1-504b-4e35-a8fa-a487c84b0834", + "scrolled": false }, "outputs": [], "source": [ "#@title overall score as radar plot (double-click to show/hide code)\n", "BSUITE_SCORE = summary_analysis.bsuite_score(DF, SWEEP_VARS)\n", "BSUITE_SUMMARY = summary_analysis.ave_score_by_tag(BSUITE_SCORE, SWEEP_VARS)\n", - "__radar_fig__ = summary_analysis.bsuite_radar_plot(BSUITE_SUMMARY, SWEEP_VARS)" + "__radar_fig__ = summary_analysis.bsuite_radar_plot(BSUITE_SUMMARY, SWEEP_VARS)\n", + "print(BSUITE_SUMMARY, SWEEP_VARS)" ] }, { @@ -173,7 +191,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -186,7 +204,7 @@ "outputs": [], "source": [ "#@title plotting overall score as bar (double-click to show/hide code)\n", - "summary_analysis.bsuite_bar_plot(BSUITE_SCORE, SWEEP_VARS).draw();" + "bar_plt = summary_analysis.bsuite_bar_plot(BSUITE_SCORE, SWEEP_VARS).draw()" ] }, { @@ -206,7 +224,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -250,6 +268,486 @@ "This section of the report contains specific analysis for each individual `bsuite` experiment." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## MDP Playground\n", + "\n", + "MDP Playground consists of many toy MDPs with controllable dimensions of difficulties that can be injected independently of each other (https://arxiv.org/abs/1909.07750). The MDPs are essentially *regularly structured* grid-worlds with the given values for the dimensions (please refer to the paper for more details). Once the structure of the MDP is set, the connections between states are randomly set to one of the possible available actions.\n", + "\n", + "We instantiate here the vanilla MDP, with harder MDPs in the cells further below, wherein we vary the hardness along one dimension of difficulty at a time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "#@title parsing data\n", + "mdp_playground_df = DF[DF.bsuite_env == 'mdp_playground'].copy()\n", + "summary_analysis.plot_single_experiment(BSUITE_SCORE, 'mdp_playground', SWEEP_VARS).draw();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#@title average regret during learning (lower is better)\n", + "mdp_playground_analysis.plot_learning(mdp_playground_df, SWEEP_VARS).draw();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Parsing the plot above:**\n", + "- Display the average regret *through* 1k episodes (lower is better)\n", + "- Dashed line shows the performance of a random agent.\n", + "- Plots are for the vanilla MDP Playground environment so this tests an agents' basic learning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#@title plot performance by seed (higher is better)\n", + "mdp_playground_analysis.plot_seeds(mdp_playground_df, SWEEP_VARS).draw();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Parsing the plot above:**\n", + "\n", + "- Here we can see the performance of each agent individually through time.\n", + "- Higher scores are better, but individual runs may be noisy.\n", + "- Use this plot to diagnose strange agent behaviour and also analyse stability of its learning." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### MDP Playground with Delays\n", + "\n", + "In these MDPs, we inject artificial delays of 0, 1, 2, 4 and 8 to test agents' robustness to reward delays." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "#@title parsing data\n", + "mdp_playground_delay_df = DF[DF.bsuite_env == 'mdp_playground_delay'].copy()\n", + "summary_analysis.plot_single_experiment(BSUITE_SCORE, 'mdp_playground_delay', SWEEP_VARS).draw();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#@title average regret after learning (lower is better)\n", + "mdp_playground_delay_analysis.plot_average(mdp_playground_delay_df, SWEEP_VARS).draw();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Parsing the plot above:**\n", + "- Display the average regret *after* 1k episodes (lower is better)\n", + "- Dashed line shows the performance of a random agent.\n", + "- Plots are for the MDP Playground environment with delays injected so this tests an agents' robustness to reward delays." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#@title average regret during learning (lower is better)\n", + "mdp_playground_delay_analysis.plot_learning(mdp_playground_delay_df, SWEEP_VARS).draw();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Parsing the plot above:**\n", + "- Display the average regret *through* 1k episodes (lower is better)\n", + "- Dashed line shows the performance of a random agent.\n", + "- Plots are for the MDP Playground environment with delays injected so this tests an agents' robustness to reward delays." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#@title plot performance by seed (higher is better)\n", + "mdp_playground_delay_analysis.plot_seeds(mdp_playground_delay_df, SWEEP_VARS).draw();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Parsing the plot above:**\n", + "\n", + "- Here we can see the performance of each agent individually through time.\n", + "- Higher scores are better, but individual runs may be noisy.\n", + "- Use this plot to diagnose strange agent behaviour." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### MDP Playground with Transition Noise\n", + "\n", + "In these MDPs, a fraction of the transitions in the environment are noisy - the fractions of transitions which are noisy in the instantiated environments are 0, 0.01, 0.02, 0.1 and 0.25 - to test agents' robustness to transition noise." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "#@title parsing data\n", + "mdp_playground_p_noise_df = DF[DF.bsuite_env == 'mdp_playground_p_noise'].copy()\n", + "summary_analysis.plot_single_experiment(BSUITE_SCORE, 'mdp_playground_p_noise', SWEEP_VARS).draw();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#@title average regret after learning (lower is better)\n", + "mdp_playground_p_noise_analysis.plot_average(mdp_playground_p_noise_df, SWEEP_VARS).draw();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Parsing the plot above:**\n", + "- Display the average regret *after* 1k episodes (lower is better)\n", + "- Dashed line shows the performance of a random agent.\n", + "- Plots are for the MDP Playground environment with injected transition noise so this tests an agents' robustness to transition noise." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#@title average regret during learning (lower is better)\n", + "mdp_playground_p_noise_analysis.plot_learning(mdp_playground_p_noise_df, SWEEP_VARS).draw();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Parsing the plot above:**\n", + "- Display the average regret *through* 1k episodes (lower is better)\n", + "- Dashed line shows the performance of a random agent.\n", + "- Plots are for the MDP Playground environment with injected transition noise so this tests an agents' robustness to transition noise." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#@title plot performance by seed (higher is better)\n", + "mdp_playground_p_noise_analysis.plot_seeds(mdp_playground_p_noise_df, SWEEP_VARS).draw();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Parsing the plot above:**\n", + "\n", + "- Here we can see the performance of each agent individually through time.\n", + "- Higher scores are better, but individual runs may be noisy.\n", + "- Use this plot to diagnose strange agent behaviour." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### MDP Playground with Reward Noise\n", + "\n", + "In these MDPs, we inject Gaussian noise into the reward function with standard deviations of 0, 1, 5, 10 and 15 to test agents' robustness to reward noise." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "#@title parsing data\n", + "mdp_playground_r_noise_df = DF[DF.bsuite_env == 'mdp_playground_r_noise'].copy()\n", + "summary_analysis.plot_single_experiment(BSUITE_SCORE, 'mdp_playground_r_noise', SWEEP_VARS).draw();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#@title average regret after learning (lower is better)\n", + "mdp_playground_r_noise_analysis.plot_average(mdp_playground_r_noise_df, SWEEP_VARS).draw();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Parsing the plot above:**\n", + "- Display the average regret *after* 1k episodes (lower is better)\n", + "- Dashed line shows the performance of a random agent.\n", + "- Plots are for the MDP Playground environment with injected reward noise so this tests an agents' robustness to reward noise." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#@title average regret during learning (lower is better)\n", + "mdp_playground_r_noise_analysis.plot_learning(mdp_playground_r_noise_df, SWEEP_VARS).draw();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Parsing the plot above:**\n", + "- Display the average regret *through* 1k episodes (lower is better)\n", + "- Dashed line shows the performance of a random agent.\n", + "- Plots are for the MDP Playground environment with injected reward noise so this tests an agents' robustness to reward noise." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#@title plot performance by seed (higher is better)\n", + "mdp_playground_r_noise_analysis.plot_seeds(mdp_playground_r_noise_df, SWEEP_VARS).draw();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Parsing the plot above:**\n", + "\n", + "- Here we can see the performance of each agent individually through time.\n", + "- Higher scores are better, but individual runs may be noisy.\n", + "- Use this plot to diagnose strange agent behaviour." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### MDP Playground with Sparse Rewards\n", + "\n", + "In these MDPs, we allow only a fraction of the possible transitions to be rewardable - the fractions of transitions which are rewardable in the instantiated environments are 0.17, 0.34, 0.5, 0.67 and 0.84 - to test agents' robustness to reward sparsity." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "#@title parsing data\n", + "mdp_playground_r_sparse_df = DF[DF.bsuite_env == 'mdp_playground_r_sparse'].copy()\n", + "summary_analysis.plot_single_experiment(BSUITE_SCORE, 'mdp_playground_r_sparse', SWEEP_VARS).draw();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#@title average regret after learning (lower is better)\n", + "mdp_playground_r_sparse_analysis.plot_average(mdp_playground_r_sparse_df, SWEEP_VARS).draw();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Parsing the plot above:**\n", + "- Display the average regret *after* 1k episodes (lower is better)\n", + "- Dashed line shows the performance of a random agent.\n", + "- Plots are for the MDP Playground environment with injected reward sparsity so this tests an agents' robustness to reward sparsity." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#@title average regret during learning (lower is better)\n", + "mdp_playground_r_sparse_analysis.plot_learning(mdp_playground_r_sparse_df, SWEEP_VARS).draw();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Parsing the plot above:**\n", + "- Display the average regret *through* 1k episodes (lower is better)\n", + "- Dashed line shows the performance of a random agent.\n", + "- Plots are for the MDP Playground environment with injected reward sparsity so this tests an agents' robustness to reward sparsity." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#@title plot performance by seed (higher is better)\n", + "mdp_playground_r_sparse_analysis.plot_seeds(mdp_playground_r_sparse_df, SWEEP_VARS).draw();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Parsing the plot above:**\n", + "\n", + "- Here we can see the performance of each agent individually through time.\n", + "- Higher scores are better, but individual runs may be noisy.\n", + "- Use this plot to diagnose strange agent behaviour." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### MDP Playground with Rewardable Sequences\n", + "\n", + "In these MDPs, we allow vary the sequence lengths of rewardable seuqences of states and actions - the rewardable sequence lengths are set to 1, 2, 3, 4 for the different environments. This is to test agents' robustness to rewardable sequence length and ties in well with Hierarchical RL." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "#@title parsing data\n", + "mdp_playground_seq_len_df = DF[DF.bsuite_env == 'mdp_playground_seq_len'].copy()\n", + "summary_analysis.plot_single_experiment(BSUITE_SCORE, 'mdp_playground_seq_len', SWEEP_VARS).draw();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#@title average regret after learning (lower is better)\n", + "mdp_playground_seq_len_analysis.plot_average(mdp_playground_seq_len_df, SWEEP_VARS).draw();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Parsing the plot above:**\n", + "- Display the average regret *after* 1k episodes (lower is better)\n", + "- Dashed line shows the performance of a random agent.\n", + "- Plots are for the MDP Playground environment with varying rewardable sequence lengths so this tests an agents' robustness to varying rewardable sequence lengths." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#@title average regret during learning (lower is better)\n", + "mdp_playground_seq_len_analysis.plot_learning(mdp_playground_seq_len_df, SWEEP_VARS).draw();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Parsing the plot above:**\n", + "- Display the average regret *through* 1k episodes (lower is better)\n", + "- Dashed line shows the performance of a random agent.\n", + "- Plots are for the MDP Playground environment with varying rewardable sequence lengths so this tests an agents' robustness to varying rewardable sequence lengths." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#@title plot performance by seed (higher is better)\n", + "mdp_playground_seq_len_analysis.plot_seeds(mdp_playground_seq_len_df, SWEEP_VARS).draw();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Parsing the plot above:**\n", + "\n", + "- Here we can see the performance of each agent individually through time.\n", + "- Higher scores are better, but individual runs may be noisy.\n", + "- Use this plot to diagnose strange agent behaviour." + ] + }, { "cell_type": "markdown", "metadata": { @@ -302,7 +800,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -321,7 +819,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -354,7 +852,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -402,7 +900,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -421,7 +919,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -454,7 +952,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -517,7 +1015,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -536,7 +1034,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -569,7 +1067,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -630,7 +1128,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -649,7 +1147,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -682,7 +1180,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -747,7 +1245,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -766,7 +1264,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -799,7 +1297,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -873,7 +1371,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -892,7 +1390,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -923,7 +1421,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -954,7 +1452,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -1015,7 +1513,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -1034,7 +1532,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -1065,7 +1563,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -1096,7 +1594,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": {}, @@ -1156,7 +1654,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -1175,7 +1673,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -1206,7 +1704,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -1237,7 +1735,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -1298,7 +1796,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -1317,7 +1815,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -1348,7 +1846,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -1379,7 +1877,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -1443,7 +1941,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -1462,7 +1960,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -1493,7 +1991,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -1524,7 +2022,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -1600,7 +2098,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -1619,7 +2117,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -1650,7 +2148,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -1681,7 +2179,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -1742,7 +2240,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -1761,7 +2259,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -1792,7 +2290,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -1823,7 +2321,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -1886,7 +2384,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -1905,7 +2403,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -1936,7 +2434,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -1967,7 +2465,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -2028,7 +2526,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -2047,7 +2545,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -2078,7 +2576,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -2109,7 +2607,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -2173,7 +2671,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -2192,7 +2690,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -2223,7 +2721,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -2254,7 +2752,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -2334,7 +2832,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -2354,7 +2852,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -2386,7 +2884,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -2418,7 +2916,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -2449,7 +2947,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -2520,7 +3018,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -2540,7 +3038,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -2572,7 +3070,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -2604,7 +3102,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -2635,7 +3133,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -2707,7 +3205,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -2726,7 +3224,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -2757,7 +3255,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -2788,7 +3286,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -2865,7 +3363,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -2884,7 +3382,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -2916,7 +3414,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -2949,7 +3447,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -3015,7 +3513,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -3034,7 +3532,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -3066,7 +3564,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -3099,7 +3597,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -3162,7 +3660,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -3181,7 +3679,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "both", "colab": { @@ -3212,7 +3710,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -3245,7 +3743,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -3320,7 +3818,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -3339,7 +3837,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -3371,7 +3869,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -3404,7 +3902,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -3465,7 +3963,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -3484,7 +3982,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -3516,7 +4014,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -3549,7 +4047,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "cellView": "form", "colab": { @@ -3596,7 +4094,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", @@ -3662,9 +4160,22 @@ }, "kernelspec": { "display_name": "Python 3", + "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/bsuite/baselines/tf/actor_critic/run.py b/bsuite/baselines/tf/actor_critic/run.py index e0ed11c2..27dcfb72 100644 --- a/bsuite/baselines/tf/actor_critic/run.py +++ b/bsuite/baselines/tf/actor_critic/run.py @@ -43,8 +43,8 @@ # algorithm flags.DEFINE_integer('seed', 42, 'seed for random number generation') flags.DEFINE_integer('num_hidden_layers', 2, 'number of hidden layers') -flags.DEFINE_integer('num_units', 64, 'number of units per hidden layer') -flags.DEFINE_float('learning_rate', 1e-2, 'the learning rate') +flags.DEFINE_integer('num_units', 50, 'number of units per hidden layer') +flags.DEFINE_float('learning_rate', 1e-3, 'the learning rate') flags.DEFINE_integer('sequence_length', 32, 'mumber of transitions to batch') flags.DEFINE_float('td_lambda', 0.9, 'mixing parameter for boostrapping') flags.DEFINE_float('discount', .99, 'discounting on the agent side') diff --git a/bsuite/bsuite.py b/bsuite/bsuite.py index 0085c111..e586593f 100644 --- a/bsuite/bsuite.py +++ b/bsuite/bsuite.py @@ -33,6 +33,12 @@ from bsuite.experiments.deep_sea import deep_sea from bsuite.experiments.deep_sea_stochastic import deep_sea_stochastic from bsuite.experiments.discounting_chain import discounting_chain +from bsuite.experiments.mdp_playground import mdp_playground +from bsuite.experiments.mdp_playground_delay import mdp_playground_delay +from bsuite.experiments.mdp_playground_p_noise import mdp_playground_p_noise +from bsuite.experiments.mdp_playground_r_noise import mdp_playground_r_noise +from bsuite.experiments.mdp_playground_r_sparse import mdp_playground_r_sparse +from bsuite.experiments.mdp_playground_seq_len import mdp_playground_seq_len from bsuite.experiments.memory_len import memory_len from bsuite.experiments.memory_size import memory_size from bsuite.experiments.mnist import mnist @@ -70,6 +76,12 @@ deep_sea=deep_sea.load, deep_sea_stochastic=deep_sea_stochastic.load, discounting_chain=discounting_chain.load, + mdp_playground=mdp_playground.load, + mdp_playground_delay=mdp_playground_delay.load, + mdp_playground_p_noise=mdp_playground_p_noise.load, + mdp_playground_r_noise=mdp_playground_r_noise.load, + mdp_playground_r_sparse=mdp_playground_r_sparse.load, + mdp_playground_seq_len=mdp_playground_seq_len.load, memory_len=memory_len.load, memory_size=memory_size.load, mnist=mnist.load, diff --git a/bsuite/environments/mdp_playground.py b/bsuite/environments/mdp_playground.py new file mode 100644 index 00000000..b29b40b8 --- /dev/null +++ b/bsuite/environments/mdp_playground.py @@ -0,0 +1,108 @@ +# pylint: disable=g-bad-file-header +# Copyright 2019 .... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""The MDP Playground reinforcement learning environment.""" + +# Import all packages + +from mdp_playground.envs import RLToyEnv #mdp_playground + +# import collections +from bsuite.experiments.mdp_playground import sweep +from bsuite.environments import base +from bsuite.utils.gym_wrapper import DMEnvFromGym, space2spec +import dm_env +from dm_env import specs +from dm_env import StepType +import gym +import numpy as np +from typing import Any + +# def ohe_observation(obs): + +class DM_RLToyEnv(base.Environment): + """A wrapper to convert an RLToyEnv Gym environment from MDP Playground to a + base.Environment which is a subclass of dm_env.Environment. + Based on the DMEnvFromGym in gym_wrapper.py""" + + def __init__(self, max_episode_len: int = 100, **config: Any): + self.gym_env = gym.make("RLToy-v0", **config) + self.dm_env = DMEnvFromGym(self.gym_env) + + self.max_episode_len = max_episode_len + self._raw_return = 0. + self._best_episode = 0. + self._episode_return = 0. + + self.bsuite_num_episodes = sweep.NUM_EPISODES + + super(DM_RLToyEnv, self).__init__() + # Convert gym action and observation spaces to dm_env specs. + # self._observation_spec = space2spec(self.gym_env.observation_space, + # name='observations') + # self._action_spec = space2spec(self.gym_env.action_space, name='actions') + # self._reset_next_step = True + + def reset(self) -> dm_env.TimeStep: + self._episode_return = 0. + dm_env_reset = self.dm_env.reset() + ohe_obs = np.zeros(shape=(self.gym_env.observation_space.n,), dtype=np.float32) #hack + ohe_obs[dm_env_reset.observation] = 1 + # dm_env_reset.observation = ohe_obs + return dm_env.restart(ohe_obs) + + def step(self, action: int) -> dm_env.TimeStep: + dm_env_step = self.dm_env.step(action) + + #hack set reward as 0 if dm_env_step.reward returns None which happens in case of restart() + self._raw_return += 0. if dm_env_step.reward is None else dm_env_step.reward + self._episode_return += 0. if dm_env_step.reward is None else dm_env_step.reward + + if self.gym_env.total_transitions_episode > self.max_episode_len: + self._best_episode = max(self._episode_return, self._best_episode) + dm_env_step = dm_env.truncation(dm_env_step.reward, dm_env_step.observation) + + ohe_obs = np.zeros(shape=(self.gym_env.observation_space.n,), dtype=np.float32) #hack #TODO bsuite/baselines/tf/dqn agent doesn't allow discrete states + ohe_obs[dm_env_step.observation] = 1 + # dm_env_step.observation = ohe_obs + + # return corresponding TimeStep object based on step_type + if dm_env_step.step_type == StepType.FIRST: + return dm_env.restart(ohe_obs) + elif dm_env_step.step_type == StepType.LAST: + return dm_env.termination(dm_env_step.reward, ohe_obs) + else: + return dm_env.transition(dm_env_step.reward, ohe_obs) + + def _step(self, action: int) -> dm_env.TimeStep: + raise NotImplementedError('This environment implements its own auto-reset.') + + def _reset(self) -> dm_env.TimeStep: + raise NotImplementedError('This environment implements its own auto-reset.') + + def close(self): + self.gym_env.close() + + def observation_spec(self): ##TODO changed for OHE #hack + return specs.BoundedArray(shape=(self.gym_env.observation_space.n,), dtype=np.float32, minimum=0.0, + maximum=1.0, name='observations') + # return self.dm_env.observation_spec() + + def action_spec(self): + return self.dm_env.action_spec() + + def bsuite_info(self): + return dict(raw_return=self._raw_return, + best_episode=self._best_episode) diff --git a/bsuite/experiments/mdp_playground/__init__.py b/bsuite/experiments/mdp_playground/__init__.py new file mode 100644 index 00000000..ac46adfa --- /dev/null +++ b/bsuite/experiments/mdp_playground/__init__.py @@ -0,0 +1,15 @@ +# pylint: disable=g-bad-file-header +# Copyright .... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ diff --git a/bsuite/experiments/mdp_playground/analysis.py b/bsuite/experiments/mdp_playground/analysis.py new file mode 100644 index 00000000..737a6df9 --- /dev/null +++ b/bsuite/experiments/mdp_playground/analysis.py @@ -0,0 +1,75 @@ +# python3 +# pylint: disable=g-bad-file-header +# Copyright 2020 #TODO ... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Analysis for MDP Playground.""" + +###TODO change to mdpp stuff below +from typing import Sequence + +from bsuite.experiments.mdp_playground import sweep +from bsuite.utils import plotting +import numpy as np +import pandas as pd +import plotnine as gg + +NUM_EPISODES = sweep.NUM_EPISODES +BASE_REGRET = 100 +GOOD_EPISODE = 50 +TAGS = sweep.TAGS + +def score(df: pd.DataFrame) -> float: + """Output a score for MDP Playground.""" + df = mdpp_preprocess(df_in=df) + regret_score = plotting.ave_regret_score( + df, baseline_regret=BASE_REGRET, episode=NUM_EPISODES) + + norm_score = 1.0 * regret_score # 2.5 was heuristically chosen value to get Sonnet DQN to score approx. 0.75, so that better algorithms like Rainbow can get score close to 1. With a bigger NN this would mean an unclipped score of 1.1 for Sonnet DQN, which is fair I think. However, a2c_rnn even reached 2.0 on this scale. DQN may be not performing as well because its epsilon is not annealed to 0. + # print("unclipped score:", norm_score) + norm_score = np.clip(norm_score, 0, 1) + return norm_score + +def mdpp_preprocess(df_in: pd.DataFrame) -> pd.DataFrame: + """Preprocess MDP Playground data for use with regret metrics.""" + df = df_in.copy() + df = df[df.episode <= NUM_EPISODES] + df['total_regret'] = (BASE_REGRET * df.episode) - df.raw_return + return df + +def plot_learning(df: pd.DataFrame, + sweep_vars: Sequence[str] = None) -> gg.ggplot: + """Simple learning curves for MDP Playground.""" + df = mdpp_preprocess(df) + p = plotting.plot_regret_learning( + df, sweep_vars=sweep_vars, max_episode=NUM_EPISODES) + p += gg.geom_hline(gg.aes(yintercept=BASE_REGRET), + linetype='dashed', alpha=0.4, size=1.75) + return p + +def plot_seeds(df_in: pd.DataFrame, + sweep_vars: Sequence[str] = None, + colour_var: str = None) -> gg.ggplot: + """Plot the returns through time individually by run.""" + df = df_in.copy() + df['average_return'] = df.raw_return.diff() / df.episode.diff() + p = plotting.plot_individual_returns( + df_in=df, + max_episode=NUM_EPISODES, + return_column='average_return', + colour_var=colour_var, + yintercept=BASE_REGRET, + sweep_vars=sweep_vars, + ) + return p + gg.ylab('average episodic return') diff --git a/bsuite/experiments/mdp_playground/mdp_playground.py b/bsuite/experiments/mdp_playground/mdp_playground.py new file mode 100644 index 00000000..2e5d6acb --- /dev/null +++ b/bsuite/experiments/mdp_playground/mdp_playground.py @@ -0,0 +1,20 @@ +# pylint: disable=g-bad-file-header +# Copyright 2019 .... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""The MDP Playground reinforcement learning environment.""" + +from bsuite.environments import mdp_playground + +load = mdp_playground.DM_RLToyEnv diff --git a/bsuite/experiments/mdp_playground/mdp_playground_test.py b/bsuite/experiments/mdp_playground/mdp_playground_test.py new file mode 100644 index 00000000..d6bcbb11 --- /dev/null +++ b/bsuite/experiments/mdp_playground/mdp_playground_test.py @@ -0,0 +1,55 @@ +# pylint: disable=g-bad-file-header +# Copyright .... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +###TODO change to mdpp stuff below +"""Tests for bsuite.experiments.mdp_playground.""" + +# Import all required packages + +from absl.testing import absltest +from bsuite.environments import mdp_playground +from dm_env import test_utils + +import numpy as np + + +class InterfaceTest(test_utils.EnvironmentTestMixin, absltest.TestCase): + + def make_object_under_test(self): + config = {} + config["state_space_type"] = "discrete" + config["action_space_type"] = "discrete" + config["state_space_size"] = 8 + config["action_space_size"] = 8 + config["generate_random_mdp"] = True + config["terminal_state_density"] = 0.25 + config["maximally_connected"] = True + config["repeats_in_sequences"] = False + config["reward_density"] = 0.25 + config["make_denser"] = False + env = mdp_playground.DM_RLToyEnv(**config) + return env + + def make_action_sequence(self): + valid_actions = list(range(8)) + rng = np.random.RandomState(42) + + for _ in range(100): + yield rng.choice(valid_actions) + + +if __name__ == '__main__': + absltest.main() diff --git a/bsuite/experiments/mdp_playground/sweep.py b/bsuite/experiments/mdp_playground/sweep.py new file mode 100644 index 00000000..6516ceeb --- /dev/null +++ b/bsuite/experiments/mdp_playground/sweep.py @@ -0,0 +1,64 @@ +# pylint: disable=g-bad-file-header +###TODO Copyright stuff +# Copyright .... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Sweep definition for experiments in the MDP Playground.""" + +import copy + +NUM_EPISODES = 1000 +#NUM_TIMESTEPS = 20000 + +# Need to have full config, including: S, A,; explicitly state all of them for backward compatibility. + +config = {} +# config["seed"] = 0 + +config["state_space_type"] = "discrete" +config["action_space_type"] = "discrete" +config["state_space_size"] = 8 +config["action_space_size"] = 8 +config["delay"] = 0 +config["sequence_length"] = 1 +config["reward_scale"] = 1 +config["reward_shift"] = 0 +# config["reward_noise"] = lambda a: a.normal(0, 0.5) +# config["transition_noise"] = 0.1 +config["reward_density"] = 0.25 +config["make_denser"] = False +config["terminal_state_density"] = 0.25 +config["completely_connected"] = True +config["repeats_in_sequences"] = False +config["generate_random_mdp"] = True +# import logging +# config["log_level"] = logging.DEBUG + +_SETTINGS = [] +for j in range(20): + config_copy = copy.deepcopy(config) + config_copy["seed"] = j + _SETTINGS.append(config_copy) +# delays = [0, 1, 2, 4, 8] +# for i in range(5): +# for j in range(4): +# config_copy = copy.deepcopy(config) +# config_copy["delay"] = delays[i] +# config_copy["seed"] = j +# _SETTINGS.append(config_copy) + + +SETTINGS = tuple(_SETTINGS) # delays, seeds for agents or envs? +TAGS = ('mdp_playground', ) # ('basic', 'sparsity', 'basic', 'generalization', 'credit_assignment', 'exploration', 'noise', 'scale', 'memory') +# TAGS = ('credit_assignment', 'delay', )#, 'sparsity', 'basic', 'generalization') diff --git a/bsuite/experiments/mdp_playground_delay/__init__.py b/bsuite/experiments/mdp_playground_delay/__init__.py new file mode 100644 index 00000000..ac46adfa --- /dev/null +++ b/bsuite/experiments/mdp_playground_delay/__init__.py @@ -0,0 +1,15 @@ +# pylint: disable=g-bad-file-header +# Copyright .... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ diff --git a/bsuite/experiments/mdp_playground_delay/analysis.py b/bsuite/experiments/mdp_playground_delay/analysis.py new file mode 100644 index 00000000..800f1acd --- /dev/null +++ b/bsuite/experiments/mdp_playground_delay/analysis.py @@ -0,0 +1,87 @@ +# python3 +# pylint: disable=g-bad-file-header +# Copyright 2020 #TODO ... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Analysis for MDP Playground.""" + +###TODO change to mdpp stuff below +from typing import Sequence + +from bsuite.experiments.mdp_playground import analysis as mdp_playground_analysis +from bsuite.experiments.mdp_playground_delay import sweep +from bsuite.utils import plotting +import numpy as np +import pandas as pd +import plotnine as gg + +NUM_EPISODES = sweep.NUM_EPISODES +BASE_REGRET = 100 +GOOD_EPISODE = 50 +TAGS = sweep.TAGS + + + +def score(df: pd.DataFrame, scaling_var='delay') -> float: + """Output a single score for experiment = mean - std over scaling_var.""" + return plotting.score_by_scaling( + df=df, + score_fn=mdp_playground_analysis.score, + scaling_var=scaling_var, + ) + +def mdpp_preprocess_delay(df_in: pd.DataFrame) -> pd.DataFrame: + """Preprocess MDP Playground data for use with regret metrics.""" + df = df_in.copy() + df = df[df.episode <= NUM_EPISODES] + df['total_regret'] = (((BASE_REGRET - df.delay) * df.episode) - df.raw_return) * BASE_REGRET/(BASE_REGRET - df.delay) # rescaling because regret differs when delay is present! + return df + +def plot_learning(df: pd.DataFrame, + sweep_vars: Sequence[str] = None, + group_col: str = 'delay') -> gg.ggplot: + """Plots the average regret through time.""" + df = mdpp_preprocess_delay(df) + p = plotting.plot_regret_learning( + df_in=df, group_col=group_col, sweep_vars=sweep_vars, + max_episode=sweep.NUM_EPISODES) + p += gg.geom_hline(gg.aes(yintercept=BASE_REGRET), + linetype='dashed', alpha=0.4, size=1.75) + return p + + +def plot_average(df: pd.DataFrame, + sweep_vars: Sequence[str] = None, + group_col: str = 'delay') -> gg.ggplot: + """Plots the average regret through time by delay.""" + df = mdpp_preprocess_delay(df) + p = plotting.plot_regret_average( + df_in=df, + group_col=group_col, + episode=sweep.NUM_EPISODES, + sweep_vars=sweep_vars + ) + p += gg.geom_hline(gg.aes(yintercept=BASE_REGRET), + linetype='dashed', alpha=0.4, size=1.75) + return p + + +def plot_seeds(df: pd.DataFrame, + sweep_vars: Sequence[str] = None) -> gg.ggplot: + """Plot the performance by individual work unit.""" + return mdp_playground_analysis.plot_seeds( + df_in=df, + sweep_vars=sweep_vars, + colour_var='delay' + ) + gg.ylab('average episodic return') diff --git a/bsuite/experiments/mdp_playground_delay/mdp_playground_delay.py b/bsuite/experiments/mdp_playground_delay/mdp_playground_delay.py new file mode 100644 index 00000000..2e5d6acb --- /dev/null +++ b/bsuite/experiments/mdp_playground_delay/mdp_playground_delay.py @@ -0,0 +1,20 @@ +# pylint: disable=g-bad-file-header +# Copyright 2019 .... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""The MDP Playground reinforcement learning environment.""" + +from bsuite.environments import mdp_playground + +load = mdp_playground.DM_RLToyEnv diff --git a/bsuite/experiments/mdp_playground_delay/mdp_playground_test.py b/bsuite/experiments/mdp_playground_delay/mdp_playground_test.py new file mode 100644 index 00000000..d769754f --- /dev/null +++ b/bsuite/experiments/mdp_playground_delay/mdp_playground_test.py @@ -0,0 +1,56 @@ +# pylint: disable=g-bad-file-header +# Copyright .... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +###TODO change to mdpp stuff below +"""Tests for bsuite.experiments.mdp_playground_delay.""" + +# Import all required packages + +from absl.testing import absltest +from bsuite.environments import mdp_playground +from dm_env import test_utils + +import numpy as np + + +class InterfaceTest(test_utils.EnvironmentTestMixin, absltest.TestCase): + + def make_object_under_test(self): + config = {} + config["state_space_type"] = "discrete" + config["action_space_type"] = "discrete" + config["state_space_size"] = 8 + config["action_space_size"] = 8 + config["generate_random_mdp"] = True + config["terminal_state_density"] = 0.25 + config["maximally_connected"] = True + config["repeats_in_sequences"] = False + config["reward_density"] = 0.25 + config["delay"] = 2 + config["make_denser"] = False + env = mdp_playground.DM_RLToyEnv(**config) + return env + + def make_action_sequence(self): + valid_actions = list(range(8)) + rng = np.random.RandomState(42) + + for _ in range(100): + yield rng.choice(valid_actions) + + +if __name__ == '__main__': + absltest.main() diff --git a/bsuite/experiments/mdp_playground_delay/sweep.py b/bsuite/experiments/mdp_playground_delay/sweep.py new file mode 100644 index 00000000..d1a16782 --- /dev/null +++ b/bsuite/experiments/mdp_playground_delay/sweep.py @@ -0,0 +1,58 @@ +# pylint: disable=g-bad-file-header +###TODO Copyright stuff +# Copyright .... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Sweep definition for experiments in the MDP Playground.""" + +import copy + +NUM_EPISODES = 1000 +#NUM_TIMESTEPS = 20000 + +# Need to have full config, including: S, A,; explicitly state all of them for backward compatibility. + +config = {} +# config["seed"] = 0 + +config["state_space_type"] = "discrete" +config["action_space_type"] = "discrete" +config["state_space_size"] = 8 +config["action_space_size"] = 8 +config["sequence_length"] = 1 +config["reward_scale"] = 1 +config["reward_shift"] = 0 +# config["reward_noise"] = lambda a: a.normal(0, 0.5) +# config["transition_noise"] = 0.1 +config["reward_density"] = 0.25 +config["make_denser"] = False +config["terminal_state_density"] = 0.25 +config["completely_connected"] = True +config["repeats_in_sequences"] = False +config["generate_random_mdp"] = True +# import logging +# config["log_level"] = logging.DEBUG + +_SETTINGS = [] +delays = [0, 1, 2, 4, 8] +for i in range(5): + for j in range(4): + config_copy = copy.deepcopy(config) + config_copy["delay"] = delays[i] + config_copy["seed"] = j + _SETTINGS.append(config_copy) + +SETTINGS = tuple(_SETTINGS) # delays, seeds for agents or envs? +TAGS = ('mdp_playground',) +# TAGS = ('credit_assignment',)#, 'sparsity', 'basic', 'generalization') diff --git a/bsuite/experiments/mdp_playground_p_noise/__init__.py b/bsuite/experiments/mdp_playground_p_noise/__init__.py new file mode 100644 index 00000000..ac46adfa --- /dev/null +++ b/bsuite/experiments/mdp_playground_p_noise/__init__.py @@ -0,0 +1,15 @@ +# pylint: disable=g-bad-file-header +# Copyright .... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ diff --git a/bsuite/experiments/mdp_playground_p_noise/analysis.py b/bsuite/experiments/mdp_playground_p_noise/analysis.py new file mode 100644 index 00000000..3ff67a86 --- /dev/null +++ b/bsuite/experiments/mdp_playground_p_noise/analysis.py @@ -0,0 +1,81 @@ +# python3 +# pylint: disable=g-bad-file-header +# Copyright 2020 #TODO ... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Analysis for MDP Playground.""" + +###TODO change to mdpp stuff below +from typing import Sequence + +from bsuite.experiments.mdp_playground import analysis as mdp_playground_analysis +from bsuite.experiments.mdp_playground_p_noise import sweep +from bsuite.utils import plotting +import numpy as np +import pandas as pd +import plotnine as gg + +NUM_EPISODES = sweep.NUM_EPISODES +BASE_REGRET = 100 +GOOD_EPISODE = 50 +TAGS = sweep.TAGS + + + +def score(df: pd.DataFrame, scaling_var='transition_noise') -> float: + """Output a single score for experiment = mean - std over scaling_var.""" + return plotting.score_by_scaling( + df=df, + score_fn=mdp_playground_analysis.score, + scaling_var=scaling_var, + ) + + +def plot_learning(df: pd.DataFrame, + sweep_vars: Sequence[str] = None, + group_col: str = 'transition_noise') -> gg.ggplot: + """Plots the average regret through time.""" + df = mdp_playground_analysis.mdpp_preprocess(df) + p = plotting.plot_regret_learning( + df_in=df, group_col=group_col, sweep_vars=sweep_vars, + max_episode=sweep.NUM_EPISODES) + p += gg.geom_hline(gg.aes(yintercept=BASE_REGRET), + linetype='dashed', alpha=0.4, size=1.75) + return p + + +def plot_average(df: pd.DataFrame, + sweep_vars: Sequence[str] = None, + group_col: str = 'transition_noise') -> gg.ggplot: + """Plots the average regret through time by transition_noise.""" + df = mdp_playground_analysis.mdpp_preprocess(df) + p = plotting.plot_regret_average( + df_in=df, + group_col=group_col, + episode=sweep.NUM_EPISODES, + sweep_vars=sweep_vars + ) + p += gg.geom_hline(gg.aes(yintercept=BASE_REGRET), + linetype='dashed', alpha=0.4, size=1.75) + return p + + +def plot_seeds(df: pd.DataFrame, + sweep_vars: Sequence[str] = None) -> gg.ggplot: + """Plot the performance by individual work unit.""" + return mdp_playground_analysis.plot_seeds( + df_in=df, + sweep_vars=sweep_vars, + colour_var='transition_noise' + ) + gg.ylab('average episodic return (removing noise)') diff --git a/bsuite/experiments/mdp_playground_p_noise/mdp_playground_p_noise.py b/bsuite/experiments/mdp_playground_p_noise/mdp_playground_p_noise.py new file mode 100644 index 00000000..2e5d6acb --- /dev/null +++ b/bsuite/experiments/mdp_playground_p_noise/mdp_playground_p_noise.py @@ -0,0 +1,20 @@ +# pylint: disable=g-bad-file-header +# Copyright 2019 .... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""The MDP Playground reinforcement learning environment.""" + +from bsuite.environments import mdp_playground + +load = mdp_playground.DM_RLToyEnv diff --git a/bsuite/experiments/mdp_playground_p_noise/mdp_playground_test.py b/bsuite/experiments/mdp_playground_p_noise/mdp_playground_test.py new file mode 100644 index 00000000..71a370b0 --- /dev/null +++ b/bsuite/experiments/mdp_playground_p_noise/mdp_playground_test.py @@ -0,0 +1,56 @@ +# pylint: disable=g-bad-file-header +# Copyright .... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +###TODO change to mdpp stuff below +"""Tests for bsuite.experiments.mdp_playground_p_noise.""" + +# Import all required packages + +from absl.testing import absltest +from bsuite.environments import mdp_playground +from dm_env import test_utils + +import numpy as np + + +class InterfaceTest(test_utils.EnvironmentTestMixin, absltest.TestCase): + + def make_object_under_test(self): + config = {} + config["state_space_type"] = "discrete" + config["action_space_type"] = "discrete" + config["state_space_size"] = 8 + config["action_space_size"] = 8 + config["generate_random_mdp"] = True + config["terminal_state_density"] = 0.25 + config["maximally_connected"] = True + config["repeats_in_sequences"] = False + config["reward_density"] = 0.25 + config["transition_noise"] = 0.25 + config["make_denser"] = False + env = mdp_playground.DM_RLToyEnv(**config) + return env + + def make_action_sequence(self): + valid_actions = list(range(8)) + rng = np.random.RandomState(42) + + for _ in range(100): + yield rng.choice(valid_actions) + + +if __name__ == '__main__': + absltest.main() diff --git a/bsuite/experiments/mdp_playground_p_noise/sweep.py b/bsuite/experiments/mdp_playground_p_noise/sweep.py new file mode 100644 index 00000000..f4667c0f --- /dev/null +++ b/bsuite/experiments/mdp_playground_p_noise/sweep.py @@ -0,0 +1,61 @@ +# pylint: disable=g-bad-file-header +###TODO Copyright stuff +# Copyright .... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Sweep definition for experiments in the MDP Playground.""" + +import copy + +NUM_EPISODES = 1000 +#NUM_TIMESTEPS = 20000 + +# Need to have full config, including: S, A,; explicitly state all of them for backward compatibility. + +config = {} +# config["seed"] = 0 + +config["state_space_type"] = "discrete" +config["action_space_type"] = "discrete" +config["state_space_size"] = 8 +config["action_space_size"] = 8 +config["delay"] = 0 +config["sequence_length"] = 1 +config["reward_scale"] = 1 +config["reward_shift"] = 0 +# config["reward_noise"] = lambda a: a.normal(0, 0.5) +# config["transition_noise"] = 0.1 +config["reward_density"] = 0.25 +config["make_denser"] = False +config["terminal_state_density"] = 0.25 +config["completely_connected"] = True +config["repeats_in_sequences"] = False +config["generate_random_mdp"] = True +# import logging +# config["log_level"] = logging.DEBUG + +## transition noise experiement settings +_SETTINGS = [] +p_noise = [0, 0.01, 0.02, 0.10, 0.25] +num_seeds = 4 +for i in range(len(p_noise)): + for j in range(num_seeds): + config_copy = copy.deepcopy(config) + config_copy["transition_noise"] = p_noise[i] + config_copy["seed"] = j + _SETTINGS.append(config_copy) + +#TODO: change the tags +SETTINGS = tuple(_SETTINGS) # delays, seeds for agents or envs? +TAGS = ('mdp_playground',)#, 'sparsity', 'basic', 'generalization') diff --git a/bsuite/experiments/mdp_playground_r_noise/__init__.py b/bsuite/experiments/mdp_playground_r_noise/__init__.py new file mode 100644 index 00000000..ac46adfa --- /dev/null +++ b/bsuite/experiments/mdp_playground_r_noise/__init__.py @@ -0,0 +1,15 @@ +# pylint: disable=g-bad-file-header +# Copyright .... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ diff --git a/bsuite/experiments/mdp_playground_r_noise/analysis.py b/bsuite/experiments/mdp_playground_r_noise/analysis.py new file mode 100644 index 00000000..59acb902 --- /dev/null +++ b/bsuite/experiments/mdp_playground_r_noise/analysis.py @@ -0,0 +1,81 @@ +# python3 +# pylint: disable=g-bad-file-header +# Copyright 2020 #TODO ... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Analysis for MDP Playground.""" + +###TODO change to mdpp stuff below +from typing import Sequence + +from bsuite.experiments.mdp_playground import analysis as mdp_playground_analysis +from bsuite.experiments.mdp_playground_r_noise import sweep +from bsuite.utils import plotting +import numpy as np +import pandas as pd +import plotnine as gg + +NUM_EPISODES = sweep.NUM_EPISODES +BASE_REGRET = 100 +GOOD_EPISODE = 50 +TAGS = sweep.TAGS + + + +def score(df: pd.DataFrame, scaling_var='r_noise') -> float: + """Output a single score for experiment = mean - std over scaling_var.""" + return plotting.score_by_scaling( + df=df, + score_fn=mdp_playground_analysis.score, + scaling_var=scaling_var, + ) + + +def plot_learning(df: pd.DataFrame, + sweep_vars: Sequence[str] = None, + group_col: str = 'r_noise') -> gg.ggplot: + """Plots the average regret through time.""" + df = mdp_playground_analysis.mdpp_preprocess(df) + p = plotting.plot_regret_learning( + df_in=df, group_col=group_col, sweep_vars=sweep_vars, + max_episode=sweep.NUM_EPISODES) + p += gg.geom_hline(gg.aes(yintercept=BASE_REGRET), + linetype='dashed', alpha=0.4, size=1.75) + return p + + +def plot_average(df: pd.DataFrame, + sweep_vars: Sequence[str] = None, + group_col: str = 'r_noise') -> gg.ggplot: + """Plots the average regret through time by reward noise.""" + df = mdp_playground_analysis.mdpp_preprocess(df) + p = plotting.plot_regret_average( + df_in=df, + group_col=group_col, + episode=sweep.NUM_EPISODES, + sweep_vars=sweep_vars + ) + p += gg.geom_hline(gg.aes(yintercept=BASE_REGRET), + linetype='dashed', alpha=0.4, size=1.75) + return p + + +def plot_seeds(df: pd.DataFrame, + sweep_vars: Sequence[str] = None) -> gg.ggplot: + """Plot the performance by individual work unit.""" + return mdp_playground_analysis.plot_seeds( + df_in=df, + sweep_vars=sweep_vars, + colour_var='r_noise' + ) + gg.ylab('average episodic return (removing noise)') diff --git a/bsuite/experiments/mdp_playground_r_noise/mdp_playground_r_noise.py b/bsuite/experiments/mdp_playground_r_noise/mdp_playground_r_noise.py new file mode 100644 index 00000000..2e5d6acb --- /dev/null +++ b/bsuite/experiments/mdp_playground_r_noise/mdp_playground_r_noise.py @@ -0,0 +1,20 @@ +# pylint: disable=g-bad-file-header +# Copyright 2019 .... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""The MDP Playground reinforcement learning environment.""" + +from bsuite.environments import mdp_playground + +load = mdp_playground.DM_RLToyEnv diff --git a/bsuite/experiments/mdp_playground_r_noise/mdp_playground_test.py b/bsuite/experiments/mdp_playground_r_noise/mdp_playground_test.py new file mode 100644 index 00000000..2680ef36 --- /dev/null +++ b/bsuite/experiments/mdp_playground_r_noise/mdp_playground_test.py @@ -0,0 +1,56 @@ +# pylint: disable=g-bad-file-header +# Copyright .... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +###TODO change to mdpp stuff below +"""Tests for bsuite.experiments.mdp_playground_r_noise.""" + +# Import all required packages + +from absl.testing import absltest +from bsuite.environments import mdp_playground +from dm_env import test_utils + +import numpy as np + + +class InterfaceTest(test_utils.EnvironmentTestMixin, absltest.TestCase): + + def make_object_under_test(self): + config = {} + config["state_space_type"] = "discrete" + config["action_space_type"] = "discrete" + config["state_space_size"] = 8 + config["action_space_size"] = 8 + config["generate_random_mdp"] = True + config["terminal_state_density"] = 0.25 + config["maximally_connected"] = True + config["repeats_in_sequences"] = False + config["reward_density"] = 0.25 + config["reward_noise"] = lambda a: a.normal(0, 0.5) + config["make_denser"] = False + env = mdp_playground.DM_RLToyEnv(**config) + return env + + def make_action_sequence(self): + valid_actions = list(range(8)) + rng = np.random.RandomState(42) + + for _ in range(100): + yield rng.choice(valid_actions) + + +if __name__ == '__main__': + absltest.main() diff --git a/bsuite/experiments/mdp_playground_r_noise/sweep.py b/bsuite/experiments/mdp_playground_r_noise/sweep.py new file mode 100644 index 00000000..154bcc9c --- /dev/null +++ b/bsuite/experiments/mdp_playground_r_noise/sweep.py @@ -0,0 +1,65 @@ +# pylint: disable=g-bad-file-header +###TODO Copyright stuff +# Copyright .... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Sweep definition for experiments in the MDP Playground.""" + +import copy + +NUM_EPISODES = 1000 +#NUM_TIMESTEPS = 20000 + +# Need to have full config, including: S, A,; explicitly state all of them for backward compatibility. + +config = {} +# config["seed"] = 0 + +config["state_space_type"] = "discrete" +config["action_space_type"] = "discrete" +config["state_space_size"] = 8 +config["action_space_size"] = 8 +config["delay"] = 0 +config["sequence_length"] = 1 +config["reward_scale"] = 1 +config["reward_shift"] = 0 +# config["reward_noise"] = lambda a: a.normal(0, 0.5) +# config["transition_noise"] = 0.1 +config["reward_density"] = 0.25 +config["make_denser"] = False +config["terminal_state_density"] = 0.25 +config["completely_connected"] = True +config["repeats_in_sequences"] = False +config["generate_random_mdp"] = True +# import logging +# config["log_level"] = logging.DEBUG + +## reward noise experiement settings +_SETTINGS = [] +r_noise = [0, 1, 5, 10, 15] +num_seeds = 4 +from functools import partial +for i in range(len(r_noise)): + for j in range(num_seeds): + config_copy = copy.deepcopy(config) + def reward_noise(r_noise, numpy_random_state): + # return lambda numpy_random_state: numpy_random_state.normal(0, r_noise[i]) + return numpy_random_state.normal(0, r_noise) + config_copy["reward_noise"] = partial(reward_noise, r_noise[i]) + config_copy["r_noise"] = r_noise[i] #hack to enable plots using the variable "r_noise" + config_copy["seed"] = j + _SETTINGS.append(config_copy) + +SETTINGS = tuple(_SETTINGS) # delays, seeds for agents or envs? +TAGS = ('mdp_playground',)#, 'sparsity', 'basic', 'generalization') diff --git a/bsuite/experiments/mdp_playground_r_sparse/__init__.py b/bsuite/experiments/mdp_playground_r_sparse/__init__.py new file mode 100644 index 00000000..ac46adfa --- /dev/null +++ b/bsuite/experiments/mdp_playground_r_sparse/__init__.py @@ -0,0 +1,15 @@ +# pylint: disable=g-bad-file-header +# Copyright .... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ diff --git a/bsuite/experiments/mdp_playground_r_sparse/analysis.py b/bsuite/experiments/mdp_playground_r_sparse/analysis.py new file mode 100644 index 00000000..73ff7a45 --- /dev/null +++ b/bsuite/experiments/mdp_playground_r_sparse/analysis.py @@ -0,0 +1,81 @@ +# python3 +# pylint: disable=g-bad-file-header +# Copyright 2020 #TODO ... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Analysis for MDP Playground.""" + +###TODO change to mdpp stuff below +from typing import Sequence + +from bsuite.experiments.mdp_playground import analysis as mdp_playground_analysis +from bsuite.experiments.mdp_playground_r_sparse import sweep +from bsuite.utils import plotting +import numpy as np +import pandas as pd +import plotnine as gg + +NUM_EPISODES = sweep.NUM_EPISODES +BASE_REGRET = 100 +GOOD_EPISODE = 50 +TAGS = sweep.TAGS + + + +def score(df: pd.DataFrame, scaling_var='reward_density') -> float: + """Output a single score for experiment = mean - std over scaling_var.""" + return plotting.score_by_scaling( + df=df, + score_fn=mdp_playground_analysis.score, + scaling_var=scaling_var, + ) + + +def plot_learning(df: pd.DataFrame, + sweep_vars: Sequence[str] = None, + group_col: str = 'reward_density') -> gg.ggplot: + """Plots the average regret through time.""" + df = mdp_playground_analysis.mdpp_preprocess(df) + p = plotting.plot_regret_learning( + df_in=df, group_col=group_col, sweep_vars=sweep_vars, + max_episode=sweep.NUM_EPISODES) + p += gg.geom_hline(gg.aes(yintercept=BASE_REGRET), + linetype='dashed', alpha=0.4, size=1.75) + return p + + +def plot_average(df: pd.DataFrame, + sweep_vars: Sequence[str] = None, + group_col: str = 'reward_density') -> gg.ggplot: + """Plots the average regret through time by reward_density.""" + df = mdp_playground_analysis.mdpp_preprocess(df) + p = plotting.plot_regret_average( + df_in=df, + group_col=group_col, + episode=sweep.NUM_EPISODES, + sweep_vars=sweep_vars + ) + p += gg.geom_hline(gg.aes(yintercept=BASE_REGRET), + linetype='dashed', alpha=0.4, size=1.75) + return p + + +def plot_seeds(df: pd.DataFrame, + sweep_vars: Sequence[str] = None) -> gg.ggplot: + """Plot the performance by individual work unit.""" + return mdp_playground_analysis.plot_seeds( + df_in=df, + sweep_vars=sweep_vars, + colour_var='reward_density' + ) + gg.ylab('average episodic return (removing noise)') diff --git a/bsuite/experiments/mdp_playground_r_sparse/mdp_playground_r_sparse.py b/bsuite/experiments/mdp_playground_r_sparse/mdp_playground_r_sparse.py new file mode 100644 index 00000000..2e5d6acb --- /dev/null +++ b/bsuite/experiments/mdp_playground_r_sparse/mdp_playground_r_sparse.py @@ -0,0 +1,20 @@ +# pylint: disable=g-bad-file-header +# Copyright 2019 .... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""The MDP Playground reinforcement learning environment.""" + +from bsuite.environments import mdp_playground + +load = mdp_playground.DM_RLToyEnv diff --git a/bsuite/experiments/mdp_playground_r_sparse/mdp_playground_test.py b/bsuite/experiments/mdp_playground_r_sparse/mdp_playground_test.py new file mode 100644 index 00000000..04a66c7f --- /dev/null +++ b/bsuite/experiments/mdp_playground_r_sparse/mdp_playground_test.py @@ -0,0 +1,55 @@ +# pylint: disable=g-bad-file-header +# Copyright .... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +###TODO change to mdpp stuff below +"""Tests for bsuite.experiments.mdp_playground_r_noise.""" + +# Import all required packages + +from absl.testing import absltest +from bsuite.environments import mdp_playground +from dm_env import test_utils + +import numpy as np + + +class InterfaceTest(test_utils.EnvironmentTestMixin, absltest.TestCase): + + def make_object_under_test(self): + config = {} + config["state_space_type"] = "discrete" + config["action_space_type"] = "discrete" + config["state_space_size"] = 8 + config["action_space_size"] = 8 + config["generate_random_mdp"] = True + config["terminal_state_density"] = 0.25 + config["maximally_connected"] = True + config["repeats_in_sequences"] = False + config["reward_density"] = 0.75 + config["make_denser"] = False + env = mdp_playground.DM_RLToyEnv(**config) + return env + + def make_action_sequence(self): + valid_actions = list(range(8)) + rng = np.random.RandomState(42) + + for _ in range(100): + yield rng.choice(valid_actions) + + +if __name__ == '__main__': + absltest.main() diff --git a/bsuite/experiments/mdp_playground_r_sparse/sweep.py b/bsuite/experiments/mdp_playground_r_sparse/sweep.py new file mode 100644 index 00000000..8ac444c1 --- /dev/null +++ b/bsuite/experiments/mdp_playground_r_sparse/sweep.py @@ -0,0 +1,61 @@ +# pylint: disable=g-bad-file-header +###TODO Copyright stuff +# Copyright .... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Sweep definition for experiments in the MDP Playground.""" + +import copy + +NUM_EPISODES = 1000 +#NUM_TIMESTEPS = 20000 + +# Need to have full config, including: S, A,; explicitly state all of them for backward compatibility. + +config = {} +# config["seed"] = 0 + +config["state_space_type"] = "discrete" +config["action_space_type"] = "discrete" +config["state_space_size"] = 8 +config["action_space_size"] = 8 +config["delay"] = 0 +config["sequence_length"] = 1 +config["reward_scale"] = 1 +config["reward_shift"] = 0 +# config["reward_noise"] = lambda a: a.normal(0, 0.5) +# config["transition_noise"] = 0.1 +#config["reward_density"] = 0.25 +config["make_denser"] = False +config["terminal_state_density"] = 0.25 +config["completely_connected"] = True +config["repeats_in_sequences"] = False +config["generate_random_mdp"] = True +# import logging +# config["log_level"] = logging.DEBUG + + +## sparse reward experiement settings +_SETTINGS = [] +r_density = [0.17, 0.34, 0.5, 0.67, 0.84] +num_seeds = 4 +for i in range(len(r_density)): + for j in range(num_seeds): + config_copy = copy.deepcopy(config) + config_copy["reward_density"] = r_density[i] + config_copy["seed"] = j + _SETTINGS.append(config_copy) + +SETTINGS = tuple(_SETTINGS) # delays, seeds for agents or envs? +TAGS = ('mdp_playground',)#, 'sparsity', 'basic', 'generalization') diff --git a/bsuite/experiments/mdp_playground_seq_len/__init__.py b/bsuite/experiments/mdp_playground_seq_len/__init__.py new file mode 100644 index 00000000..ac46adfa --- /dev/null +++ b/bsuite/experiments/mdp_playground_seq_len/__init__.py @@ -0,0 +1,15 @@ +# pylint: disable=g-bad-file-header +# Copyright .... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ diff --git a/bsuite/experiments/mdp_playground_seq_len/analysis.py b/bsuite/experiments/mdp_playground_seq_len/analysis.py new file mode 100644 index 00000000..74a5aaaa --- /dev/null +++ b/bsuite/experiments/mdp_playground_seq_len/analysis.py @@ -0,0 +1,87 @@ +# python3 +# pylint: disable=g-bad-file-header +# Copyright 2020 #TODO ... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Analysis for MDP Playground.""" + +###TODO change to mdpp stuff below +from typing import Sequence + +from bsuite.experiments.mdp_playground import analysis as mdp_playground_analysis +from bsuite.experiments.mdp_playground_seq_len import sweep +from bsuite.utils import plotting +import numpy as np +import pandas as pd +import plotnine as gg + +NUM_EPISODES = sweep.NUM_EPISODES +BASE_REGRET = 100 +GOOD_EPISODE = 50 +TAGS = sweep.TAGS + + + +def score(df: pd.DataFrame, scaling_var='sequence_length') -> float: + """Output a single score for experiment = mean - std over scaling_var.""" + return plotting.score_by_scaling( + df=df, + score_fn=mdp_playground_analysis.score, + scaling_var=scaling_var, + ) + +def mdpp_preprocess_seq_len(df_in: pd.DataFrame) -> pd.DataFrame: + """Preprocess MDP Playground data for use with regret metrics.""" + df = df_in.copy() + df = df[df.episode <= NUM_EPISODES] + df['total_regret'] = (((BASE_REGRET / df.sequence_length) * df.episode) - df.raw_return) * df.sequence_length # Rescaling depending on seq_len since max. reward achievable is diff. for diff. seq_lens + return df + +def plot_learning(df: pd.DataFrame, + sweep_vars: Sequence[str] = None, + group_col: str = 'sequence_length') -> gg.ggplot: + """Plots the average regret through time.""" + df = mdpp_preprocess_seq_len(df) + p = plotting.plot_regret_learning( + df_in=df, group_col=group_col, sweep_vars=sweep_vars, + max_episode=sweep.NUM_EPISODES) + p += gg.geom_hline(gg.aes(yintercept=BASE_REGRET), + linetype='dashed', alpha=0.4, size=1.75) + return p + + +def plot_average(df: pd.DataFrame, + sweep_vars: Sequence[str] = None, + group_col: str = 'sequence_length') -> gg.ggplot: + """Plots the average regret through time by sequence_length.""" + df = mdpp_preprocess_seq_len(df) + p = plotting.plot_regret_average( + df_in=df, + group_col=group_col, + episode=sweep.NUM_EPISODES, + sweep_vars=sweep_vars + ) + p += gg.geom_hline(gg.aes(yintercept=BASE_REGRET), + linetype='dashed', alpha=0.4, size=1.75) + return p + + +def plot_seeds(df: pd.DataFrame, + sweep_vars: Sequence[str] = None) -> gg.ggplot: + """Plot the performance by individual work unit.""" + return mdp_playground_analysis.plot_seeds( + df_in=df, + sweep_vars=sweep_vars, + colour_var='sequence_length' + ) + gg.ylab('average episodic return (removing noise)') diff --git a/bsuite/experiments/mdp_playground_seq_len/mdp_playground_seq_len.py b/bsuite/experiments/mdp_playground_seq_len/mdp_playground_seq_len.py new file mode 100644 index 00000000..2e5d6acb --- /dev/null +++ b/bsuite/experiments/mdp_playground_seq_len/mdp_playground_seq_len.py @@ -0,0 +1,20 @@ +# pylint: disable=g-bad-file-header +# Copyright 2019 .... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""The MDP Playground reinforcement learning environment.""" + +from bsuite.environments import mdp_playground + +load = mdp_playground.DM_RLToyEnv diff --git a/bsuite/experiments/mdp_playground_seq_len/mdp_playground_test.py b/bsuite/experiments/mdp_playground_seq_len/mdp_playground_test.py new file mode 100644 index 00000000..7aabe252 --- /dev/null +++ b/bsuite/experiments/mdp_playground_seq_len/mdp_playground_test.py @@ -0,0 +1,56 @@ +# pylint: disable=g-bad-file-header +# Copyright .... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +###TODO change to mdpp stuff below +"""Tests for bsuite.experiments.mdp_playground_r_noise.""" + +# Import all required packages + +from absl.testing import absltest +from bsuite.environments import mdp_playground +from dm_env import test_utils + +import numpy as np + + +class InterfaceTest(test_utils.EnvironmentTestMixin, absltest.TestCase): + + def make_object_under_test(self): + config = {} + config["state_space_type"] = "discrete" + config["action_space_type"] = "discrete" + config["state_space_size"] = 8 + config["action_space_size"] = 8 + config["generate_random_mdp"] = True + config["terminal_state_density"] = 0.25 + config["maximally_connected"] = True + config["repeats_in_sequences"] = False + config["reward_density"] = 0.25 + config["sequence_length"] = 3 + config["make_denser"] = False + env = mdp_playground.DM_RLToyEnv(**config) + return env + + def make_action_sequence(self): + valid_actions = list(range(8)) + rng = np.random.RandomState(42) + + for _ in range(100): + yield rng.choice(valid_actions) + + +if __name__ == '__main__': + absltest.main() diff --git a/bsuite/experiments/mdp_playground_seq_len/sweep.py b/bsuite/experiments/mdp_playground_seq_len/sweep.py new file mode 100644 index 00000000..809d554e --- /dev/null +++ b/bsuite/experiments/mdp_playground_seq_len/sweep.py @@ -0,0 +1,61 @@ +# pylint: disable=g-bad-file-header +###TODO Copyright stuff +# Copyright .... All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Sweep definition for experiments in the MDP Playground.""" + +import copy + +NUM_EPISODES = 1000 +#NUM_TIMESTEPS = 20000 + +# Need to have full config, including: S, A,; explicitly state all of them for backward compatibility. + +config = {} +# config["seed"] = 0 + +config["state_space_type"] = "discrete" +config["action_space_type"] = "discrete" +config["state_space_size"] = 8 +config["action_space_size"] = 8 +config["delay"] = 0 +#config["sequence_length"] = 1 +config["reward_every_n_steps"] = True +config["reward_scale"] = 1 +config["reward_shift"] = 0 +# config["reward_noise"] = lambda a: a.normal(0, 0.5) +# config["transition_noise"] = 0.1 +config["reward_density"] = 0.25 +config["make_denser"] = False +config["terminal_state_density"] = 0.25 +config["completely_connected"] = True +config["repeats_in_sequences"] = False +config["generate_random_mdp"] = True +# import logging +# config["log_level"] = logging.DEBUG + +## sequence length experiement settings +_SETTINGS = [] +seq_len = [1, 2, 3, 4] +num_seeds = 5 +for i in range(len(seq_len)): + for j in range(num_seeds): + config_copy = copy.deepcopy(config) + config_copy["sequence_length"] = seq_len[i] + config_copy["seed"] = j + _SETTINGS.append(config_copy) + +SETTINGS = tuple(_SETTINGS) # delays, seeds for agents or envs? +TAGS = ('mdp_playground',)#, 'sparsity', 'basic', 'generalization') diff --git a/bsuite/experiments/summary_analysis.py b/bsuite/experiments/summary_analysis.py index e634db8b..1aa7ff38 100644 --- a/bsuite/experiments/summary_analysis.py +++ b/bsuite/experiments/summary_analysis.py @@ -31,6 +31,12 @@ from bsuite.experiments.deep_sea import analysis as deep_sea_analysis from bsuite.experiments.deep_sea_stochastic import analysis as deep_sea_stochastic_analysis from bsuite.experiments.discounting_chain import analysis as discounting_chain_analysis +from bsuite.experiments.mdp_playground import analysis as mdp_playground_analysis +from bsuite.experiments.mdp_playground_delay import analysis as mdp_playground_delay_analysis +from bsuite.experiments.mdp_playground_p_noise import analysis as mdp_playground_p_noise_analysis +from bsuite.experiments.mdp_playground_r_noise import analysis as mdp_playground_r_noise_analysis +from bsuite.experiments.mdp_playground_r_sparse import analysis as mdp_playground_r_sparse_analysis +from bsuite.experiments.mdp_playground_seq_len import analysis as mdp_playground_seq_len_analysis from bsuite.experiments.memory_len import analysis as memory_len_analysis from bsuite.experiments.memory_size import analysis as memory_size_analysis from bsuite.experiments.mnist import analysis as mnist_analysis @@ -84,6 +90,12 @@ def _parse_bsuite(package) -> BSuiteSummary: deep_sea=_parse_bsuite(deep_sea_analysis), deep_sea_stochastic=_parse_bsuite(deep_sea_stochastic_analysis), discounting_chain=_parse_bsuite(discounting_chain_analysis), + mdp_playground=_parse_bsuite(mdp_playground_analysis), + mdp_playground_delay=_parse_bsuite(mdp_playground_delay_analysis), + mdp_playground_p_noise=_parse_bsuite(mdp_playground_p_noise_analysis), + mdp_playground_r_noise=_parse_bsuite(mdp_playground_r_noise_analysis), + mdp_playground_r_sparse=_parse_bsuite(mdp_playground_r_sparse_analysis), + mdp_playground_seq_len=_parse_bsuite(mdp_playground_seq_len_analysis), memory_len=_parse_bsuite(memory_len_analysis), memory_size=_parse_bsuite(memory_size_analysis), mnist=_parse_bsuite(mnist_analysis), @@ -185,12 +197,13 @@ def _gen_ordered_experiments() -> Sequence[str]: scale = [env + '_scale' for env in basics] explore = ['deep_sea', 'deep_sea_stochastic', 'cartpole_swingup'] credit = ['umbrella_length', 'umbrella_distract', 'discounting_chain'] + mdp_playground = ['mdp_playground', 'mdp_playground_delay', 'mdp_playground_p_noise', 'mdp_playground_r_noise', 'mdp_playground_r_sparse', 'mdp_playground_seq_len'] memory = ['memory_len', 'memory_size'] - return basics + noise + scale + explore + credit + memory + return basics + noise + scale + explore + credit + memory + mdp_playground _ORDERED_EXPERIMENTS = _gen_ordered_experiments() _ORDERED_TYPES = [ - 'basic', 'noise', 'scale', 'exploration', 'credit_assignment', 'memory'] + 'basic', 'noise', 'scale', 'exploration', 'credit_assignment', 'memory', 'mdp_playground'] def _clean_bar_plot_data(df_in: pd.DataFrame, diff --git a/bsuite/sweep.py b/bsuite/sweep.py index e370e687..70a31ece 100644 --- a/bsuite/sweep.py +++ b/bsuite/sweep.py @@ -49,6 +49,12 @@ from bsuite.experiments.deep_sea import sweep as deep_sea_sweep from bsuite.experiments.deep_sea_stochastic import sweep as deep_sea_stochastic_sweep from bsuite.experiments.discounting_chain import sweep as discounting_chain_sweep +from bsuite.experiments.mdp_playground import sweep as mdp_playground_sweep +from bsuite.experiments.mdp_playground_delay import sweep as mdp_playground_delay_sweep +from bsuite.experiments.mdp_playground_p_noise import sweep as mdp_playground_p_noise_sweep +from bsuite.experiments.mdp_playground_r_noise import sweep as mdp_playground_r_noise_sweep +from bsuite.experiments.mdp_playground_r_sparse import sweep as mdp_playground_r_sparse_sweep +from bsuite.experiments.mdp_playground_seq_len import sweep as mdp_playground_seq_len_sweep from bsuite.experiments.memory_len import sweep as memory_len_sweep from bsuite.experiments.memory_size import sweep as memory_size_sweep from bsuite.experiments.mnist import sweep as mnist_sweep @@ -120,6 +126,12 @@ def _parse_sweep(experiment_package) -> Tuple[BSuiteId, ...]: DEEP_SEA = _parse_sweep(deep_sea_sweep) DEEP_SEA_STOCHASTIC = _parse_sweep(deep_sea_stochastic_sweep) DISCOUNTING_CHAIN = _parse_sweep(discounting_chain_sweep) +MDP_PLAYGROUND = _parse_sweep(mdp_playground_sweep) +MDP_PLAYGROUND_DELAY = _parse_sweep(mdp_playground_delay_sweep) +MDP_PLAYGROUND_P_NOISE = _parse_sweep(mdp_playground_p_noise_sweep) +MDP_PLAYGROUND_R_NOISE = _parse_sweep(mdp_playground_r_noise_sweep) +MDP_PLAYGROUND_R_SPARSE = _parse_sweep(mdp_playground_r_sparse_sweep) +MDP_PLAYGROUND_SEQ_LEN = _parse_sweep(mdp_playground_seq_len_sweep) MEMORY_LEN = _parse_sweep(memory_len_sweep) MEMORY_SIZE = _parse_sweep(memory_size_sweep) MNIST = _parse_sweep(mnist_sweep) diff --git a/setup.py b/setup.py index e317829a..059fc326 100755 --- a/setup.py +++ b/setup.py @@ -75,6 +75,7 @@ 'frozendict', 'gym', 'matplotlib', + 'mdp-playground', 'numpy', 'pandas', 'plotnine',