diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a81c8ee --- /dev/null +++ b/.gitignore @@ -0,0 +1,138 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ diff --git a/__pycache__/word_count_cat.cpython-38.pyc b/__pycache__/word_count_cat.cpython-38.pyc deleted file mode 100644 index 2712f42..0000000 Binary files a/__pycache__/word_count_cat.cpython-38.pyc and /dev/null differ diff --git a/notebooks/DollarValueFeature_debugging.ipynb b/notebooks/DollarValueFeature_debugging.ipynb new file mode 100644 index 0000000..8f2ad6e --- /dev/null +++ b/notebooks/DollarValueFeature_debugging.ipynb @@ -0,0 +1,263 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "f138e41b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/Users/mwharton/Code/DeepREI\n" + ] + } + ], + "source": [ + "%cd .." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6f164266", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from src.features.dollar_value_feature import DollarValueFeature" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "95adc587", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"realtor_sample_data.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "84c44114", + "metadata": {}, + "outputs": [], + "source": [ + "dvf = DollarValueFeature(df[\"AssociationFee\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "19dfe96e", + "metadata": {}, + "outputs": [], + "source": [ + "dvf.run_etl()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "7cee4596", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 None\n", + "1 19\n", + "2 None\n", + "3 None\n", + "4 None\n", + " ... \n", + "120 None\n", + "121 None\n", + "122 None\n", + "123 None\n", + "124 711\n", + "Name: AssociationFee, Length: 125, dtype: object" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dvf.col_series" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f0d7b3ec", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dvf.col_etl.values[0] is None" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "dead8d74", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | AssociationFee_value | \n", + "AssociationFee_is_null | \n", + "
---|---|---|
0 | \n", + "0.0 | \n", + "0.0 | \n", + "
1 | \n", + "19.0 | \n", + "19.0 | \n", + "
2 | \n", + "0.0 | \n", + "0.0 | \n", + "
3 | \n", + "0.0 | \n", + "0.0 | \n", + "
4 | \n", + "0.0 | \n", + "0.0 | \n", + "
... | \n", + "... | \n", + "... | \n", + "
120 | \n", + "0.0 | \n", + "0.0 | \n", + "
121 | \n", + "0.0 | \n", + "0.0 | \n", + "
122 | \n", + "0.0 | \n", + "0.0 | \n", + "
123 | \n", + "0.0 | \n", + "0.0 | \n", + "
124 | \n", + "711.0 | \n", + "711.0 | \n", + "
125 rows × 2 columns
\n", + "