diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a81c8ee --- /dev/null +++ b/.gitignore @@ -0,0 +1,138 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ diff --git a/__pycache__/word_count_cat.cpython-38.pyc b/__pycache__/word_count_cat.cpython-38.pyc deleted file mode 100644 index 2712f42..0000000 Binary files a/__pycache__/word_count_cat.cpython-38.pyc and /dev/null differ diff --git a/notebooks/DollarValueFeature_debugging.ipynb b/notebooks/DollarValueFeature_debugging.ipynb new file mode 100644 index 0000000..8f2ad6e --- /dev/null +++ b/notebooks/DollarValueFeature_debugging.ipynb @@ -0,0 +1,263 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "f138e41b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/Users/mwharton/Code/DeepREI\n" + ] + } + ], + "source": [ + "%cd .." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6f164266", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from src.features.dollar_value_feature import DollarValueFeature" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "95adc587", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"realtor_sample_data.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "84c44114", + "metadata": {}, + "outputs": [], + "source": [ + "dvf = DollarValueFeature(df[\"AssociationFee\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "19dfe96e", + "metadata": {}, + "outputs": [], + "source": [ + "dvf.run_etl()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "7cee4596", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 None\n", + "1 19\n", + "2 None\n", + "3 None\n", + "4 None\n", + " ... \n", + "120 None\n", + "121 None\n", + "122 None\n", + "123 None\n", + "124 711\n", + "Name: AssociationFee, Length: 125, dtype: object" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dvf.col_series" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f0d7b3ec", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dvf.col_etl.values[0] is None" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "dead8d74", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AssociationFee_valueAssociationFee_is_null
00.00.0
119.019.0
20.00.0
30.00.0
40.00.0
.........
1200.00.0
1210.00.0
1220.00.0
1230.00.0
124711.0711.0
\n", + "

125 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " AssociationFee_value AssociationFee_is_null\n", + "0 0.0 0.0\n", + "1 19.0 19.0\n", + "2 0.0 0.0\n", + "3 0.0 0.0\n", + "4 0.0 0.0\n", + ".. ... ...\n", + "120 0.0 0.0\n", + "121 0.0 0.0\n", + "122 0.0 0.0\n", + "123 0.0 0.0\n", + "124 711.0 711.0\n", + "\n", + "[125 rows x 2 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dvf.df_etl" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8948482", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Realtor Sample Data.csv b/realtor_sample_data.csv similarity index 100% rename from Realtor Sample Data.csv rename to realtor_sample_data.csv diff --git a/requrements.txt b/requrements.txt new file mode 100644 index 0000000..fd98253 --- /dev/null +++ b/requrements.txt @@ -0,0 +1,3 @@ +numpy +pandas +jupyter \ No newline at end of file diff --git a/src/features/dollar_value_feature.py b/src/features/dollar_value_feature.py new file mode 100644 index 0000000..fe84966 --- /dev/null +++ b/src/features/dollar_value_feature.py @@ -0,0 +1,27 @@ +from src.features.feature import Feature +import pandas as pd + + +class DollarValueFeature(Feature): + def __init__(self, col_series): + """Instantiate new dollar value feature.""" + super().__init__(col_series) + + def run_etl(self): + """Run ETL of the column.""" + self._replace_nans() + self._cast() + self._add_is_null_column() + + def _cast(self): + """Cast all values in the column to ensure correct type.""" + self.col_etl = self.col_etl.astype(float) + + def _add_is_null_column(self): + """Create an is_null column.""" + null_indexes = pd.isna(self.col_etl) + value_key = "_".join([self.name, "value"]) + null_key = "_".join([self.name, "is_null"]) + self.df_etl[value_key] = self.col_etl.copy().fillna(0) + self.df_etl[null_key] = self.col_etl.copy().fillna(1) + self.df_etl[null_key].loc[null_indexes] = 0 \ No newline at end of file diff --git a/src/features/feature.py b/src/features/feature.py new file mode 100644 index 0000000..04ea4a0 --- /dev/null +++ b/src/features/feature.py @@ -0,0 +1,18 @@ +import pandas as pd + + +class Feature: + def __init__(self, col_series): + """Instantiate new feature class. + + 'col' must be a pandas Series object. + """ + self.col_series = col_series + self.name = self.col_series.name + self.col_etl = col_series.copy() + self.df_etl = pd.DataFrame() + + def _replace_nans(self, values_to_replace=["None"]): + """Fill common nan values.""" + for value in values_to_replace: + self.col_etl.loc[self.col_etl == value] = None