Merge pull request #2810 from martinholmer/revise-taxcalcio-logic

Revise TaxCalcIO logic; remove TMD data files and test
PSLmodels · Sep 25, 2024 · c698a38 · c698a38
2 parents 0936659 + fa2e518
commit c698a38
Show file tree

Hide file tree

Showing 10 changed files with 53 additions and 125 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -7,5 +7,3 @@ include taxcalc/policy_current_law.json
 include taxcalc/puf_weights.csv.gz
 include taxcalc/puf_ratios.csv
 include taxcalc/records_variables.json
-include taxcalc/tmd_weights.csv.gz
-include taxcalc/tmd_growfactors.csv
diff --git a/Makefile b/Makefile
@@ -13,7 +13,7 @@ help:
 	@echo "clean      : remove .pyc files and local taxcalc package"
 	@echo "package    : build and install local package"
 	@echo "pytest-cps : generate report for and cleanup after"
-	@echo "             pytest -m 'not requires_pufcsv and not requires_tmdcsv and not pre_release'"
+	@echo "             pytest -m 'not requires_pufcsv and not pre_release'"
 	@echo "pytest     : generate report for and cleanup after"
 	@echo "             pytest -m 'not pre_release'"
 	@echo "pytest-all : generate report for and cleanup after"
@@ -51,7 +51,7 @@ endef
 .PHONY=pytest-cps
 pytest-cps:
 	@$(pytest-setup)
-	@cd taxcalc ; pytest -n4 --disable-warnings --durations=0 --durations-min=2 -m "not requires_pufcsv and not requires_tmdcsv and not pre_release"
+	@cd taxcalc ; pytest -n4 --disable-warnings --durations=0 --durations-min=2 -m "not requires_pufcsv and not pre_release"
 	@$(pytest-cleanup)
 
 .PHONY=pytest
@@ -103,7 +103,7 @@ define coverage-cleanup
 rm -f .coverage htmlcov/*
 endef
 
-COVMARK = "not requires_pufcsv and not requires_tmdcsv and not pre_release"
+COVMARK = "not requires_pufcsv and not pre_release"
 
 OS := $(shell uname -s)
 

diff --git a/docs/usage/data.md b/docs/usage/data.md
@@ -61,13 +61,16 @@ file.
 
 The [tax-microdata
 repository](https://github.com/PSLmodels/tax-microdata-benchmarking)
-produces an input variables file (`tmd.csv`) and a
-`tmd_weights.csv.gz` file that is included in the Tax-Calculator
+produces an input variables file (`tmd.csv`), a national weights file
+(`tmd_weights.csv.gz`), and a variable growth factors file
+(`tmd_growfactors.csv`) that can be used with the Tax-Calculator
 package beginning with the 3.6.0 release.  The `tmd.csv` file is
 available only to Tax-Calculator users who have purchased their own
-version of the 2015 IRS-SOI PUF.  For those users, the
-`Records.tmd_constructor()` method creates a `Records` class object
-containing the `tmd` variables and weights.
+version of the 2015 IRS-SOI PUF.  For those users, those three files
+are avaiable from the tax-microdata repository.  These three tmd files
+can be used with the Tax-Calculator Python API (using the 
+`Records.tmd_constructor()` static method) or with the Tax-Calculator
+CLI tool, `tc`.
 
 ## Using other data with Tax-Calculator
 

diff --git a/pytest.ini b/pytest.ini
@@ -3,7 +3,6 @@ testpaths =
     taxcalc
 markers =
     requires_pufcsv
-    requires_tmdcsv
     pre_release
     compatible_data
     local

diff --git a/taxcalc.egg-info/SOURCES.txt b/taxcalc.egg-info/SOURCES.txt
@@ -121,8 +121,6 @@ taxcalc/puf_weights.csv.gz
 taxcalc/records.py
 taxcalc/records_variables.json
 taxcalc/taxcalcio.py
-taxcalc/tmd_growfactors.csv
-taxcalc/tmd_weights.csv.gz
 taxcalc/utils.py
 taxcalc/utilsprvt.py
 taxcalc.egg-info/PKG-INFO
@@ -214,7 +212,6 @@ taxcalc/tests/test_records.py
 taxcalc/tests/test_reforms.py
 taxcalc/tests/test_responses.py
 taxcalc/tests/test_taxcalcio.py
-taxcalc/tests/test_tmdcsv.py
 taxcalc/tests/test_utils.py
 taxcalc/validation/CSV_INPUT_VARS.md
 taxcalc/validation/CSV_OUTPUT_VARS.md

diff --git a/taxcalc/records.py b/taxcalc/records.py
@@ -240,6 +240,9 @@ def tmd_constructor(
         eliminate the need to specify all the details of the PUF input
         data.
         """
+        assert isinstance(data_path, Path)
+        assert isinstance(weights_path, Path)
+        assert isinstance(growfactors_path, Path)
         return Records(
             data=pd.read_csv(data_path),
             start_year=Records.TMDCSV_YEAR,

diff --git a/taxcalc/taxcalcio.py b/taxcalc/taxcalcio.py
@@ -74,6 +74,8 @@ def __init__(self, input_data, tax_year, baseline, reform, assump,
         self.puf_input_data = False
         self.cps_input_data = False
         self.tmd_input_data = False
+        self.tmd_weights = None
+        self.tmd_gfactor = None
         if isinstance(input_data, str):
             # remove any leading directory path from INPUT filename
             fname = os.path.basename(input_data)
@@ -90,6 +92,23 @@ def __init__(self, input_data, tax_year, baseline, reform, assump,
             if not self.cps_input_data and not os.path.isfile(input_data):
                 msg = 'INPUT file could not be found'
                 self.errmsg += 'ERROR: {}\n'.format(msg)
+            # if tmd_input_data is True, construct weights and gfactor paths
+            if self.tmd_input_data:  # pragma: no cover
+                tmd_dir = os.path.dirname(input_data)
+                if 'TMD_AREA' in os.environ:
+                    area = os.environ['TMD_AREA']
+                    wfile = f'{area}_tmd_weights.csv.gz'
+                    inp = f'{fname[:-4]}_{area}-{str(tax_year)[2:]}'
+                else:  # using national weights
+                    wfile = 'tmd_weights.csv.gz'
+                self.tmd_weights = os.path.join(tmd_dir, wfile)
+                self.tmd_gfactor = os.path.join(tmd_dir, 'tmd_growfactors.csv')
+                if not os.path.isfile(self.tmd_weights):
+                    msg = f'weights file {self.tmd_weights} could not be found'
+                    self.errmsg += 'ERROR: {}\n'.format(msg)
+                if not os.path.isfile(self.tmd_gfactor):
+                    msg = f'gfactor file {self.tmd_gfactor} could not be found'
+                    self.errmsg += 'ERROR: {}\n'.format(msg)
         elif isinstance(input_data, pd.DataFrame):
             inp = 'df-{}'.format(str(tax_year)[2:])
         else:
@@ -123,7 +142,7 @@ def __init__(self, input_data, tax_year, baseline, reform, assump,
         elif isinstance(reform, str):
             self.specified_reform = True
             # split any compound reform into list of simple reforms
-            refnames = list()
+            refnames = []
             reforms = reform.split('+')
             for rfm in reforms:
                 # remove any leading directory path from rfm filename
@@ -206,7 +225,7 @@ def __init__(self, input_data, tax_year, baseline, reform, assump,
         self.calc = None
         self.calc_base = None
         self.param_dict = None
-        self.policy_dicts = list()
+        self.policy_dicts = []
 
     def init(self, input_data, tax_year, baseline, reform, assump,
              aging_input_data, exact_calculations):
@@ -234,7 +253,7 @@ def init(self, input_data, tax_year, baseline, reform, assump,
         # get assumption sub-dictionaries
         paramdict = Calculator.read_json_param_objects(None, assump)
         # get policy parameter dictionaries from --reform file(s)
-        policydicts = list()
+        policydicts = []
         if self.specified_reform:
             reforms = reform.split('+')
             for ref in reforms:
@@ -252,9 +271,7 @@ def init(self, input_data, tax_year, baseline, reform, assump,
             self.errmsg += valerr_msg.__str__()
         # create GrowFactors base object that incorporates gdiff_baseline
         if self.tmd_input_data:
-            gfactors_base = GrowFactors(  # pragma: no cover
-                Records.TMD_GROWFACTORS_FILENAME
-            )
+            gfactors_base = GrowFactors(self.tmd_gfactor)  # pragma: no cover
         else:
             gfactors_base = GrowFactors()
         gdiff_baseline.apply_to(gfactors_base)
@@ -266,9 +283,7 @@ def init(self, input_data, tax_year, baseline, reform, assump,
             self.errmsg += valerr_msg.__str__()
         # create GrowFactors ref object that has all gdiff objects applied
         if self.tmd_input_data:
-            gfactors_ref = GrowFactors(  # pragma: no cover
-                Records.TMD_GROWFACTORS_FILENAME
-            )
+            gfactors_ref = GrowFactors(self.tmd_gfactor)  # pragma: no cover
         else:
             gfactors_ref = GrowFactors()
         gdiff_baseline.apply_to(gfactors_ref)
@@ -333,14 +348,20 @@ def init(self, input_data, tax_year, baseline, reform, assump,
                     exact_calculations=exact_calculations
                 )
             elif self.tmd_input_data:
-                recs = Records.tmd_constructor(
-                    data=input_data,
+                recs = Records(
+                    data=pd.read_csv(input_data),
+                    start_year=Records.TMDCSV_YEAR,
+                    weights=self.tmd_weights,
                     gfactors=gfactors_ref,
+                    adjust_ratios=None,
                     exact_calculations=exact_calculations
                 )  # pragma: no cover
-                recs_base = Records.tmd_constructor(
-                    data=input_data,
+                recs_base = Records(
+                    data=pd.read_csv(input_data),
+                    start_year=Records.TMDCSV_YEAR,
+                    weights=self.tmd_weights,
                     gfactors=gfactors_base,
+                    adjust_ratios=None,
                     exact_calculations=exact_calculations
                 )  # pragma: no cover
             else:  # if not {cps|tmd}_input_data but aging_input_data
@@ -541,7 +562,7 @@ def write_doc_file(self):
             doc = Calculator.reform_documentation(self.param_dict,
                                                   self.policy_dicts[1:])
         doc_fname = self._output_filename.replace('.csv', '-doc.text')
-        with open(doc_fname, 'w') as dfile:
+        with open(doc_fname, 'w', encoding='utf-8') as dfile:
             dfile.write(doc)
 
     def write_sqldb_file(self, dump_varset, mtr_paytax, mtr_inctax,
@@ -575,7 +596,7 @@ def write_tables_file(self):
         tab_fname = self._output_filename.replace('.csv', '-tab.text')
         # skip tables if there are not some positive weights
         if self.calc_base.total_weight() <= 0.:
-            with open(tab_fname, 'w') as tfile:
+            with open(tab_fname, 'w', encoding='utf-8') as tfile:
                 msg = 'No tables because sum of weights is not positive\n'
                 tfile.write(msg)
             return
@@ -597,7 +618,7 @@ def write_tables_file(self):
         diff = nontax + change  # using expanded_income under baseline policy
         diffdf = pd.DataFrame(data=np.column_stack(diff), columns=all_vars)
         # write each kind of distributional table
-        with open(tab_fname, 'w') as tfile:
+        with open(tab_fname, 'w', encoding='utf-8') as tfile:
             TaxCalcIO.write_decile_table(distdf, tfile, tkind='Reform Totals')
             tfile.write('\n')
             TaxCalcIO.write_decile_table(diffdf, tfile, tkind='Differences')
@@ -730,15 +751,15 @@ def write_empty_graph_file(fname, title, reason):
                '<head><title>{}</title></head>\n'
                '<body><center<h1>{}</h1></center></body>\n'
                '</html>\n').format(title, reason)
-        with open(fname, 'w') as gfile:
+        with open(fname, 'w', encoding='utf-8') as gfile:
             gfile.write(txt)
 
     def minimal_output(self):
         """
         Extract minimal output and return it as Pandas DataFrame.
         """
         varlist = ['RECID', 'YEAR', 'WEIGHT', 'INCTAX', 'LSTAX', 'PAYTAX']
-        odict = dict()
+        odict = {}
         scalc = self.calc
         odict['RECID'] = scalc.array('RECID')  # id for tax filing unit
         odict['YEAR'] = self.tax_year()  # tax calculation year

diff --git a/taxcalc/tests/test_tmdcsv.py b/taxcalc/tests/test_tmdcsv.py