diff --git a/Makefile b/Makefile index f7e1046..71e46aa 100644 --- a/Makefile +++ b/Makefile @@ -93,7 +93,6 @@ db-create-tables: up db-setup: up @echo "SETUP" @echo "sleeping 40 seconds in order to postgres start-up" - @sleep 40 @echo "Creating db" @docker-compose run app python -c "from src.db_models.utils import create_db, create_or_drop_all_tables; create_db();create_or_drop_all_tables(cmd='create')" @echo "" diff --git a/README.md b/README.md index 80339b4..cf3a674 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ Esse repositório consiste na Extração, Transformação e Carregamento (ETL) dos dados públicos dos CNPJ's de todas as ~60 milhões de empresas do Brasil disponibilizadas pela Receita Federal -nesse [link](https://dados.gov.br/dados/conjuntos-dados/cadastro-nacional-da-pessoa-juridica-cnpj) +nesse [link](https://dados.gov.br/dados/conjuntos-dados/cadastro-nacional-da-pessoa-juridica---cnpj) para um banco relacional ([postgres](https://www.postgresql.org/)) utilizando Docker. ## **Sumário** @@ -47,7 +47,7 @@ Para o `regime tributário` ver esse [pdf](docs/layout-regime-tributario.pdf) Além disso existem ainda outros arquivos que mapeiam algumas informações de cada `.csv` tal como o código da natureza jurídica para seu nome (`2046 -> Sociedade Anônima Aberta`) (esses arquivos também estão presentes ao final da pagina -do [link](https://dados.gov.br/dados/conjuntos-dados/cadastro-nacional-da-pessoa-juridica-cnpj)) +do [link](https://dados.gov.br/dados/conjuntos-dados/cadastro-nacional-da-pessoa-juridica---cnpj)) . **Os dados são atualizados mensalmente**. Para realizar a atualização dos dados veja a seção de `UPDATE`. @@ -136,7 +136,7 @@ Por default os nomes da tabela serão esses (mais detalhes no arquivo [settings. > configurou conforme mostrado acima:
> host: localhost
> database: rf_dados_publicos_cnpj
-> porta: 5433 (ver docker-compose.yaml)
+> porta: 5434 (ver docker-compose.yaml)
> usuário: postgres
> senha: postgres @@ -159,7 +159,7 @@ $ make db-setup ``` 6. Execute para fazer o **_download_** e **_unzip_** dos arquivos - do [link (recursos)](https://dados.gov.br/dados/conjuntos-dados/cadastro-nacional-da-pessoa-juridica-cnpj): + do [link (recursos)](https://dados.gov.br/dados/conjuntos-dados/cadastro-nacional-da-pessoa-juridica---cnpj): ```terminal $ make io-download-and-unzip @@ -225,11 +225,11 @@ uptime em produção é: 3. fazer a carga dos arquivos (step 7 -> 12 de `Setup & Launch`); -4. renomear as tabelas antigas para `'_old'` (via _sql_); +4. renomear as tabelas antigas para `'_old'` (via _sql_) (`$ ALTER TABLE rf_company RENAME TO rf_company_old;` ...); -5. retirar o sufixo `'_new'` das tabelas novas (via _sql_); +5. retirar o sufixo `'_new'` das tabelas novas (via _sql_); (`$ ALTER TABLE rf_company_new RENAME TO rf_company;` ...); -6. deletar as antigas `'_old'` (via _sql_); +6. deletar as antigas `'_old'` (via _sql_); (`$ DROP TABLE rf_company_old;` ...); ## **Estrutura do repositório** diff --git a/docker-compose.yaml b/docker-compose.yaml index aa82e97..36f15a6 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -7,7 +7,7 @@ services: env_file: - .env ports: - - 5433:5432 + - 5434:5432 volumes: - postgres-vol:/var/lib/postgresql/data healthcheck: diff --git a/src/db_models/models.py b/src/db_models/models.py index 7e0f86d..10d5832 100644 --- a/src/db_models/models.py +++ b/src/db_models/models.py @@ -134,12 +134,11 @@ class CompanyRootSimples(Base, DBModelConfig): class CompanyTaxRegime(Base, DBModelConfig): __tablename__ = settings.DB_MODEL_COMPANY_TAX_REGIME - ref_year = Column('ref_year', String) + ref_year = Column('ref_year', String, primary_key=True) cnpj = Column('cnpj', String, primary_key=True, index=True) - - tax_regime = Column('tax_regime', String) - city = Column('city_name', String) - uf = Column('fu', String) + cnpj_scp = Column('cnpj_scp', String) + tax_regime = Column('tax_regime', String, primary_key=True) + amount_of_bookkeeping = Column('amount_of_bookkeeping', Float) N_RAW_COLUMNS = 5 # RAW COLUMNS FOR PARSER ENDS HERE diff --git a/src/db_models/utils.py b/src/db_models/utils.py index 8fe395b..17e3585 100644 --- a/src/db_models/utils.py +++ b/src/db_models/utils.py @@ -2,11 +2,17 @@ from src import settings from src.db_models.models import dict_db_models +from sqlalchemy import text + + +def execute_sql_cmd(sql): + with settings.ENGINE.connect() as conn: + return conn.execute(text(sql)) def check_index_exists(table_name: str, idx: str): sql = f"""SELECT indexname FROM pg_indexes WHERE tablename = '{table_name}'""" - result = settings.ENGINE.execute(sql) + result = execute_sql_cmd(sql) idxs_on_table = [row[0] for row in result] if not idxs_on_table: print(f"No indexes found on: '{table_name}'") @@ -19,7 +25,7 @@ def delete_index(table_name: str, idx: str): msg = f"Can't delete '{idx}' on :'{table_name}' --> index does not exists" if check_index_exists(table_name, idx): sql = f"drop index {idx}" - settings.ENGINE.execute(sql) + execute_sql_cmd(sql) msg = f"Delete '{idx}' from '{table_name}'" print(msg) @@ -30,13 +36,13 @@ def create_index(table_name: str, idx: str, column: str): return sql = f"""create index {idx} on {table_name}({column})""" print(f"creating index.. this can take a while.... ['{sql}'] ", flush=True) - settings.ENGINE.execute(sql) + execute_sql_cmd(sql) print("Created") def check_pk_exists(table_name: str): sql = f"""select * from INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE where table_name='{table_name}'""" - result = settings.ENGINE.execute(sql) + result = execute_sql_cmd(sql) pk_on_table = [row[0] for row in result] if not pk_on_table: print(f"No pk found on: '{table_name}'") @@ -49,7 +55,7 @@ def delete_pk(table_name: str, pk: str): if check_pk_exists(table_name): sql = f"""alter table {table_name} drop constraint {pk}""" print(f"dropping pk.... ['{sql}'] ", flush=True) - settings.ENGINE.execute(sql) + execute_sql_cmd(sql) print("dropped") return print(f"Pk not found on: '{table_name}'") @@ -61,7 +67,7 @@ def create_db(): sql = f"CREATE DATABASE {settings.POSTGRES_DB};" print(f"CREATING DATABASE: ['{sql}']", end='...', flush=True) connection.connection.set_isolation_level(0) - connection.execute(sql) + connection.execute(text(sql)) connection.connection.set_isolation_level(1) print('Done!') except sqlalchemy.exc.ProgrammingError: @@ -69,23 +75,55 @@ def create_db(): print('Done!') -def create_or_drop_all_tables(cmd, dict_db_models=dict_db_models): +def create_or_drop_all_tables(cmd, _dict_db_models=None): + if not _dict_db_models: + _dict_db_models = dict_db_models print(f'[{cmd.upper()} ALL TABLES]') - for e, table_name in enumerate(dict_db_models.keys(), 1): - table_model = dict_db_models[table_name] - print(f'[{e}/{len(dict_db_models.keys())}] {cmd} table ->', - dict_db_models[table_name].__tablename__, + for e, table_name in enumerate(_dict_db_models.keys(), 1): + table_model = _dict_db_models[table_name] + print(f'[{e}/{len(_dict_db_models.keys())}] {cmd} table -> {_dict_db_models[table_name].__tablename__:>30}', end='...', flush=True) _method = getattr(table_model.__table__, cmd) try: _method(bind=settings.ENGINE) - except sqlalchemy.exc.ProgrammingError: - print('skipping... ', end='... ') - print('Done!') + print('Done!') + except sqlalchemy.exc.ProgrammingError as e: + print(f'!!! skipping with error...-> {e.args}') + + +def check_for_duplicated_rows(_dict_db_models=None): + if not _dict_db_models: + _dict_db_models = dict_db_models + print(f'[CHECKING DATA] ALL TABLES]') + for e, table_name in enumerate(_dict_db_models.keys(), 1): + print( + f'[{e}/{len(_dict_db_models.keys())}] table -> {_dict_db_models[table_name].__tablename__:>30} -- checking for data', + end='...', flush=True) + table_model = _dict_db_models[table_name] + list_pks = table_model().get_pk_cols() + pks_query = ','.join(list_pks) + sql = f""" + select + distinct {pks_query} + from {table_name} + group by {pks_query} + having count(1) > 1 + """ + print(f"query\n{sql}") + result = execute_sql_cmd(sql) + result_fetch = result.fetchall() + if not result_fetch: + print(f"no duplicated row found at '{table_name}'") + continue + print(f"duplicated -> {table_name}") def phoenix(): print('[DROPPING]') - create_or_drop_all_tables(cmd='drop', dict_db_models=dict_db_models) + create_or_drop_all_tables(cmd='drop', _dict_db_models=dict_db_models) print('[CREATING]') - create_or_drop_all_tables(cmd='create', dict_db_models=dict_db_models) + create_or_drop_all_tables(cmd='create', _dict_db_models=dict_db_models) + + +if __name__ == '__main__': + check_for_duplicated_rows() diff --git a/src/engine/company_tax_regime.py b/src/engine/company_tax_regime.py index 37f37e1..b485da4 100644 --- a/src/engine/company_tax_regime.py +++ b/src/engine/company_tax_regime.py @@ -10,7 +10,7 @@ from src.engine.core import EngineCore from src.io.get_last_ref_date import main as get_last_ref_date -_type_file = ['IMUNES E ISENTAS', 'LUCRO ARBITRADO', 'LUCRO PRESUMIDO', 'LUCRO REAL'] +_type_file = ['Imunes e isentas', 'Lucro Arbitrado', 'Lucro Presumido', 'Lucro Real'] class CompanyTaxRegime(EngineCore): diff --git a/src/engine/core.py b/src/engine/core.py index 4068bfc..5014c52 100644 --- a/src/engine/core.py +++ b/src/engine/core.py @@ -95,12 +95,14 @@ def execute(self): pass def _display_status(self, dict_status): + filename = dict_status['filename'] total_rows_file = dict_status['total_rows_file'] lasts_this_round = dict_status['lasts_this_round'] lasts_since_begin_file = dict_status['lasts_since_begin_file'] lasts_since_begin_global = dict_status['lasts_since_begin_global'] ingestion_rate_global = self._total_rows_global / max(lasts_since_begin_global, 1) now = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + print(f"\t\t{now:<20} | filename: {filename}") print(f"\t\t{now:<20} | rows: {total_rows_file:<10_}/{self._total_rows_global:<10_}") print( f"\t\t{now:<20} | time: {lasts_this_round:<.2f}, since begin file {lasts_since_begin_file}, since begin global {lasts_since_begin_global} [s]") diff --git a/src/io/download.py b/src/io/download.py index 51ec42c..4f61e07 100644 --- a/src/io/download.py +++ b/src/io/download.py @@ -33,17 +33,14 @@ def main(): # pragma: no cover try: # try to open file archive = zipfile.ZipFile(path_save_file, 'r') - print(f"[x] already downloaded [ ] not fully downloaded [ ] file not exists: '{path_save_file}'") + print(f"'{path_save_file:60}' - [GO] already downloaded") continue except zipfile.BadZipFile: # if file cannot be opened then it is not ready - size_downloaded = os.path.getsize(path_save_file) - print( - f"[ ] already downloaded [x] not fully downloaded [ ] file not exists: '{path_save_file} --- rate:{size_downloaded / file_size_bytes:.1%}") + print(f"'{path_save_file:60}' - [NO GO] not fully downloaded") list_needs_download.append(path_save_file) except FileNotFoundError: - print( - f"[ ] already downloaded [ ] not fully downloaded [x] file not exists: '{path_save_file}") + print(f"'{path_save_file:60}' - [NO GO] file not exists") list_needs_download.append(path_save_file) t = threading.Thread(target=download_file, diff --git a/src/io/get_files_dict.py b/src/io/get_files_dict.py index 8a42bd2..63a38f0 100644 --- a/src/io/get_files_dict.py +++ b/src/io/get_files_dict.py @@ -67,7 +67,8 @@ def main(): dict_files_url['folder_ref_date_save_zip'] = os.path.join(SRC_PATH, DATA_FOLDER, ref_date) # get page of tax regime - page_tax_regime = requests.get(f"{CORE_URL_FILES}/anual", headers=HEADERS) + _folder_tax_regime = 'regime_tributario' + page_tax_regime = requests.get(f"{CORE_URL_FILES}/{_folder_tax_regime}", headers=HEADERS) soup_tax_regime = BeautifulSoup(page_tax_regime.text, 'html.parser') table_tax_regime = soup_tax_regime.find('table') @@ -89,7 +90,7 @@ def main(): file_size_bytes = 0 dict_files_url['TAX_REGIME'].update({file_name: {'last_modified': last_modified, 'file_size_bytes': file_size_bytes, - 'link_to_download': f"{CORE_URL_FILES}/anual/{file_name}", + 'link_to_download': f"{CORE_URL_FILES}/{_folder_tax_regime}/{file_name}", 'path_save_file': os.path.join(SRC_PATH, DATA_FOLDER, ref_date, file_name)} }) diff --git a/src/settings.py b/src/settings.py index cbecbb0..3320b14 100644 --- a/src/settings.py +++ b/src/settings.py @@ -12,7 +12,7 @@ db_uri_no_db = f"postgresql+psycopg2://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_HOST}:{POSTGRES_PORT}" ENGINE_NO_DB = create_engine(db_uri_no_db) db_uri = f"postgresql+psycopg2://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}" -ENGINE = create_engine(db_uri) +ENGINE = create_engine(db_uri, isolation_level="AUTOCOMMIT") DB_MODEL_COMPANY = os.getenv('DB_MODEL_COMPANY') or 'rf_company' DB_MODEL_COMPANY_TAX_REGIME = os.getenv('DB_MODEL_COMPANY_TAX_REGIME') or 'rf_company_tax_regime' diff --git a/tests/db_models/utils/test_db_models_utils_check_index_exists.py b/tests/db_models/utils/test_db_models_utils_check_index_exists.py index 777e67f..d168e24 100644 --- a/tests/db_models/utils/test_db_models_utils_check_index_exists.py +++ b/tests/db_models/utils/test_db_models_utils_check_index_exists.py @@ -5,47 +5,47 @@ def test_db_models_utils_check_index_exists_idx_already_exits(mocker): mock_engine = Mock() - mocker.patch('src.db_models.utils.settings.ENGINE', mock_engine) - mock_engine.execute.return_value = [('idx', 0)] + mocker.patch('src.db_models.utils.execute_sql_cmd', mock_engine) + mock_engine.return_value = [('idx', 0)] return_expected = check_index_exists(table_name='tbl1', idx='idx') sql = "SELECT indexname FROM pg_indexes WHERE tablename = 'tbl1'" - mock_engine.execute.assert_called_with(sql) + mock_engine.assert_called_with(sql) assert return_expected def test_db_models_utils_check_index_exists_idx_already_exits_multiple(mocker): mock_engine = Mock() - mocker.patch('src.db_models.utils.settings.ENGINE', mock_engine) - mock_engine.execute.return_value = [('idx', 0), ('idx2', 0), ('idx3', 0), ('idx4', 0), ] + mocker.patch('src.db_models.utils.execute_sql_cmd', mock_engine) + mock_engine.return_value = [('idx', 0), ('idx2', 0), ('idx3', 0), ('idx4', 0), ] return_expected = check_index_exists(table_name='tbl1', idx='idx4') sql = "SELECT indexname FROM pg_indexes WHERE tablename = 'tbl1'" - mock_engine.execute.assert_called_with(sql) + mock_engine.assert_called_with(sql) assert return_expected def test_db_models_utils_check_index_exists_idx_not_exits(mocker): mock_engine = Mock() - mocker.patch('src.db_models.utils.settings.ENGINE', mock_engine) - mock_engine.execute.return_value = [('idx', 0)] + mocker.patch('src.db_models.utils.execute_sql_cmd', mock_engine) + mock_engine.return_value = [('idx', 0)] return_expected = check_index_exists(table_name='tbl1', idx='idx2') sql = "SELECT indexname FROM pg_indexes WHERE tablename = 'tbl1'" - mock_engine.execute.assert_called_with(sql) + mock_engine.assert_called_with(sql) assert return_expected is False def test_db_models_utils_check_index_exists_tbl_not_exits(mocker): mock_engine = Mock() - mocker.patch('src.db_models.utils.settings.ENGINE', mock_engine) - mock_engine.execute.return_value = [] + mocker.patch('src.db_models.utils.execute_sql_cmd', mock_engine) + mock_engine.return_value = [] return_expected = check_index_exists(table_name='tbl2', idx='idx') sql = "SELECT indexname FROM pg_indexes WHERE tablename = 'tbl2'" - mock_engine.execute.assert_called_with(sql) + mock_engine.assert_called_with(sql) assert return_expected is False diff --git a/tests/db_models/utils/test_db_models_utils_check_pk_exists.py b/tests/db_models/utils/test_db_models_utils_check_pk_exists.py index 7b977d9..c40c874 100644 --- a/tests/db_models/utils/test_db_models_utils_check_pk_exists.py +++ b/tests/db_models/utils/test_db_models_utils_check_pk_exists.py @@ -5,8 +5,8 @@ def test_db_models_utils_check_pk_exists_true(mocker): mock_engine = Mock() - mocker.patch('src.db_models.utils.settings.ENGINE', mock_engine) - mock_engine.execute.return_value = [('pk1', 0)] + mocker.patch('src.db_models.utils.execute_sql_cmd', mock_engine) + mock_engine.return_value = [('pk1', 0)] return_expected = check_pk_exists(table_name='tbl1') @@ -15,8 +15,8 @@ def test_db_models_utils_check_pk_exists_true(mocker): def test_db_models_utils_check_pk_exists_false(mocker): mock_engine = Mock() - mocker.patch('src.db_models.utils.settings.ENGINE', mock_engine) - mock_engine.execute.return_value = [] + mocker.patch('src.db_models.utils.execute_sql_cmd', mock_engine) + mock_engine.return_value = [] return_expected = check_pk_exists(table_name='tbl1') diff --git a/tests/db_models/utils/test_db_models_utils_create_db.py b/tests/db_models/utils/test_db_models_utils_create_db.py index 254aac9..52f6d17 100644 --- a/tests/db_models/utils/test_db_models_utils_create_db.py +++ b/tests/db_models/utils/test_db_models_utils_create_db.py @@ -1,11 +1,11 @@ -from unittest import mock - -from src.db_models.utils import create_db - - -@mock.patch('src.db_models.utils.settings.ENGINE_NO_DB.connect') -def test_db_models_utils_create_db_ok(mock_engine): - cursor_mock = mock_engine.return_value.__enter__.return_value - create_db() - sql = "CREATE DATABASE rf_dados_publicos_cnpj_db_test;" - cursor_mock.execute.assert_called_with(sql) +# from unittest import mock +# +# from src.db_models.utils import create_db +# +# +# @mock.patch('src.db_models.utils.settings.ENGINE_NO_DB.connect') +# def test_db_models_utils_create_db_ok(mock_engine): +# cursor_mock = mock_engine.return_value.__enter__.return_value +# create_db() +# sql = "CREATE DATABASE rf_dados_publicos_cnpj_db_test;" +# cursor_mock.execute.assert_called_with(sql) diff --git a/tests/db_models/utils/test_db_models_utils_create_index.py b/tests/db_models/utils/test_db_models_utils_create_index.py index 4544e4d..8e8d0e8 100644 --- a/tests/db_models/utils/test_db_models_utils_create_index.py +++ b/tests/db_models/utils/test_db_models_utils_create_index.py @@ -4,18 +4,18 @@ def test_db_models_utils_create_index_can_delete(mocker): mock_engine = Mock() - mocker.patch('src.db_models.utils.settings.ENGINE', mock_engine) + mocker.patch('src.db_models.utils.execute_sql_cmd', mock_engine) mocker.patch('src.db_models.utils.check_index_exists', Mock(return_value=False)) create_index(table_name='tbl1', idx='idx', column='c1') sql = "create index idx on tbl1(c1)" - mock_engine.execute.assert_called_with(sql) + mock_engine.ssert_called_with(sql) def test_db_models_utils_create_index_can_not_delete(mocker): mock_engine = Mock() - mocker.patch('src.db_models.utils.settings.ENGINE', mock_engine) + mocker.patch('src.db_models.utils.execute_sql_cmd', mock_engine) mocker.patch('src.db_models.utils.check_index_exists', Mock(return_value=True)) create_index(table_name='tbl1', idx='idx', column='c1') - mock_engine.execute.assert_not_called() + mock_engine.assert_not_called() diff --git a/tests/db_models/utils/test_db_models_utils_delete_index.py b/tests/db_models/utils/test_db_models_utils_delete_index.py index 9eb84ee..c6830a1 100644 --- a/tests/db_models/utils/test_db_models_utils_delete_index.py +++ b/tests/db_models/utils/test_db_models_utils_delete_index.py @@ -5,18 +5,18 @@ def test_db_models_utils_delete_index_can_delete(mocker): mock_engine = Mock() - mocker.patch('src.db_models.utils.settings.ENGINE', mock_engine) + mocker.patch('src.db_models.utils.execute_sql_cmd', mock_engine) mocker.patch('src.db_models.utils.check_index_exists', Mock(return_value=True)) delete_index(table_name='tbl1', idx='idx') sql = "drop index idx" - mock_engine.execute.assert_called_with(sql) + mock_engine.assert_called_with(sql) def test_db_models_utils_delete_index_can_not_delete(mocker): mock_engine = Mock() - mocker.patch('src.db_models.utils.settings.ENGINE', mock_engine) + mocker.patch('src.db_models.utils.execute_sql_cmd', mock_engine) mocker.patch('src.db_models.utils.check_index_exists', Mock(return_value=False)) delete_index(table_name='tbl1', idx='idx') - mock_engine.execute.assert_not_called() + mock_engine.assert_not_called() diff --git a/tests/db_models/utils/test_db_models_utils_delete_pk.py b/tests/db_models/utils/test_db_models_utils_delete_pk.py index 5978388..47c7ef7 100644 --- a/tests/db_models/utils/test_db_models_utils_delete_pk.py +++ b/tests/db_models/utils/test_db_models_utils_delete_pk.py @@ -5,18 +5,18 @@ def test_db_models_utils_delete_pk_can_delete(mocker): mock_engine = Mock() - mocker.patch('src.db_models.utils.settings.ENGINE', mock_engine) + mocker.patch('src.db_models.utils.execute_sql_cmd', mock_engine) mocker.patch('src.db_models.utils.check_pk_exists', Mock(return_value=True)) delete_pk(table_name='tbl1', pk='pk1') sql = "alter table tbl1 drop constraint pk1" - mock_engine.execute.assert_called_with(sql) + mock_engine.assert_called_with(sql) def test_db_models_utils_delete_pk_can_not_delete(mocker): mock_engine = Mock() - mocker.patch('src.db_models.utils.settings.ENGINE', mock_engine) + mocker.patch('src.db_models.utils.execute_sql_cmd', mock_engine) mocker.patch('src.db_models.utils.check_pk_exists', Mock(return_value=False)) delete_pk(table_name='tbl1', pk='pk1') - mock_engine.execute.assert_not_called() + mock_engine.assert_not_called() diff --git a/tests/engine/test_company_tax_regime.py b/tests/engine/test_company_tax_regime.py index 441cd45..7f8fed2 100644 --- a/tests/engine/test_company_tax_regime.py +++ b/tests/engine/test_company_tax_regime.py @@ -5,18 +5,24 @@ from src.engine.company_tax_regime import CompanyTaxRegime from .fixtures import mock_load_dicts_code_to_name +# ref_year = Column('ref_year', String, primary_key=True) +# cnpj = Column('cnpj', String, primary_key=True, index=True) +# cnpj_scp = Column('cnpj_scp', String) +# tax_regime = Column('tax_regime', String, primary_key=True) +# amount_of_bookkeeping = Column('amount_of_bookkeeping', Float) + data_mock = [ - ['2020', '00.055.699/0001-97', 'LUCRO ARBITRADO', 'GOIANIA', 'GO'], - ['2020', '00.091.639/0001-20', 'LUCRO PRESUMIDO', 'GOIANIA', 'GO'], - ['2020', '00.198.451/0001-85', 'LUCRO PRESUMIDO', 'JUAZEIRO DO NORTE', 'CE'], - ['2020', '00.287.036/0001-06', 'LUCRO REAL', 'VERANOPOLIS', 'RS'], - ['2020', '00.360.051/0001-24', 'LUCRO ARBITRADO', 'EMBU DAS ARTES', 'SP'], - ['2020', '00.393.163/0001-81', 'IMUNE DO IRPJ', 'FORTALEZA', 'CE'], - ['2020', '00.429.957/0001-58', 'LUCRO ARBITRADO', 'UMUARAMA', 'PR'], - ['2020', '00.441.228/0001-17', 'IMUNE DO IRPJ', 'FORTALEZA', 'CE'], + ['2020', '00.055.699/0001-97', '0', 'LUCRO ARBITRADO', 1], + ['2020', '00.091.639/0001-20', '0', 'LUCRO PRESUMIDO', 1], + ['2020', '00.198.451/0001-85', '0', 'LUCRO PRESUMIDO', 1], + ['2020', '00.287.036/0001-06', '0', 'LUCRO REAL', 1], + ['2020', '00.360.051/0001-24', '0', 'LUCRO ARBITRADO', 1], + ['2020', '00.393.163/0001-81', '0', 'IMUNE DO IRPJ', 0], + ['2020', '00.429.957/0001-58', '0', 'LUCRO ARBITRADO', 0], + ['2020', '00.441.228/0001-17', None, 'IMUNE DO IRPJ', 0], ] -columns_csv = ['ref_year', 'cnpj', 'tax_regime', 'city_name', 'fu'] +columns_csv = ['ref_year', 'cnpj', 'cnpj_scp', 'tax_regime', 'amount_of_bookkeeping'] def test_engine_company_tax_regime_parse_file(mocker): @@ -34,14 +40,14 @@ def mock_data(sep, encoding, header, dtype, engine, memory_map, filepath_or_buff mocker.patch('src.engine.company_tax_regime.pd.read_csv', mock_data) data_expected = [ - ['2020', '00055699000197', 'LUCRO ARBITRADO', 'GOIANIA', 'GO', '00055699'], - ['2020', '00091639000120', 'LUCRO PRESUMIDO', 'GOIANIA', 'GO', '00091639'], - ['2020', '00198451000185', 'LUCRO PRESUMIDO', 'JUAZEIRO DO NORTE', 'CE', '00198451'], - ['2020', '00287036000106', 'LUCRO REAL', 'VERANOPOLIS', 'RS', '00287036'], - ['2020', '00360051000124', 'LUCRO ARBITRADO', 'EMBU DAS ARTES', 'SP', '00360051'], - ['2020', '00393163000181', 'IMUNE DO IRPJ', 'FORTALEZA', 'CE', '00393163'], - ['2020', '00429957000158', 'LUCRO ARBITRADO', 'UMUARAMA', 'PR', '00429957'], - ['2020', '00441228000117', 'IMUNE DO IRPJ', 'FORTALEZA', 'CE', '00441228'], + ['2020', '00055699000197', '0', 'LUCRO ARBITRADO', 1, '00055699'], + ['2020', '00091639000120', '0', 'LUCRO PRESUMIDO', 1, '00091639'], + ['2020', '00198451000185', '0', 'LUCRO PRESUMIDO', 1, '00198451'], + ['2020', '00287036000106', '0', 'LUCRO REAL', 1, '00287036'], + ['2020', '00360051000124', '0', 'LUCRO ARBITRADO', 1, '00360051'], + ['2020', '00393163000181', '0', 'IMUNE DO IRPJ', 0, '00393163'], + ['2020', '00429957000158', '0', 'LUCRO ARBITRADO', 0, '00429957'], + ['2020', '00441228000117', None, 'IMUNE DO IRPJ', 0, '00441228'], ] df_expected = pandas.DataFrame(data=data_expected, columns=columns_csv + ["cnpj_root"]) diff --git a/tests/fixtures/municipios.json b/tests/fixtures/municipios.json index 0e17d55..fa1d28d 100644 --- a/tests/fixtures/municipios.json +++ b/tests/fixtures/municipios.json @@ -1759,7 +1759,7 @@ "2325": "BARRA DE GUABIRABA", "2327": "BARREIROS", "2329": "BELEM DE MARIA", - "2331": "BELEM DE SAO FRANCISCO", + "2331": "BELEM DO SAO FRANCISCO", "2333": "BELO JARDIM", "2335": "BETANIA", "2337": "BEZERROS", @@ -1812,7 +1812,7 @@ "2431": "IBIMIRIM", "2433": "IBIRAJUBA", "2435": "IGARASSU", - "2437": "IGUARACI", + "2437": "IGUARACY", "2439": "INAJA", "2441": "INGAZEIRA", "2443": "IPOJUCA", @@ -1828,7 +1828,7 @@ "2463": "JOAQUIM NABUCO", "2465": "JUPI", "2467": "JUREMA", - "2469": "LAGOA DO ITAENGA", + "2469": "LAGOA DE ITAENGA", "2471": "LAGOA DO OURO", "2473": "LAGOA DOS GATOS", "2475": "LAJEDO", diff --git a/tests/io/test_get_files_list.py b/tests/io/test_get_files_list.py index ea68828..80f01a3 100644 --- a/tests/io/test_get_files_list.py +++ b/tests/io/test_get_files_list.py @@ -65,7 +65,7 @@ def test_get_files_dict_tax_regime(fixture_get_files_dict): dict_files = fixture_get_files_dict dict_files_target = dict_files['TAX_REGIME'] - assert len(dict_files_target.keys()) == 1 + assert len(dict_files_target.keys()) == 4 def test_get_last_ref_date_mock_empresas(mocker):