From 9d4b6c57720eb93628c08b95e3b3a21eb138295f Mon Sep 17 00:00:00 2001 From: Robson Date: Mon, 21 Oct 2024 15:10:18 -0300 Subject: [PATCH] feat: fix get data from receita federal --- Makefile | 58 ++++++++++++++++++------------------- src/io/get_files_dict.py | 6 ++-- src/io/get_last_ref_date.py | 20 +++++++------ 3 files changed, 44 insertions(+), 40 deletions(-) diff --git a/Makefile b/Makefile index 71e46aa..cff7630 100644 --- a/Makefile +++ b/Makefile @@ -7,9 +7,9 @@ all: @echo "" @echo "########################################################################################################################" @echo "[SESSION] Launch" - @echo "make up ......................................... docker-compose up -d" - @echo "make stop ....................................... docker-compose stop" - @echo "make down ....................................... docker-compose down" + @echo "make up ......................................... docker compose up -d" + @echo "make stop ....................................... docker compose stop" + @echo "make down ....................................... docker compose down" @echo "make app ........................................ run container app" @echo "make rm ......................................... remove all exited containers and all dangling volumes" @echo "" @@ -47,25 +47,25 @@ build-img: up: @echo "---------------------------------------" - @echo "docker-compose up -d" - @docker-compose up -d + @echo "docker compose up -d" + @docker compose up -d @echo "" stop: @echo "---------------------------------------" - @echo "docker-compose stop" - @docker-compose stop + @echo "docker compose stop" + @docker compose stop @echo "" down: @echo "---------------------------------------" - @echo "docker-compose down" - @docker-compose down + @echo "docker compose down" + @docker compose down @echo "" app: up @echo "compose-up run app container" - @docker-compose run --rm app + @docker compose run --rm app @echo "" rm: down @@ -82,24 +82,24 @@ rm: down db-create: up @echo "PHOENIX" - @docker-compose run app python -c "from src.db_models.utils import create_db; create_db()" + @docker compose run app python -c "from src.db_models.utils import create_db; create_db()" @echo "" db-create-tables: up @echo "Creating tables" - @docker-compose run app python -c "from src.db_models.utils import create_or_drop_all_tables; create_or_drop_all_tables(cmd='create')" + @docker compose run app python -c "from src.db_models.utils import create_or_drop_all_tables; create_or_drop_all_tables(cmd='create')" @echo "" db-setup: up @echo "SETUP" @echo "sleeping 40 seconds in order to postgres start-up" @echo "Creating db" - @docker-compose run app python -c "from src.db_models.utils import create_db, create_or_drop_all_tables; create_db();create_or_drop_all_tables(cmd='create')" + @docker compose run app python -c "from src.db_models.utils import create_db, create_or_drop_all_tables; create_db();create_or_drop_all_tables(cmd='create')" @echo "" db-phoenix: up @echo "PHOENIX" - @docker-compose run app python -c "from src.db_models.utils import phoenix; phoenix()" + @docker compose run app python -c "from src.db_models.utils import phoenix; phoenix()" @echo "" db-enter: up @@ -107,29 +107,29 @@ db-enter: up tests: up @echo "compose-up run app & [PYTEST]" - @docker-compose run app python -m pytest + @docker compose run app python -m pytest @echo "" io-download: up @echo "compose-up run app container & [DOWNLOAD]" - @docker-compose run app python src/io/download.py + @docker compose run app python src/io/download.py @echo "" io-unzip: up @echo "compose-up run app container & [UNZIP]" - @docker-compose run app python src/io/unzip.py + @docker compose run app python src/io/unzip.py @echo "" @echo "[CREATE JSONS]" - @docker-compose run app python src/io/create_jsons_from_csv.py + @docker compose run app python src/io/create_jsons_from_csv.py io-create-jsons: up @echo "[CREATE JSONS]" - @docker-compose run app python src/io/create_jsons_from_csv.py + @docker compose run app python src/io/create_jsons_from_csv.py io-download-and-unzip: up @echo "compose-up run app container & [DOWNLOAD]" - @docker-compose run app python src/io/download.py + @docker compose run app python src/io/download.py @echo "" @echo "------------------------" @echo "sleep for 30 seconds to take a breath" @@ -137,42 +137,42 @@ io-download-and-unzip: up @echo "" @echo "------------------------" @echo "[UNZIP]" - @docker-compose run app python src/io/unzip.py + @docker compose run app python src/io/unzip.py @echo "" @echo "[CREATE JSONS]" - @docker-compose run app python src/io/create_jsons_from_csv.py + @docker compose run app python src/io/create_jsons_from_csv.py engine-company: up @echo "compose-up run app container & [ENGINE COMPANY]" - @docker-compose run app python src/engine/company.py + @docker compose run app python src/engine/company.py @echo "" engine-company-tax-regime: up @echo "compose-up run app container & [ENGINE COMPANY TAX REGIME]" - @docker-compose run app python src/engine/company_tax_regime.py + @docker compose run app python src/engine/company_tax_regime.py @echo "" engine-company-root: up @echo "compose-up run app container & [ENGINE COMPANY ROOT]" - @docker-compose run app python src/engine/company_root.py + @docker compose run app python src/engine/company_root.py @echo "" engine-company-root-simples: up @echo "compose-up run app container & [ENGINE COMPANY ROOT SIMPLES]" - @docker-compose run app python src/engine/company_root_simples.py + @docker compose run app python src/engine/company_root_simples.py @echo "" engine-partners: up @echo "compose-up run app container & [ENGINE PARTNERS]" - @docker-compose run app python src/engine/partners.py + @docker compose run app python src/engine/partners.py @echo "" engine-ref-date: up @echo "compose-up run app container & [ENGINE REF DATE]" - @docker-compose run app python src/engine/ref_date.py + @docker compose run app python src/engine/ref_date.py @echo "" engine-main: up @echo "compose-up run app container & engine main" - @docker-compose run app python src/engine/main.py + @docker compose run app python src/engine/main.py @echo "" diff --git a/src/io/get_files_dict.py b/src/io/get_files_dict.py index 63a38f0..6ca7054 100644 --- a/src/io/get_files_dict.py +++ b/src/io/get_files_dict.py @@ -19,7 +19,9 @@ def main(): ref_date = get_last_ref_date() # get page content - page = requests.get(CORE_URL_FILES, headers=HEADERS) + _folder_open_date = 'dados_abertos_cnpj' + CORE_URL = f'{CORE_URL_FILES}/{_folder_open_date}/{ref_date}' + page = requests.get(CORE_URL, headers=HEADERS) # BeautifulSoup object soup = BeautifulSoup(page.text, 'html.parser') @@ -52,7 +54,7 @@ def main(): dict_core = {file_name: {'last_modified': last_modified, 'file_size_bytes': file_size_bytes, - 'link_to_download': f"{CORE_URL_FILES}/{file_name}", + 'link_to_download': f"{CORE_URL}/{file_name}", 'path_save_file': os.path.join(SRC_PATH, DATA_FOLDER, ref_date, file_name)} } if 'Socios' in file_name: diff --git a/src/io/get_last_ref_date.py b/src/io/get_last_ref_date.py index 4c4409f..86cf931 100644 --- a/src/io/get_last_ref_date.py +++ b/src/io/get_last_ref_date.py @@ -14,7 +14,8 @@ def main(): :return: dict with urls from files as well as last modified date and size in bytes """ # get page content - page = requests.get(CORE_URL_FILES, headers=HEADERS) + _folder_open_date = 'dados_abertos_cnpj' + page = requests.get(f'{CORE_URL_FILES}/{_folder_open_date}', headers=HEADERS) # BeautifulSoup object soup = BeautifulSoup(page.text, 'html.parser') @@ -26,16 +27,17 @@ def main(): print('creating dict files url') for row in rows: if row.find_all('td'): - if row.find_all('td')[1].find('a')['href'].endswith('.zip'): + if row.find_all('td')[1].find('a')['href']: # get last modified time and parse to date (ex: '2021-07-19') - list_last_modified_at.append( - datetime.strptime(row.find_all('td')[2].text.strip(), '%Y-%m-%d %H:%M').strftime( - '%Y-%m-%d')) - + try: + list_last_modified_at.append(row.find_all('td')[1].find('a')['href'].replace('/', '')) + except ValueError as e: + print('not a date: ', e) # get the most common on 'last_modified' from source - ref_date, occurences = Counter(list_last_modified_at).most_common(1)[0] - print( - f"ref date will be: '{ref_date}' with {occurences} out of {len(list_last_modified_at)} ({occurences / len(list_last_modified_at):.1%}) ") + list_last_modified_at.remove('CNPJ') + ref_date = max(list_last_modified_at) + print('last updated date is ', ref_date) + return ref_date