Merge pull request #28 from libercapital/fix_get_data

Issue#27 : Fix get data
libercapital · Nov 22, 2024 · 8f71a26 · 8f71a26
2 parents aa28a42 + b93f2a0
commit 8f71a26
Show file tree

Hide file tree

Showing 8 changed files with 68 additions and 235 deletions.
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -11,12 +11,17 @@ env:
 
 jobs:
   run_tests:
-
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/checkout@5a4ac9002d0be2fb38bd78e4b4dbde5606d7042f
-        name: clone this repo
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        id: checkout
+
+      - name: Set up Docker Compose
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y docker-compose
 
       - name: build img
         run: make build-img

diff --git a/Dockerfile b/Dockerfile
@@ -20,7 +20,7 @@ RUN pip install --user --no-cache-dir -r requirements/requirements.txt \
                                            -name '*.jpeg' -name '*.js.map' -name '*.pyc' -name '*.c' -name '*.pxc' \
                                            -name '*.pyc' -delete \
     && find /usr/local/lib/python3.8 -name '__pycache__' | xargs rm -r
-ENV LANG C.UTF-8
+ENV LANG=C.UTF-8
 
 COPY src/ src/
 COPY tests/ tests/

diff --git a/src/io/__init__.py b/src/io/__init__.py
@@ -1,5 +1,5 @@
 CORE_URL = "https://www.gov.br/receitafederal/pt-br/assuntos/orientacao-tributaria/cadastros/consultas/dados-publicos-cnpj"
-CORE_URL_FILES = "http://200.152.38.155/CNPJ"
+CORE_URL_FILES = "https://arquivos.receitafederal.gov.br/dados/cnpj"
 CNAE_JSON_NAME = 'cnaes.json'
 NATJU_JSON_NAME = 'natju.json'
 QUAL_SOCIO_JSON_NAME = 'qual_socio.json'

diff --git a/src/io/get_files_dict.py b/src/io/get_files_dict.py
@@ -3,10 +3,11 @@
 
 import requests
 from bs4 import BeautifulSoup
-from requests.adapters import HTTPAdapter, Retry
 
-from src import SRC_PATH, DATA_FOLDER
-from src.io import CORE_URL_FILES, HEADERS
+from src import DATA_FOLDER
+from src import SRC_PATH
+from src.io import CORE_URL_FILES
+from src.io import HEADERS
 from src.io.get_last_ref_date import main as get_last_ref_date
 
 
@@ -19,18 +20,22 @@ def main():
     ref_date = get_last_ref_date()
 
     # get page content
-    page = requests.get(CORE_URL_FILES, headers=HEADERS)
+    _folder_open_date = 'dados_abertos_cnpj'
+    CORE_URL = f'{CORE_URL_FILES}/{_folder_open_date}/{ref_date}'
+    page = requests.get(CORE_URL, headers=HEADERS)
 
     # BeautifulSoup object
     soup = BeautifulSoup(page.text, 'html.parser')
 
     table = soup.find('table')
     rows = table.find_all('tr')
-    dict_files_url = {'SOCIOS': {},
-                      'EMPRESAS': {},
-                      'ESTABELECIMENTOS': {},
-                      'TAX_REGIME': {},
-                      'TABELAS': {}}
+    dict_files_url = {
+        'SOCIOS': {},
+        'EMPRESAS': {},
+        'ESTABELECIMENTOS': {},
+        'TAX_REGIME': {},
+        'TABELAS': {},
+    }
 
     print('creating dict files url')
     for row in rows:
@@ -40,21 +45,25 @@ def main():
                 file_name = row.find_all('td')[1].find('a')['href']
                 # get last modified time and parse to date (ex: '2021-07-19')
                 last_modified = datetime.strptime(row.find_all('td')[2].text.strip(), '%Y-%m-%d %H:%M').strftime(
-                    '%Y-%m-%d')
+                    '%Y-%m-%d',
+                )
                 # get size file_name
                 file_size = row.find_all('td')[3].text.strip()
                 if 'K' in file_size:
-                    file_size_bytes = float(file_size.replace('K', '')) * 2 ** 10
+                    file_size_bytes = float(file_size.replace('K', '')) * 2**10
                 elif 'M' in file_size:
-                    file_size_bytes = float(file_size.replace('M', '')) * 2 ** 20
+                    file_size_bytes = float(file_size.replace('M', '')) * 2**20
                 else:
                     file_size_bytes = 0
 
-                dict_core = {file_name: {'last_modified': last_modified,
-                                         'file_size_bytes': file_size_bytes,
-                                         'link_to_download': f"{CORE_URL_FILES}/{file_name}",
-                                         'path_save_file': os.path.join(SRC_PATH, DATA_FOLDER, ref_date, file_name)}
-                             }
+                dict_core = {
+                    file_name: {
+                        'last_modified': last_modified,
+                        'file_size_bytes': file_size_bytes,
+                        'link_to_download': f"{CORE_URL}/{file_name}",
+                        'path_save_file': os.path.join(SRC_PATH, DATA_FOLDER, ref_date, file_name),
+                    },
+                }
                 if 'Socios' in file_name:
                     dict_files_url['SOCIOS'].update(dict_core)
                 elif 'Empresas' in file_name:
@@ -79,21 +88,31 @@ def main():
                 file_name = row.find_all('td')[1].find('a')['href']
                 # get last modified time and parse to date (ex: '2021-07-19')
                 last_modified = datetime.strptime(row.find_all('td')[2].text.strip(), '%Y-%m-%d %H:%M').strftime(
-                    '%Y-%m-%d')
+                    '%Y-%m-%d',
+                )
                 # get size file_name
                 file_size = row.find_all('td')[3].text.strip()
                 if 'K' in file_size:
-                    file_size_bytes = float(file_size.replace('K', '')) * 2 ** 10
+                    file_size_bytes = float(file_size.replace('K', '')) * 2**10
                 elif 'M' in file_size:
-                    file_size_bytes = float(file_size.replace('M', '')) * 2 ** 20
+                    file_size_bytes = float(file_size.replace('M', '')) * 2**20
                 else:
                     file_size_bytes = 0
-                dict_files_url['TAX_REGIME'].update({file_name: {'last_modified': last_modified,
-                                                                 'file_size_bytes': file_size_bytes,
-                                                                 'link_to_download': f"{CORE_URL_FILES}/{_folder_tax_regime}/{file_name}",
-                                                                 'path_save_file': os.path.join(SRC_PATH, DATA_FOLDER,
-                                                                                                ref_date, file_name)}
-                                                     })
+                dict_files_url['TAX_REGIME'].update(
+                    {
+                        file_name: {
+                            'last_modified': last_modified,
+                            'file_size_bytes': file_size_bytes,
+                            'link_to_download': f"{CORE_URL_FILES}/{_folder_tax_regime}/{file_name}",
+                            'path_save_file': os.path.join(
+                                SRC_PATH,
+                                DATA_FOLDER,
+                                ref_date,
+                                file_name,
+                            ),
+                        },
+                    },
+                )
 
     print('Done')
 
@@ -102,4 +121,4 @@ def main():
 
 if __name__ == '__main__':
     dict_files_url = main()
-    print(dict_files_url)
+    print(dict_files_url)
diff --git a/src/io/get_last_ref_date.py b/src/io/get_last_ref_date.py
@@ -1,11 +1,8 @@
-from collections import Counter
-from datetime import datetime
-
 import requests
 from bs4 import BeautifulSoup
-from requests.adapters import HTTPAdapter, Retry
 
-from src.io import CORE_URL_FILES, HEADERS
+from src.io import CORE_URL_FILES
+from src.io import HEADERS
 
 
 def main():
@@ -14,30 +11,27 @@ def main():
     :return: dict with urls from files as well as last modified date and size in bytes
     """
     # get page content
-    page = requests.get(CORE_URL_FILES, headers=HEADERS)
+    _folder_open_date = 'dados_abertos_cnpj'
+    page = requests.get(f'{CORE_URL_FILES}/{_folder_open_date}', headers=HEADERS)
 
     # BeautifulSoup object
     soup = BeautifulSoup(page.text, 'html.parser')
-
     table = soup.find('table')
     rows = table.find_all('tr')
     list_last_modified_at = []
 
     print('creating dict files url')
     for row in rows:
         if row.find_all('td'):
-            if row.find_all('td')[1].find('a')['href'].endswith('.zip'):
+            if row.find_all('td')[1].find('a')['href'].replace('-', '').replace('/', '').isdigit():
                 # get last modified time and parse to date (ex: '2021-07-19')
-                list_last_modified_at.append(
-                    datetime.strptime(row.find_all('td')[2].text.strip(), '%Y-%m-%d %H:%M').strftime(
-                        '%Y-%m-%d'))
-
+                list_last_modified_at.append(row.find_all('td')[1].find('a')['href'].replace('/', ''))
     # get the most common on 'last_modified' from source
-    ref_date, occurences = Counter(list_last_modified_at).most_common(1)[0]
-    print(
-        f"ref date will be: '{ref_date}' with {occurences} out of {len(list_last_modified_at)} ({occurences / len(list_last_modified_at):.1%}) ")
+    ref_date = max(list_last_modified_at)
+    print('last updated date is ', ref_date)
+
     return ref_date
 
 
 if __name__ == '__main__':
-    main()
+    main()
diff --git a/tests/fixtures/municipios.json b/tests/fixtures/municipios.json
@@ -3530,7 +3530,7 @@
   "5869": "NOVA IGUACU",
   "5871": "PARACAMBI",
   "5873": "PARAIBA DO SUL",
-  "5875": "PARATY",
+  "5875": "PARATI",
   "5877": "PETROPOLIS",
   "5879": "PIRAI",
   "5881": "PORCIUNCULA",
@@ -5207,7 +5207,7 @@
   "9263": "BARRO ALTO",
   "9265": "BELA VISTA DE GOIAS",
   "9267": "BOM JARDIM DE GOIAS",
-  "9269": "BOM JESUS",
+  "9269": "BOM JESUS DE GOIAS",
   "9271": "BRAZABRANTES",
   "9273": "BREJINHO DE NAZARE",
   "9275": "BRITANIA",

diff --git a/tests/io/test_get_files_list.py b/tests/io/test_get_files_list.py