Skip to content

Commit

Permalink
Merge pull request #28 from libercapital/fix_get_data
Browse files Browse the repository at this point in the history
Issue#27 : Fix get data
  • Loading branch information
Robso-creator authored Nov 22, 2024
2 parents aa28a42 + b93f2a0 commit 8f71a26
Show file tree
Hide file tree
Showing 8 changed files with 68 additions and 235 deletions.
11 changes: 8 additions & 3 deletions .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,17 @@ env:

jobs:
run_tests:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@5a4ac9002d0be2fb38bd78e4b4dbde5606d7042f
name: clone this repo
- name: Checkout repository
uses: actions/checkout@v3
id: checkout

- name: Set up Docker Compose
run: |
sudo apt-get update
sudo apt-get install -y docker-compose
- name: build img
run: make build-img
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ RUN pip install --user --no-cache-dir -r requirements/requirements.txt \
-name '*.jpeg' -name '*.js.map' -name '*.pyc' -name '*.c' -name '*.pxc' \
-name '*.pyc' -delete \
&& find /usr/local/lib/python3.8 -name '__pycache__' | xargs rm -r
ENV LANG C.UTF-8
ENV LANG=C.UTF-8

COPY src/ src/
COPY tests/ tests/
Expand Down
2 changes: 1 addition & 1 deletion src/io/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
CORE_URL = "https://www.gov.br/receitafederal/pt-br/assuntos/orientacao-tributaria/cadastros/consultas/dados-publicos-cnpj"
CORE_URL_FILES = "http://200.152.38.155/CNPJ"
CORE_URL_FILES = "https://arquivos.receitafederal.gov.br/dados/cnpj"
CNAE_JSON_NAME = 'cnaes.json'
NATJU_JSON_NAME = 'natju.json'
QUAL_SOCIO_JSON_NAME = 'qual_socio.json'
Expand Down
73 changes: 46 additions & 27 deletions src/io/get_files_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@

import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter, Retry

from src import SRC_PATH, DATA_FOLDER
from src.io import CORE_URL_FILES, HEADERS
from src import DATA_FOLDER
from src import SRC_PATH
from src.io import CORE_URL_FILES
from src.io import HEADERS
from src.io.get_last_ref_date import main as get_last_ref_date


Expand All @@ -19,18 +20,22 @@ def main():
ref_date = get_last_ref_date()

# get page content
page = requests.get(CORE_URL_FILES, headers=HEADERS)
_folder_open_date = 'dados_abertos_cnpj'
CORE_URL = f'{CORE_URL_FILES}/{_folder_open_date}/{ref_date}'
page = requests.get(CORE_URL, headers=HEADERS)

# BeautifulSoup object
soup = BeautifulSoup(page.text, 'html.parser')

table = soup.find('table')
rows = table.find_all('tr')
dict_files_url = {'SOCIOS': {},
'EMPRESAS': {},
'ESTABELECIMENTOS': {},
'TAX_REGIME': {},
'TABELAS': {}}
dict_files_url = {
'SOCIOS': {},
'EMPRESAS': {},
'ESTABELECIMENTOS': {},
'TAX_REGIME': {},
'TABELAS': {},
}

print('creating dict files url')
for row in rows:
Expand All @@ -40,21 +45,25 @@ def main():
file_name = row.find_all('td')[1].find('a')['href']
# get last modified time and parse to date (ex: '2021-07-19')
last_modified = datetime.strptime(row.find_all('td')[2].text.strip(), '%Y-%m-%d %H:%M').strftime(
'%Y-%m-%d')
'%Y-%m-%d',
)
# get size file_name
file_size = row.find_all('td')[3].text.strip()
if 'K' in file_size:
file_size_bytes = float(file_size.replace('K', '')) * 2 ** 10
file_size_bytes = float(file_size.replace('K', '')) * 2**10
elif 'M' in file_size:
file_size_bytes = float(file_size.replace('M', '')) * 2 ** 20
file_size_bytes = float(file_size.replace('M', '')) * 2**20
else:
file_size_bytes = 0

dict_core = {file_name: {'last_modified': last_modified,
'file_size_bytes': file_size_bytes,
'link_to_download': f"{CORE_URL_FILES}/{file_name}",
'path_save_file': os.path.join(SRC_PATH, DATA_FOLDER, ref_date, file_name)}
}
dict_core = {
file_name: {
'last_modified': last_modified,
'file_size_bytes': file_size_bytes,
'link_to_download': f"{CORE_URL}/{file_name}",
'path_save_file': os.path.join(SRC_PATH, DATA_FOLDER, ref_date, file_name),
},
}
if 'Socios' in file_name:
dict_files_url['SOCIOS'].update(dict_core)
elif 'Empresas' in file_name:
Expand All @@ -79,21 +88,31 @@ def main():
file_name = row.find_all('td')[1].find('a')['href']
# get last modified time and parse to date (ex: '2021-07-19')
last_modified = datetime.strptime(row.find_all('td')[2].text.strip(), '%Y-%m-%d %H:%M').strftime(
'%Y-%m-%d')
'%Y-%m-%d',
)
# get size file_name
file_size = row.find_all('td')[3].text.strip()
if 'K' in file_size:
file_size_bytes = float(file_size.replace('K', '')) * 2 ** 10
file_size_bytes = float(file_size.replace('K', '')) * 2**10
elif 'M' in file_size:
file_size_bytes = float(file_size.replace('M', '')) * 2 ** 20
file_size_bytes = float(file_size.replace('M', '')) * 2**20
else:
file_size_bytes = 0
dict_files_url['TAX_REGIME'].update({file_name: {'last_modified': last_modified,
'file_size_bytes': file_size_bytes,
'link_to_download': f"{CORE_URL_FILES}/{_folder_tax_regime}/{file_name}",
'path_save_file': os.path.join(SRC_PATH, DATA_FOLDER,
ref_date, file_name)}
})
dict_files_url['TAX_REGIME'].update(
{
file_name: {
'last_modified': last_modified,
'file_size_bytes': file_size_bytes,
'link_to_download': f"{CORE_URL_FILES}/{_folder_tax_regime}/{file_name}",
'path_save_file': os.path.join(
SRC_PATH,
DATA_FOLDER,
ref_date,
file_name,
),
},
},
)

print('Done')

Expand All @@ -102,4 +121,4 @@ def main():

if __name__ == '__main__':
dict_files_url = main()
print(dict_files_url)
print(dict_files_url)
26 changes: 10 additions & 16 deletions src/io/get_last_ref_date.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
from collections import Counter
from datetime import datetime

import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter, Retry

from src.io import CORE_URL_FILES, HEADERS
from src.io import CORE_URL_FILES
from src.io import HEADERS


def main():
Expand All @@ -14,30 +11,27 @@ def main():
:return: dict with urls from files as well as last modified date and size in bytes
"""
# get page content
page = requests.get(CORE_URL_FILES, headers=HEADERS)
_folder_open_date = 'dados_abertos_cnpj'
page = requests.get(f'{CORE_URL_FILES}/{_folder_open_date}', headers=HEADERS)

# BeautifulSoup object
soup = BeautifulSoup(page.text, 'html.parser')

table = soup.find('table')
rows = table.find_all('tr')
list_last_modified_at = []

print('creating dict files url')
for row in rows:
if row.find_all('td'):
if row.find_all('td')[1].find('a')['href'].endswith('.zip'):
if row.find_all('td')[1].find('a')['href'].replace('-', '').replace('/', '').isdigit():
# get last modified time and parse to date (ex: '2021-07-19')
list_last_modified_at.append(
datetime.strptime(row.find_all('td')[2].text.strip(), '%Y-%m-%d %H:%M').strftime(
'%Y-%m-%d'))

list_last_modified_at.append(row.find_all('td')[1].find('a')['href'].replace('/', ''))
# get the most common on 'last_modified' from source
ref_date, occurences = Counter(list_last_modified_at).most_common(1)[0]
print(
f"ref date will be: '{ref_date}' with {occurences} out of {len(list_last_modified_at)} ({occurences / len(list_last_modified_at):.1%}) ")
ref_date = max(list_last_modified_at)
print('last updated date is ', ref_date)

return ref_date


if __name__ == '__main__':
main()
main()
4 changes: 2 additions & 2 deletions tests/fixtures/municipios.json
Original file line number Diff line number Diff line change
Expand Up @@ -3530,7 +3530,7 @@
"5869": "NOVA IGUACU",
"5871": "PARACAMBI",
"5873": "PARAIBA DO SUL",
"5875": "PARATY",
"5875": "PARATI",
"5877": "PETROPOLIS",
"5879": "PIRAI",
"5881": "PORCIUNCULA",
Expand Down Expand Up @@ -5207,7 +5207,7 @@
"9263": "BARRO ALTO",
"9265": "BELA VISTA DE GOIAS",
"9267": "BOM JARDIM DE GOIAS",
"9269": "BOM JESUS",
"9269": "BOM JESUS DE GOIAS",
"9271": "BRAZABRANTES",
"9273": "BREJINHO DE NAZARE",
"9275": "BRITANIA",
Expand Down
143 changes: 0 additions & 143 deletions tests/io/test_get_files_list.py

This file was deleted.

Loading

0 comments on commit 8f71a26

Please sign in to comment.