Skip to content

Commit

Permalink
Updates download for Federal Revenue files
Browse files Browse the repository at this point in the history
Fix #234
  • Loading branch information
cuducos committed Aug 17, 2024
1 parent 8681a53 commit 6253d41
Show file tree
Hide file tree
Showing 17 changed files with 278 additions and 145 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/gofmt.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ jobs:
- uses: actions/checkout@v4
- uses: actions/setup-go@v4
with:
go-version: "1.21"
go-version: "1.22"
- run: if [ "$(gofmt -s -l . | wc -l)" -gt 0 ]; then exit 1; fi
4 changes: 2 additions & 2 deletions .github/workflows/golint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ jobs:
- uses: actions/checkout@v4
- uses: actions/setup-go@v4
with:
go-version: "1.21.x"
- run: "go install honnef.co/go/tools/cmd/staticcheck@2023.1.6"
go-version: "1.22.x"
- run: "go install honnef.co/go/tools/cmd/staticcheck@v0.5.1"
- run: "staticcheck ./..."
2 changes: 1 addition & 1 deletion .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:
fail-fast: false
matrix:
os: [windows-latest, ubuntu-latest]
go: [1.20.x, 1.21.x]
go: [1.21.x, 1.22.x]

runs-on: ${{ matrix.os }}

Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM golang:1.21-bookworm AS build
FROM golang:1.22-bookworm AS build
WORKDIR /minha-receita
COPY go.mod .
COPY go.sum .
Expand Down
4 changes: 2 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ services:
- ./data:/mnt/data

postgres:
image: postgres:14-alpine
image: postgres:16.1-bookworm
ports:
- 5432:5432
volumes:
Expand All @@ -26,7 +26,7 @@ services:
retries: 5

postgres_test:
image: postgres:14-alpine
image: postgres:16.1-bookworm
ports:
- 5555:5432
environment: *credentials
Expand Down
2 changes: 1 addition & 1 deletion docs/instalacao.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ $ docker pull ghcr.io/cuducos/minha-receita:main

#### A partir do código fonte

* [Go](https://golang.org/) versão 1.21
* [Go](https://golang.org/) versão 1.22

Depois de clonar o repositório, baixe as dependências e compile a aplicação para um diretório incluído no `PATH`, por exemplo:

Expand Down
13 changes: 8 additions & 5 deletions download/download.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ import (
"time"
)

type getURLsHandler func(url, dir string) ([]string, error)
type getURLsHandler func(url string) ([]string, error)

func getURLs(url string, handler getURLsHandler, dir string, skip bool) ([]string, error) {
urls, err := handler(url, dir)
urls, err := handler(url)
if err != nil {
return nil, fmt.Errorf("error getting urls: %w", err)
}
Expand Down Expand Up @@ -72,6 +72,9 @@ func Download(dir string, timeout time.Duration, skip, restart bool, parallel in
if err := download(dir, urls, parallel, retries, chunkSize, timeout, restart); err != nil {
return fmt.Errorf("error downloading files from the federal revenue: %w", err)
}
if err := federalRevenueGetMetadata(federalRevenueMetadataURL, dir); err != nil {
return fmt.Errorf("error getting metadata: %w", err)
}
return nil
}

Expand All @@ -90,7 +93,7 @@ func DownloadFromMirror(mirror string, dir string, timeout time.Duration, skip,
// URLs shows the URLs to be downloaded.
func URLs(dir string, skip bool) error {
urls := []string{federalRevenueURL, nationalTreasureBaseURL}
handlers := []getURLsHandler{federalRevenueGetURLsNoUpdatedAt, nationalTreasureGetURLs}
handlers := []getURLsHandler{federalRevenueGetURLs, nationalTreasureGetURLs}
var out []string
for idx := range urls {
u, err := getURLs(urls[idx], handlers[idx], dir, skip)
Expand All @@ -106,7 +109,7 @@ func URLs(dir string, skip bool) error {

// UpdatedAt shows the updated at of the files to be downloaded.
func UpdatedAt() error {
u, err := fetchUpdatedAt(federalRevenueURL)
u, err := fetchUpdatedAt(federalRevenueMetadataURL)
if err != nil {
return fmt.Errorf("error getting updated at: %w", err)
}
Expand All @@ -116,7 +119,7 @@ func UpdatedAt() error {

// HasUpdate checks if there is an update available.
func HasUpdate(dir string) error {
h, err := hasUpdate(federalRevenueURL, dir)
h, err := hasUpdate(federalRevenueMetadataURL, dir)
if err != nil {
return fmt.Errorf("error getting updated at: %w", err)
}
Expand Down
18 changes: 12 additions & 6 deletions download/download_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ import (
func TestGetURLs(t *testing.T) {
for _, tc := range []struct {
name string
fixture string
fixture []string
handler getURLsHandler
expected int
}{
{"federal revenue", "cadastro-nacional-de-pessoa-juridica-cnpj.json", federalRevenueGetURLs, 37},
{"national treasure", "national-treasure.json", nationalTreasureGetURLs, 1},
{"federal revenue", []string{"dados_abertos_cnpj.html", "2024-08.html"}, federalRevenueGetURLs, 37},
{"national treasure", []string{"national-treasure.json"}, nationalTreasureGetURLs, 1},
} {
ts := httpTestServer(t, tc.fixture)
defer ts.Close()
Expand Down Expand Up @@ -47,16 +47,22 @@ func loadFixture(t *testing.T, n string) (*os.File, int64) {
return f, i.Size()
}

func httpTestServer(t *testing.T, n string) *httptest.Server {
func httpTestServer(t *testing.T, cs []string) *httptest.Server {
if len(cs) == 0 {
panic("no content provided to the test server")
}
var c int
return httptest.NewServer(
http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
idx := c % len(cs)
c += 1
if r.Method == http.MethodHead {
f, s := loadFixture(t, n)
f, s := loadFixture(t, cs[idx])
defer f.Close()
w.Header().Add("Content-Length", fmt.Sprint(s))
return
}
http.ServeFile(w, r, path.Join("..", "testdata", n))
http.ServeFile(w, r, path.Join("..", "testdata", cs[idx]))
}))
}

Expand Down
2 changes: 1 addition & 1 deletion download/downloader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import (
)

func TestDownloader(t *testing.T) {
ts := httpTestServer(t, "cadastro-nacional-de-pessoa-juridica-cnpj.json")
ts := httpTestServer(t, []string{"cadastro-nacional-de-pessoa-juridica-cnpj.json"})
defer ts.Close()

f, s := loadFixture(t, "cadastro-nacional-de-pessoa-juridica-cnpj.json")
Expand Down
111 changes: 78 additions & 33 deletions download/federal_revenue.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,84 @@ import (
"os"
"path/filepath"
"regexp"
"slices"
"strings"
"time"
)

const (
userAgent = "Minha Receita/0.0.1 (minhareceita.org)"

// FederalRevenueUpdatedAt is a file that contains the date the data was
// extracted by the Federal Revenue
FederalRevenueUpdatedAt = "updated_at.txt"

federalRevenueURL = "https://dados.gov.br/api/publico/conjuntos-dados/cadastro-nacional-da-pessoa-juridica---cnpj"
federalRevenueFormat = "zip+csv"
// Metadata source
federalRevenueMetadataURL = "https://dados.gov.br/api/publico/conjuntos-dados/cadastro-nacional-da-pessoa-juridica---cnpj"
federalRevenueDateFormat = "02/01/2006 15:04:05"
federalRevenueDateFormatNotes = "02/01/2006"

userAgent = "Minha Receita/0.0.1 (minhareceita.org)"
// Zipped CSV source
federalRevenueURL = "https://dadosabertos.rfb.gov.br/CNPJ/dados_abertos_cnpj"
)

var datePattern = regexp.MustCompile(`Data da última extração:? +(?P<updatedAt>\d{2}/\d{2}/\d{4})`)
var yearMonthPattern = regexp.MustCompile(`href="(\d{4}-\d{2}/)"`)
var filePattern = regexp.MustCompile(`href="(\w+\d?\.zip)"`)

func httpGet(url string) (string, error) {
c := http.Client{}
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return "", fmt.Errorf("error creating request %s: %w", url, err)
}
req.Header.Set("User-Agent", userAgent)
r, err := c.Do(req)
if err != nil {
return "", fmt.Errorf("error getting %s: %w", url, err)
}
defer r.Body.Close()
if r.StatusCode != http.StatusOK {
return "", fmt.Errorf("%s responded with %s", url, r.Status)
}
b, err := io.ReadAll(r.Body)
if err != nil {
return "", fmt.Errorf("could not read %s response body: %w", url, err)
}
return string(b), nil
}

func federalRevenueGetMostRecentURL(url string) (string, error) {
b, err := httpGet(url)
if err != nil {
return "", fmt.Errorf("error getting %s: %w", url, err)
}
var bs []string
for _, m := range yearMonthPattern.FindAllStringSubmatch(b, -1) {
bs = append(bs, m[1])
}
slices.Sort(bs)
if len(bs) == 0 {
return "", fmt.Errorf("no batches found in %s", url)
}
return url + "/" + bs[len(bs)-1], nil
}

func federalRevenueGetURLs(url string) ([]string, error) {
u, err := federalRevenueGetMostRecentURL(url)
if err != nil {
return nil, fmt.Errorf("could not read %s response body: %w", url, err)
}
b, err := httpGet(u)
if err != nil {
return nil, fmt.Errorf("error getting %s: %w", url, err)
}
var urls []string
for _, m := range filePattern.FindAllStringSubmatch(b, -1) {
urls = append(urls, u+m[1])
}
return urls, nil
}

type federalRevenueTime struct{ Time time.Time }

Expand All @@ -44,18 +104,18 @@ func (t *federalRevenueTime) UnmarshalJSON(b []byte) error {
return nil
}

type federalRevenueResource struct {
type federalRevenueMetadataResource struct {
Format string `json:"format"`
URL string `json:"url"`
MetadataModified federalRevenueTime `json:"metadata_modified"`
}

type federalRevenueResponse struct {
Resources []federalRevenueResource `json:"resources"`
Notes string `json:"notes"`
type federalRevenueMetadata struct {
Resources []federalRevenueMetadataResource `json:"resources"`
Notes string `json:"notes"`
}

func (r *federalRevenueResponse) updatedAt() (t time.Time) {
func (r *federalRevenueMetadata) updatedAt() (t time.Time) {
m := datePattern.FindStringSubmatch(r.Notes)
if len(m) == 2 {
t, err := time.Parse(federalRevenueDateFormatNotes, m[1])
Expand All @@ -72,7 +132,7 @@ func (r *federalRevenueResponse) updatedAt() (t time.Time) {
return t
}

func newFederalRevenueResponse(url string) (*federalRevenueResponse, error) {
func newFederalRevenueMetadata(url string) (*federalRevenueMetadata, error) {
c := http.Client{}
req, err := http.NewRequest("GET", url, nil)
if err != nil {
Expand All @@ -91,42 +151,27 @@ func newFederalRevenueResponse(url string) (*federalRevenueResponse, error) {
if err != nil {
return nil, fmt.Errorf("could not read %s response body: %w", url, err)
}
var data federalRevenueResponse
var data federalRevenueMetadata
if err := json.Unmarshal(b, &data); err != nil {
return nil, fmt.Errorf("could not unmarshal %s json response: %w", url, err)
}
return &data, nil
}

func federalRevenueGetURLsBase(url, dir string, updatedAt bool) ([]string, error) {
data, err := newFederalRevenueResponse(url)
func federalRevenueGetMetadata(url, dir string) error {
data, err := newFederalRevenueMetadata(url)
if err != nil {
return nil, fmt.Errorf("error getting federal revenue data: %w", err)
}
var u []string
for _, v := range data.Resources {
if v.Format == federalRevenueFormat {
u = append(u, v.URL)
}
}
if updatedAt {
if err := saveUpdatedAt(dir, data.updatedAt()); err != nil {
return nil, fmt.Errorf("could not save the update at date: %w", err)
}
return fmt.Errorf("error getting federal revenue data: %w", err)
}
return u, nil
}

func federalRevenueGetURLs(url, dir string) ([]string, error) {
return federalRevenueGetURLsBase(url, dir, true)
}
if err := saveUpdatedAt(dir, data.updatedAt()); err != nil {
return fmt.Errorf("could not save the update at date: %w", err)

func federalRevenueGetURLsNoUpdatedAt(url, dir string) ([]string, error) {
return federalRevenueGetURLsBase(url, dir, false)
}
return nil
}

func fetchUpdatedAt(url string) (string, error) {
data, err := newFederalRevenueResponse(url)
data, err := newFederalRevenueMetadata(url)
if err != nil {
return "", fmt.Errorf("error getting federal revenue data: %w", err)
}
Expand Down
Loading

0 comments on commit 6253d41

Please sign in to comment.