Skip to content

Commit

Permalink
Merge pull request #59 from mjanez/latest
Browse files Browse the repository at this point in the history
Add retry mechanism for HTTP requests and update dependencies
  • Loading branch information
mjanez authored Oct 31, 2024
2 parents b20d86e + 4a23688 commit b8b2be5
Show file tree
Hide file tree
Showing 5 changed files with 440 additions and 731 deletions.
8 changes: 6 additions & 2 deletions .github/workflows/docker-manual.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,16 +40,20 @@ jobs:
labels: |
org.opencontainers.image.documentation=https://github.com/${{ github.repository }}/blob/${{ env.BRANCH }}/README.md
org.opencontainers.image.version=${{ env.BRANCH }}
annotations: |
org.opencontainers.image.description=Docker compose environment (based on pycsw) for development and testing with CKAN Open Data portals.
org.opencontainers.image.source=https://github.com/${{ github.repository }}
- name: Replace slashes in BRANCH to avoid errors
run: echo "BRANCH=${BRANCH////_}" >> $GITHUB_ENV

- name: Build and push
uses: docker/build-push-action@v5
uses: docker/build-push-action@v6
with:
push: true
tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.BRANCH }}
labels: ${{ steps.meta.outputs.labels }}
annotations: ${{ steps.meta.outputs.annotations }}
context: ${{ env.CONTEXT }}
file: ${{ env.CONTEXT }}${{ env.DOCKERFILE_PATH }}/${{ env.DOCKERFILE }}

Expand All @@ -60,7 +64,7 @@ jobs:
no-fail: true

- name: Run Trivy container image vulnerability scanner
uses: aquasecurity/trivy-action@0.17.0
uses: aquasecurity/trivy-action@0.28.0
with:
image-ref: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.BRANCH }}
format: sarif
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ jobs:
run: echo "BRANCH=${BRANCH////_}" >> $GITHUB_ENV

- name: Build and push
uses: docker/build-push-action@v5
uses: docker/build-push-action@v6
with:
push: true
tags: ghcr.io/${{ github.repository }}:${{ github.head_ref }}
Expand All @@ -70,7 +70,7 @@ jobs:
no-fail: true

- name: Run Trivy container image vulnerability scanner
uses: aquasecurity/trivy-action@0.17.0
uses: aquasecurity/trivy-action@0.28.0
with:
image-ref: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.BRANCH }}
format: sarif
Expand Down
48 changes: 35 additions & 13 deletions ckan2pycsw/ckan2pycsw.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from datetime import datetime, time
import subprocess
import time
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# third-party libraries
import psutil
Expand Down Expand Up @@ -58,6 +60,18 @@
SSL_UNVERIFIED_MODE = os.environ.get("SSL_UNVERIFIED_MODE", False)


session = requests.Session()
retries = Retry(
total=5,
backoff_factor=1,
status_forcelist=[502, 503, 504],
allowed_methods=["HEAD", "GET", "OPTIONS"]
)
adapter = HTTPAdapter(max_retries=retries)
session.mount("https://", adapter)
session.mount("http://", adapter)


def get_datasets(base_url):
"""
Retrieve a generator of CKAN datasets from the specified CKAN instance.
Expand All @@ -78,30 +92,38 @@ def get_datasets(base_url):
if not base_url.endswith("/"):
base_url += "/"

if SSL_UNVERIFIED_MODE == True or SSL_UNVERIFIED_MODE == "True":
logging.warning(f"[INSECURE] SSL_UNVERIFIED_MODE:'{SSL_UNVERIFIED_MODE}'. Only if you trust the CKAN_URL: {base_url}.")
if SSL_UNVERIFIED_MODE in [True, "True"]:
logging.warning(f"[INSECURE] SSL_UNVERIFIED_MODE:'{SSL_UNVERIFIED_MODE}'. Solo si confías en CKAN_URL: {base_url}.")

package_search = urljoin(base_url, "api/3/action/package_search")
res = requests.get(package_search, params={"rows": 0}, verify=not SSL_UNVERIFIED_MODE)
res.raise_for_status() # Raises a HTTPError if the response is not 200

# Usar la sesión configurada con reintentos y timeout
res = session.get(package_search, params={"rows": 0}, verify=not SSL_UNVERIFIED_MODE, timeout=10)
res.raise_for_status()
end = res.json().get("result", {}).get("count", 0)
rows = 10
rows = 100 # Number of files
for start in range(0, end, rows):
res = requests.get(package_search, params={"start": start, "rows": rows}, verify=not SSL_UNVERIFIED_MODE)
res.raise_for_status() # Check response status
logging.info(f"Fetching datasets with start={start} and rows={rows}") # Log de progreso
try:
res = session.get(package_search, params={"start": start, "rows": rows}, verify=not SSL_UNVERIFIED_MODE, timeout=30)
res.raise_for_status()
datasets = res.json()["result"]["results"]
except ValueError as e: # Catch JSON decode error
logging.error(f"Error decoding JSON from response: {e}")
continue # Skip to the next iteration
except ValueError as e:
logging.error(f"Error al decodificar JSON: {e}")
continue
except requests.exceptions.RequestException as e:
logging.error(f"Request error: {e}", exc_info=True)
continue

for dataset in datasets:
if dataset.get("type") == "dataset":
yield dataset
except requests.exceptions.RequestException as e:
logging.error(f"Request error while communicating with CKAN instance {base_url}: {e}")
except requests.exceptions.Timeout:
logging.error(f"Timeout error for request starting at {start}", exc_info=True)
except requests.exceptions.ConnectionError:
logging.error(f"Connection error for request starting at {start}", exc_info=True)
except Exception as e:
logging.error(f"Unexpected error: {e}")
logging.error(f"Unexpected error at start={start}: {e}", exc_info=True)

def main():
"""
Expand Down
Loading

0 comments on commit b8b2be5

Please sign in to comment.