Skip to content

Commit

Permalink
Merge pull request #5 from soilwise-he/use_catalogue_base
Browse files Browse the repository at this point in the history
use single catalogue base, and change to use soilwise one
  • Loading branch information
vgole001 authored Jun 5, 2024
2 parents 63ec572 + 720226e commit 6b41937
Showing 1 changed file with 13 additions and 7 deletions.
20 changes: 13 additions & 7 deletions src/linkchecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,12 @@
# Load environment variables from .env file
load_dotenv()

# base catalog

base = "https://soilwise-he.containers.wur.nl/cat/"

# Remove comment'
ejp_catalogue_json_url = "https://catalogue.ejpsoil.eu/collections/metadata:main/items?f=json"
catalogue_json_url = base + "collections/metadata:main/items?f=json"

def setup_database():
# Connect to the database
Expand Down Expand Up @@ -60,7 +64,7 @@ def setup_database():

def get_pagination_info(url):
try:
# Fetch ejpsoil JSON
# Fetch catalogue JSON
response = requests.get(url)
response.raise_for_status() # Raise exception for HHTP errors
data = response.json()
Expand Down Expand Up @@ -122,12 +126,12 @@ def main():
conn, cur = setup_database()

print("Time started processing links.")
print("Loading EJP SOIL Catalogue links...")
print("Loading Catalogue links...")
filename = "soil_catalogue_link.csv"
total_pages, numbers_returned = get_pagination_info(ejp_catalogue_json_url)
total_pages, numbers_returned = get_pagination_info(catalogue_json_url)

# Base URL
base_url = 'https://catalogue.ejpsoil.eu/collections/metadata:main/items?offset='
base_url = base + 'collections/metadata:main/items?offset='

# Generate URLs for each page
urls = [base_url + str(i * numbers_returned) + "&f=html" for i in range(total_pages)]
Expand All @@ -141,12 +145,14 @@ def main():

# Define the formats to be removed
formats_to_remove = [
'https://catalogue.ejpsoil.eu/collections/metadata:main/items?offset',
'collections/metadata:main/items?offset',
'?f=json'
]

# Filter out links with the specified formats
filtered_links = {link for link in all_links if not any(format_to_remove in link for format_to_remove in formats_to_remove)}
print(all_links)
print(formats_to_remove)
filtered_links = {link for link in all_links if not any(format_to_remove in (link or "") for format_to_remove in formats_to_remove)}

# Remove the existing file if it exists
if os.path.exists(filename):
Expand Down

0 comments on commit 6b41937

Please sign in to comment.