Initial commit

basm92 · Aug 22, 2023 · e5bfa20 · e5bfa20
commit e5bfa20
Show file tree

Hide file tree

Showing 30 changed files with 403 additions and 0 deletions.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,9 @@
+
+Copyright 2003 Bas Machielsen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,2 @@
+include *.toml
+
diff --git a/README.md b/README.md
@@ -0,0 +1,32 @@
+# cbs_hist_downloader
+
+A Python-based tool using Selenium to download books to your harddrive rather than accessing the pictures through the API at historisch.cbs.nl
+
+## Dependencies
+
+Selenium (`pip install selenium`), time, os, webdriver-manager (`pip install webdriver-manager`)
+
+## Example how to use:
+
+In the console:
+
+```
+pip install cbs_hist_downloader
+```
+
+After installation, in a python environment, you start off with the first URL of the book/volume you want to download:
+
+```{python}
+import cbs_hist_downloader as chd
+
+url = "https://historisch.cbs.nl/detail.php?nav_id=2-1&index=10&id=395291474"
+
+chd.scrape_book(url)
+```
+
+The files will be downloaded to your Downloads folder. 
+
+
+## Suggestions / Comments
+
+a dot h dot machielsen at uu dot nl
diff --git a/build/lib/cbs_hist_downloader/__init__.py b/build/lib/cbs_hist_downloader/__init__.py
@@ -0,0 +1,2 @@
+from .scrape_book import scrape_book
+
diff --git a/build/lib/cbs_hist_downloader/find_next_page.py b/build/lib/cbs_hist_downloader/find_next_page.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Aug 22 09:34:30 2023
+
+@author: baswork
+"""
+
+import time
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from webdriver_manager.chrome import ChromeDriverManager
+from selenium.webdriver.common.by import By
+import os
+
+def find_next_page(driver, con_tinue):
+
+    elements = driver.find_elements(By.CSS_SELECTOR, '#collection-navigation > div > a, #collection-navigation > div > span')
+    # Iterate through the elements to find the one with aria-current="true"
+    current_element = None
+    for element in elements:
+        aria_current = element.get_attribute('aria-current')
+        if aria_current == 'true':
+            current_element = element
+            break
+    # Find where we are in the list of elements 
+    current_index = elements.index(current_element)
+
+    # Find the next element if there is any
+    if current_index < len(elements) - 3:
+        next_element = elements[current_index + 1]
+        # Click the next element if it had been found
+        next_element.click()
+    else:
+        # Otherwise terminate the loop
+        con_tinue = False
+        print("No next element found.")
+        driver.quit()
+    return con_tinue
diff --git a/build/lib/cbs_hist_downloader/scrape_book.py b/build/lib/cbs_hist_downloader/scrape_book.py
@@ -0,0 +1,59 @@
+# -*- coding: utf-8 -*-
+"""
+Spyder Editor
+
+This is a temporary script file.
+"""
+
+import time
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from webdriver_manager.chrome import ChromeDriverManager
+from selenium.webdriver.common.by import By
+import os
+from .find_next_page import find_next_page
+
+def scrape_book(start_url):
+
+    if "historisch.cbs.nl" not in start_url:
+        raise ValueError("Please insert a CBS historisch link")
+
+    service = Service()
+    options = webdriver.ChromeOptions()
+    driver = webdriver.Chrome(service=service, options=options)
+
+    # Open the initial URL
+    driver.get(start_url)
+
+    image_element = driver.find_element(By.CSS_SELECTOR,'a.cb-enable')
+    image_element.click()
+
+    con_tinue = True
+    # Main loop to navigate through pages
+    while con_tinue:
+        cur_url = driver.current_url
+        try: 
+            # Find the frame
+            frames = driver.find_elements(By.CSS_SELECTOR,'iframe')
+            driver.switch_to.frame(frames[0])
+
+            # Locate the image element
+            step1 = driver.find_element(By.CSS_SELECTOR, 'a#downloadDirect')
+            step1.click()
+
+            # Just leave the default resolution and click download
+            step2 = driver.find_element(By.CSS_SELECTOR, 'a#downloadResLink')
+            step2.click()
+
+            # Locate and click the 'next' link
+            driver.switch_to.default_content()
+            # Draft: change the next code to be robust
+            con_tinue = find_next_page(driver, con_tinue)
+
+            # Add a short delay to ensure the page loads completely before proceeding
+            time.sleep(2)  # You can adjust this delay as needed
+        except:
+            print("An error occurred with the Selenium package. Starting again from", cur_url)
+            driver.quit()
+            scrape_book(cur_url)
+
diff --git a/cbs_hist_downloader.egg-info/PKG-INFO b/cbs_hist_downloader.egg-info/PKG-INFO
@@ -0,0 +1,47 @@
+Metadata-Version: 2.1
+Name: cbs-hist-downloader
+Version: 1.0.2
+Summary: A python program for downloading images from CBS historisch.
+Home-page: https://github.com/basm92/cbs_hist_downloader
+Author: Bas Machieslen
+Author-email: [email protected]
+Project-URL: Bug Tracker, https://github.com/basm92/cbs_hist_downloader/issues
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.6
+Description-Content-Type: text/markdown
+License-File: LICENSE
+
+# cbs_hist_downloader
+
+A Python-based tool using Selenium to download books to your harddrive rather than accessing the pictures through the API at historisch.cbs.nl
+
+## Dependencies
+
+Selenium (`pip install selenium`), time, os, webdriver-manager (`pip install webdriver-manager`)
+
+## Example how to use:
+
+In the console:
+
+```
+pip install cbs_hist_downloader
+```
+
+After installation, in a python environment, you start off with the first URL of the book/volume you want to download:
+
+```{python}
+import cbs_hist_downloader as chd
+
+url = "https://historisch.cbs.nl/detail.php?nav_id=2-1&index=10&id=395291474"
+
+chd.scrape_book(url)
+```
+
+The files will be downloaded to your Downloads folder. 
+
+
+## Suggestions / Comments
+
+a dot h dot machielsen at uu dot nl
diff --git a/cbs_hist_downloader.egg-info/SOURCES.txt b/cbs_hist_downloader.egg-info/SOURCES.txt
@@ -0,0 +1,14 @@
+LICENSE
+MANIFEST.in
+README.md
+pyproject.toml
+setup.cfg
+setup.py
+cbs_hist_downloader/__init__.py
+cbs_hist_downloader/find_next_page.py
+cbs_hist_downloader/scrape_book.py
+cbs_hist_downloader.egg-info/PKG-INFO
+cbs_hist_downloader.egg-info/SOURCES.txt
+cbs_hist_downloader.egg-info/dependency_links.txt
+cbs_hist_downloader.egg-info/requires.txt
+cbs_hist_downloader.egg-info/top_level.txt
diff --git a/cbs_hist_downloader.egg-info/dependency_links.txt b/cbs_hist_downloader.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/cbs_hist_downloader.egg-info/requires.txt b/cbs_hist_downloader.egg-info/requires.txt
@@ -0,0 +1,2 @@
+selenium
+webdriver_manager
diff --git a/cbs_hist_downloader.egg-info/top_level.txt b/cbs_hist_downloader.egg-info/top_level.txt
@@ -0,0 +1 @@
+cbs_hist_downloader
diff --git a/cbs_hist_downloader/__init__.py b/cbs_hist_downloader/__init__.py
@@ -0,0 +1,2 @@
+from .scrape_book import scrape_book
+
diff --git a/cbs_hist_downloader/__pycache__/__init__.cpython-310.pyc b/cbs_hist_downloader/__pycache__/__init__.cpython-310.pyc
diff --git a/cbs_hist_downloader/__pycache__/find_next_page.cpython-310.pyc b/cbs_hist_downloader/__pycache__/find_next_page.cpython-310.pyc
diff --git a/cbs_hist_downloader/__pycache__/scrape_book.cpython-310.pyc b/cbs_hist_downloader/__pycache__/scrape_book.cpython-310.pyc
diff --git a/cbs_hist_downloader/cbs_hist_downloader.egg-info/PKG-INFO b/cbs_hist_downloader/cbs_hist_downloader.egg-info/PKG-INFO
@@ -0,0 +1,47 @@
+Metadata-Version: 2.1
+Name: cbs-hist-downloader
+Version: 1.0.0
+Summary: A python program for downloading images from CBS historisch.
+Home-page: https://github.com/basm92/cbs_hist_downloader
+Author: Bas Machieslen
+Author-email: [email protected]
+Project-URL: Bug Tracker, https://github.com/basm92/cbs_hist_downloader/issues
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.6
+Description-Content-Type: text/markdown
+License-File: LICENSE
+
+# cbs_hist_downloader
+
+A Python-based tool using Selenium to download books to your harddrive rather than accessing the pictures through the API at historisch.cbs.nl
+
+## Dependencies
+
+Selenium (`pip install selenium`), time, os, webdriver-manager (`pip install webdriver-manager`)
+
+## Example how to use:
+
+In the console:
+
+```
+pip install cbs_historisch_downloader
+```
+
+After installation, in a python environment, you start off with the first URL of the book/volume you want to download:
+
+```{python}
+import cbs_historisch_downloader as chd
+
+url = "https://historisch.cbs.nl/detail.php?nav_id=2-1&index=10&id=395291474"
+
+chd.scrape_book(url)
+```
+
+The files will be downloaded to your Downloads folder. 
+
+
+## Suggestions / Comments
+
+a dot h dot machielsen at uu dot nl
diff --git a/cbs_hist_downloader/cbs_hist_downloader.egg-info/SOURCES.txt b/cbs_hist_downloader/cbs_hist_downloader.egg-info/SOURCES.txt
diff --git a/cbs_hist_downloader/cbs_hist_downloader.egg-info/dependency_links.txt b/cbs_hist_downloader/cbs_hist_downloader.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/cbs_hist_downloader/cbs_hist_downloader.egg-info/requires.txt b/cbs_hist_downloader/cbs_hist_downloader.egg-info/requires.txt
@@ -0,0 +1,2 @@
+selenium
+webdriver_manager
diff --git a/cbs_hist_downloader/cbs_hist_downloader.egg-info/top_level.txt b/cbs_hist_downloader/cbs_hist_downloader.egg-info/top_level.txt
@@ -0,0 +1 @@
+cbs_hist_downloader
diff --git a/cbs_hist_downloader/find_next_page.py b/cbs_hist_downloader/find_next_page.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Aug 22 09:34:30 2023
+
+@author: baswork
+"""
+
+import time
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from webdriver_manager.chrome import ChromeDriverManager
+from selenium.webdriver.common.by import By
+import os
+
+def find_next_page(driver, con_tinue):
+
+    elements = driver.find_elements(By.CSS_SELECTOR, '#collection-navigation > div > a, #collection-navigation > div > span')
+    # Iterate through the elements to find the one with aria-current="true"
+    current_element = None
+    for element in elements:
+        aria_current = element.get_attribute('aria-current')
+        if aria_current == 'true':
+            current_element = element
+            break
+    # Find where we are in the list of elements 
+    current_index = elements.index(current_element)
+
+    # Find the next element if there is any
+    if current_index < len(elements) - 3:
+        next_element = elements[current_index + 1]
+        # Click the next element if it had been found
+        next_element.click()
+    else:
+        # Otherwise terminate the loop
+        con_tinue = False
+        print("No next element found.")
+        driver.quit()
+    return con_tinue
diff --git a/cbs_hist_downloader/scrape_book.py b/cbs_hist_downloader/scrape_book.py
@@ -0,0 +1,59 @@
+# -*- coding: utf-8 -*-
+"""
+Spyder Editor
+
+This is a temporary script file.
+"""
+
+import time
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from webdriver_manager.chrome import ChromeDriverManager
+from selenium.webdriver.common.by import By
+import os
+from .find_next_page import find_next_page
+
+def scrape_book(start_url):
+
+    if "historisch.cbs.nl" not in start_url:
+        raise ValueError("Please insert a CBS historisch link")
+
+    service = Service()
+    options = webdriver.ChromeOptions()
+    driver = webdriver.Chrome(service=service, options=options)
+
+    # Open the initial URL
+    driver.get(start_url)
+
+    image_element = driver.find_element(By.CSS_SELECTOR,'a.cb-enable')
+    image_element.click()
+
+    con_tinue = True
+    # Main loop to navigate through pages
+    while con_tinue:
+        cur_url = driver.current_url
+        try: 
+            # Find the frame
+            frames = driver.find_elements(By.CSS_SELECTOR,'iframe')
+            driver.switch_to.frame(frames[0])
+
+            # Locate the image element
+            step1 = driver.find_element(By.CSS_SELECTOR, 'a#downloadDirect')
+            step1.click()
+
+            # Just leave the default resolution and click download
+            step2 = driver.find_element(By.CSS_SELECTOR, 'a#downloadResLink')
+            step2.click()
+
+            # Locate and click the 'next' link
+            driver.switch_to.default_content()
+            # Draft: change the next code to be robust
+            con_tinue = find_next_page(driver, con_tinue)
+
+            # Add a short delay to ensure the page loads completely before proceeding
+            time.sleep(2)  # You can adjust this delay as needed
+        except:
+            print("An error occurred with the Selenium package. Starting again from", cur_url)
+            driver.quit()
+            scrape_book(cur_url)
+
diff --git a/dist/cbs-hist-downloader-1.0.1.tar.gz b/dist/cbs-hist-downloader-1.0.1.tar.gz
diff --git a/dist/cbs-hist-downloader-1.0.2.tar.gz b/dist/cbs-hist-downloader-1.0.2.tar.gz
diff --git a/dist/cbs_hist_downloader-1.0.1-py3-none-any.whl b/dist/cbs_hist_downloader-1.0.1-py3-none-any.whl
diff --git a/dist/cbs_hist_downloader-1.0.2-py3-none-any.whl b/dist/cbs_hist_downloader-1.0.2-py3-none-any.whl
diff --git a/instructions.txt b/instructions.txt
@@ -0,0 +1,6 @@
+1. Make changes
+2. Change the version number in setup.py
+3. run python setup.py sdist bdist_wheel (or python -m build)
+4. run twine upload dist/*
+5. download the new version of the package from pip again (pip install cbs-hist-downloader==new_vers)
+6. Commit the stuff to github
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,6 @@
+[build-system]
+requires = [
+    "setuptools>=42",
+    "wheel"
+]
+build-backend = "setuptools.build_meta"