-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit e5bfa20
Showing
30 changed files
with
403 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
|
||
Copyright 2003 Bas Machielsen | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
include *.toml | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
# cbs_hist_downloader | ||
|
||
A Python-based tool using Selenium to download books to your harddrive rather than accessing the pictures through the API at historisch.cbs.nl | ||
|
||
## Dependencies | ||
|
||
Selenium (`pip install selenium`), time, os, webdriver-manager (`pip install webdriver-manager`) | ||
|
||
## Example how to use: | ||
|
||
In the console: | ||
|
||
``` | ||
pip install cbs_hist_downloader | ||
``` | ||
|
||
After installation, in a python environment, you start off with the first URL of the book/volume you want to download: | ||
|
||
```{python} | ||
import cbs_hist_downloader as chd | ||
url = "https://historisch.cbs.nl/detail.php?nav_id=2-1&index=10&id=395291474" | ||
chd.scrape_book(url) | ||
``` | ||
|
||
The files will be downloaded to your Downloads folder. | ||
|
||
|
||
## Suggestions / Comments | ||
|
||
a dot h dot machielsen at uu dot nl |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
from .scrape_book import scrape_book | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Created on Tue Aug 22 09:34:30 2023 | ||
@author: baswork | ||
""" | ||
|
||
import time | ||
from selenium import webdriver | ||
from selenium.webdriver.chrome.service import Service | ||
from webdriver_manager.chrome import ChromeDriverManager | ||
from selenium.webdriver.common.by import By | ||
import os | ||
|
||
def find_next_page(driver, con_tinue): | ||
|
||
elements = driver.find_elements(By.CSS_SELECTOR, '#collection-navigation > div > a, #collection-navigation > div > span') | ||
# Iterate through the elements to find the one with aria-current="true" | ||
current_element = None | ||
for element in elements: | ||
aria_current = element.get_attribute('aria-current') | ||
if aria_current == 'true': | ||
current_element = element | ||
break | ||
# Find where we are in the list of elements | ||
current_index = elements.index(current_element) | ||
|
||
# Find the next element if there is any | ||
if current_index < len(elements) - 3: | ||
next_element = elements[current_index + 1] | ||
# Click the next element if it had been found | ||
next_element.click() | ||
else: | ||
# Otherwise terminate the loop | ||
con_tinue = False | ||
print("No next element found.") | ||
driver.quit() | ||
return con_tinue |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Spyder Editor | ||
This is a temporary script file. | ||
""" | ||
|
||
import time | ||
from selenium import webdriver | ||
from selenium.webdriver.chrome.service import Service | ||
from webdriver_manager.chrome import ChromeDriverManager | ||
from selenium.webdriver.common.by import By | ||
import os | ||
from .find_next_page import find_next_page | ||
|
||
def scrape_book(start_url): | ||
|
||
if "historisch.cbs.nl" not in start_url: | ||
raise ValueError("Please insert a CBS historisch link") | ||
|
||
service = Service() | ||
options = webdriver.ChromeOptions() | ||
driver = webdriver.Chrome(service=service, options=options) | ||
|
||
# Open the initial URL | ||
driver.get(start_url) | ||
|
||
image_element = driver.find_element(By.CSS_SELECTOR,'a.cb-enable') | ||
image_element.click() | ||
|
||
con_tinue = True | ||
# Main loop to navigate through pages | ||
while con_tinue: | ||
cur_url = driver.current_url | ||
try: | ||
# Find the frame | ||
frames = driver.find_elements(By.CSS_SELECTOR,'iframe') | ||
driver.switch_to.frame(frames[0]) | ||
|
||
# Locate the image element | ||
step1 = driver.find_element(By.CSS_SELECTOR, 'a#downloadDirect') | ||
step1.click() | ||
|
||
# Just leave the default resolution and click download | ||
step2 = driver.find_element(By.CSS_SELECTOR, 'a#downloadResLink') | ||
step2.click() | ||
|
||
# Locate and click the 'next' link | ||
driver.switch_to.default_content() | ||
# Draft: change the next code to be robust | ||
con_tinue = find_next_page(driver, con_tinue) | ||
|
||
# Add a short delay to ensure the page loads completely before proceeding | ||
time.sleep(2) # You can adjust this delay as needed | ||
except: | ||
print("An error occurred with the Selenium package. Starting again from", cur_url) | ||
driver.quit() | ||
scrape_book(cur_url) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
Metadata-Version: 2.1 | ||
Name: cbs-hist-downloader | ||
Version: 1.0.2 | ||
Summary: A python program for downloading images from CBS historisch. | ||
Home-page: https://github.com/basm92/cbs_hist_downloader | ||
Author: Bas Machieslen | ||
Author-email: [email protected] | ||
Project-URL: Bug Tracker, https://github.com/basm92/cbs_hist_downloader/issues | ||
Classifier: Programming Language :: Python :: 3 | ||
Classifier: License :: OSI Approved :: MIT License | ||
Classifier: Operating System :: OS Independent | ||
Requires-Python: >=3.6 | ||
Description-Content-Type: text/markdown | ||
License-File: LICENSE | ||
|
||
# cbs_hist_downloader | ||
|
||
A Python-based tool using Selenium to download books to your harddrive rather than accessing the pictures through the API at historisch.cbs.nl | ||
|
||
## Dependencies | ||
|
||
Selenium (`pip install selenium`), time, os, webdriver-manager (`pip install webdriver-manager`) | ||
|
||
## Example how to use: | ||
|
||
In the console: | ||
|
||
``` | ||
pip install cbs_hist_downloader | ||
``` | ||
|
||
After installation, in a python environment, you start off with the first URL of the book/volume you want to download: | ||
|
||
```{python} | ||
import cbs_hist_downloader as chd | ||
|
||
url = "https://historisch.cbs.nl/detail.php?nav_id=2-1&index=10&id=395291474" | ||
|
||
chd.scrape_book(url) | ||
``` | ||
|
||
The files will be downloaded to your Downloads folder. | ||
|
||
|
||
## Suggestions / Comments | ||
|
||
a dot h dot machielsen at uu dot nl |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
LICENSE | ||
MANIFEST.in | ||
README.md | ||
pyproject.toml | ||
setup.cfg | ||
setup.py | ||
cbs_hist_downloader/__init__.py | ||
cbs_hist_downloader/find_next_page.py | ||
cbs_hist_downloader/scrape_book.py | ||
cbs_hist_downloader.egg-info/PKG-INFO | ||
cbs_hist_downloader.egg-info/SOURCES.txt | ||
cbs_hist_downloader.egg-info/dependency_links.txt | ||
cbs_hist_downloader.egg-info/requires.txt | ||
cbs_hist_downloader.egg-info/top_level.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
selenium | ||
webdriver_manager |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
cbs_hist_downloader |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
from .scrape_book import scrape_book | ||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
Metadata-Version: 2.1 | ||
Name: cbs-hist-downloader | ||
Version: 1.0.0 | ||
Summary: A python program for downloading images from CBS historisch. | ||
Home-page: https://github.com/basm92/cbs_hist_downloader | ||
Author: Bas Machieslen | ||
Author-email: [email protected] | ||
Project-URL: Bug Tracker, https://github.com/basm92/cbs_hist_downloader/issues | ||
Classifier: Programming Language :: Python :: 3 | ||
Classifier: License :: OSI Approved :: MIT License | ||
Classifier: Operating System :: OS Independent | ||
Requires-Python: >=3.6 | ||
Description-Content-Type: text/markdown | ||
License-File: LICENSE | ||
|
||
# cbs_hist_downloader | ||
|
||
A Python-based tool using Selenium to download books to your harddrive rather than accessing the pictures through the API at historisch.cbs.nl | ||
|
||
## Dependencies | ||
|
||
Selenium (`pip install selenium`), time, os, webdriver-manager (`pip install webdriver-manager`) | ||
|
||
## Example how to use: | ||
|
||
In the console: | ||
|
||
``` | ||
pip install cbs_historisch_downloader | ||
``` | ||
|
||
After installation, in a python environment, you start off with the first URL of the book/volume you want to download: | ||
|
||
```{python} | ||
import cbs_historisch_downloader as chd | ||
|
||
url = "https://historisch.cbs.nl/detail.php?nav_id=2-1&index=10&id=395291474" | ||
|
||
chd.scrape_book(url) | ||
``` | ||
|
||
The files will be downloaded to your Downloads folder. | ||
|
||
|
||
## Suggestions / Comments | ||
|
||
a dot h dot machielsen at uu dot nl |
Empty file.
1 change: 1 addition & 0 deletions
1
cbs_hist_downloader/cbs_hist_downloader.egg-info/dependency_links.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
|
2 changes: 2 additions & 0 deletions
2
cbs_hist_downloader/cbs_hist_downloader.egg-info/requires.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
selenium | ||
webdriver_manager |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
cbs_hist_downloader |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Created on Tue Aug 22 09:34:30 2023 | ||
@author: baswork | ||
""" | ||
|
||
import time | ||
from selenium import webdriver | ||
from selenium.webdriver.chrome.service import Service | ||
from webdriver_manager.chrome import ChromeDriverManager | ||
from selenium.webdriver.common.by import By | ||
import os | ||
|
||
def find_next_page(driver, con_tinue): | ||
|
||
elements = driver.find_elements(By.CSS_SELECTOR, '#collection-navigation > div > a, #collection-navigation > div > span') | ||
# Iterate through the elements to find the one with aria-current="true" | ||
current_element = None | ||
for element in elements: | ||
aria_current = element.get_attribute('aria-current') | ||
if aria_current == 'true': | ||
current_element = element | ||
break | ||
# Find where we are in the list of elements | ||
current_index = elements.index(current_element) | ||
|
||
# Find the next element if there is any | ||
if current_index < len(elements) - 3: | ||
next_element = elements[current_index + 1] | ||
# Click the next element if it had been found | ||
next_element.click() | ||
else: | ||
# Otherwise terminate the loop | ||
con_tinue = False | ||
print("No next element found.") | ||
driver.quit() | ||
return con_tinue |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Spyder Editor | ||
This is a temporary script file. | ||
""" | ||
|
||
import time | ||
from selenium import webdriver | ||
from selenium.webdriver.chrome.service import Service | ||
from webdriver_manager.chrome import ChromeDriverManager | ||
from selenium.webdriver.common.by import By | ||
import os | ||
from .find_next_page import find_next_page | ||
|
||
def scrape_book(start_url): | ||
|
||
if "historisch.cbs.nl" not in start_url: | ||
raise ValueError("Please insert a CBS historisch link") | ||
|
||
service = Service() | ||
options = webdriver.ChromeOptions() | ||
driver = webdriver.Chrome(service=service, options=options) | ||
|
||
# Open the initial URL | ||
driver.get(start_url) | ||
|
||
image_element = driver.find_element(By.CSS_SELECTOR,'a.cb-enable') | ||
image_element.click() | ||
|
||
con_tinue = True | ||
# Main loop to navigate through pages | ||
while con_tinue: | ||
cur_url = driver.current_url | ||
try: | ||
# Find the frame | ||
frames = driver.find_elements(By.CSS_SELECTOR,'iframe') | ||
driver.switch_to.frame(frames[0]) | ||
|
||
# Locate the image element | ||
step1 = driver.find_element(By.CSS_SELECTOR, 'a#downloadDirect') | ||
step1.click() | ||
|
||
# Just leave the default resolution and click download | ||
step2 = driver.find_element(By.CSS_SELECTOR, 'a#downloadResLink') | ||
step2.click() | ||
|
||
# Locate and click the 'next' link | ||
driver.switch_to.default_content() | ||
# Draft: change the next code to be robust | ||
con_tinue = find_next_page(driver, con_tinue) | ||
|
||
# Add a short delay to ensure the page loads completely before proceeding | ||
time.sleep(2) # You can adjust this delay as needed | ||
except: | ||
print("An error occurred with the Selenium package. Starting again from", cur_url) | ||
driver.quit() | ||
scrape_book(cur_url) | ||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
1. Make changes | ||
2. Change the version number in setup.py | ||
3. run python setup.py sdist bdist_wheel (or python -m build) | ||
4. run twine upload dist/* | ||
5. download the new version of the package from pip again (pip install cbs-hist-downloader==new_vers) | ||
6. Commit the stuff to github |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
[build-system] | ||
requires = [ | ||
"setuptools>=42", | ||
"wheel" | ||
] | ||
build-backend = "setuptools.build_meta" |
Oops, something went wrong.