Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
basm92 committed Aug 22, 2023
0 parents commit e5bfa20
Show file tree
Hide file tree
Showing 30 changed files with 403 additions and 0 deletions.
9 changes: 9 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@

Copyright 2003 Bas Machielsen

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

2 changes: 2 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
include *.toml

32 changes: 32 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# cbs_hist_downloader

A Python-based tool using Selenium to download books to your harddrive rather than accessing the pictures through the API at historisch.cbs.nl

## Dependencies

Selenium (`pip install selenium`), time, os, webdriver-manager (`pip install webdriver-manager`)

## Example how to use:

In the console:

```
pip install cbs_hist_downloader
```

After installation, in a python environment, you start off with the first URL of the book/volume you want to download:

```{python}
import cbs_hist_downloader as chd
url = "https://historisch.cbs.nl/detail.php?nav_id=2-1&index=10&id=395291474"
chd.scrape_book(url)
```

The files will be downloaded to your Downloads folder.


## Suggestions / Comments

a dot h dot machielsen at uu dot nl
2 changes: 2 additions & 0 deletions build/lib/cbs_hist_downloader/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .scrape_book import scrape_book

39 changes: 39 additions & 0 deletions build/lib/cbs_hist_downloader/find_next_page.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 22 09:34:30 2023
@author: baswork
"""

import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import os

def find_next_page(driver, con_tinue):

elements = driver.find_elements(By.CSS_SELECTOR, '#collection-navigation > div > a, #collection-navigation > div > span')
# Iterate through the elements to find the one with aria-current="true"
current_element = None
for element in elements:
aria_current = element.get_attribute('aria-current')
if aria_current == 'true':
current_element = element
break
# Find where we are in the list of elements
current_index = elements.index(current_element)

# Find the next element if there is any
if current_index < len(elements) - 3:
next_element = elements[current_index + 1]
# Click the next element if it had been found
next_element.click()
else:
# Otherwise terminate the loop
con_tinue = False
print("No next element found.")
driver.quit()
return con_tinue
59 changes: 59 additions & 0 deletions build/lib/cbs_hist_downloader/scrape_book.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""

import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import os
from .find_next_page import find_next_page

def scrape_book(start_url):

if "historisch.cbs.nl" not in start_url:
raise ValueError("Please insert a CBS historisch link")

service = Service()
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)

# Open the initial URL
driver.get(start_url)

image_element = driver.find_element(By.CSS_SELECTOR,'a.cb-enable')
image_element.click()

con_tinue = True
# Main loop to navigate through pages
while con_tinue:
cur_url = driver.current_url
try:
# Find the frame
frames = driver.find_elements(By.CSS_SELECTOR,'iframe')
driver.switch_to.frame(frames[0])

# Locate the image element
step1 = driver.find_element(By.CSS_SELECTOR, 'a#downloadDirect')
step1.click()

# Just leave the default resolution and click download
step2 = driver.find_element(By.CSS_SELECTOR, 'a#downloadResLink')
step2.click()

# Locate and click the 'next' link
driver.switch_to.default_content()
# Draft: change the next code to be robust
con_tinue = find_next_page(driver, con_tinue)

# Add a short delay to ensure the page loads completely before proceeding
time.sleep(2) # You can adjust this delay as needed
except:
print("An error occurred with the Selenium package. Starting again from", cur_url)
driver.quit()
scrape_book(cur_url)

47 changes: 47 additions & 0 deletions cbs_hist_downloader.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
Metadata-Version: 2.1
Name: cbs-hist-downloader
Version: 1.0.2
Summary: A python program for downloading images from CBS historisch.
Home-page: https://github.com/basm92/cbs_hist_downloader
Author: Bas Machieslen
Author-email: [email protected]
Project-URL: Bug Tracker, https://github.com/basm92/cbs_hist_downloader/issues
Classifier: Programming Language :: Python :: 3
Classifier: License :: OSI Approved :: MIT License
Classifier: Operating System :: OS Independent
Requires-Python: >=3.6
Description-Content-Type: text/markdown
License-File: LICENSE

# cbs_hist_downloader

A Python-based tool using Selenium to download books to your harddrive rather than accessing the pictures through the API at historisch.cbs.nl

## Dependencies

Selenium (`pip install selenium`), time, os, webdriver-manager (`pip install webdriver-manager`)

## Example how to use:

In the console:

```
pip install cbs_hist_downloader
```

After installation, in a python environment, you start off with the first URL of the book/volume you want to download:

```{python}
import cbs_hist_downloader as chd

url = "https://historisch.cbs.nl/detail.php?nav_id=2-1&index=10&id=395291474"

chd.scrape_book(url)
```

The files will be downloaded to your Downloads folder.


## Suggestions / Comments

a dot h dot machielsen at uu dot nl
14 changes: 14 additions & 0 deletions cbs_hist_downloader.egg-info/SOURCES.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
LICENSE
MANIFEST.in
README.md
pyproject.toml
setup.cfg
setup.py
cbs_hist_downloader/__init__.py
cbs_hist_downloader/find_next_page.py
cbs_hist_downloader/scrape_book.py
cbs_hist_downloader.egg-info/PKG-INFO
cbs_hist_downloader.egg-info/SOURCES.txt
cbs_hist_downloader.egg-info/dependency_links.txt
cbs_hist_downloader.egg-info/requires.txt
cbs_hist_downloader.egg-info/top_level.txt
1 change: 1 addition & 0 deletions cbs_hist_downloader.egg-info/dependency_links.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

2 changes: 2 additions & 0 deletions cbs_hist_downloader.egg-info/requires.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
selenium
webdriver_manager
1 change: 1 addition & 0 deletions cbs_hist_downloader.egg-info/top_level.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
cbs_hist_downloader
2 changes: 2 additions & 0 deletions cbs_hist_downloader/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .scrape_book import scrape_book

Binary file not shown.
Binary file not shown.
Binary file not shown.
47 changes: 47 additions & 0 deletions cbs_hist_downloader/cbs_hist_downloader.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
Metadata-Version: 2.1
Name: cbs-hist-downloader
Version: 1.0.0
Summary: A python program for downloading images from CBS historisch.
Home-page: https://github.com/basm92/cbs_hist_downloader
Author: Bas Machieslen
Author-email: [email protected]
Project-URL: Bug Tracker, https://github.com/basm92/cbs_hist_downloader/issues
Classifier: Programming Language :: Python :: 3
Classifier: License :: OSI Approved :: MIT License
Classifier: Operating System :: OS Independent
Requires-Python: >=3.6
Description-Content-Type: text/markdown
License-File: LICENSE

# cbs_hist_downloader

A Python-based tool using Selenium to download books to your harddrive rather than accessing the pictures through the API at historisch.cbs.nl

## Dependencies

Selenium (`pip install selenium`), time, os, webdriver-manager (`pip install webdriver-manager`)

## Example how to use:

In the console:

```
pip install cbs_historisch_downloader
```

After installation, in a python environment, you start off with the first URL of the book/volume you want to download:

```{python}
import cbs_historisch_downloader as chd

url = "https://historisch.cbs.nl/detail.php?nav_id=2-1&index=10&id=395291474"

chd.scrape_book(url)
```

The files will be downloaded to your Downloads folder.


## Suggestions / Comments

a dot h dot machielsen at uu dot nl
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

2 changes: 2 additions & 0 deletions cbs_hist_downloader/cbs_hist_downloader.egg-info/requires.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
selenium
webdriver_manager
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
cbs_hist_downloader
39 changes: 39 additions & 0 deletions cbs_hist_downloader/find_next_page.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 22 09:34:30 2023
@author: baswork
"""

import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import os

def find_next_page(driver, con_tinue):

elements = driver.find_elements(By.CSS_SELECTOR, '#collection-navigation > div > a, #collection-navigation > div > span')
# Iterate through the elements to find the one with aria-current="true"
current_element = None
for element in elements:
aria_current = element.get_attribute('aria-current')
if aria_current == 'true':
current_element = element
break
# Find where we are in the list of elements
current_index = elements.index(current_element)

# Find the next element if there is any
if current_index < len(elements) - 3:
next_element = elements[current_index + 1]
# Click the next element if it had been found
next_element.click()
else:
# Otherwise terminate the loop
con_tinue = False
print("No next element found.")
driver.quit()
return con_tinue
59 changes: 59 additions & 0 deletions cbs_hist_downloader/scrape_book.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""

import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import os
from .find_next_page import find_next_page

def scrape_book(start_url):

if "historisch.cbs.nl" not in start_url:
raise ValueError("Please insert a CBS historisch link")

service = Service()
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)

# Open the initial URL
driver.get(start_url)

image_element = driver.find_element(By.CSS_SELECTOR,'a.cb-enable')
image_element.click()

con_tinue = True
# Main loop to navigate through pages
while con_tinue:
cur_url = driver.current_url
try:
# Find the frame
frames = driver.find_elements(By.CSS_SELECTOR,'iframe')
driver.switch_to.frame(frames[0])

# Locate the image element
step1 = driver.find_element(By.CSS_SELECTOR, 'a#downloadDirect')
step1.click()

# Just leave the default resolution and click download
step2 = driver.find_element(By.CSS_SELECTOR, 'a#downloadResLink')
step2.click()

# Locate and click the 'next' link
driver.switch_to.default_content()
# Draft: change the next code to be robust
con_tinue = find_next_page(driver, con_tinue)

# Add a short delay to ensure the page loads completely before proceeding
time.sleep(2) # You can adjust this delay as needed
except:
print("An error occurred with the Selenium package. Starting again from", cur_url)
driver.quit()
scrape_book(cur_url)

Binary file added dist/cbs-hist-downloader-1.0.1.tar.gz
Binary file not shown.
Binary file added dist/cbs-hist-downloader-1.0.2.tar.gz
Binary file not shown.
Binary file added dist/cbs_hist_downloader-1.0.1-py3-none-any.whl
Binary file not shown.
Binary file added dist/cbs_hist_downloader-1.0.2-py3-none-any.whl
Binary file not shown.
6 changes: 6 additions & 0 deletions instructions.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
1. Make changes
2. Change the version number in setup.py
3. run python setup.py sdist bdist_wheel (or python -m build)
4. run twine upload dist/*
5. download the new version of the package from pip again (pip install cbs-hist-downloader==new_vers)
6. Commit the stuff to github
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[build-system]
requires = [
"setuptools>=42",
"wheel"
]
build-backend = "setuptools.build_meta"
Loading

0 comments on commit e5bfa20

Please sign in to comment.