Skip to content

Commit 7724242

Browse files
committed
first commit
0 parents  commit 7724242

File tree

5 files changed

+159
-0
lines changed

5 files changed

+159
-0
lines changed

.DS_Store

6 KB
Binary file not shown.

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
/venv
2+
/downloads
3+

requirements.txt

+57
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
appnope==0.1.4
2+
asttokens==3.0.0
3+
attrs==25.1.0
4+
beautifulsoup4==4.13.3
5+
bs4==0.0.2
6+
certifi==2025.1.31
7+
charset-normalizer==3.4.1
8+
comm==0.2.2
9+
debugpy==1.8.12
10+
decorator==5.1.1
11+
et_xmlfile==2.0.0
12+
executing==2.2.0
13+
h11==0.14.0
14+
idna==3.10
15+
ipykernel==6.29.5
16+
ipython==8.32.0
17+
jedi==0.19.2
18+
jupyter_client==8.6.3
19+
jupyter_core==5.7.2
20+
matplotlib-inline==0.1.7
21+
nest-asyncio==1.6.0
22+
numpy==2.2.2
23+
openpyxl==3.1.5
24+
outcome==1.3.0.post0
25+
packaging==24.2
26+
pandas==2.2.3
27+
parso==0.8.4
28+
pexpect==4.9.0
29+
platformdirs==4.3.6
30+
prompt_toolkit==3.0.50
31+
psutil==6.1.1
32+
ptyprocess==0.7.0
33+
pure_eval==0.2.3
34+
Pygments==2.19.1
35+
PySocks==1.7.1
36+
python-dateutil==2.9.0.post0
37+
python-dotenv==1.0.1
38+
pytz==2025.1
39+
pyzmq==26.2.1
40+
requests==2.32.3
41+
selenium==4.28.1
42+
six==1.17.0
43+
sniffio==1.3.1
44+
sortedcontainers==2.4.0
45+
soupsieve==2.6
46+
stack-data==0.6.3
47+
tornado==6.4.2
48+
traitlets==5.14.3
49+
trio==0.28.0
50+
trio-websocket==0.11.1
51+
typing_extensions==4.12.2
52+
tzdata==2025.1
53+
urllib3==2.3.0
54+
wcwidth==0.2.13
55+
webdriver-manager==4.0.2
56+
websocket-client==1.8.0
57+
wsproto==1.2.0

results.xlsx

15.5 KB
Binary file not shown.

scaper.py

+99
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import os
2+
import time
3+
import requests
4+
from bs4 import BeautifulSoup
5+
from selenium import webdriver
6+
from selenium.webdriver.common.by import By
7+
from selenium.webdriver.chrome.service import Service
8+
from webdriver_manager.chrome import ChromeDriverManager
9+
import pandas as pd
10+
11+
def download_pdd(reference_number, download_folder="downloads"):
12+
# Set up Selenium WebDriver
13+
options = webdriver.ChromeOptions()
14+
options.add_argument("--headless") # Run in headless mode
15+
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
16+
17+
try:
18+
# Step 1: Open the CDM project search page
19+
search_url = "https://cdm.unfccc.int/Projects/projsearch.html"
20+
driver.get(search_url)
21+
time.sleep(2)
22+
23+
# Step 2: Enter the reference number into the search field
24+
search_input = driver.find_element(By.NAME, "Ref")
25+
search_input.send_keys(str(reference_number))
26+
27+
# Step 3: Click the "Search" button
28+
search_button = driver.find_element(By.NAME, "button")
29+
search_button.click()
30+
time.sleep(3) # Allow time for the search results to load
31+
32+
# Step 4: Locate the correct table containing the project details
33+
soup = BeautifulSoup(driver.page_source, "html.parser")
34+
tables = soup.find_all("table", class_="formTable") # Find all tables with class 'formTable'
35+
36+
project_url = None
37+
for table in tables:
38+
# Find the row containing the project details
39+
for row in table.find_all("tr"):
40+
columns = row.find_all("td")
41+
if len(columns) > 1: # Ensure row has multiple columns
42+
project_link = columns[1].find("a") # Title column (2nd column)
43+
if project_link and "href" in project_link.attrs:
44+
project_url = project_link["href"]
45+
break
46+
47+
if not project_url:
48+
print(f"No project found for reference number {reference_number}.")
49+
return
50+
51+
if not project_url.startswith("http"):
52+
project_url = "https://cdm.unfccc.int" + project_url # Ensure full URL
53+
54+
print(f"Project page found: {project_url}")
55+
56+
# Step 5: Navigate to the project page
57+
driver.get(project_url)
58+
time.sleep(2)
59+
60+
# Step 6: Find the first available <a> inside the project table (PDD link)
61+
soup = BeautifulSoup(driver.page_source, "html.parser")
62+
project_table = soup.find("table", class_="formTable") # Locate the main project table
63+
64+
if not project_table:
65+
print("Could not find the project table on the project page.")
66+
return
67+
68+
first_link = project_table.find("a", href=True) # Find the first available link
69+
70+
if not first_link or "href" not in first_link.attrs:
71+
print("PDD document link not found.")
72+
return
73+
74+
pdd_url = first_link["href"]
75+
if not pdd_url.startswith("http"):
76+
pdd_url = "https://cdm.unfccc.int" + pdd_url # Ensure full URL
77+
78+
print(f"Downloading PDD from: {pdd_url}")
79+
80+
# Step 7: Download the PDD PDF
81+
response = requests.get(pdd_url, stream=True)
82+
if response.status_code == 200:
83+
os.makedirs(download_folder, exist_ok=True)
84+
filename = os.path.join(download_folder, f"{reference_number}_PDD.pdf")
85+
with open(filename, "wb") as file:
86+
file.write(response.content)
87+
print(f"PDD downloaded successfully: {filename}")
88+
else:
89+
print("Failed to download PDD.")
90+
91+
finally:
92+
driver.quit()
93+
94+
# Read the results.xlsx file
95+
df = pd.read_excel("results.xlsx")
96+
97+
# Loop through the reference numbers in the Ref column
98+
for reference_number in df['Ref']:
99+
download_pdd(reference_number)

0 commit comments

Comments
 (0)