1
+ import os
2
+ import time
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+ from selenium import webdriver
6
+ from selenium .webdriver .common .by import By
7
+ from selenium .webdriver .chrome .service import Service
8
+ from webdriver_manager .chrome import ChromeDriverManager
9
+ import pandas as pd
10
+
11
+ def download_pdd (reference_number , download_folder = "downloads" ):
12
+ # Set up Selenium WebDriver
13
+ options = webdriver .ChromeOptions ()
14
+ options .add_argument ("--headless" ) # Run in headless mode
15
+ driver = webdriver .Chrome (service = Service (ChromeDriverManager ().install ()), options = options )
16
+
17
+ try :
18
+ # Step 1: Open the CDM project search page
19
+ search_url = "https://cdm.unfccc.int/Projects/projsearch.html"
20
+ driver .get (search_url )
21
+ time .sleep (2 )
22
+
23
+ # Step 2: Enter the reference number into the search field
24
+ search_input = driver .find_element (By .NAME , "Ref" )
25
+ search_input .send_keys (str (reference_number ))
26
+
27
+ # Step 3: Click the "Search" button
28
+ search_button = driver .find_element (By .NAME , "button" )
29
+ search_button .click ()
30
+ time .sleep (3 ) # Allow time for the search results to load
31
+
32
+ # Step 4: Locate the correct table containing the project details
33
+ soup = BeautifulSoup (driver .page_source , "html.parser" )
34
+ tables = soup .find_all ("table" , class_ = "formTable" ) # Find all tables with class 'formTable'
35
+
36
+ project_url = None
37
+ for table in tables :
38
+ # Find the row containing the project details
39
+ for row in table .find_all ("tr" ):
40
+ columns = row .find_all ("td" )
41
+ if len (columns ) > 1 : # Ensure row has multiple columns
42
+ project_link = columns [1 ].find ("a" ) # Title column (2nd column)
43
+ if project_link and "href" in project_link .attrs :
44
+ project_url = project_link ["href" ]
45
+ break
46
+
47
+ if not project_url :
48
+ print (f"No project found for reference number { reference_number } ." )
49
+ return
50
+
51
+ if not project_url .startswith ("http" ):
52
+ project_url = "https://cdm.unfccc.int" + project_url # Ensure full URL
53
+
54
+ print (f"Project page found: { project_url } " )
55
+
56
+ # Step 5: Navigate to the project page
57
+ driver .get (project_url )
58
+ time .sleep (2 )
59
+
60
+ # Step 6: Find the first available <a> inside the project table (PDD link)
61
+ soup = BeautifulSoup (driver .page_source , "html.parser" )
62
+ project_table = soup .find ("table" , class_ = "formTable" ) # Locate the main project table
63
+
64
+ if not project_table :
65
+ print ("Could not find the project table on the project page." )
66
+ return
67
+
68
+ first_link = project_table .find ("a" , href = True ) # Find the first available link
69
+
70
+ if not first_link or "href" not in first_link .attrs :
71
+ print ("PDD document link not found." )
72
+ return
73
+
74
+ pdd_url = first_link ["href" ]
75
+ if not pdd_url .startswith ("http" ):
76
+ pdd_url = "https://cdm.unfccc.int" + pdd_url # Ensure full URL
77
+
78
+ print (f"Downloading PDD from: { pdd_url } " )
79
+
80
+ # Step 7: Download the PDD PDF
81
+ response = requests .get (pdd_url , stream = True )
82
+ if response .status_code == 200 :
83
+ os .makedirs (download_folder , exist_ok = True )
84
+ filename = os .path .join (download_folder , f"{ reference_number } _PDD.pdf" )
85
+ with open (filename , "wb" ) as file :
86
+ file .write (response .content )
87
+ print (f"PDD downloaded successfully: { filename } " )
88
+ else :
89
+ print ("Failed to download PDD." )
90
+
91
+ finally :
92
+ driver .quit ()
93
+
94
+ # Read the results.xlsx file
95
+ df = pd .read_excel ("results.xlsx" )
96
+
97
+ # Loop through the reference numbers in the Ref column
98
+ for reference_number in df ['Ref' ]:
99
+ download_pdd (reference_number )
0 commit comments