-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwebscraper_idsp.py
41 lines (28 loc) · 1.09 KB
/
webscraper_idsp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# -*- coding: utf-8 -*-
"""
Created on Thu Nov 21 14:59:48 2019
@author: soumy
"""
import ssl; ssl._create_default_https_context = ssl._create_unverified_context
import wget
from urllib import request
from bs4 import BeautifulSoup
import re
import os
# connect to website and get list of all pdfs
url="https://idsp.nic.in/index4.php?lang=1&level=0&linkid=406&lid=3689"
response = request.urlopen(url).read()
soup= BeautifulSoup(response, "html.parser")
links = soup.find_all('a', href=re.compile(r'(.pdf)'))
# clean the pdf link names
url_list = []
for el in links:
url_list.append(el['href'])
print(url_list)
# download the pdfs to a specified location
for url in url_list[0:37]: #Change the range as per your wish
print(url)
folder_location = r'D:\webscraping_idsp_2019'
if not os.path.exists(folder_location):os.mkdir(folder_location)
fullfilename = os.path.join('D:\webscraping_idsp_2019', url.replace("https://idsp.nic.in/index4.php?lang=1&level=0&linkid=406&lid=3689", ""))
wget.download(url, 'D:\webscraping_idsp_2019')