-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrapping.py
34 lines (28 loc) · 1.05 KB
/
scrapping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import requests
from bs4 import BeautifulSoup
import numpy as np
pages = []
gral_page = requests.get("https://biblioteca.colmex.mx/index.php/induccion")
content = BeautifulSoup(gral_page.content, 'html.parser')
div_colums = content.find(id="ja-slideshow")
links = div_colums.find_all("a")
for link in links:
page = requests.get("https://biblioteca.colmex.mx" + link.get('href'))
soup = BeautifulSoup(page.content, 'html.parser')
tabla = soup.find(id="tabla")
all_texts = []
td_texts = []
th_array = tabla.find_all('th')
for th_cell in th_array:
td_texts.append("\"" + th_cell.text.strip() + "\"")
all_texts.append(td_texts)
td_texts = []
tr_array = tabla.find_all('tr')
for tr_row in tr_array:
td_array = tr_row.find_all('td')
td_texts = []
for td_cell in td_array:
td_texts.append("\"" + td_cell.text.strip() + "\"")
if(len(td_texts) != 0):
all_texts.append(td_texts)
np.savetxt('csv/%s.csv'%(link.get_text()), all_texts, delimiter=',', fmt='%s')