|
| 1 | +''' |
| 2 | +TO use this code, you will first need to install the three packages being imported below using pip or a manual install method. |
| 3 | +''' |
| 4 | +from bs4 import BeautifulSoup |
| 5 | +import requests |
| 6 | +import csv |
| 7 | +from datetime import datetime |
| 8 | + |
| 9 | +#grab the basic content from a web page |
| 10 | +source = requests.get('https://www.kennesaw.edu/news/news-releases/index.php?&p=1').text |
| 11 | +#using the lxml parser to process the web page text content |
| 12 | +soup = BeautifulSoup(source, 'lxml') |
| 13 | +#create a csv file in "w" write mode so we can add content to it. |
| 14 | +ksu_news_csv = open("ksu_news "+"{:%B %d, %Y}".format(datetime.now())+".csv","w") |
| 15 | +csv_writer = csv.writer(ksu_news_csv) |
| 16 | +#write the header row into our csv file |
| 17 | +csv_writer.writerow(["Number","Title","URL","Date"]) |
| 18 | +#show the content of the website we retrieved so that we can find the unique tags around the content we want |
| 19 | +#print(soup.prettify()) |
| 20 | + |
| 21 | +#blog_post = soup.find('ul',class_='two_col has_gap is_30') |
| 22 | +#blog_post = soup.find('div',{"id":"main"}) |
| 23 | +blog_list = soup.find('ul',class_='blog_listing') |
| 24 | +#print(blog_list.prettify()) |
| 25 | +blog_posts = blog_list.find_all('li') |
| 26 | + |
| 27 | +#print(type(blog_posts)) |
| 28 | +#blog_posts = blog_posts.split("<li>") |
| 29 | + |
| 30 | +i = 1 |
| 31 | +for blog_post in blog_posts: |
| 32 | + |
| 33 | + #print(i) |
| 34 | + title = blog_post.h3.text |
| 35 | + #print(title) |
| 36 | + |
| 37 | + date = blog_post.p.text |
| 38 | + date = date.strip() |
| 39 | + date = date.strip('"') |
| 40 | + date = date.strip() |
| 41 | + #print(date) |
| 42 | + |
| 43 | + |
| 44 | + URL = blog_post.find('a')['href'] |
| 45 | + #print(URL) |
| 46 | + |
| 47 | + |
| 48 | + csv_writer.writerow([i,title,URL,date]) |
| 49 | + |
| 50 | + i += 1 |
| 51 | + |
| 52 | + |
| 53 | +ksu_news_csv.close() |
0 commit comments