|
| 1 | +import feedparser |
| 2 | +import itertools |
| 3 | +import requests |
| 4 | +import schedule |
| 5 | +import re |
| 6 | + |
| 7 | +try: |
| 8 | + from bs4 import BeautifulSoup |
| 9 | +except ImportError: |
| 10 | + from BeautifulSoup import BeautifulSoup |
| 11 | +import pandas as pd |
| 12 | + |
| 13 | +feeds = { |
| 14 | + 'Home':"https://timesofindia.indiatimes.com/rss.cms", |
| 15 | + 'Top stories':"https://timesofindia.indiatimes.com/rssfeedstopstories.cms", |
| 16 | + 'Most Recent Stroies':"https://timesofindia.indiatimes.com/rssfeeds/-2128936835.cms", |
| 17 | + 'India':"https://timesofindia.indiatimes.com/rssfeeds/-2128936835.cms", |
| 18 | + 'NRI':"https://timesofindia.indiatimes.com/rssfeeds/296589292.cms", |
| 19 | + 'Business':"https://timesofindia.indiatimes.com/rssfeeds/1898055.cms", |
| 20 | + 'Cricket':"https://timesofindia.indiatimes.com/rssfeeds/4719161.cms", |
| 21 | + 'Sports':"https://timesofindia.indiatimes.com/rssfeeds/4719148.cms", |
| 22 | + 'Health':"https://timesofindia.indiatimes.com/rssfeeds/3908999.cms", |
| 23 | + 'Science':"https://timesofindia.indiatimes.com/rssfeeds/-2128672765.cms", |
| 24 | + 'Environment':"https://timesofindia.indiatimes.com/rssfeeds/2647163.cms", |
| 25 | + 'Tech':"https://timesofindia.indiatimes.com/rssfeeds/5880659.cms", |
| 26 | + 'Education':"https://timesofindia.indiatimes.com/rssfeeds/913168846.cms", |
| 27 | + 'Mumbai':"https://timesofindia.indiatimes.com/rssfeeds/-2128838597.cms", |
| 28 | + 'Delhi':"https://timesofindia.indiatimes.com/rssfeeds/-2128839596.cms", |
| 29 | + 'Bangalore':"https://timesofindia.indiatimes.com/rssfeeds/-2128833038.cms", |
| 30 | + 'Hyderabad':"https://timesofindia.indiatimes.com/rssfeeds/-2128816011.cms", |
| 31 | + 'Chennai':"https://timesofindia.indiatimes.com/rssfeeds/2950623.cms", |
| 32 | + 'Ahemdabad':"https://timesofindia.indiatimes.com/rssfeeds/-2128821153.cms", |
| 33 | + 'Allahabad':"https://timesofindia.indiatimes.com/rssfeeds/3947060.cms", |
| 34 | + 'Bhubaneswar':"https://timesofindia.indiatimes.com/rssfeeds/4118235.cms", |
| 35 | + 'Coimbatore':"https://timesofindia.indiatimes.com/rssfeeds/7503091.cms", |
| 36 | + 'Gurgaon':"https://timesofindia.indiatimes.com/rssfeeds/6547154.cms", |
| 37 | + 'Guwahati':"https://timesofindia.indiatimes.com/rssfeeds/4118215.cms", |
| 38 | + 'Hubli':"https://timesofindia.indiatimes.com/rssfeeds/3942695.cms", |
| 39 | + 'Kanpur':"https://timesofindia.indiatimes.com/rssfeeds/3947067.cms", |
| 40 | + 'Kolkata':"https://timesofindia.indiatimes.com/rssfeeds/-2128830821.cms", |
| 41 | + 'Ludhiana':"https://timesofindia.indiatimes.com/rssfeeds/3947051.cms", |
| 42 | + 'Mangalore':"https://timesofindia.indiatimes.com/rssfeeds/3942690.cms", |
| 43 | + 'Mysore':"https://timesofindia.indiatimes.com/rssfeeds/3942693.cms", |
| 44 | + 'Noida':"https://timesofindia.indiatimes.com/rssfeeds/8021716.cms", |
| 45 | + 'Pune':"https://timesofindia.indiatimes.com/rssfeeds/-2128821991.cms", |
| 46 | + 'Goa':"https://timesofindia.indiatimes.com/rssfeeds/3012535.cms", |
| 47 | + 'Chandigarh':"https://timesofindia.indiatimes.com/rssfeeds/-2128816762.cms" , |
| 48 | + 'Lucknow':"https://timesofindia.indiatimes.com/rssfeeds/-2128819658.cms", |
| 49 | + 'Patna':"https://timesofindia.indiatimes.com/rssfeeds/-2128817995.cms", |
| 50 | + 'Jaipur':"https://timesofindia.indiatimes.com/rssfeeds/3012544.cms", |
| 51 | + 'Nagpur':"https://timesofindia.indiatimes.com/rssfeeds/442002.cms", |
| 52 | + 'Rajkot':"https://timesofindia.indiatimes.com/rssfeeds/3942663.cms", |
| 53 | + 'Ranchi':"https://timesofindia.indiatimes.com/rssfeeds/4118245.cms", |
| 54 | + 'Surat':"https://timesofindia.indiatimes.com/rssfeeds/3942660.cms", |
| 55 | + 'Vadodara':"https://timesofindia.indiatimes.com/rssfeeds/3942666.cms", |
| 56 | + 'Varanasi':"https://timesofindia.indiatimes.com/rssfeeds/3947071.cms", |
| 57 | + 'Thane':"https://timesofindia.indiatimes.com/rssfeeds/3831863.cms", |
| 58 | + 'Thiruvananthapuram':"https://timesofindia.indiatimes.com/rssfeeds/878156304.cms", |
| 59 | + 'US':"https://timesofindia.indiatimes.com/rssfeeds/30359486.cms", |
| 60 | + 'NRI':"https://timesofindia.indiatimes.com/rssfeeds/7098551.cms", |
| 61 | + 'Pakistan':"https://timesofindia.indiatimes.com/rssfeeds/30359534.cms", |
| 62 | + 'South Asia':"https://timesofindia.indiatimes.com/rssfeeds/3907412.cms", |
| 63 | + 'UK':"https://timesofindia.indiatimes.com/rssfeeds/2177298.cms", |
| 64 | + 'Europe':"https://timesofindia.indiatimes.com/rssfeeds/1898274.cms", |
| 65 | + 'China':"https://timesofindia.indiatimes.com/rssfeeds/1898184.cms", |
| 66 | + 'Middle East':"https://timesofindia.indiatimes.com/rssfeeds/1898272.cms", |
| 67 | + 'Rest of World':"https://timesofindia.indiatimes.com/rssfeeds/671314.cms", |
| 68 | +} |
| 69 | + |
| 70 | +all_links=[] |
| 71 | +all_category=[] |
| 72 | +all_labels=[] |
| 73 | + |
| 74 | +# Function to fetch the rss feed and return the parsed RSS |
| 75 | +def parseRSS( rss_url ): |
| 76 | + return feedparser.parse( rss_url ) |
| 77 | + |
| 78 | +# Function grabs the rss feed headlines (titles) and returns them as a list |
| 79 | +def get( rss_url ): |
| 80 | + global all_links |
| 81 | + global all_category |
| 82 | + feed = parseRSS( rss_url ) |
| 83 | + for newsitem in feed['items']: |
| 84 | + all_links.append(newsitem['link']) |
| 85 | + all_category.append(newsitem['title']) |
| 86 | + |
| 87 | + |
| 88 | +# Iterate over the feed urls |
| 89 | + |
| 90 | +for key,url in feeds.items(): |
| 91 | + get(url) |
| 92 | +for a in range(len(all_category)): |
| 93 | + all_labels.append("REAL") |
| 94 | + |
| 95 | + |
| 96 | +content=[] |
| 97 | +for x in all_links: |
| 98 | + r = requests.get(x,verify=False) # Some of website does not have the certificate |
| 99 | + soup = BeautifulSoup(r.content, 'lxml') |
| 100 | + print(x) |
| 101 | + body=[] |
| 102 | + try: |
| 103 | + for i in soup.findAll("div", {"class": "_3WlLe clearfix "}): |
| 104 | + if(i.get_text()) not in ['','\xa0']: |
| 105 | + body.append(i.get_text()) |
| 106 | + |
| 107 | + except: |
| 108 | + continue |
| 109 | + |
| 110 | + if(len(body)) == 0: |
| 111 | + try: |
| 112 | + for i in soup.findAll("div", {"class": "Normal"}): |
| 113 | + if(i.get_text()) not in ['','\xa0']: |
| 114 | + body.append(i.get_text()) |
| 115 | + except: |
| 116 | + continue |
| 117 | + |
| 118 | + body= ''.join(body) |
| 119 | + content.append(body) |
| 120 | + |
| 121 | +list_of_tuples = list(zip(all_links, content, all_category, all_labels)) |
| 122 | + |
| 123 | +df = pd.DataFrame(list_of_tuples, columns=['all_links','text','title','label']) |
| 124 | +df.to_excel("times of india.xlsx",index=False) |
0 commit comments