1
+ import feedparser
2
+ import itertools
3
+ import requests
4
+ import schedule
5
+ #import re
6
+ import schedule
7
+ import time
8
+
9
+
10
+ try :
11
+ from bs4 import BeautifulSoup
12
+ except ImportError :
13
+ from BeautifulSoup import BeautifulSoup
14
+ import pandas as pd
15
+
16
+
17
+ # Function to fetch the rss feed and return the parsed RSS
18
+ def parseRSS ( rss_url ):
19
+ return feedparser .parse ( rss_url )
20
+
21
+ # Function grabs the rss feed headlines (titles) and returns them as a list
22
+ def get ( rss_url ):
23
+ all_links = []
24
+ all_category = []
25
+ feed = parseRSS ( rss_url )
26
+ for newsitem in feed ['items' ]:
27
+ all_links .append (newsitem ['link' ])
28
+ all_category .append (newsitem ['title' ])
29
+ return (all_links , all_category )
30
+
31
+ def fakenewsgenerator ():
32
+
33
+ all_labels = []
34
+
35
+ # Iterate over the feed urls
36
+ all_links , all_category = get ('http://www.fakingnews.com/feed' )
37
+ print (all_links )
38
+ content = []
39
+ for x in all_links :
40
+ r = requests .get (x ,verify = False )
41
+ soup = BeautifulSoup (r .content , 'lxml' )
42
+ print (x )
43
+ body = []
44
+ try :
45
+ for i in soup .findAll ("div" , {"class" : "article-content" }):
46
+ if (i .get_text ()) not in ['' ,'\xa0 ' ]:
47
+ body .append (i .get_text ())
48
+
49
+ except :
50
+ continue
51
+
52
+
53
+ body = '' .join (body )
54
+ content .append (body )
55
+ print (body )
56
+
57
+ for a in range (len (all_category )):
58
+ all_labels .append ("FAKE" )
59
+
60
+
61
+ list_of_tuples = list (zip (all_links , content , all_category , all_labels ))
62
+ df = pd .DataFrame (list_of_tuples , columns = ['all_links' ,'text' ,'title' ,'label' ])
63
+ df .to_excel ("fakenewsgenerator.xlsx" ,index = False )
64
+
65
+
66
+ fakenewsgenerator ()
0 commit comments