-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathguardian.py
103 lines (81 loc) · 3.21 KB
/
guardian.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
"""
Author : Vishak Raj
E-mail ID : [email protected]
Gives and save the news from the guardian news paper
And the guardian have limitaiton, to get full access contact the guardian
"""
import json
import requests
import html2text,pandas as pd
import csv,xlsxwriter
from bs4 import BeautifulSoup
from openpyxl import load_workbook
import re
import schedule
import time
import xlwt
# Refer the doc - https://open-platform.theguardian.com/documentation/
def guardian():
API_KEY = "" #get your api-key from https://open-platform.theguardian.com/access/
API_ENDPOINT = 'http://content.guardianapis.com/search' # api-endpoit means calling code
my_params = {
'q':"",#q is query
'from-date': "", #begin-date
'to-date': "", #finish-date
'order-by':"oldest",
'show-fields': "all",
'page-size':10 ,
'api-key': API_KEY,
'tag':"",
'section':"",
'lang':"",
}
my_params['from-date'] = "2019-08-19"
my_params['to-date'] = "2019-08-19"
my_params['tag'] = "politics/politics"
my_params['q'] = "bank"
my_params['section'] = "politics"
my_params['lang'] = "en"
body_list=[]
topic=[]
startrow=None
current_page = 1
total_pages = 1
while current_page <= total_pages:
print("...page", current_page)
my_params['page'] = current_page
resp = requests.get(API_ENDPOINT, my_params)
print(resp.url)
print("\n")
data = resp.json()
current_page += 1
total_pages=data['response']['pages']
results=len(data["response"]["results"])
for i in range(0,results):
d=data["response"]["results"][i]["fields"]["bodyText"]
d.replace("\n","")
dd=html2text.html2text(d)
dd = re.sub(r"\(https\:\S+|\s+\)", "", dd)
body_list.append(dd)
topic.append(my_params.get("q"))
list_of_tuples = list(zip(body_list, topic))
df = pd.DataFrame(list_of_tuples,index=None, columns=None)
filename = r"D:\guardian.xls"
df.to_excel("q.xlsx",index=False,header=False) # it overwrites the excel file
"""
# it appends the dataframe df to the excel
writer = pd.ExcelWriter(filename, engine='openpyxl')
writer.book = load_workbook(filename)
if "Sheet1" in writer.book.sheetnames:
startrow = writer.book["Sheet1"].max_row
writer.sheets = {ws.title:ws for ws in writer.book.worksheets}
if startrow is None:
startrow = 0
df.to_excel(writer, "Sheet1", startrow=startrow,index=False,header=False)
writer.save()
writer.close()
"""
schedule.every().day.at("17:21").do(guardian) #specify the time to automatically run the api
while True:
schedule.run_pending()
time.sleep(1)