From 7fc8e8d0afdeb3556f9149bc0034521fe762e6ed Mon Sep 17 00:00:00 2001 From: KMicha Date: Mon, 14 Nov 2022 16:45:22 +0100 Subject: [PATCH] add code to use already collected data --- .github/workflows/manually.yml | 42 +++++++++ manually.py | 150 +++++++++++++++++++++++++++++++++ 2 files changed, 192 insertions(+) create mode 100644 .github/workflows/manually.yml create mode 100644 manually.py diff --git a/.github/workflows/manually.yml b/.github/workflows/manually.yml new file mode 100644 index 0000000000..da98e55b7d --- /dev/null +++ b/.github/workflows/manually.yml @@ -0,0 +1,42 @@ +# This is a basic workflow to help you get started with Actions + +name: manually:0.0 + +on: + workflow_dispatch: + +jobs: + docker-run-action: + runs-on: ubuntu-latest + container: + image: tamboraorg/crecoding:2020.0 + env: + NEWSAPI_KEY: ${{ secrets.NEWSAPI_KEY }} + volumes: + - ${{ github.workspace }}:/cre/python + steps: + - uses: actions/checkout@v3 + - name: Set ownership + run: | + # this is to fix GIT not liking owner of the checkout dir + chown -R $(id -u):$(id -g) $PWD +# with: +# ref: master + - name: install aiohttp + run: (pip3 install aiohttp) + - name: install asyncio + run: (pip3 install asyncio) + - name: cd /cre/python/ + run: (cd /cre/python/) + - name: cp mysecrets.py + run: (cp mysecrets.orig.py mysecrets.py) + - name: Run newsapi + env: + NEWSAPI_KEY: ${{ secrets.NEWSAPI_KEY }} + run: (python3 manually.py) + - name: Submit changes + uses: EndBug/add-and-commit@v9 + with: +# pull: '--rebase --autostash ...' + add: 'csv/*.csv' + tag_push: '--force' diff --git a/manually.py b/manually.py new file mode 100644 index 0000000000..3a4deb95c3 --- /dev/null +++ b/manually.py @@ -0,0 +1,150 @@ +import pandas as pd +import io +import os +import sys + +from pathlib import Path +import os.path + +import aiohttp +import asyncio +import requests +from urllib.parse import urlparse +import json +import time +import smtplib +import random +import hashlib + + +import datetime +from dateutil import parser +import re + +from bs4 import BeautifulSoup + +from deep_translator import GoogleTranslator +from deep_translator import single_detection + +DATA_PATH = Path.cwd() + +keywordsDF = pd.read_csv(DATA_PATH / 'keywords.csv', delimiter=',') #,index_col='keyword' +keywordsDF['uniqueString'] = keywordsDF['keyword'] + "_" + keywordsDF['language'] + "_" + keywordsDF['topic'] +keywordsDF['crc'] = keywordsDF['uniqueString'].apply( + lambda x: + hashlib.sha256(x.encode()).hexdigest() +) +keywordsDF = keywordsDF.sort_values(by=['ratioNew'], ascending=False) + + + +collectedNews = {} + +def addNewsToCollection(data): + global collectedNews + + year_month = '1970_01' + pubDate = None + try: + pubDate = parser.parse(data['published']) + except: + print('date parse error 1') + if(not pubDate): + try: + pubDate = parser.isoparse(data['published']) + except: + print('date parse error 2') + if(pubDate): + year_month = pubDate.strftime('%Y_%m') + + +# if(not data['language'] in collectedNews): +# collectedNews[data['language']] = {} + fileDate = 'news_'+year_month+'.csv' + if(not fileDate in collectedNews): + if(os.path.isfile(DATA_PATH / 'csv' / fileDate)): + #df = pd.read_csv(DATA_PATH / fileDate, delimiter=',' ,index_col='url') + df = pd.read_csv(DATA_PATH / 'csv' / fileDate, delimiter=',',index_col='index') + collectedNews[fileDate] = df.to_dict('index') + else: + collectedNews[fileDate] = {} + if(not data['url'] in collectedNews[fileDate]): + #data = translateNews(data) + #print(data['en']) + #data = archiveUrl(data) + collectedNews[fileDate][data['url']] = data + return True + return False + +# index,url,valid,domain,title,description,image,published,archive,content,quote,language,keyword +def storeCollection(): + global collectedNews + cols = ['url','valid','domain','title','description','image','published','archive','content','quote','language','keyword'] + for dateFile in collectedNews: + df = pd.DataFrame.from_dict(collectedNews[dateFile], orient='index', columns=cols) + #df.to_csv(DATA_PATH / dateFile, index=True) + df.to_csv(DATA_PATH / 'csv' / dateFile, index_label='index') + collectedNews = {} + + +def getDFfromGitHub(url, delimiter=','): + stream=requests.get(url).content + dataframe=pd.read_csv(io.StringIO(stream.decode('utf-8')), delimiter=delimiter) + dataframe = dataframe.sort_values(by=['published'], ascending=True) + return dataframe + +manualDF = pd.DataFrame(None) +gitNames = ["news_2022_01.csv","news_2022_02.csv","news_2022_03.csv","news_2022_04.csv","news_2022_05.csv","news_2022_06.csv", + "news_2022_07.csv","news_2022_08.csv","news_2022_09.csv","news_2022_10.csv","news_2022_11.csv","news_2022_12.csv"] +for gitName in gitNames: + gitUrl = "https://raw.githubusercontent.com/newsWhisperer/winterWeapon/main/csv/" + gitName + df = getDFfromGitHub(gitUrl) + + if(manualDF.empty): + manualDF = df + else: + manualDF = pd.concat([manualDF, df]) +manualDF = manualDF.sort_values(by=['published'], ascending=True) +manualDF['title'] = manualDF['title'].fillna('') +manualDF['description'] = manualDF['description'].fillna('') +print(manualDF) + +# keyword +# + +counter = 0 +notFoundUrls = [] +for index, column in manualDF.iterrows(): + #newData = {'url': column['url'], 'language':'de', 'valid':0, 'quote':'', + # 'content':'', 'archive':'', 'title':'','description':'', 'published':'1970-01-01T00:00:00'} + counter += 1 + if((counter % 100) ==0): + print(counter) + storeCollection() + if(random.random()>0.75): + newData = column + #print(column) + searchQuote = newData['title'] + " " + newData['description'] + foundKeywords = [] + found = False + for index2, column2 in keywordsDF.iterrows(): + keyword = column2['keyword'] + allFound = True + keywords = keyword.strip("'").split(" ") + for keyw in keywords: + allFound = allFound and (keyw in searchQuote) + if(allFound): + foundKeywords.append(keyword) + found = True + if(found): + newData['keyword'] = random.choice(foundKeywords) + addNewsToCollection(newData) + +storeCollection() +#print(notFoundUrls) +for xx in notFoundUrls: + print(xx) + + + +