Skip to content

Commit

Permalink
add code to use already collected data
Browse files Browse the repository at this point in the history
  • Loading branch information
KMicha committed Nov 14, 2022
1 parent 0117779 commit 7fc8e8d
Show file tree
Hide file tree
Showing 2 changed files with 192 additions and 0 deletions.
42 changes: 42 additions & 0 deletions .github/workflows/manually.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# This is a basic workflow to help you get started with Actions

name: manually:0.0

on:
workflow_dispatch:

jobs:
docker-run-action:
runs-on: ubuntu-latest
container:
image: tamboraorg/crecoding:2020.0
env:
NEWSAPI_KEY: ${{ secrets.NEWSAPI_KEY }}
volumes:
- ${{ github.workspace }}:/cre/python
steps:
- uses: actions/checkout@v3
- name: Set ownership
run: |
# this is to fix GIT not liking owner of the checkout dir
chown -R $(id -u):$(id -g) $PWD
# with:
# ref: master
- name: install aiohttp
run: (pip3 install aiohttp)
- name: install asyncio
run: (pip3 install asyncio)
- name: cd /cre/python/
run: (cd /cre/python/)
- name: cp mysecrets.py
run: (cp mysecrets.orig.py mysecrets.py)
- name: Run newsapi
env:
NEWSAPI_KEY: ${{ secrets.NEWSAPI_KEY }}
run: (python3 manually.py)
- name: Submit changes
uses: EndBug/add-and-commit@v9
with:
# pull: '--rebase --autostash ...'
add: 'csv/*.csv'
tag_push: '--force'
150 changes: 150 additions & 0 deletions manually.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
import pandas as pd
import io
import os
import sys

from pathlib import Path
import os.path

import aiohttp
import asyncio
import requests
from urllib.parse import urlparse
import json
import time
import smtplib
import random
import hashlib


import datetime
from dateutil import parser
import re

from bs4 import BeautifulSoup

from deep_translator import GoogleTranslator
from deep_translator import single_detection

DATA_PATH = Path.cwd()

keywordsDF = pd.read_csv(DATA_PATH / 'keywords.csv', delimiter=',') #,index_col='keyword'
keywordsDF['uniqueString'] = keywordsDF['keyword'] + "_" + keywordsDF['language'] + "_" + keywordsDF['topic']
keywordsDF['crc'] = keywordsDF['uniqueString'].apply(
lambda x:
hashlib.sha256(x.encode()).hexdigest()
)
keywordsDF = keywordsDF.sort_values(by=['ratioNew'], ascending=False)



collectedNews = {}

def addNewsToCollection(data):
global collectedNews

year_month = '1970_01'
pubDate = None
try:
pubDate = parser.parse(data['published'])
except:
print('date parse error 1')
if(not pubDate):
try:
pubDate = parser.isoparse(data['published'])
except:
print('date parse error 2')
if(pubDate):
year_month = pubDate.strftime('%Y_%m')


# if(not data['language'] in collectedNews):
# collectedNews[data['language']] = {}
fileDate = 'news_'+year_month+'.csv'
if(not fileDate in collectedNews):
if(os.path.isfile(DATA_PATH / 'csv' / fileDate)):
#df = pd.read_csv(DATA_PATH / fileDate, delimiter=',' ,index_col='url')
df = pd.read_csv(DATA_PATH / 'csv' / fileDate, delimiter=',',index_col='index')
collectedNews[fileDate] = df.to_dict('index')
else:
collectedNews[fileDate] = {}
if(not data['url'] in collectedNews[fileDate]):
#data = translateNews(data)
#print(data['en'])
#data = archiveUrl(data)
collectedNews[fileDate][data['url']] = data
return True
return False

# index,url,valid,domain,title,description,image,published,archive,content,quote,language,keyword
def storeCollection():
global collectedNews
cols = ['url','valid','domain','title','description','image','published','archive','content','quote','language','keyword']
for dateFile in collectedNews:
df = pd.DataFrame.from_dict(collectedNews[dateFile], orient='index', columns=cols)
#df.to_csv(DATA_PATH / dateFile, index=True)
df.to_csv(DATA_PATH / 'csv' / dateFile, index_label='index')
collectedNews = {}


def getDFfromGitHub(url, delimiter=','):
stream=requests.get(url).content
dataframe=pd.read_csv(io.StringIO(stream.decode('utf-8')), delimiter=delimiter)
dataframe = dataframe.sort_values(by=['published'], ascending=True)
return dataframe

manualDF = pd.DataFrame(None)
gitNames = ["news_2022_01.csv","news_2022_02.csv","news_2022_03.csv","news_2022_04.csv","news_2022_05.csv","news_2022_06.csv",
"news_2022_07.csv","news_2022_08.csv","news_2022_09.csv","news_2022_10.csv","news_2022_11.csv","news_2022_12.csv"]
for gitName in gitNames:
gitUrl = "https://raw.githubusercontent.com/newsWhisperer/winterWeapon/main/csv/" + gitName
df = getDFfromGitHub(gitUrl)

if(manualDF.empty):
manualDF = df
else:
manualDF = pd.concat([manualDF, df])
manualDF = manualDF.sort_values(by=['published'], ascending=True)
manualDF['title'] = manualDF['title'].fillna('')
manualDF['description'] = manualDF['description'].fillna('')
print(manualDF)

# keyword
#

counter = 0
notFoundUrls = []
for index, column in manualDF.iterrows():
#newData = {'url': column['url'], 'language':'de', 'valid':0, 'quote':'',
# 'content':'', 'archive':'', 'title':'','description':'', 'published':'1970-01-01T00:00:00'}
counter += 1
if((counter % 100) ==0):
print(counter)
storeCollection()
if(random.random()>0.75):
newData = column
#print(column)
searchQuote = newData['title'] + " " + newData['description']
foundKeywords = []
found = False
for index2, column2 in keywordsDF.iterrows():
keyword = column2['keyword']
allFound = True
keywords = keyword.strip("'").split(" ")
for keyw in keywords:
allFound = allFound and (keyw in searchQuote)
if(allFound):
foundKeywords.append(keyword)
found = True
if(found):
newData['keyword'] = random.choice(foundKeywords)
addNewsToCollection(newData)

storeCollection()
#print(notFoundUrls)
for xx in notFoundUrls:
print(xx)




0 comments on commit 7fc8e8d

Please sign in to comment.