Skip to content

Commit

Permalink
fix bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
KMicha committed Nov 24, 2022
1 parent ad4209d commit d8fada7
Show file tree
Hide file tree
Showing 13 changed files with 55 additions and 19 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/bayes.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
- name: cd /cre/python/
run: (cd /cre/python/)
- name: Install HanTa textblob_de
run: (pip3 install HanTa textblob_de)
run: (pip3 install HanTa textblob_de scikit-learn)
- name: Run bayes
run: (python3 bayes.py)
- name: Submit changes
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/diagrams.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ jobs:
# ref: main
- name: cd /cre/python/
run: (cd /cre/python/)
- name: Install nltk
run: (pip3 install nltk scikit-learn)
- name: Run diagrams
run: (python3 diagrams.py)
- name: Submit changes
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/entities.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
- name: cd /cre/python/
run: (cd /cre/python/)
- name: Install spacy textblob_de
run: (pip3 install spacy textblob_de)
run: (pip3 install spacy textblob_de nltk)
- name: Install download de_core_news_md
run: (python3 -m spacy download de_core_news_md)
- name: Run entities
Expand Down
22 changes: 21 additions & 1 deletion HOWTO.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ The harvester works best with about 20 search-terms (called keyword here) organi

![image of organisations](images/gh_organisation.png)

* Create a new organization with unique name - i.e. choose your username and append '_news', like 'kmicha_news'
* Create a new organization with unique name - i.e. choose your username and append '-news', like 'kmicha-news'

![image of organizations](images/gh_new_org.png)

Expand Down Expand Up @@ -84,5 +84,25 @@ Attention: You can adapt the keywords later, but for now you should not remove a

### 7. Remove existing news_20yy_mm.csv files

Inside your repository, goto code, then inside csv folder, select news_2022_mm.csv file (one after the other) and delete it.
After each deletion,a commit must be done.

![goto csv](images/gh_csv_folder.png)

![select csv](images/gh_select_news_csv.png)

![delete csv](images/gh_delete_news_csv.png)

![submit csv](images/gh_submit_delete.png)


### 8.) Add the newsapi API-key to your organization

Switch to your organization (the one ending with "-news") and goto the settings tab.
Scroll down untill you find the Security section at the left side panel; there open the Secrets-Range and click on Actions.

![new secret](images/gh_secrets_new.png)

Press the "New organization secret" button and add the newsapi-key inside the Value field. For the name use "NEWSAPI_KEY".

![add secret](images/gh_secrets_add.png)
1 change: 1 addition & 0 deletions bayes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@


import nltk
import sklearn
from nltk.corpus import stopwords
from HanTa import HanoverTagger as ht
from textblob_de import TextBlobDE
Expand Down
9 changes: 6 additions & 3 deletions diagrams.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.gridspec as gridspec
from mpl_toolkits.mplot3d import Axes3D
#from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d import axes3d, Axes3D
import matplotlib.cm as cm

from nltk.corpus import stopwords
Expand Down Expand Up @@ -473,8 +474,10 @@ def getDay(dateString):
ca.append(column2['topicColor'])
p += 1
fig = plt.figure(figsize=(30, 20))
ax = fig.gca(projection='3d')
fig.subplots_adjust(left=0, right=1, bottom=0, top=1.5)
## ax = Axes3D(fig)
## ax = fig.gca(projection='3d')
ax = fig.add_subplot(projection='3d')
#fig.subplots_adjust(left=0, right=1, bottom=0, top=1.5)
ticksx = germanTopicsDate.index.values.tolist()
plt.xticks(ticksx, germanTopicsDate['Unnamed: 0'],rotation=63, fontsize=18)
ticksy = np.arange(1, len(topicsColorsDF)+1, 1)
Expand Down
3 changes: 2 additions & 1 deletion entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,13 @@
# python3 -m spacy download de_core_news_md
#pip3 install textblob_de

import nltk
import spacy
import de_core_news_md
from textblob_de import TextBlobDE

nlp = de_core_news_md.load()

nltk.download('punkt')


DATA_PATH = Path.cwd()
Expand Down
Binary file added images/gh_csv_folder.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/gh_delete_news_csv.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/gh_select_news_csv.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/gh_submit_delete.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
6 changes: 3 additions & 3 deletions manually.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@
from dateutil import parser
import re

from bs4 import BeautifulSoup
#from bs4 import BeautifulSoup

from deep_translator import GoogleTranslator
from deep_translator import single_detection
#from deep_translator import GoogleTranslator
#from deep_translator import single_detection

DATA_PATH = Path.cwd()

Expand Down
27 changes: 18 additions & 9 deletions newsapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ def getNewsDFbyList(files):
newsDF = df
else:
newsDF = pd.concat([newsDF, df])
newsDF = newsDF.sort_values(by=['published'], ascending=True)
if(not newsDF.empty):
newsDF = newsDF.sort_values(by=['published'], ascending=True)
return newsDF

def getNewsDF():
Expand All @@ -60,19 +61,24 @@ def getNewsDF():
return newsDF

newsDf = getNewsDF()
keywordsNewsDF = newsDf.groupby('keyword').count()
keywordsNewsDF = keywordsNewsDF.drop(columns = ['language'])

keywordsNewsDF = pd.DataFrame(None)
if(not newsDf.empty):
keywordsNewsDF = newsDf.groupby('keyword').count()
keywordsNewsDF = keywordsNewsDF.drop(columns = ['language'])

'''
newsDf['age'] = newsDf['published'].apply(
lambda x:
datetime.datetime.now(datetime.timezone.utc) - parser.parse(x)
)
'''
keywordsNewsDF2 = pd.merge(keywordsDF, keywordsNewsDF, how='left', left_on=['keyword'], right_on=['keyword'])
keywordsNewsDF2['index'] = keywordsNewsDF2['index'].fillna(0)
keywordsNewsDF2['index'] = keywordsNewsDF2['index'] - keywordsNewsDF2['ratioNew']
keywordsNewsDF2 = keywordsNewsDF2.sort_values(by=['index'], ascending=True)
keywordsNewsDF2 = pd.DataFrame(None)
if(not keywordsNewsDF.empty):
keywordsNewsDF2 = pd.merge(keywordsDF, keywordsNewsDF, how='left', left_on=['keyword'], right_on=['keyword'])
keywordsNewsDF2['index'] = keywordsNewsDF2['index'].fillna(0)
keywordsNewsDF2['index'] = keywordsNewsDF2['index'] - keywordsNewsDF2['ratioNew']
keywordsNewsDF2 = keywordsNewsDF2.sort_values(by=['index'], ascending=True)

rows20 = int(math.ceil(keywordsNewsDF2.shape[0]/5))
keywordsNewsDF2 = keywordsNewsDF2.head(rows20)
Expand Down Expand Up @@ -379,11 +385,14 @@ def inqRandomNews():

rndKey = keywordsDF.sample()
randomNumber = random.random()

print(['randomNumber: ',randomNumber])
if(randomNumber>0.8):
if(not keywordsNewsDF2.empty):
if(randomNumber>0.8):
print("DF2 seldoms")
rndKey = keywordsNewsDF2.sample()
if(randomNumber<0.4):
if(not keywordsDF3.empty):
if(randomNumber<0.4):
print("DF3 successors")
rndKey = keywordsDF3.sample()
#if FoundAny: newLimit = minimum(currPage+1,limitPage)
Expand Down

0 comments on commit d8fada7

Please sign in to comment.