fix bugs

MalinSt98 · Nov 24, 2022 · d8fada7 · d8fada7
1 parent ad4209d
commit d8fada7
Show file tree

Hide file tree

Showing 13 changed files with 55 additions and 19 deletions.
diff --git a/.github/workflows/bayes.yml b/.github/workflows/bayes.yml
@@ -33,7 +33,7 @@ jobs:
       - name: cd /cre/python/
         run: (cd /cre/python/)
       - name: Install HanTa textblob_de
-        run: (pip3 install HanTa textblob_de)
+        run: (pip3 install HanTa textblob_de scikit-learn)
       - name: Run bayes
         run: (python3 bayes.py)
       - name: Submit changes       

diff --git a/.github/workflows/diagrams.yml b/.github/workflows/diagrams.yml
@@ -32,6 +32,8 @@ jobs:
 #          ref: main
       - name: cd /cre/python/
         run: (cd /cre/python/)
+      - name: Install nltk
+        run: (pip3 install nltk scikit-learn)
       - name: Run diagrams
         run: (python3 diagrams.py)
       - name: Submit changes       

diff --git a/.github/workflows/entities.yml b/.github/workflows/entities.yml
@@ -33,7 +33,7 @@ jobs:
       - name: cd /cre/python/
         run: (cd /cre/python/)
       - name: Install spacy textblob_de
-        run: (pip3 install spacy textblob_de)
+        run: (pip3 install spacy textblob_de nltk)
       - name: Install download de_core_news_md
         run: (python3 -m spacy download de_core_news_md)
       - name: Run entities

diff --git a/HOWTO.md b/HOWTO.md
@@ -28,7 +28,7 @@ The harvester works best with about 20 search-terms (called keyword here) organi
 
 ![image of organisations](images/gh_organisation.png)
 
-* Create a new organization with unique name - i.e. choose your username and append '_news', like 'kmicha_news'
+* Create a new organization with unique name - i.e. choose your username and append '-news', like 'kmicha-news'
 
 ![image of organizations](images/gh_new_org.png)
 
@@ -84,5 +84,25 @@ Attention: You can adapt the keywords later, but for now you should not remove a
 
 ### 7. Remove existing news_20yy_mm.csv files
 
+Inside your repository, goto code, then inside csv folder, select news_2022_mm.csv file (one after the other) and delete it.
+After each deletion,a commit must be done. 
+
+![goto csv](images/gh_csv_folder.png)
+
+![select csv](images/gh_select_news_csv.png)
+
+![delete csv](images/gh_delete_news_csv.png)
+
+![submit csv](images/gh_submit_delete.png)
+
+
 ### 8.) Add the newsapi API-key to your organization
 
+Switch to your organization (the one ending with "-news") and goto the settings tab.
+Scroll down  untill you find the Security section at the left side panel; there open the Secrets-Range and click on Actions.
+
+![new secret](images/gh_secrets_new.png)
+
+Press the "New organization secret" button and add the newsapi-key inside the Value field. For the name use "NEWSAPI_KEY".
+
+![add secret](images/gh_secrets_add.png)
diff --git a/bayes.py b/bayes.py
@@ -8,6 +8,7 @@
 
 
 import nltk
+import sklearn
 from nltk.corpus import stopwords
 from HanTa import HanoverTagger as ht
 from textblob_de import TextBlobDE

diff --git a/diagrams.py b/diagrams.py
@@ -14,7 +14,8 @@
 import matplotlib.pyplot as plt
 import matplotlib.patches as mpatches
 import matplotlib.gridspec as gridspec
-from mpl_toolkits.mplot3d import Axes3D
+#from mpl_toolkits.mplot3d import Axes3D
+from mpl_toolkits.mplot3d import axes3d, Axes3D
 import matplotlib.cm as cm
 
 from nltk.corpus import stopwords
@@ -473,8 +474,10 @@ def getDay(dateString):
         ca.append(column2['topicColor'])
         p += 1
 fig = plt.figure(figsize=(30, 20))
-ax = fig.gca(projection='3d')
-fig.subplots_adjust(left=0, right=1, bottom=0, top=1.5)
+## ax = Axes3D(fig)
+## ax = fig.gca(projection='3d')
+ax = fig.add_subplot(projection='3d')
+#fig.subplots_adjust(left=0, right=1, bottom=0, top=1.5)
 ticksx = germanTopicsDate.index.values.tolist()
 plt.xticks(ticksx, germanTopicsDate['Unnamed: 0'],rotation=63, fontsize=18)
 ticksy = np.arange(1, len(topicsColorsDF)+1, 1)

diff --git a/entities.py b/entities.py
@@ -13,12 +13,13 @@
 # python3 -m spacy download de_core_news_md
 #pip3 install textblob_de
 
+import nltk
 import spacy
 import de_core_news_md
 from textblob_de import TextBlobDE
 
 nlp = de_core_news_md.load()
-
+nltk.download('punkt')
 
 
 DATA_PATH = Path.cwd()

diff --git a/images/gh_csv_folder.png b/images/gh_csv_folder.png
diff --git a/images/gh_delete_news_csv.png b/images/gh_delete_news_csv.png
diff --git a/images/gh_select_news_csv.png b/images/gh_select_news_csv.png
diff --git a/images/gh_submit_delete.png b/images/gh_submit_delete.png
diff --git a/manually.py b/manually.py
@@ -21,10 +21,10 @@
 from dateutil import parser
 import re
 
-from bs4 import BeautifulSoup
+#from bs4 import BeautifulSoup
 
-from deep_translator import GoogleTranslator
-from deep_translator import single_detection
+#from deep_translator import GoogleTranslator
+#from deep_translator import single_detection
 
 DATA_PATH = Path.cwd()
 

diff --git a/newsapi.py b/newsapi.py
@@ -51,7 +51,8 @@ def getNewsDFbyList(files):
             newsDF = df
         else:
             newsDF = pd.concat([newsDF, df])
-    newsDF = newsDF.sort_values(by=['published'], ascending=True)        
+    if(not newsDF.empty):
+        newsDF = newsDF.sort_values(by=['published'], ascending=True)        
     return newsDF 
 
 def getNewsDF():
@@ -60,19 +61,24 @@ def getNewsDF():
     return newsDF     
 
 newsDf = getNewsDF()
-keywordsNewsDF = newsDf.groupby('keyword').count()
-keywordsNewsDF = keywordsNewsDF.drop(columns = ['language'])
+
+keywordsNewsDF = pd.DataFrame(None) 
+if(not newsDf.empty):
+  keywordsNewsDF = newsDf.groupby('keyword').count()
+  keywordsNewsDF = keywordsNewsDF.drop(columns = ['language'])
 
 '''
 newsDf['age'] = newsDf['published'].apply(
     lambda x: 
         datetime.datetime.now(datetime.timezone.utc) - parser.parse(x)
 )
 '''
-keywordsNewsDF2 = pd.merge(keywordsDF, keywordsNewsDF, how='left', left_on=['keyword'], right_on=['keyword'])
-keywordsNewsDF2['index'] = keywordsNewsDF2['index'].fillna(0)
-keywordsNewsDF2['index'] = keywordsNewsDF2['index'] - keywordsNewsDF2['ratioNew']
-keywordsNewsDF2 = keywordsNewsDF2.sort_values(by=['index'], ascending=True)  
+keywordsNewsDF2 = pd.DataFrame(None) 
+if(not keywordsNewsDF.empty):
+  keywordsNewsDF2 = pd.merge(keywordsDF, keywordsNewsDF, how='left', left_on=['keyword'], right_on=['keyword'])
+  keywordsNewsDF2['index'] = keywordsNewsDF2['index'].fillna(0)
+  keywordsNewsDF2['index'] = keywordsNewsDF2['index'] - keywordsNewsDF2['ratioNew']
+  keywordsNewsDF2 = keywordsNewsDF2.sort_values(by=['index'], ascending=True)  
 
 rows20 = int(math.ceil(keywordsNewsDF2.shape[0]/5))
 keywordsNewsDF2 = keywordsNewsDF2.head(rows20)
@@ -379,11 +385,14 @@ def inqRandomNews():
 
     rndKey = keywordsDF.sample()
     randomNumber = random.random()
+
     print(['randomNumber: ',randomNumber])
-    if(randomNumber>0.8):
+    if(not keywordsNewsDF2.empty):
+      if(randomNumber>0.8):
         print("DF2 seldoms")
         rndKey = keywordsNewsDF2.sample()
-    if(randomNumber<0.4): 
+    if(not keywordsDF3.empty):
+      if(randomNumber<0.4): 
         print("DF3 successors")
         rndKey = keywordsDF3.sample()
     #if FoundAny: newLimit = minimum(currPage+1,limitPage)