update

ms2329-news · Nov 22, 2022 · 20657e6 · 20657e6
1 parent 7fc8e8d
commit 20657e6
Show file tree

Hide file tree

Showing 28 changed files with 6,842 additions and 62 deletions.
diff --git a/.github/workflows/bayes.yml b/.github/workflows/bayes.yml
@@ -0,0 +1,42 @@
+
+# This is a basic workflow to help you get started with Actions
+
+name: bayes:0.0
+
+on:
+  push:
+    branches:
+      - 'main'
+  workflow_dispatch:
+  schedule:
+    # * is a special character in YAML so you have to quote this string : 
+#    weekly #m h d/m m d/w 
+#    - cron:  '30 1 * * 1'
+#    every month 3.rd day
+    - cron:  '40 8 3 * *' 
+
+jobs:
+  docker-run-action:
+    runs-on: ubuntu-latest
+    container:
+      image: tamboraorg/crecoding:2020.0
+      volumes:
+        - ${{ github.workspace }}:/cre/python
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set ownership
+        run: |
+          # this is to fix GIT not liking owner of the checkout dir
+          chown -R $(id -u):$(id -g) $PWD
+#        with:
+#          ref: main
+      - name: cd /cre/python/
+        run: (cd /cre/python/)
+      - name: Run diagrams
+        run: (python3 bayes.py)
+      - name: Submit changes       
+        uses: EndBug/add-and-commit@v9
+        with:
+#          pull: '--rebase --autostash ...'
+          add: 'img/words_bayes_*.png  csv/words_bayes_*.csv'
+          tag_push: '--force'
diff --git a/.github/workflows/diagrams.yml b/.github/workflows/diagrams.yml
@@ -11,7 +11,9 @@ on:
   schedule:
     # * is a special character in YAML so you have to quote this string : 
 #    weekly #m h d/m m d/w 
-    - cron:  '30 1 * * 1'
+#    - cron:  '30 1 * * 1'
+#    daily #m h d/m m d/w 
+    - cron:  '30 1 * * *'
 
 jobs:
   docker-run-action:
@@ -36,5 +38,5 @@ jobs:
         uses: EndBug/add-and-commit@v9
         with:
 #          pull: '--rebase --autostash ...'
-          add: 'img/*.png'
+          add: 'img/*.png csv/topics_date.csv'
           tag_push: '--force'
diff --git a/.github/workflows/entities.yml b/.github/workflows/entities.yml
@@ -0,0 +1,42 @@
+
+# This is a basic workflow to help you get started with Actions
+
+name: entities:0.0
+
+on:
+  push:
+    branches:
+      - 'main'
+  workflow_dispatch:
+  schedule:
+    # * is a special character in YAML so you have to quote this string : 
+#    weekly #m h d/m m d/w 
+    - cron:  '10 1 * * 1'
+#    daily #m h d/m m d/w 
+#    - cron:  '30 1 * * *'
+
+jobs:
+  docker-run-action:
+    runs-on: ubuntu-latest
+    container:
+      image: tamboraorg/crecoding:2020.0
+      volumes:
+        - ${{ github.workspace }}:/cre/python
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set ownership
+        run: |
+          # this is to fix GIT not liking owner of the checkout dir
+          chown -R $(id -u):$(id -g) $PWD
+#        with:
+#          ref: main
+      - name: cd /cre/python/
+        run: (cd /cre/python/)
+      - name: Run diagrams
+        run: (python3 entities.py)
+      - name: Submit changes       
+        uses: EndBug/add-and-commit@v9
+        with:
+#          pull: '--rebase --autostash ...'
+          add: 'csv/sentiments_*.csv'
+          tag_push: '--force'
diff --git a/.github/workflows/harvest.yml b/.github/workflows/harvest.yml
@@ -9,11 +9,13 @@ on:
   workflow_dispatch:
   schedule:
     # * is a special character in YAML so you have to quote this string : 
-#    daily,every 6 hours #m h d/m m d/w 
-    - cron:  '00 */6 * * *'
+#    daily,every 5 hours #m h d/m m d/w 
+    - cron:  '00 */5 * * *'
 #    every 2 hour
-#    - cron:  '20 */2 * * *'
-
+#    - cron:  '10 */2 * * *'
+#    every month 2.nd day
+#    - cron:  '40 8 2 * *' 
+
 jobs:
   docker-run-action:
     runs-on: ubuntu-latest

diff --git a/README.md b/README.md
@@ -5,4 +5,30 @@ To setup you own harvester read the [HOWTO.md](https://github.com/newsWhisperer/
 
 # winterIsComing
 
+# Topics & Keywords
+
 ![Pie](img/keywords_pie_all.png)
+
+# Topic Analysis
+
+![LDA](img/topics_lda.png)
+
+![NMF](img/topics_nmf.png)
+
+# Entities
+
+![Domains](img/domains_count.png)
+
+![Persons](img/persons_count.png)
+
+![Organizations](img/organizations_count.png)
+
+![Locations](img/locations_count.png)
+
+# Timeline
+
+![DateTopic](img/dates_topics_article_count.png)
+
+
+
+
diff --git a/bayes.py b/bayes.py
@@ -0,0 +1,231 @@
+import pandas as pd
+
+from pathlib import Path
+import os.path
+import io
+#import requests
+import glob
+
+
+import nltk
+from nltk.corpus import stopwords
+from HanTa import HanoverTagger as ht
+from textblob_de import TextBlobDE
+import math
+import re
+import random
+
+from sklearn.decomposition import PCA
+
+import matplotlib.dates as mdates
+import matplotlib.pyplot as plt
+import matplotlib.gridspec as gridspec
+from mpl_toolkits.mplot3d import Axes3D
+import matplotlib.cm as cm
+
+DATA_PATH = Path.cwd()
+if(not os.path.exists(DATA_PATH / 'csv')):
+    os.mkdir(DATA_PATH / 'csv')
+if(not os.path.exists(DATA_PATH / 'img')):
+    os.mkdir(DATA_PATH / 'img')
+
+def getNewsFiles():
+    fileName = './csv/news_????_??.csv'
+    files = glob.glob(fileName)
+    return files  
+
+def getNewsDFbyList(files):    
+    newsDF = pd.DataFrame(None)
+    for file in files:
+        df = pd.read_csv(file, delimiter=',')
+        if(newsDF.empty):
+            newsDF = df
+        else:
+            newsDF = pd.concat([newsDF, df])
+    newsDF = newsDF.sort_values(by=['published'], ascending=True)        
+    return newsDF 
+
+def getNewsDF():
+    files = getNewsFiles()
+    newsDF = getNewsDFbyList(files)
+    return newsDF         
+
+keywordsColorsDF = pd.read_csv(DATA_PATH / 'keywords.csv', delimiter=',')
+topicsColorsDF = keywordsColorsDF.drop_duplicates(subset=['topic'])
+
+newsDf = getNewsDF()
+print(newsDf)   
+
+language = 'ger'
+nltk.download('punkt')
+nltk.download('stopwords')
+tagger = ht.HanoverTagger('morphmodel_'+language+'.pgz')
+german_stop_words = set(stopwords.words('german'))
+
+def generateTokensWithPosition(quote):
+    sentences = nltk.sent_tokenize(quote,language='german')   
+    for sentence in sentences:
+        positionSentence = quote.find(sentence)
+        lastWord = None
+        tokens = nltk.tokenize.word_tokenize(sentence,language='german') 
+        lemmata = tagger.tag_sent(tokens,taglevel = 2)
+        for (orig,lemma,gramma) in lemmata:
+         if(len(orig)>2):
+          if(not orig in german_stop_words):
+            positionWord = sentence.find(orig)
+            yield [orig, positionSentence+positionWord]
+            if(lastWord):
+                yield [(lastWord+' '+lemma), positionSentence+positionWord]
+            lastWord = lemma 
+
+
+emptyTopics = {'summary':0}
+for index2, column2 in keywordsColorsDF.iterrows():     
+    topic = column2['topic']
+    emptyTopics[topic] = 0
+
+i=0
+topicWordsAbs = {'summaryOfAllWords': emptyTopics.copy()}
+for index, column in newsDf.iterrows():
+    i += 1
+    if(i % 50 == 0):
+        print(i)
+
+    quote = str(column.title)+' ' +str(column.description)+' '+str(column.content)
+    #quote = str(column.title)+' ' +str(column.description)
+    for tokenAndPosition in generateTokensWithPosition(quote):
+        token = tokenAndPosition[0]
+        tokenPosition = tokenAndPosition[1]
+        if(not token in topicWordsAbs):
+            topicWordsAbs[token] = emptyTopics.copy()  
+        for index2, column2 in keywordsColorsDF.iterrows(): 
+            found = 0.0
+            if(column2['keyword'] == column['keyword']):
+               found = 0.05
+            keywords = column2['keyword'].strip("'").split(" ")
+            topic = column2['topic']
+            for keyword in keywords:
+                if(keyword in quote):
+                    for keyPosition in [m.start() for m in re.finditer(keyword, quote)]:
+                        distance = abs(tokenPosition - keyPosition)
+                        factor = math.sqrt(1/(1+distance*0.25))/len(keywords)
+                        if(factor>found):
+                            found = factor  
+            topicWordsAbs[token][topic] += found
+            topicWordsAbs[token]['summary'] += found
+            topicWordsAbs['summaryOfAllWords'][topic] += found
+            topicWordsAbs['summaryOfAllWords']['summary'] += found
+
+overallProbability = emptyTopics.copy()
+for topic in overallProbability:
+    if(not topic == 'summary'):
+        if(topicWordsAbs['summaryOfAllWords']['summary'] > 0):
+            overallProbability[topic] = float(topicWordsAbs['summaryOfAllWords'][topic])/float(topicWordsAbs['summaryOfAllWords']['summary']) 
+
+## now increase all counting by sqrt(n), but minimum of overall probability
+for word in topicWordsAbs:
+    if(word != 'summaryOfAllWords'):
+        data = topicWordsAbs[word]
+        for topic in overallProbability:   
+            if(not topic == 'summary'):
+                frac = overallProbability[topic]
+                delta = math.sqrt(frac+topicWordsAbs[word][topic])
+                topicWordsAbs[word][topic] += delta
+                topicWordsAbs['summaryOfAllWords'][topic] += delta
+                topicWordsAbs[word]['summary'] += delta
+                topicWordsAbs['summaryOfAllWords']['summary'] += delta  
+
+emptyCol = emptyTopics.copy()
+emptyCol['word'] = 'oneWord'
+topicWordsRel = {}  
+for word in topicWordsAbs:
+    if(word == 'summaryOfAllWords'):  
+        relData = topicWordsAbs[word].copy()
+    else:    
+        data = topicWordsAbs[word]
+        relData = emptyCol.copy()
+        relData['word'] = word
+        relData['summary'] = topicWordsAbs[word]['summary']
+        for topic in data:
+            if(not topic in ['word','summary']):
+                if(not topicWordsAbs['summaryOfAllWords'][topic] == 0): 
+                    if(topicWordsAbs['summaryOfAllWords'][topic]*topicWordsAbs[word]['summary'] > 0): 
+                      relValue = topicWordsAbs[word][topic]*topicWordsAbs['summaryOfAllWords']['summary']/(topicWordsAbs['summaryOfAllWords'][topic]*topicWordsAbs[word]['summary'])   #Bayes
+                      relData[topic] = math.log(relValue)
+    topicWordsRel[word] = relData 
+topicWordsRelDF = pd.DataFrame.from_dict(topicWordsRel, orient='index', columns=emptyCol.keys()) 
+topicWordsRelDF.to_csv(DATA_PATH / 'csv' / "words_bayes_topic_all.csv", index=True) 
+
+#PCA
+numberComponents = 5  #0.5*len(topics), minimum: 4
+dfn = topicWordsRelDF.drop(columns = ['word'])
+
+dfn['const0'] = 1.0
+pca = PCA(n_components=numberComponents)
+pca.fit(dfn)
+apca = pca.fit_transform(dfn)
+dfpca = pd.DataFrame(apca)
+dfpca['word'] = topicWordsRelDF.index
+dfpca['summary'] = topicWordsRelDF['summary'].values
+dfpca.to_csv(DATA_PATH / "csv" /"words_bayes_topic_pca.csv", index=False)
+
+def combine_hex_values(d):
+  d_items = sorted(d.items())
+  tot_weight = sum(d.values())
+  red = int(sum([int(k[:2], 16)*v for k, v in d_items])/tot_weight)
+  green = int(sum([int(k[2:4], 16)*v for k, v in d_items])/tot_weight)
+  blue = int(sum([int(k[4:6], 16)*v for k, v in d_items])/tot_weight)
+  zpad = lambda x: x if len(x)==2 else '0' + x
+  return zpad(hex(red)[2:]) + zpad(hex(green)[2:]) + zpad(hex(blue)[2:])
+
+plt.figure( figsize=(20,15) )
+plt.xlim([-4, 4])
+plt.ylim([-4, 4])
+i=0
+for index, column in dfpca.iterrows():
+  i += 1
+  if(i % 50 == 0):
+     print(i)    
+  if(not " " in str(column['word'])): 
+    maxColor = '#000000'
+    nxtColor = '#555555'
+    maxprobabiliyty = -15  #log!
+    nxtprobabiliyty = -15  
+
+    for index2, column2 in topicsColorsDF.iterrows():
+        topic = column2['topic']    
+        if(str(column['word']) in topicWordsRelDF[topic]):
+            if(topicWordsRelDF[topic][str(column['word'])]> maxprobabiliyty):
+                maxprobabiliyty = topicWordsRelDF[topic][str(column['word'])]
+                maxColor = column2['topicColor']
+    for index2, column2 in topicsColorsDF.iterrows():
+        topic = column2['topic']    
+        if(str(column['word']) in topicWordsRelDF[topic]):
+            if(maxprobabiliyty > topicWordsRelDF[topic][str(column['word'])] > nxtprobabiliyty):
+                nxtprobabiliyty = topicWordsRelDF[topic][str(column['word'])]
+                nxtColor = column2['topicColor']
+    if((maxprobabiliyty < -12) & (nxtprobabiliyty < -12)):
+        maxColor = '#555555'
+        nxtColor = '#555555'                    
+
+    ##maxColor = '#'+combine_hex_values({maxColor: math.exp(maxprobabiliyty) , nxtColor: math.exp(nxtprobabiliyty)})          
+
+    x = random.uniform(-0.1, 0.1)+column[2]
+    y = random.uniform(-0.1, 0.1)+column[3]
+    s = (2+math.sqrt(1+math.sqrt(column['summary'])))
+    plt.text(x, y, column['word'], color='#ffffff', fontsize=s, ha='center', va='center', zorder=s-1E-7, fontweight='bold')
+    plt.text(x, y, column['word'], color=maxColor, fontsize=s, ha='center', va='center', zorder=s)
+
+colorLeg = list(topicsColorsDF['topicColor'])#.reverse()
+colorLeg.reverse()
+labelLeg = list(topicsColorsDF['topic'])#.reverse()
+labelLeg.reverse()
+custom_lines = [plt.Line2D([],[], ls="", marker='.', 
+                mec='k', mfc=c, mew=.1, ms=20) for c in colorLeg]
+
+leg = plt.legend(custom_lines, labelLeg, 
+          loc='center left', fontsize=10, bbox_to_anchor=(0.9, .80))
+leg.set_title("Topics", prop = {'size':12}) 
+
+plt.savefig(DATA_PATH / 'img' / 'words_bayes_topic_pca.png', dpi=300)  
+