Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
KMicha committed Nov 22, 2022
1 parent 7fc8e8d commit 20657e6
Show file tree
Hide file tree
Showing 28 changed files with 6,842 additions and 62 deletions.
42 changes: 42 additions & 0 deletions .github/workflows/bayes.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@

# This is a basic workflow to help you get started with Actions

name: bayes:0.0

on:
push:
branches:
- 'main'
workflow_dispatch:
schedule:
# * is a special character in YAML so you have to quote this string :
# weekly #m h d/m m d/w
# - cron: '30 1 * * 1'
# every month 3.rd day
- cron: '40 8 3 * *'

jobs:
docker-run-action:
runs-on: ubuntu-latest
container:
image: tamboraorg/crecoding:2020.0
volumes:
- ${{ github.workspace }}:/cre/python
steps:
- uses: actions/checkout@v3
- name: Set ownership
run: |
# this is to fix GIT not liking owner of the checkout dir
chown -R $(id -u):$(id -g) $PWD
# with:
# ref: main
- name: cd /cre/python/
run: (cd /cre/python/)
- name: Run diagrams
run: (python3 bayes.py)
- name: Submit changes
uses: EndBug/add-and-commit@v9
with:
# pull: '--rebase --autostash ...'
add: 'img/words_bayes_*.png csv/words_bayes_*.csv'
tag_push: '--force'
6 changes: 4 additions & 2 deletions .github/workflows/diagrams.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ on:
schedule:
# * is a special character in YAML so you have to quote this string :
# weekly #m h d/m m d/w
- cron: '30 1 * * 1'
# - cron: '30 1 * * 1'
# daily #m h d/m m d/w
- cron: '30 1 * * *'

jobs:
docker-run-action:
Expand All @@ -36,5 +38,5 @@ jobs:
uses: EndBug/add-and-commit@v9
with:
# pull: '--rebase --autostash ...'
add: 'img/*.png'
add: 'img/*.png csv/topics_date.csv'
tag_push: '--force'
42 changes: 42 additions & 0 deletions .github/workflows/entities.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@

# This is a basic workflow to help you get started with Actions

name: entities:0.0

on:
push:
branches:
- 'main'
workflow_dispatch:
schedule:
# * is a special character in YAML so you have to quote this string :
# weekly #m h d/m m d/w
- cron: '10 1 * * 1'
# daily #m h d/m m d/w
# - cron: '30 1 * * *'

jobs:
docker-run-action:
runs-on: ubuntu-latest
container:
image: tamboraorg/crecoding:2020.0
volumes:
- ${{ github.workspace }}:/cre/python
steps:
- uses: actions/checkout@v3
- name: Set ownership
run: |
# this is to fix GIT not liking owner of the checkout dir
chown -R $(id -u):$(id -g) $PWD
# with:
# ref: main
- name: cd /cre/python/
run: (cd /cre/python/)
- name: Run diagrams
run: (python3 entities.py)
- name: Submit changes
uses: EndBug/add-and-commit@v9
with:
# pull: '--rebase --autostash ...'
add: 'csv/sentiments_*.csv'
tag_push: '--force'
10 changes: 6 additions & 4 deletions .github/workflows/harvest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@ on:
workflow_dispatch:
schedule:
# * is a special character in YAML so you have to quote this string :
# daily,every 6 hours #m h d/m m d/w
- cron: '00 */6 * * *'
# daily,every 5 hours #m h d/m m d/w
- cron: '00 */5 * * *'
# every 2 hour
# - cron: '20 */2 * * *'

# - cron: '10 */2 * * *'
# every month 2.nd day
# - cron: '40 8 2 * *'

jobs:
docker-run-action:
runs-on: ubuntu-latest
Expand Down
26 changes: 26 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,30 @@ To setup you own harvester read the [HOWTO.md](https://github.com/newsWhisperer/

# winterIsComing

# Topics & Keywords

![Pie](img/keywords_pie_all.png)

# Topic Analysis

![LDA](img/topics_lda.png)

![NMF](img/topics_nmf.png)

# Entities

![Domains](img/domains_count.png)

![Persons](img/persons_count.png)

![Organizations](img/organizations_count.png)

![Locations](img/locations_count.png)

# Timeline

![DateTopic](img/dates_topics_article_count.png)




231 changes: 231 additions & 0 deletions bayes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
import pandas as pd

from pathlib import Path
import os.path
import io
#import requests
import glob


import nltk
from nltk.corpus import stopwords
from HanTa import HanoverTagger as ht
from textblob_de import TextBlobDE
import math
import re
import random

from sklearn.decomposition import PCA

import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.cm as cm

DATA_PATH = Path.cwd()
if(not os.path.exists(DATA_PATH / 'csv')):
os.mkdir(DATA_PATH / 'csv')
if(not os.path.exists(DATA_PATH / 'img')):
os.mkdir(DATA_PATH / 'img')

def getNewsFiles():
fileName = './csv/news_????_??.csv'
files = glob.glob(fileName)
return files

def getNewsDFbyList(files):
newsDF = pd.DataFrame(None)
for file in files:
df = pd.read_csv(file, delimiter=',')
if(newsDF.empty):
newsDF = df
else:
newsDF = pd.concat([newsDF, df])
newsDF = newsDF.sort_values(by=['published'], ascending=True)
return newsDF

def getNewsDF():
files = getNewsFiles()
newsDF = getNewsDFbyList(files)
return newsDF

keywordsColorsDF = pd.read_csv(DATA_PATH / 'keywords.csv', delimiter=',')
topicsColorsDF = keywordsColorsDF.drop_duplicates(subset=['topic'])

newsDf = getNewsDF()
print(newsDf)

language = 'ger'
nltk.download('punkt')
nltk.download('stopwords')
tagger = ht.HanoverTagger('morphmodel_'+language+'.pgz')
german_stop_words = set(stopwords.words('german'))

def generateTokensWithPosition(quote):
sentences = nltk.sent_tokenize(quote,language='german')
for sentence in sentences:
positionSentence = quote.find(sentence)
lastWord = None
tokens = nltk.tokenize.word_tokenize(sentence,language='german')
lemmata = tagger.tag_sent(tokens,taglevel = 2)
for (orig,lemma,gramma) in lemmata:
if(len(orig)>2):
if(not orig in german_stop_words):
positionWord = sentence.find(orig)
yield [orig, positionSentence+positionWord]
if(lastWord):
yield [(lastWord+' '+lemma), positionSentence+positionWord]
lastWord = lemma


emptyTopics = {'summary':0}
for index2, column2 in keywordsColorsDF.iterrows():
topic = column2['topic']
emptyTopics[topic] = 0

i=0
topicWordsAbs = {'summaryOfAllWords': emptyTopics.copy()}
for index, column in newsDf.iterrows():
i += 1
if(i % 50 == 0):
print(i)

quote = str(column.title)+' ' +str(column.description)+' '+str(column.content)
#quote = str(column.title)+' ' +str(column.description)
for tokenAndPosition in generateTokensWithPosition(quote):
token = tokenAndPosition[0]
tokenPosition = tokenAndPosition[1]
if(not token in topicWordsAbs):
topicWordsAbs[token] = emptyTopics.copy()
for index2, column2 in keywordsColorsDF.iterrows():
found = 0.0
if(column2['keyword'] == column['keyword']):
found = 0.05
keywords = column2['keyword'].strip("'").split(" ")
topic = column2['topic']
for keyword in keywords:
if(keyword in quote):
for keyPosition in [m.start() for m in re.finditer(keyword, quote)]:
distance = abs(tokenPosition - keyPosition)
factor = math.sqrt(1/(1+distance*0.25))/len(keywords)
if(factor>found):
found = factor
topicWordsAbs[token][topic] += found
topicWordsAbs[token]['summary'] += found
topicWordsAbs['summaryOfAllWords'][topic] += found
topicWordsAbs['summaryOfAllWords']['summary'] += found

overallProbability = emptyTopics.copy()
for topic in overallProbability:
if(not topic == 'summary'):
if(topicWordsAbs['summaryOfAllWords']['summary'] > 0):
overallProbability[topic] = float(topicWordsAbs['summaryOfAllWords'][topic])/float(topicWordsAbs['summaryOfAllWords']['summary'])

## now increase all counting by sqrt(n), but minimum of overall probability
for word in topicWordsAbs:
if(word != 'summaryOfAllWords'):
data = topicWordsAbs[word]
for topic in overallProbability:
if(not topic == 'summary'):
frac = overallProbability[topic]
delta = math.sqrt(frac+topicWordsAbs[word][topic])
topicWordsAbs[word][topic] += delta
topicWordsAbs['summaryOfAllWords'][topic] += delta
topicWordsAbs[word]['summary'] += delta
topicWordsAbs['summaryOfAllWords']['summary'] += delta

emptyCol = emptyTopics.copy()
emptyCol['word'] = 'oneWord'
topicWordsRel = {}
for word in topicWordsAbs:
if(word == 'summaryOfAllWords'):
relData = topicWordsAbs[word].copy()
else:
data = topicWordsAbs[word]
relData = emptyCol.copy()
relData['word'] = word
relData['summary'] = topicWordsAbs[word]['summary']
for topic in data:
if(not topic in ['word','summary']):
if(not topicWordsAbs['summaryOfAllWords'][topic] == 0):
if(topicWordsAbs['summaryOfAllWords'][topic]*topicWordsAbs[word]['summary'] > 0):
relValue = topicWordsAbs[word][topic]*topicWordsAbs['summaryOfAllWords']['summary']/(topicWordsAbs['summaryOfAllWords'][topic]*topicWordsAbs[word]['summary']) #Bayes
relData[topic] = math.log(relValue)
topicWordsRel[word] = relData
topicWordsRelDF = pd.DataFrame.from_dict(topicWordsRel, orient='index', columns=emptyCol.keys())
topicWordsRelDF.to_csv(DATA_PATH / 'csv' / "words_bayes_topic_all.csv", index=True)

#PCA
numberComponents = 5 #0.5*len(topics), minimum: 4
dfn = topicWordsRelDF.drop(columns = ['word'])

dfn['const0'] = 1.0
pca = PCA(n_components=numberComponents)
pca.fit(dfn)
apca = pca.fit_transform(dfn)
dfpca = pd.DataFrame(apca)
dfpca['word'] = topicWordsRelDF.index
dfpca['summary'] = topicWordsRelDF['summary'].values
dfpca.to_csv(DATA_PATH / "csv" /"words_bayes_topic_pca.csv", index=False)

def combine_hex_values(d):
d_items = sorted(d.items())
tot_weight = sum(d.values())
red = int(sum([int(k[:2], 16)*v for k, v in d_items])/tot_weight)
green = int(sum([int(k[2:4], 16)*v for k, v in d_items])/tot_weight)
blue = int(sum([int(k[4:6], 16)*v for k, v in d_items])/tot_weight)
zpad = lambda x: x if len(x)==2 else '0' + x
return zpad(hex(red)[2:]) + zpad(hex(green)[2:]) + zpad(hex(blue)[2:])

plt.figure( figsize=(20,15) )
plt.xlim([-4, 4])
plt.ylim([-4, 4])
i=0
for index, column in dfpca.iterrows():
i += 1
if(i % 50 == 0):
print(i)
if(not " " in str(column['word'])):
maxColor = '#000000'
nxtColor = '#555555'
maxprobabiliyty = -15 #log!
nxtprobabiliyty = -15

for index2, column2 in topicsColorsDF.iterrows():
topic = column2['topic']
if(str(column['word']) in topicWordsRelDF[topic]):
if(topicWordsRelDF[topic][str(column['word'])]> maxprobabiliyty):
maxprobabiliyty = topicWordsRelDF[topic][str(column['word'])]
maxColor = column2['topicColor']
for index2, column2 in topicsColorsDF.iterrows():
topic = column2['topic']
if(str(column['word']) in topicWordsRelDF[topic]):
if(maxprobabiliyty > topicWordsRelDF[topic][str(column['word'])] > nxtprobabiliyty):
nxtprobabiliyty = topicWordsRelDF[topic][str(column['word'])]
nxtColor = column2['topicColor']
if((maxprobabiliyty < -12) & (nxtprobabiliyty < -12)):
maxColor = '#555555'
nxtColor = '#555555'

##maxColor = '#'+combine_hex_values({maxColor: math.exp(maxprobabiliyty) , nxtColor: math.exp(nxtprobabiliyty)})

x = random.uniform(-0.1, 0.1)+column[2]
y = random.uniform(-0.1, 0.1)+column[3]
s = (2+math.sqrt(1+math.sqrt(column['summary'])))
plt.text(x, y, column['word'], color='#ffffff', fontsize=s, ha='center', va='center', zorder=s-1E-7, fontweight='bold')
plt.text(x, y, column['word'], color=maxColor, fontsize=s, ha='center', va='center', zorder=s)

colorLeg = list(topicsColorsDF['topicColor'])#.reverse()
colorLeg.reverse()
labelLeg = list(topicsColorsDF['topic'])#.reverse()
labelLeg.reverse()
custom_lines = [plt.Line2D([],[], ls="", marker='.',
mec='k', mfc=c, mew=.1, ms=20) for c in colorLeg]

leg = plt.legend(custom_lines, labelLeg,
loc='center left', fontsize=10, bbox_to_anchor=(0.9, .80))
leg.set_title("Topics", prop = {'size':12})

plt.savefig(DATA_PATH / 'img' / 'words_bayes_topic_pca.png', dpi=300)

Loading

0 comments on commit 20657e6

Please sign in to comment.