-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
90 lines (77 loc) · 3.36 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def save_bow(texts, name):
import json
with open(f'./corpora/{name}.json', 'w') as filehandle:
json.dump(texts, filehandle)
def load_bow(name):
import json
with open(f'./corpora/{name}.json', 'r') as filehandle:
return json.load(filehandle)
def dataset():
import pandas as pd
import numpy as np
import os
path = './database'
query_dataframe = pd.read_csv(path+'/query_dataframe.csv')
included_df = query_dataframe[query_dataframe['Incluído']=='Yes']
df = pd.DataFrame()
for source in os.listdir(path+'/venues'):
if source[:-4] in included_df['SIGLA'].values:
ven = pd.read_csv(path+'/venues/'+source)
ven['Venue'] = source[:-4]
df = pd.concat([df, ven], ignore_index=True, axis=0)
def extract_month(date_string):
import numpy as np
try:
first_part = date_string.split("-")
if first_part[0].isdigit():
month = first_part[1].upper()[:3]
else:
month = first_part[0].split()[0].upper()[:3]
return month
except:
return np.nan
def extract_year(date_string):
import numpy as np
try:
first_part = date_string.split("-")
if len(first_part) > 1:
year = str(2000 + int(first_part[1]))
else:
year = date_string.split()[1]
return year
except:
return np.nan
# Treatment
df.index = range(len(df))
df['Publication Date'] = df.apply(lambda row: extract_month(row['Conference Date']) if pd.isna(row['Publication Date']) else row['Publication Date'], axis=1)
df['Publication Date'] = df['Publication Date'].fillna(df['Early Access Date'])
df['Publication Date'] = df['Publication Date'].apply(lambda x: str(x)[:3].upper())
df['Publication Date'] = df['Publication Date'].apply(lambda x: str(x)[0].upper() + str(x)[1:].lower())
df['Publication Date'] = df['Publication Date'].replace('Nan', np.nan)
df['Publication Year'] = df.apply(lambda row: extract_year(row['Early Access Date']) if pd.isna(row['Publication Year']) else row['Publication Year'], axis=1)
df['Publication Year'] = df['Publication Year'].astype(str)
df['Publication Year'] = df['Publication Year'].apply(lambda x: x[:4])
return df
def train_lda_model(corpus, id2word, K, alpha, eta, df, tfidf):
from gensim.models import LdaModel, LdaMulticore
path = f'models/lda_models/{K}_{alpha}_{eta}_{df}_{tfidf}'
try:
lda_model = LdaModel.load(path)
return lda_model
except FileNotFoundError:
if alpha == 'auto':
lda_model = LdaModel(corpus=corpus,
id2word=id2word,
num_topics=K,
alpha=alpha,
eta=eta,
passes=20)
else:
lda_model = LdaMulticore(corpus=corpus,
id2word=id2word,
num_topics=K,
alpha=alpha,
eta=eta,
passes=20)
lda_model.save(path)
return lda_model