-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprepare_for_training.py
80 lines (59 loc) · 2.5 KB
/
prepare_for_training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import spacy as spacy_en
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import pickle
import os
model = spacy_en.load('en_core_web_md')
def clean_my_text(song):
""" It filters punctuation, numbers, stop word
and returns lemmatized words"""
doc = model(song)
clean_text = ''
for word in doc:
if (word.is_stop == False) and (word.pos_ != 'PUNCT') and (word.pos_ != 'NUM'):
word = word.lemma_
clean_text += word + ' '
return clean_text
def keep_english_for_spacy_nn(df):
"""This function takes the DataFrame for songs
and keep songs with english as main language
for english version of spacy neural network for word processing"""
#Keep only english for spacy NN English preprocessing words
#Network for other languages like french, spanish, portuguese are also available
df = df.loc[df['Main Language'] == 'en',:]
#Drop the translation column not use for lyrics in english
df.drop(['English Translation Lyrics'],axis =1,inplace = True)
return df
def apply_spacy_nn_to_DataFrame(df):
"""Apply reduction of words using clean_my_text Function
to the lyrics column"""
df['Text Lyrics'] = df['Text Lyrics'].apply(clean_my_text)
return df
def save_transform_to_disk(cv, tf, folder_save):
countvectorfile = os.path.join(folder_save, 'countvector.sav')
pickle.dump(cv, open(countvectorfile, 'wb'))
Tfidfile = os.path.join(folder_save, 'Tfidfile.sav')
pickle.dump(tf, open(Tfidfile, 'wb'))
return
def prepare_training(df_read, folder_save):
"""This function takes the database of artists as input
and the folder where to save transform operations on data
and return X and y for training"""
#Songs in english for spacy nn (disable if multilanguage)
df_prep = keep_english_for_spacy_nn(df_read)
#Apply spacy nn to reduce dimension of text
df_prep = apply_spacy_nn_to_DataFrame(df_prep)
#Count vecorizer of words
cv = CountVectorizer()
corpus_vec = cv.fit_transform(df_prep['Text Lyrics'])
#Tfidf Transform
tf = TfidfTransformer()
transform_vec = tf.fit_transform(corpus_vec)
#Save transform to disk to reuse for predictions
save_transform_to_disk(cv, tf, folder_save)
#todense() to remove sparse formatting
df_word_vec = pd.DataFrame(transform_vec.todense(), columns=cv.get_feature_names())
y = df_prep['Name']
X = df_word_vec
return X,y