-
Notifications
You must be signed in to change notification settings - Fork 0
/
tb.py
100 lines (79 loc) · 3.01 KB
/
tb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from fuzzywuzzy import process
import sqlite3
from gi.repository import GLib
from pydbus import SessionBus
from time import sleep
import subprocess
import time
#TF-IDF approach
import pandas as pd, numpy as np, re
from sparse_dot_topn import awesome_cossim_topn
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
expected_song = "Thank You!!"
test_song = "Home Made Kazoku - Thank You!!"
def ngrams(string, n=4):
try:
string = (re.sub(r'[,-./]|\sBD',r'', string)).upper()
ngrams = zip(*[string[i:] for i in range(n)])
return [''.join(ngram) for ngram in ngrams]
except TypeError:
print(string)
raise
conn = sqlite3.connect("anisong.db")
def identify_anisong(title,artist=None):
if artist is not None:
pass
else:
song = process.extractOne(title, conn.execute("select title_en from anisong"),score_cutoff=85) #fuzzy match title with song titles from database
#if song is None:
# return None
#return conn.execute("select anime,type,start_ep,end_ep from anisong where title_en = ?",song[0]).fetchone()
print("Levenshtein distance database access")
total = 0
for i in range(1,11):
start = time.time()
matched_song = process.extractOne(test_song, conn.execute("select title_en from anisong"),score_cutoff=85)
end = time.time()
duration = end - start
total += duration
print(f"Iteration {i}: {duration}")
print(f"Matched {matched_song} expected {expected_song}")
print(f"Avg duration: {total/10}")
time_lev_1 = total/10
print("Levenshtein distance python array")
songs = []
for song in conn.execute("select title_en from anisong"):
songs.append(song[0])
total = 0
for i in range(1,11):
start = time.time()
matched_song = process.extractOne(test_song,songs,score_cutoff=85)
end = time.time()
duration = end - start
total += duration
print(f"Iteration {i}: {duration}")
print(f"Matched {matched_song} expected {expected_song}")
print(f"Avg duration: {total/10}")
time_lev_2 = total/10
print(f"Time saved over Levenshtein distance database: {time_lev_1 - time_lev_2}s ({time_lev_1/time_lev_2}x speedup)")
print("TF-IDF matching")
conn.row_factory = lambda cursor, row: row[0]
songs = conn.execute("select title_en from anisong").fetchall()
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix_songs = vectorizer.fit_transform(songs)
total = 0
for i in range(1,11):
start = time.time()
tf_idf_matrix_test = vectorizer.transform([test_song])
matches = awesome_cossim_topn(tf_idf_matrix_test, tf_idf_matrix_songs.transpose(), 1, 0)
print(matches)
end = time.time()
duration = end - start
total += duration
print(f"Iteration {i}: {duration}")
song2 = songs[matches.nonzero()[1][0]]
print(f"Matched {song2} expected {expected_song} certainty {matches.data[0]}")
print(f"Avg duration: {total/10}")
time_tf = total/10
#print(f"Time saved over Levenshtein distance: {time_lev_2 - time_tf}s ({time_lev_2/time_tf}x speedup)")