Skip to content

Commit 36c2785

Browse files
authored
Removes embedding word2vec (#8)
* removes word2vec * update embedders version
1 parent 09141d9 commit 36c2785

File tree

3 files changed

+11
-23
lines changed

3 files changed

+11
-23
lines changed

app.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -90,11 +90,6 @@ def recommendations(
9090
"tokenizers": ["all"],
9191
"applicability": {"attribute": True, "token": False},
9292
},
93-
# TODO: w2v currently doesn't work yet, @Johannes needs to fix this
94-
# {
95-
# "config_string": "word2vec",
96-
# "description": "SkipGram based word representations (token-granularity only)",
97-
# },
9893
]
9994

10095
return recommends, 200

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ certifi==2021.10.8
77
charset-normalizer==2.0.12
88
click==8.0.4
99
cymem==2.0.6
10-
embedders==0.0.15
10+
embedders==0.0.16
1111
fastapi==0.78.0
1212
greenlet==1.1.2
1313
h11==0.13.0

util/embedders.py

Lines changed: 10 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,19 @@
55
)
66
from embedders.classification.contextual import TransformerSentenceEmbedder
77
from embedders.extraction.count_based import BagOfCharsTokenEmbedder
8-
from embedders.extraction.contextual import (
9-
SkipGramTokenEmbedder,
10-
TransformerTokenEmbedder,
11-
)
8+
from embedders.extraction.contextual import TransformerTokenEmbedder
129
from embedders.classification.reduce import PCASentenceReducer
1310
from embedders.extraction.reduce import PCATokenReducer
1411
from embedders import Transformer
1512

1613
from submodules.model.business_objects import record
1714

15+
1816
def get_embedder(
19-
project_id: str, embedding_type: str, config_string: str, language_code: str,
17+
project_id: str,
18+
embedding_type: str,
19+
config_string: str,
20+
language_code: str,
2021
) -> Transformer:
2122
if embedding_type == "classification":
2223
batch_size = 128
@@ -26,18 +27,16 @@ def get_embedder(
2627
elif config_string == "bag-of-words":
2728
embedder = BagOfWordsSentenceEmbedder(batch_size=batch_size)
2829
elif config_string == "tf-idf":
29-
embedder = TfidfSentenceEmbedder(batch_size=batch_size),
30-
elif config_string == "word2vec":
31-
return None
30+
embedder = TfidfSentenceEmbedder(batch_size=batch_size)
3231
else:
3332
embedder = TransformerSentenceEmbedder(
34-
config_string=config_string, batch_size=batch_size
35-
)
33+
config_string=config_string, batch_size=batch_size
34+
)
3635

3736
if record.count(project_id) < n_components:
3837
return embedder
3938
else:
40-
return PCASentenceReducer(embedder, n_components = n_components)
39+
return PCASentenceReducer(embedder, n_components=n_components)
4140

4241
else: # extraction
4342
batch_size = 32
@@ -52,12 +51,6 @@ def get_embedder(
5251
return None
5352
if config_string == "tf-idf":
5453
return None
55-
if config_string == "word2vec":
56-
return SkipGramTokenEmbedder(
57-
language_code=language_code,
58-
precomputed_docs=True,
59-
batch_size=batch_size,
60-
)
6154
else:
6255
return PCATokenReducer(
6356
TransformerTokenEmbedder(

0 commit comments

Comments
 (0)