5
5
)
6
6
from embedders .classification .contextual import TransformerSentenceEmbedder
7
7
from embedders .extraction .count_based import BagOfCharsTokenEmbedder
8
- from embedders .extraction .contextual import (
9
- SkipGramTokenEmbedder ,
10
- TransformerTokenEmbedder ,
11
- )
8
+ from embedders .extraction .contextual import TransformerTokenEmbedder
12
9
from embedders .classification .reduce import PCASentenceReducer
13
10
from embedders .extraction .reduce import PCATokenReducer
14
11
from embedders import Transformer
15
12
16
13
from submodules .model .business_objects import record
17
14
15
+
18
16
def get_embedder (
19
- project_id : str , embedding_type : str , config_string : str , language_code : str ,
17
+ project_id : str ,
18
+ embedding_type : str ,
19
+ config_string : str ,
20
+ language_code : str ,
20
21
) -> Transformer :
21
22
if embedding_type == "classification" :
22
23
batch_size = 128
@@ -26,18 +27,16 @@ def get_embedder(
26
27
elif config_string == "bag-of-words" :
27
28
embedder = BagOfWordsSentenceEmbedder (batch_size = batch_size )
28
29
elif config_string == "tf-idf" :
29
- embedder = TfidfSentenceEmbedder (batch_size = batch_size ),
30
- elif config_string == "word2vec" :
31
- return None
30
+ embedder = TfidfSentenceEmbedder (batch_size = batch_size )
32
31
else :
33
32
embedder = TransformerSentenceEmbedder (
34
- config_string = config_string , batch_size = batch_size
35
- )
33
+ config_string = config_string , batch_size = batch_size
34
+ )
36
35
37
36
if record .count (project_id ) < n_components :
38
37
return embedder
39
38
else :
40
- return PCASentenceReducer (embedder , n_components = n_components )
39
+ return PCASentenceReducer (embedder , n_components = n_components )
41
40
42
41
else : # extraction
43
42
batch_size = 32
@@ -52,12 +51,6 @@ def get_embedder(
52
51
return None
53
52
if config_string == "tf-idf" :
54
53
return None
55
- if config_string == "word2vec" :
56
- return SkipGramTokenEmbedder (
57
- language_code = language_code ,
58
- precomputed_docs = True ,
59
- batch_size = batch_size ,
60
- )
61
54
else :
62
55
return PCATokenReducer (
63
56
TransformerTokenEmbedder (
0 commit comments