1
+ #获取deberta的embedding
1
2
import os
2
- config = {
3
- "type_vocab_size" : 0 ,
4
- 'hidden_act' :'gelu'
5
- }
6
-
7
- os .environ ["CUDA_VISIBLE_DEVICES" ] = "0"
8
-
9
3
os .environ ["KERAS_BACKEND" ] = "torch"
10
- import keras
11
- import keras_nlp
12
- from bert4keras3 .models import build_transformer_model
13
- from bert4keras3 .snippets import sequence_padding
14
- from keras import ops
15
4
model_name = "Deberta_v3_base_multi"
16
-
17
- try :
18
- os .makedirs (model_name )
19
- except :
20
- pass
21
-
22
- import shutil
23
- import os
24
-
25
- # 源文件路径
26
- source_file_path = './keras_nlp_weights/deberta_v3-keras-%s-v2/assets/tokenizer/vocabulary.spm' % model_name .lower ()
27
-
28
- shutil .move (source_file_path , model_name + '/' )
29
- model = keras_nlp .models .DebertaV3MaskedLM .from_preset ('./keras_nlp_weights/deberta_v3-keras-%s-v2' % model_name .lower (),preprocessor = None )
30
- model .eval ()
31
-
32
- backbone = model .layers [2 ]
33
- deberta_config = backbone .get_config ()
34
-
35
- config [ "vocab_size" ]= deberta_config ['vocabulary_size' ]
36
- config [ "num_hidden_layers" ]= deberta_config ['num_layers' ]
37
- config [ "num_attention_heads" ]= deberta_config ['num_heads' ]
38
- config [ "hidden_size" ]= deberta_config ['hidden_dim' ]
39
- config [ "intermediate_size" ]= deberta_config ['intermediate_dim' ]
40
- config [ "attention_probs_dropout_prob" ]= deberta_config ['dropout' ]
41
- config [ "dropout_rate" ]= deberta_config ['dropout' ]
42
- config [ "max_position" ]= deberta_config ['max_sequence_length' ]
43
- config [ "bucket_size" ]= deberta_config ['bucket_size' ]
44
- import json
45
- with open (model_name + '/config.json' , 'w' ) as f :
46
- json .dump (config , f , indent = 4 , ensure_ascii = False )
47
-
5
+ from bert4keras3 .tokenizers import SpTokenizer
6
+ import numpy as np
7
+ from bert4keras3 .models import build_transformer_model
8
+ tokenizer = SpTokenizer (model_name + '/vocabulary.spm' )
48
9
mydeberta = build_transformer_model (
49
10
config_path = model_name + '/config.json' ,
11
+ keras_weights_path = model_name + '/model.weights.h5' ,
50
12
model = 'deberta' ,
51
13
return_keras_model = True ,
52
- with_mlm = 'linear' ,
14
+ with_mlm = False ,
53
15
)
54
-
55
- mydeberta .get_layer ('Embedding-Token' ).set_weights (backbone .layers [1 ].get_weights ())
56
- mydeberta .get_layer ('Embedding-Norm' ).set_weights (backbone .layers [2 ].get_weights ())
57
- mydeberta .get_layer ('Embedding-Deberta-Position' ).set_weights (backbone .layers [5 ].get_weights ())
58
- mydeberta .eval ()
59
- for index in range (config [ "num_hidden_layers" ]):
60
- layers = backbone .layers [6 + index ]
61
- attention_name = 'Transformer-%d-MultiHeadSelfAttention' % index
62
- feed_forward_name = 'Transformer-%d-FeedForward' % index
63
-
64
- mydeberta .get_layer (attention_name ).set_weights (layers ._self_attention_layer .get_weights ())
65
- mydeberta .get_layer (feed_forward_name ).set_weights (layers ._feedforward_intermediate_dense .get_weights ()+ layers ._feedforward_output_dense .get_weights ())
66
- mydeberta .get_layer ('%s-Norm' % attention_name ).set_weights (layers ._self_attention_layer_norm .get_weights ())
67
- mydeberta .get_layer ('%s-Norm' % feed_forward_name ).set_weights (layers ._feedforward_layer_norm .get_weights ())
68
-
69
- import numpy as np
70
- from bert4keras3 .tokenizers import SpTokenizer
71
- tokenizer = SpTokenizer (model_name + '/vocabulary.spm' )
72
- mydeberta .layers [- 1 ].set_weights (model .layers [- 1 ].get_weights ())
73
-
74
- features = {
75
- "token_ids" : np .array ([[1 , 2 , 3 , 4 , 3 , 6 , 7 , 8 ,0 ]] * 2 ),
76
- "padding_mask" : np .array ([[1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,0 ]] * 2 ),
77
- "mask_positions" : np .array ([[2 , 4 ]] * 2 ),
78
- }
79
- z1 = model .predict (features )
80
- z2 = mydeberta .predict ([features ["token_ids" ],
81
- features ["mask_positions" ]])
82
- print (np .sum (z1 - z2 ))
83
- mydeberta .save_weights (model_name + '/model.weights.h5' )
16
+ text = "Always get the best performance for your models. In our benchmarks, we found that JAX typically delivers the best training and inference performance on GPU, TPU, and CPU – but results vary from model to model, as non-XLA TensorFlow is occasionally faster on GPU. The ability to dynamically select the backend that will deliver the best performance for your model without having to change anything to your code means you're always guaranteed to train and serve with the highest achievable'"
17
+ x = np .reshape (tokenizer .encode (text )[0 ],[1 ,- 1 ])
18
+ w = mydeberta .predict (x )
19
+ print (w )
0 commit comments