Skip to content

Commit 9119b94

Browse files
authored
Update convert_deberta.py
1 parent 23ee61d commit 9119b94

File tree

1 file changed

+11
-75
lines changed

1 file changed

+11
-75
lines changed

examples/convert_deberta.py

Lines changed: 11 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1,83 +1,19 @@
1+
#获取deberta的embedding
12
import os
2-
config={
3-
"type_vocab_size": 0,
4-
'hidden_act':'gelu'
5-
}
6-
7-
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
8-
93
os.environ["KERAS_BACKEND"] = "torch"
10-
import keras
11-
import keras_nlp
12-
from bert4keras3.models import build_transformer_model
13-
from bert4keras3.snippets import sequence_padding
14-
from keras import ops
154
model_name = "Deberta_v3_base_multi"
16-
17-
try:
18-
os.makedirs(model_name)
19-
except:
20-
pass
21-
22-
import shutil
23-
import os
24-
25-
# 源文件路径
26-
source_file_path = './keras_nlp_weights/deberta_v3-keras-%s-v2/assets/tokenizer/vocabulary.spm'%model_name.lower()
27-
28-
shutil.move(source_file_path, model_name+'/')
29-
model = keras_nlp.models.DebertaV3MaskedLM.from_preset('./keras_nlp_weights/deberta_v3-keras-%s-v2'%model_name.lower(),preprocessor=None)
30-
model.eval()
31-
32-
backbone = model.layers[2]
33-
deberta_config = backbone.get_config()
34-
35-
config[ "vocab_size"]=deberta_config['vocabulary_size']
36-
config[ "num_hidden_layers"]=deberta_config['num_layers']
37-
config[ "num_attention_heads"]=deberta_config['num_heads']
38-
config[ "hidden_size"]=deberta_config['hidden_dim']
39-
config[ "intermediate_size"]=deberta_config['intermediate_dim']
40-
config[ "attention_probs_dropout_prob"]=deberta_config['dropout']
41-
config[ "dropout_rate"]=deberta_config['dropout']
42-
config[ "max_position"]=deberta_config['max_sequence_length']
43-
config[ "bucket_size"]=deberta_config['bucket_size']
44-
import json
45-
with open(model_name+'/config.json', 'w') as f:
46-
json.dump(config, f, indent=4, ensure_ascii=False)
47-
5+
from bert4keras3.tokenizers import SpTokenizer
6+
import numpy as np
7+
from bert4keras3.models import build_transformer_model
8+
tokenizer = SpTokenizer(model_name+'/vocabulary.spm')
489
mydeberta = build_transformer_model(
4910
config_path=model_name+'/config.json',
11+
keras_weights_path=model_name+'/model.weights.h5',
5012
model='deberta',
5113
return_keras_model=True,
52-
with_mlm='linear',
14+
with_mlm=False,
5315
)
54-
55-
mydeberta.get_layer('Embedding-Token').set_weights(backbone.layers[1].get_weights())
56-
mydeberta.get_layer('Embedding-Norm').set_weights(backbone.layers[2].get_weights())
57-
mydeberta.get_layer('Embedding-Deberta-Position').set_weights(backbone.layers[5].get_weights())
58-
mydeberta.eval()
59-
for index in range(config[ "num_hidden_layers"]):
60-
layers = backbone.layers[6+index]
61-
attention_name = 'Transformer-%d-MultiHeadSelfAttention' % index
62-
feed_forward_name = 'Transformer-%d-FeedForward' % index
63-
64-
mydeberta.get_layer(attention_name).set_weights(layers._self_attention_layer.get_weights())
65-
mydeberta.get_layer(feed_forward_name).set_weights(layers._feedforward_intermediate_dense.get_weights()+layers._feedforward_output_dense.get_weights())
66-
mydeberta.get_layer('%s-Norm' % attention_name).set_weights(layers._self_attention_layer_norm.get_weights())
67-
mydeberta.get_layer('%s-Norm' % feed_forward_name).set_weights(layers._feedforward_layer_norm.get_weights())
68-
69-
import numpy as np
70-
from bert4keras3.tokenizers import SpTokenizer
71-
tokenizer = SpTokenizer(model_name+'/vocabulary.spm')
72-
mydeberta.layers[-1].set_weights(model.layers[-1].get_weights())
73-
74-
features = {
75-
"token_ids": np.array([[1, 2, 3, 4, 3, 6, 7, 8,0]] * 2),
76-
"padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1,0]] * 2),
77-
"mask_positions": np.array([[2, 4]] * 2),
78-
}
79-
z1 = model.predict(features)
80-
z2 = mydeberta.predict([features["token_ids"],
81-
features["mask_positions"]])
82-
print(np.sum(z1-z2))
83-
mydeberta.save_weights(model_name+'/model.weights.h5')
16+
text = "Always get the best performance for your models. In our benchmarks, we found that JAX typically delivers the best training and inference performance on GPU, TPU, and CPU – but results vary from model to model, as non-XLA TensorFlow is occasionally faster on GPU. The ability to dynamically select the backend that will deliver the best performance for your model without having to change anything to your code means you're always guaranteed to train and serve with the highest achievable'"
17+
x = np.reshape(tokenizer.encode(text)[0],[1,-1])
18+
w = mydeberta.predict(x)
19+
print(w)

0 commit comments

Comments
 (0)