wobert
参考:https://kexue.fm/archives/7758/comment-page-1#comments
maxlen = 32
# bert配置
config_path = r'C:\Users\long3.xiang\Desktop\tcl\wobert\chinese_wobert_L-12_H-768_A-12\bert_config.json'
checkpoint_path = r'C:\Users\long3.xiang\Desktop\tcl\wobert\chinese_wobert_L-12_H-768_A-12\bert_model.ckpt'
dict_path = r'C:\Users\long3.xiang\Desktop\tcl\wobert\chinese_wobert_L-12_H-768_A-12\vocab.txt'
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True, pre_tokenize=lambda s: jieba.cut(s, HMM=False)) # 建立分词器
# 建立加载模型
model = build_transformer_model(config_path, checkpoint_path) # 建立模型,加载权重
tokenizer.tokenize('语言模型')
token_ids, segment_ids = tokenizer.encode('语言模型')
print('\n ===== predicting =====\n')
print(model.predict([np.array([token_ids]), np.array([segment_ids])]))

## 取语言这个词的向量:
## 语言的embedding
kk = model.predict([np.array([token_ids]), np.array([segment_ids])])
kk[0][1]

roformer
参考:https://github.com/ZhuiyiTechnology/roformer
升级:bert4keras 0.10.4;下载预训练模型
import numpy as np
import jieba
import tensorflow as tf
from bert4keras.backend import keras, K
from bert4keras.layers import Loss
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer
# 基本参数
maxlen = 512
batch_size = 64
epochs = 100000
# bert配置
config_path = r'***inese_roformer_L-12_H-768_A-12/bert_config.json'
checkpoint_path = r'***e_roformer_L-12_H-768_A-12/bert_model.ckpt'
dict_path = r'***e_roformer_L-12_H-768_A-12/vocab.txt'
tokenizer = Tokenizer(
dict_path,
do_lower_case=True,
pre_tokenize=lambda s: jieba.cut(s, HMM=False)
)
bert = build_transformer_model(
config_path,
checkpoint_path=None,
model='roformer',
with_mlm='linear',
ignore_invalid_weights=True,
return_keras_model=False
)
model = bert.model
tokenizer.tokenize('语言模型')

token_ids, segment_ids = tokenizer.encode('语言模型')

print('\n ===== predicting =====\n')
print(model.predict([np.array([token_ids]), np.array([segment_ids])]))

版权声明:本文为weixin_42357472原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。