使用hanlp 模块貌似要先安装tensorflow、numpy等模块
本文使用hanlp模块完成中英文分词、中英文命名体识别、中英文词性标注等功能
代码
#coding:utf-8
import hanlp,re
content_Chinese = "xxx,副教授、硕士生导师。主要研究方向为无线电监测与管理、智能信息处理、大数据开发与应用。负责、主研国家级、地厅级及企事业单位委托项目30余项。多次为企事业单位提供咨询服务和教育培训,配合相关部门多次参与无线电重大活动保障、电磁环境测试与监测工作。指导学生团队完成“创新基金”、“大学生创新创业”等项目,有着丰富的实战型项目开发、管理经验。"
content_English = "xxx, associate professor and master supervisor, School of computer and software engineering, Xihua University. The main research directions are radio monitoring and management, intelligent information processing, big data development and application. In charge of more than 30 projects entrusted by state, prefecture, enterprises and institutions. It has provided consulting services and education training for enterprises and institutions for many times, and cooperated with relevant departments to participate in radio major activities support, electromagnetic environment testing and monitoring work for many times. Guiding the student team to complete 'innovation fund', 'College Students innovation and entrepreneurship' and other projects, has rich experience in practical project development and management."
# 中文分词
def segmentation_Chinese(str): # 返回一个分词后的list
tokenizer = hanlp.load('CTB6_CONVSEG') # 加载CTB6_CONVSEG预训练模型进行分词任务
# 去除字符串中的标点符号
punctuation = '!,;:?"\'、,;“”。'
def removePunctuation(text):
text = re.sub(r'[{}]+'.format(punctuation), ' ', text)
return text.replace(' ','').strip()
content = removePunctuation(str)
cut_result = tokenizer(content)
print(cut_result)
return cut_result
# 英文分词
def segmentation_English(str): # 返回一个分词后的list
tokenizer = hanlp.load('PKU_NAME_MERGED_SIX_MONTHS_CONVSEG')
# 去除字符串中的标点符号
punctuation = '!,;:.?"\'、,;“”。'
def removePunctuation(text):
text = re.sub(r'[{}]+'.format(punctuation), ' ', text)
return text.strip()
content = removePunctuation(str)
cut_result = tokenizer(content)
# 去除列表中的空字符串
def not_empty(s):
return s and s.strip()
cut_result_trans = list(filter(not_empty,cut_result))
print(cut_result_trans) # list
return cut_result_trans
# 中文命名实体识别
def NER_Chinese(str):
recognizer = hanlp.load(hanlp.pretrained.ner.MSRA_NER_BERT_BASE_ZH) # 加载中文命名实体识别的预训练模型
# 去除字符串中的标点符号
punctuation = '!,;:?"\'、,;“”。'
def removePunctuation(text):
text = re.sub(r'[{}]+'.format(punctuation), ' ', text)
return text.replace(' ', '').strip()
content = list(removePunctuation(str))
result = recognizer(content)
print(result)
# 英文命名实体识别
def NER_English(str):
recognizer = hanlp.load(hanlp.pretrained.ner.CONLL03_NER_BERT_BASE_UNCASED_EN) # 加载英文命名实体识别的预训练模型
content = segmentation_English(str)
result = recognizer(content)
print(result)
# 中文词性标注
def partOfSpeechTagging_Chinese(str): # 结果返回词性list
tagger = hanlp.load(hanlp.pretrained.pos.CTB5_POS_RNN_FASTTEXT_ZH)
result = tagger(segmentation_Chinese(str))
print(result)
return result
# 英文词性标注
def partOfSpeechTagging_English(str): # 结果返回词性list
tagger = hanlp.load(hanlp.pretrained.pos.PTB_POS_RNN_FASTTEXT_EN)
result = tagger(segmentation_English(str))
print(result)
return result
if __name__ == '__main__':
# segmentation_Chinese(content_Chinese)
# segmentation_English(content_English)
# NER_Chinese(content_Chinese)
# NER_English(content_English)
# partOfSpeechTagging_Chinese(content_Chinese)
partOfSpeechTagging_English(content_English)
版权声明:本文为ChaoChao66666原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。