HanLP 是一个强大的自然语言处理（NLP）工具包，提供了中文处理及多语言支持，涵盖了分词、词性标注、命名实体识别、句法分析等多个 NLP 任务。它基于深度学习技术，支持多种语言，并提供了 Python、Java API，适用于学术研究和工业应用。

pip install hanlp[full]
模型：https://hanlp.hankcs.com/docs/api/hanlp/pretrained/tok.html

1. 中文分词

import hanlp


inputs = 'hanlp提供自然语言处理全功能API,及语料库管理,语料标注工具,在线模型训练,GPU资源调用等网络服务.支持多人异地异时协同标注,提供自然语言处理全生命周期服务.'


# https://file.hankcs.com/hanlp/tok/coarse_electra_small_20220220_013548.zip
# <class 'hanlp.components.tokenizers.transformer.TransformerTaggingTokenizer'>
model = hanlp.load('model/coarse_electra_small_20220220_013548')
print(type(model))
results = model(inputs)

# https://file.hankcs.com/hanlp/tok/ctb6_convseg_nowe_nocrf_20200110_004046.zip
# <class 'hanlp.components.tok_tf.NgramConvTokenizerTF'>
model = hanlp.load('model/ctb6_convseg_nowe_nocrf_20200110_004046')
print(type(model))
results = model(inputs)

# http://download.hanlp.com/tok/extra/ctb9_tok_electra_base_20220426_111949.zip
# <class 'hanlp.components.tokenizers.transformer.TransformerTaggingTokenizer'>
model = hanlp.load('model/ctb9_tok_electra_base_20220426_111949')
print(type(model))
results = model(inputs)

# http://download.hanlp.com/tok/extra/ctb9_tok_electra_base_crf_20220426_161255.zip
# <class 'hanlp.components.tokenizers.transformer.TransformerTaggingTokenizer'>
model = hanlp.load('model/ctb9_tok_electra_base_crf_20220426_161255')
print(type(model))
# 报错: TypeError: 'int' object is not subscriptable
# results = model(inputs)

# https://file.hankcs.com/hanlp/tok/ctb9_electra_small_20220215_205427.zip
# <class 'hanlp.components.tokenizers.transformer.TransformerTaggingTokenizer'>
model = hanlp.load('model/ctb9_electra_small_20220215_205427')
print(type(model))
results = model(inputs)

2. 命名实体识别

import hanlp


inputs = '我要感谢洛杉矶市民议政论坛、亚洲协会南加中心、美中关系全国委员会、美中友协美西分会等友好团体的盛情款待。'

# https://file.hankcs.com/hanlp/ner/msra_ner_albert_base_20211228_173323.zip
# <class 'hanlp.components.ner.ner_tf.TransformerNamedEntityRecognizerTF'>
model = hanlp.load('model/msra_ner_albert_base_20211228_173323')
# 如果分词不准确的话，可以自己 tokenize，也就是 list(str)
print(model(inputs))
print(model(list(inputs)))
print('-' * 50)

# https://file.hankcs.com/hanlp/ner/msra_ner_albert_base_20211228_173323.zip
# <class 'hanlp.components.ner.ner_tf.TransformerNamedEntityRecognizerTF'>
model = hanlp.load('model/msra_ner_electra_small_20220215_205503')
print(model(inputs))
print(model(list(inputs)))
print('-' * 50)

# https://file.hankcs.com/hanlp/ner/ner_bert_base_msra_20211227_114712.zip
# <class 'hanlp.components.ner.ner_tf.TransformerNamedEntityRecognizerTF
model = hanlp.load('model/ner_bert_base_msra_20211227_114712')
print(model(inputs))
print(model(list(inputs)))
print('-' * 50)

# https://file.hankcs.com/hanlp/ner/ner_bert_base_msra_20211227_114712.zip
# <class 'hanlp.components.ner.ner_tf.TransformerNamedEntityRecognizerTF
model = hanlp.load('model/ner_conll03_bert_base_cased_en_20211227_121443')
# 输出结果异常: []
print(model(inputs))
print(model(list(inputs)))
print('-' * 50)


# NER模型: https://hanlp.hankcs.com/docs/api/hanlp/pretrained/ner.html

3. 其他

from pyhanlp import *



# URL识别
url_recognizer = JClass('com.hankcs.hanlp.tokenizer.URLTokenizer')
result = url_recognizer.segment('你好啊 http://baidu.com,我是 https://mengbaolaing.cn 谁啊？')
print(result)
for data in result:
    # data.nature 是 Nature 类型，可以使用 str 进行转换
    # URL 必须以空格或者标点符号结尾，否则无法识别
    if str(data.nature) == 'xu':
        print(data.word)


# 台湾繁体转简体
transfer = JClass('com.hankcs.hanlp.HanLP')
result = transfer.tw2s('hankcs在臺灣寫程式碼')
print(result)

# 简体转台湾繁体
result = transfer.s2tw('hankcs在台湾写代码')
print(result)

# 香港繁体: s2hk hk2s


# 文本标准化
# * 演示正规化字符配置项的效果（繁体->简体，全角->半角，大写->小写）。
#  * 该配置项位于hanlp.properties中，通过Normalization=true来开启
#  * 切换配置后必须删除CustomDictionary.txt.bin缓存，否则只影响动态插入的新词。
normalizer = JClass('com.hankcs.hanlp.dictionary.other.CharTable')
text = normalizer.convert('2012年9月5日，中央电视台《艺术人生》节目主持人朱军先生在首都北京亲切会见了中国著名书法家高咏华先生')
print(text)

# 数词和量词识别
shuci = JClass('com.hankcs.hanlp.tokenizer.StandardTokenizer')
result = shuci.segment('十九元套餐包括什么')
print(result)

# 自动摘要
content = "水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露，根据刚刚完成了水资源管理制度的考核，有部分省接近了红线的指标，有部分省超过红线的指标。对一些超过红线的地方，陈明忠表示，对一些取用水项目进行区域的限批，严格地进行水资源论证和取水许可的批准。"
# 第二个参数抽取几句
result = HanLP.extractSummary(content, 3)
print(result)

# 提取短语词
result = HanLP.extractPhrase(content, 3)
print(result)

result = HanLP.extractWords(content, 3)
print(result)


a = HanLP.newSegment().enablePlaceRecognize(True)
print(a.seg('蓝翔给宁夏固原市彭阳县红河镇黑牛沟村捐赠了挖掘机'))

HanLP 使用

1. 中文分词

2. 命名实体识别

3. 其他

文章目录