基于 svm、朴素贝叶斯、albert 文本分类

目标是构建一个基于向量的问答检索系统,即:问题和答案是固定的数据集,通过将用户输入的问题编码为向量,在向量数据库中匹配最相似的问题,并返回问题对应的答案。

实现时,希望能够对输入的问题进行类别判别。例如:我们做法律的问答检索系统,就希望能够判断出用户问题是否和法律相关,如果相关我们再进行后续处理。

训练这样的一个二分类模型使用到的数据为:

  1. 正样本:就是拿到的正常的问题的语料
  2. 负样本:来源就是开放的百度问答数据、以及部分其他行业的问答数据,从中筛选出不同的问题

这种二分类问题,在做的时候方案太多了,这里就使用了三种方法:

  1. 朴素贝叶斯:根据词的频数向量
  2. 支持向量机:用 word2vec 将问题编码为向量
  3. Albert:mini 版本的 bert 预训练模型

从效果来看,朴素贝叶斯使用的高维的稀疏向量,虽然特征维度高,速度也还是可以接受的。其效果不如 albert 和 svm。albert 虽然是 mini 版本,也比另外两个大,并且训练过程产生了很多 checkpoint,需要我们自己来选择用哪个。svm + word2vec 的方式,效果非常不错,训练时间也不长,模型也不大,我比较喜欢这个模型。

1. 基于 Albert 的方法

预训练模型:https://huggingface.co/clue/albert_chinese_tiny

下面完整的实现代码:

from transformers import AlbertForSequenceClassification
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader
from transformers import BertTokenizer
from tqdm import tqdm
from datasets import load_from_disk
import torch.optim as optim
import torch.nn as nn
import glob
import torch
import numpy as np
from datasets import Dataset
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support


# 计算设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def collate_function(batch_data, tokenizer):

    titles, labels = [], []
    for data in batch_data:
        titles.append(data['title'])
        labels.append(data['label'])

    title_tensor = tokenizer.batch_encode_plus(titles,
                                               add_special_tokens=True,
                                               padding='longest',
                                               return_tensors='pt')
    title_tensor = {key: value.to(device) for key, value in title_tensor.items()}
    label_tensor = torch.tensor(labels, device=device)
    return title_tensor, label_tensor


def train_albert():

    # https://huggingface.co/clue/albert_chinese_tiny
    estimator = AlbertForSequenceClassification.from_pretrained('pretrained/albert_chinese_tiny', num_labels=2).to(device)
    tokenizer = BertTokenizer.from_pretrained('pretrained/albert_chinese_tiny')
    traindata = load_from_disk('data/intention.data')['train']
    dataloader = DataLoader(traindata, batch_size=128, shuffle=True, collate_fn=lambda data: collate_function(data, tokenizer))
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(estimator.parameters(), lr=1e-5)
    scheduler = ReduceLROnPlateau(optimizer, factor=0.6, patience=2, cooldown=2, verbose=True)

    for epoch in range(30):

        total_loss, total_size, total_corr = 0.0, 0, 0
        progress = tqdm(range(len(dataloader)))
        for title_tensor, label_tensor in dataloader:

            outputs = estimator(**title_tensor)
            loss = criterion(outputs.logits, label_tensor)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # 预测标签
            y_pred = torch.argmax(outputs.logits, dim=-1)
            total_corr += (y_pred == label_tensor).sum().item()
            total_loss += loss.item() * len(label_tensor)
            total_size += len(label_tensor)

            # 更新进度
            desc = '%2d. %6.1f %5d/%5d %.4f %.2E' % (epoch + 1, total_loss, total_corr, total_size, total_corr/total_size, scheduler.optimizer.param_groups[0]['lr'])
            progress.set_description(desc)
            progress.update()

        scheduler.step(total_loss)
        progress.close()

        if epoch > 5:
            model_save_path = 'finish/intention/albert/%0d_intention_albert_loss_%.4f' % (epoch + 1, total_loss)
            estimator.save_pretrained(model_save_path)
            tokenizer.save_pretrained(model_save_path)


@torch.no_grad()
def eval_model(model_name):

    estimator = AlbertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device).eval()
    tokenizer = BertTokenizer.from_pretrained(model_name)
    traindata = load_from_disk('data/intention.data')['test']
    dataloader = DataLoader(traindata, batch_size=128, shuffle=True, collate_fn=lambda data: collate_function(data, tokenizer))

    model_name = model_name[model_name.rfind('/') + 1:]
    progress = tqdm(range(len(dataloader)), desc='%30s' % model_name)
    y_true, y_pred = [], []
    for inputs_tensor, labels_tensor in dataloader:
        outputs = estimator(**inputs_tensor)
        y_label = torch.argmax(outputs.logits, dim=-1)
        y_pred.extend(y_label.cpu().numpy().tolist())
        y_true.extend(labels_tensor.cpu().numpy().tolist())
        progress.update()
    progress.close()


    print('准确率:', accuracy_score(y_true, y_pred))
    precision, recall, f_score, true_sum = precision_recall_fscore_support(y_true, y_pred)
    print('精确率:', precision)
    print('召回率:', recall)
    print('F-score:', f_score)
    print('-' * 100)


def eval_albert():

    model_names = glob.glob('finish/intention/albert/*intention_albert*')
    for model_name in model_names:
        eval_model(model_name)

def predict(inputs):
    model_save_path = 'albert_model/epoch_36_simcse_loss_4.5393'
    tokenizer = BertTokenizer.from_pretrained(model_save_path)
    estimator = AlbertForSequenceClassification.from_pretrained(model_save_path, num_labels=2).eval()

    # 对输入处理编码
    inputs = tokenizer.encode_plus(inputs,
                                   return_token_type_ids=False,
                                   return_attention_mask=False,
                                   return_tensors='pt')
    # 模型预测
    with torch.no_grad():
        outputs = estimator(**inputs)
        y_pred = torch.argmax(outputs.logits)

        if y_pred.item() == 1:
            print('\033[31m需要处理的问题\033[m')
        else:
            print('其他方面的问题')

def test():
    predict('为什么抽完血后会出现头晕、四肢无力脸色发白冒汗等现象?')
    predict('我是怀孕了吗')
    predict('嗓子起疮,这是什么原因导致的?')
    predict('哈哈')
    predict('我滴妈呀,你真笨啊')
    predict('前天早上在医院的广场上玩篮球,一会来了几个病人,我们就一起玩了')
    predict('我是医院的病人,我发烧了,所以在这里住院')


if __name__ == '__main__':
    # train_albert()
    eval_albert()
    # test()

各个模型的评估结果为:

7_intention_albert_loss_308.1043: 100%|█████████| 17/17 [00:02<00:00,  8.49it/s]
准确率: 0.9884504331087585
精确率: [0.98823529 0.98869476]
召回率: [0.9900272  0.98666667]
F-score: [0.98913043 0.98767967]
----------------------------------------------------------------------------------------------------
8_intention_albert_loss_231.9439: 100%|█████████| 17/17 [00:01<00:00, 10.74it/s]
准确率: 0.9884504331087585
精确率: [0.99090082 0.98569969]
召回率: [0.98730734 0.98974359]
F-score: [0.98910082 0.9877175 ]
----------------------------------------------------------------------------------------------------
9_intention_albert_loss_186.2181: 100%|█████████| 17/17 [00:01<00:00, 10.97it/s]
准确率: 0.9908565928777671
精确率: [0.996337   0.98478702]
召回率: [0.98640073 0.99589744]
F-score: [0.99134396 0.99031107]
----------------------------------------------------------------------------------------------------
10_intention_albert_loss_155.6420: 100%|████████| 17/17 [00:01<00:00, 10.81it/s]
准确率: 0.9846005774783445
精确率: [0.98026906 0.98961578]
召回率: [0.99093382 0.9774359 ]
F-score: [0.98557259 0.98348813]
----------------------------------------------------------------------------------------------------
11_intention_albert_loss_133.5007: 100%|████████| 17/17 [00:01<00:00, 10.65it/s]
准确率: 0.9898941289701636
精确率: [0.99271403 0.98673469]
召回率: [0.98821396 0.99179487]
F-score: [0.99045888 0.98925831]
----------------------------------------------------------------------------------------------------
12_intention_albert_loss_92.2596: 100%|█████████| 17/17 [00:01<00:00, 10.57it/s]
准确率: 0.987487969201155
精确率: [0.98910082 0.98567042]
召回率: [0.98730734 0.98769231]
F-score: [0.98820327 0.98668033]
----------------------------------------------------------------------------------------------------
13_intention_albert_loss_80.6847: 100%|█████████| 17/17 [00:01<00:00, 10.55it/s]
准确率: 0.987487969201155
精确率: [0.98910082 0.98567042]
召回率: [0.98730734 0.98769231]
F-score: [0.98820327 0.98668033]
----------------------------------------------------------------------------------------------------
14_intention_albert_loss_67.7850: 100%|█████████| 17/17 [00:01<00:00, 10.59it/s]
准确率: 0.9879692011549567
精确率: [0.98646209 0.98969072]
召回率: [0.99093382 0.98461538]
F-score: [0.9886929  0.98714653]
----------------------------------------------------------------------------------------------------
15_intention_albert_loss_61.4615: 100%|█████████| 17/17 [00:01<00:00, 10.62it/s]
准确率: 0.9879692011549567
精确率: [0.99       0.98568507]
召回率: [0.98730734 0.98871795]
F-score: [0.98865184 0.98719918]
----------------------------------------------------------------------------------------------------
16_intention_albert_loss_39.3099: 100%|█████████| 17/17 [00:01<00:00, 10.44it/s]
准确率: 0.9889316650625601
精确率: [0.99090909 0.98670757]
召回率: [0.98821396 0.98974359]
F-score: [0.98955969 0.98822325]
----------------------------------------------------------------------------------------------------
17_intention_albert_loss_28.6968: 100%|█████████| 17/17 [00:01<00:00, 10.65it/s]
准确率: 0.9884504331087585
精确率: [0.99000908 0.98669396]
召回率: [0.98821396 0.98871795]
F-score: [0.98911071 0.98770492]
----------------------------------------------------------------------------------------------------
18_intention_albert_loss_24.5714: 100%|█████████| 17/17 [00:01<00:00, 10.63it/s]
准确率: 0.9884504331087585
精确率: [0.99090082 0.98569969]
召回率: [0.98730734 0.98974359]
F-score: [0.98910082 0.9877175 ]
----------------------------------------------------------------------------------------------------
19_intention_albert_loss_21.6295: 100%|█████████| 17/17 [00:01<00:00, 10.49it/s]
准确率: 0.9884504331087585
精确率: [0.99090082 0.98569969]
召回率: [0.98730734 0.98974359]
F-score: [0.98910082 0.9877175 ]
----------------------------------------------------------------------------------------------------
20_intention_albert_loss_19.8042: 100%|█████████| 17/17 [00:01<00:00, 10.60it/s]
准确率: 0.9889316650625601
精确率: [0.99090909 0.98670757]
召回率: [0.98821396 0.98974359]
F-score: [0.98955969 0.98822325]
----------------------------------------------------------------------------------------------------
21_intention_albert_loss_18.0605: 100%|█████████| 17/17 [00:01<00:00, 10.35it/s]
准确率: 0.9889316650625601
精确率: [0.99090909 0.98670757]
召回率: [0.98821396 0.98974359]
F-score: [0.98955969 0.98822325]
----------------------------------------------------------------------------------------------------
22_intention_albert_loss_16.5603: 100%|█████████| 17/17 [00:01<00:00, 10.61it/s]
准确率: 0.9884504331087585
精确率: [0.98823529 0.98869476]
召回率: [0.9900272  0.98666667]
F-score: [0.98913043 0.98767967]
----------------------------------------------------------------------------------------------------
23_intention_albert_loss_15.6354: 100%|█████████| 17/17 [00:01<00:00, 10.46it/s]
准确率: 0.9889316650625601
精确率: [0.99090909 0.98670757]
召回率: [0.98821396 0.98974359]
F-score: [0.98955969 0.98822325]
----------------------------------------------------------------------------------------------------
24_intention_albert_loss_14.7335: 100%|█████████| 17/17 [00:01<00:00, 10.76it/s]
准确率: 0.9894128970163619
精确率: [0.99181074 0.98672114]
召回率: [0.98821396 0.99076923]
F-score: [0.99000908 0.98874104]
----------------------------------------------------------------------------------------------------
25_intention_albert_loss_13.9919: 100%|█████████| 17/17 [00:01<00:00, 10.48it/s]
准确率: 0.9894128970163619
精确率: [0.99181074 0.98672114]
召回率: [0.98821396 0.99076923]
F-score: [0.99000908 0.98874104]
----------------------------------------------------------------------------------------------------
26_intention_albert_loss_13.3679: 100%|█████████| 17/17 [00:01<00:00, 10.57it/s]
准确率: 0.9894128970163619
精确率: [0.99091735 0.9877175 ]
召回率: [0.98912058 0.98974359]
F-score: [0.99001815 0.98872951]
----------------------------------------------------------------------------------------------------
27_intention_albert_loss_12.8064: 100%|█████████| 17/17 [00:01<00:00, 10.68it/s]
准确率: 0.9894128970163619
精确率: [0.99181074 0.98672114]
召回率: [0.98821396 0.99076923]
F-score: [0.99000908 0.98874104]
----------------------------------------------------------------------------------------------------
28_intention_albert_loss_12.2769: 100%|█████████| 17/17 [00:01<00:00, 10.65it/s]
准确率: 0.9894128970163619
精确率: [0.99181074 0.98672114]
召回率: [0.98821396 0.99076923]
F-score: [0.99000908 0.98874104]
----------------------------------------------------------------------------------------------------
29_intention_albert_loss_11.7247: 100%|█████████| 17/17 [00:01<00:00, 10.69it/s]
准确率: 0.9894128970163619
精确率: [0.99181074 0.98672114]
召回率: [0.98821396 0.99076923]
F-score: [0.99000908 0.98874104]
----------------------------------------------------------------------------------------------------
30_intention_albert_loss_11.1622: 100%|█████████| 17/17 [00:01<00:00, 10.39it/s]
准确率: 0.9894128970163619
精确率: [0.99181074 0.98672114]
召回率: [0.98821396 0.99076923]
F-score: [0.99000908 0.98874104]
----------------------------------------------------------------------------------------------------

2. 基于朴素贝叶斯

from sklearn.naive_bayes import MultinomialNB
import pandas as pd
from datasets import load_from_disk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
import pickle
import jieba.posseg as psg
import jieba
jieba.setLogLevel(0)


def cut_word(sentence):

    allow = ['n', 'nr', 'ns', 'nt', 'nl', 'nz', 'nsf', 's'] + ['v', 'vd', 'vn', 'vx'] + ['a', 'ad', 'al', 'an']
    stopwords = [word.strip() for word in open('file/stopwords.txt')]
    sentence_words = []
    sentence = psg.lcut(sentence)
    for word, pos in sentence:
        if pos not in allow:
            continue
        if word in stopwords:
            continue
        sentence_words.append(word)

    return ' '.join(sentence_words)


def train_vectorizer():

    questions = load_from_disk('data/intention.data')['train']
    questions = [cut_word(question) for question in questions['title']]
    tokenizer = CountVectorizer(max_features=21246)
    tokenizer.fit(questions)
    print('特征数:', len(tokenizer.get_feature_names_out()))
    pickle.dump(tokenizer, open('finish/intention/bayes/vectorizer.pkl', 'wb'))


def train_bayes_model():

    vectorizer = pickle.load(open('finish/intention/bayes/vectorizer.pkl', 'rb'))
    questions = load_from_disk('data/intention.data')['train']
    inputs = [cut_word(title) for title in questions['title']]
    labels = questions['label']
    inputs = vectorizer.transform(inputs)
    estimator = MultinomialNB()
    estimator.fit(inputs, labels)
    pickle.dump(estimator, open('finish/intention/bayes/bayes.pkl', 'wb'))


def eval_bayes_model():

    vectorizer = pickle.load(open('finish/intention/bayes/vectorizer.pkl', 'rb'))
    estimator = pickle.load(open('finish/intention/bayes/bayes.pkl', 'rb'))
    questions = load_from_disk('data/intention.data')['test']
    inputs = [cut_word(question) for question in questions['title']]
    labels = questions['label']
    inputs = vectorizer.transform(inputs)
    ypreds = estimator.predict(inputs)

    precision, recall, f_score, true_sum = precision_recall_fscore_support(labels, ypreds)
    print('准确率:', accuracy_score(labels, ypreds))
    print('精确率:', precision)
    print('召回率:', recall)
    print('F-score:', f_score)

if __name__ == '__main__':
    train_vectorizer()
    train_bayes_model()
    eval_bayes_model()

评估的结果为:

准确率: 0.9701636188642926
精确率: [0.99430199 0.94536585]
召回率: [0.94922937 0.99384615]
F-score: [0.97124304 0.969     ]

3. 基于支持向量机

import pickle

from sklearn.svm import SVC
from datasets import load_from_disk
import jieba
jieba.setLogLevel(0)
import fasttext
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
import datasets
datasets.disable_progress_bar()


def train_svm():

    traindata = load_from_disk('data/intention.data')['train']
    tokenizer = fasttext.load_model('pretrained/cc.zh.300.bin')

    def collate_function(batch_data):
        titles = batch_data['title']
        labels = batch_data['label']
        model_inputs = []
        for title in titles:
            inputs = tokenizer.get_sentence_vector(' '.join(jieba.lcut(title)))
            model_inputs.append(inputs.tolist())
        return {'title': model_inputs, 'label': labels}

    # 数据向量化
    traindata = traindata.map(collate_function, batched=True, batch_size=32)
    # 训练支持向量机
    estimator = SVC()
    estimator.fit(traindata['title'], traindata['label'])
    # 存储模型
    pickle.dump(estimator, open('finish/intention/svm/svm.pkl', 'wb'))


def eval_svm():

    estimator = pickle.load(open('finish/intention/svm/svm.pkl', 'rb'))
    tokenizer = fasttext.load_model('pretrained/cc.zh.300.bin')

    traindata = load_from_disk('data/intention.data')
    def collate_function(batch_data):
        titles = batch_data['title']
        labels = batch_data['label']
        model_inputs = []
        for title in titles:
            inputs = tokenizer.get_sentence_vector(' '.join(jieba.lcut(title)))
            model_inputs.append(inputs.tolist())
        return {'title': model_inputs, 'label': labels}
    traindata = traindata.map(collate_function, batched=True, batch_size=32)

    # 训练集准确率
    y_pred = estimator.predict(traindata['train']['title'])
    y_true = traindata['train']['label']
    print('准确率:', accuracy_score(y_true, y_pred))
    precision, recall, f_score, true_sum = precision_recall_fscore_support(y_true, y_pred)
    print('精确率:', precision)
    print('召回率:', recall)
    print('F-score:', f_score)
    print('-' * 50)

    # 测试集准确率
    y_pred = estimator.predict(traindata['test']['title'])
    y_true = traindata['test']['label']
    print('测试集:', accuracy_score(y_true, y_pred))
    precision, recall, f_score, true_sum = precision_recall_fscore_support(y_true, y_pred)
    print('精确率:', precision)
    print('召回率:', recall)
    print('F-score:', f_score)

if __name__ == '__main__':
    train_svm()
    eval_svm()

评估结果:

训练集: 0.9879503698401336
(array([0.99103551, 0.98462111]), array([0.98582371, 0.99027067]), array([0.98842274, 0.98743781]), array([8747, 8017]))
测试集: 0.9880725190839694
(array([0.98912551, 0.98690176]), array([0.98822997, 0.98789713]), array([0.98867754, 0.98739919]), array([2209, 1983]))

未经允许不得转载:一亩三分地 » 基于 svm、朴素贝叶斯、albert 文本分类
评论 (0)

7 + 2 =