目标是构建一个基于向量的问答检索系统,即:问题和答案是固定的数据集,通过将用户输入的问题编码为向量,在向量数据库中匹配最相似的问题,并返回问题对应的答案。
实现时,希望能够对输入的问题进行类别判别。例如:我们做法律的问答检索系统,就希望能够判断出用户问题是否和法律相关,如果相关我们再进行后续处理。
训练这样的一个二分类模型使用到的数据为:
- 正样本:就是拿到的正常的问题的语料
- 负样本:来源就是开放的百度问答数据、以及部分其他行业的问答数据,从中筛选出不同的问题
这种二分类问题,在做的时候方案太多了,这里就使用了三种方法:
- 朴素贝叶斯:根据词的频数向量
- 支持向量机:用 word2vec 将问题编码为向量
- Albert:mini 版本的 bert 预训练模型
从效果来看,朴素贝叶斯使用的高维的稀疏向量,虽然特征维度高,速度也还是可以接受的。其效果不如 albert 和 svm。albert 虽然是 mini 版本,也比另外两个大,并且训练过程产生了很多 checkpoint,需要我们自己来选择用哪个。svm + word2vec 的方式,效果非常不错,训练时间也不长,模型也不大,我比较喜欢这个模型。
1. 基于 Albert 的方法
预训练模型:https://huggingface.co/clue/albert_chinese_tiny
下面完整的实现代码:
from transformers import AlbertForSequenceClassification from torch.optim.lr_scheduler import ReduceLROnPlateau from torch.utils.data import DataLoader from transformers import BertTokenizer from tqdm import tqdm from datasets import load_from_disk import torch.optim as optim import torch.nn as nn import glob import torch import numpy as np from datasets import Dataset from sklearn.metrics import accuracy_score from sklearn.metrics import precision_recall_fscore_support # 计算设备 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') def collate_function(batch_data, tokenizer): titles, labels = [], [] for data in batch_data: titles.append(data['title']) labels.append(data['label']) title_tensor = tokenizer.batch_encode_plus(titles, add_special_tokens=True, padding='longest', return_tensors='pt') title_tensor = {key: value.to(device) for key, value in title_tensor.items()} label_tensor = torch.tensor(labels, device=device) return title_tensor, label_tensor def train_albert(): # https://huggingface.co/clue/albert_chinese_tiny estimator = AlbertForSequenceClassification.from_pretrained('pretrained/albert_chinese_tiny', num_labels=2).to(device) tokenizer = BertTokenizer.from_pretrained('pretrained/albert_chinese_tiny') traindata = load_from_disk('data/intention.data')['train'] dataloader = DataLoader(traindata, batch_size=128, shuffle=True, collate_fn=lambda data: collate_function(data, tokenizer)) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(estimator.parameters(), lr=1e-5) scheduler = ReduceLROnPlateau(optimizer, factor=0.6, patience=2, cooldown=2, verbose=True) for epoch in range(30): total_loss, total_size, total_corr = 0.0, 0, 0 progress = tqdm(range(len(dataloader))) for title_tensor, label_tensor in dataloader: outputs = estimator(**title_tensor) loss = criterion(outputs.logits, label_tensor) optimizer.zero_grad() loss.backward() optimizer.step() # 预测标签 y_pred = torch.argmax(outputs.logits, dim=-1) total_corr += (y_pred == label_tensor).sum().item() total_loss += loss.item() * len(label_tensor) total_size += len(label_tensor) # 更新进度 desc = '%2d. %6.1f %5d/%5d %.4f %.2E' % (epoch + 1, total_loss, total_corr, total_size, total_corr/total_size, scheduler.optimizer.param_groups[0]['lr']) progress.set_description(desc) progress.update() scheduler.step(total_loss) progress.close() if epoch > 5: model_save_path = 'finish/intention/albert/%0d_intention_albert_loss_%.4f' % (epoch + 1, total_loss) estimator.save_pretrained(model_save_path) tokenizer.save_pretrained(model_save_path) @torch.no_grad() def eval_model(model_name): estimator = AlbertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device).eval() tokenizer = BertTokenizer.from_pretrained(model_name) traindata = load_from_disk('data/intention.data')['test'] dataloader = DataLoader(traindata, batch_size=128, shuffle=True, collate_fn=lambda data: collate_function(data, tokenizer)) model_name = model_name[model_name.rfind('/') + 1:] progress = tqdm(range(len(dataloader)), desc='%30s' % model_name) y_true, y_pred = [], [] for inputs_tensor, labels_tensor in dataloader: outputs = estimator(**inputs_tensor) y_label = torch.argmax(outputs.logits, dim=-1) y_pred.extend(y_label.cpu().numpy().tolist()) y_true.extend(labels_tensor.cpu().numpy().tolist()) progress.update() progress.close() print('准确率:', accuracy_score(y_true, y_pred)) precision, recall, f_score, true_sum = precision_recall_fscore_support(y_true, y_pred) print('精确率:', precision) print('召回率:', recall) print('F-score:', f_score) print('-' * 100) def eval_albert(): model_names = glob.glob('finish/intention/albert/*intention_albert*') for model_name in model_names: eval_model(model_name) def predict(inputs): model_save_path = 'albert_model/epoch_36_simcse_loss_4.5393' tokenizer = BertTokenizer.from_pretrained(model_save_path) estimator = AlbertForSequenceClassification.from_pretrained(model_save_path, num_labels=2).eval() # 对输入处理编码 inputs = tokenizer.encode_plus(inputs, return_token_type_ids=False, return_attention_mask=False, return_tensors='pt') # 模型预测 with torch.no_grad(): outputs = estimator(**inputs) y_pred = torch.argmax(outputs.logits) if y_pred.item() == 1: print('\033[31m需要处理的问题\033[m') else: print('其他方面的问题') def test(): predict('为什么抽完血后会出现头晕、四肢无力脸色发白冒汗等现象?') predict('我是怀孕了吗') predict('嗓子起疮,这是什么原因导致的?') predict('哈哈') predict('我滴妈呀,你真笨啊') predict('前天早上在医院的广场上玩篮球,一会来了几个病人,我们就一起玩了') predict('我是医院的病人,我发烧了,所以在这里住院') if __name__ == '__main__': # train_albert() eval_albert() # test()
各个模型的评估结果为:
7_intention_albert_loss_308.1043: 100%|█████████| 17/17 [00:02<00:00, 8.49it/s] 准确率: 0.9884504331087585 精确率: [0.98823529 0.98869476] 召回率: [0.9900272 0.98666667] F-score: [0.98913043 0.98767967] ---------------------------------------------------------------------------------------------------- 8_intention_albert_loss_231.9439: 100%|█████████| 17/17 [00:01<00:00, 10.74it/s] 准确率: 0.9884504331087585 精确率: [0.99090082 0.98569969] 召回率: [0.98730734 0.98974359] F-score: [0.98910082 0.9877175 ] ---------------------------------------------------------------------------------------------------- 9_intention_albert_loss_186.2181: 100%|█████████| 17/17 [00:01<00:00, 10.97it/s] 准确率: 0.9908565928777671 精确率: [0.996337 0.98478702] 召回率: [0.98640073 0.99589744] F-score: [0.99134396 0.99031107] ---------------------------------------------------------------------------------------------------- 10_intention_albert_loss_155.6420: 100%|████████| 17/17 [00:01<00:00, 10.81it/s] 准确率: 0.9846005774783445 精确率: [0.98026906 0.98961578] 召回率: [0.99093382 0.9774359 ] F-score: [0.98557259 0.98348813] ---------------------------------------------------------------------------------------------------- 11_intention_albert_loss_133.5007: 100%|████████| 17/17 [00:01<00:00, 10.65it/s] 准确率: 0.9898941289701636 精确率: [0.99271403 0.98673469] 召回率: [0.98821396 0.99179487] F-score: [0.99045888 0.98925831] ---------------------------------------------------------------------------------------------------- 12_intention_albert_loss_92.2596: 100%|█████████| 17/17 [00:01<00:00, 10.57it/s] 准确率: 0.987487969201155 精确率: [0.98910082 0.98567042] 召回率: [0.98730734 0.98769231] F-score: [0.98820327 0.98668033] ---------------------------------------------------------------------------------------------------- 13_intention_albert_loss_80.6847: 100%|█████████| 17/17 [00:01<00:00, 10.55it/s] 准确率: 0.987487969201155 精确率: [0.98910082 0.98567042] 召回率: [0.98730734 0.98769231] F-score: [0.98820327 0.98668033] ---------------------------------------------------------------------------------------------------- 14_intention_albert_loss_67.7850: 100%|█████████| 17/17 [00:01<00:00, 10.59it/s] 准确率: 0.9879692011549567 精确率: [0.98646209 0.98969072] 召回率: [0.99093382 0.98461538] F-score: [0.9886929 0.98714653] ---------------------------------------------------------------------------------------------------- 15_intention_albert_loss_61.4615: 100%|█████████| 17/17 [00:01<00:00, 10.62it/s] 准确率: 0.9879692011549567 精确率: [0.99 0.98568507] 召回率: [0.98730734 0.98871795] F-score: [0.98865184 0.98719918] ---------------------------------------------------------------------------------------------------- 16_intention_albert_loss_39.3099: 100%|█████████| 17/17 [00:01<00:00, 10.44it/s] 准确率: 0.9889316650625601 精确率: [0.99090909 0.98670757] 召回率: [0.98821396 0.98974359] F-score: [0.98955969 0.98822325] ---------------------------------------------------------------------------------------------------- 17_intention_albert_loss_28.6968: 100%|█████████| 17/17 [00:01<00:00, 10.65it/s] 准确率: 0.9884504331087585 精确率: [0.99000908 0.98669396] 召回率: [0.98821396 0.98871795] F-score: [0.98911071 0.98770492] ---------------------------------------------------------------------------------------------------- 18_intention_albert_loss_24.5714: 100%|█████████| 17/17 [00:01<00:00, 10.63it/s] 准确率: 0.9884504331087585 精确率: [0.99090082 0.98569969] 召回率: [0.98730734 0.98974359] F-score: [0.98910082 0.9877175 ] ---------------------------------------------------------------------------------------------------- 19_intention_albert_loss_21.6295: 100%|█████████| 17/17 [00:01<00:00, 10.49it/s] 准确率: 0.9884504331087585 精确率: [0.99090082 0.98569969] 召回率: [0.98730734 0.98974359] F-score: [0.98910082 0.9877175 ] ---------------------------------------------------------------------------------------------------- 20_intention_albert_loss_19.8042: 100%|█████████| 17/17 [00:01<00:00, 10.60it/s] 准确率: 0.9889316650625601 精确率: [0.99090909 0.98670757] 召回率: [0.98821396 0.98974359] F-score: [0.98955969 0.98822325] ---------------------------------------------------------------------------------------------------- 21_intention_albert_loss_18.0605: 100%|█████████| 17/17 [00:01<00:00, 10.35it/s] 准确率: 0.9889316650625601 精确率: [0.99090909 0.98670757] 召回率: [0.98821396 0.98974359] F-score: [0.98955969 0.98822325] ---------------------------------------------------------------------------------------------------- 22_intention_albert_loss_16.5603: 100%|█████████| 17/17 [00:01<00:00, 10.61it/s] 准确率: 0.9884504331087585 精确率: [0.98823529 0.98869476] 召回率: [0.9900272 0.98666667] F-score: [0.98913043 0.98767967] ---------------------------------------------------------------------------------------------------- 23_intention_albert_loss_15.6354: 100%|█████████| 17/17 [00:01<00:00, 10.46it/s] 准确率: 0.9889316650625601 精确率: [0.99090909 0.98670757] 召回率: [0.98821396 0.98974359] F-score: [0.98955969 0.98822325] ---------------------------------------------------------------------------------------------------- 24_intention_albert_loss_14.7335: 100%|█████████| 17/17 [00:01<00:00, 10.76it/s] 准确率: 0.9894128970163619 精确率: [0.99181074 0.98672114] 召回率: [0.98821396 0.99076923] F-score: [0.99000908 0.98874104] ---------------------------------------------------------------------------------------------------- 25_intention_albert_loss_13.9919: 100%|█████████| 17/17 [00:01<00:00, 10.48it/s] 准确率: 0.9894128970163619 精确率: [0.99181074 0.98672114] 召回率: [0.98821396 0.99076923] F-score: [0.99000908 0.98874104] ---------------------------------------------------------------------------------------------------- 26_intention_albert_loss_13.3679: 100%|█████████| 17/17 [00:01<00:00, 10.57it/s] 准确率: 0.9894128970163619 精确率: [0.99091735 0.9877175 ] 召回率: [0.98912058 0.98974359] F-score: [0.99001815 0.98872951] ---------------------------------------------------------------------------------------------------- 27_intention_albert_loss_12.8064: 100%|█████████| 17/17 [00:01<00:00, 10.68it/s] 准确率: 0.9894128970163619 精确率: [0.99181074 0.98672114] 召回率: [0.98821396 0.99076923] F-score: [0.99000908 0.98874104] ---------------------------------------------------------------------------------------------------- 28_intention_albert_loss_12.2769: 100%|█████████| 17/17 [00:01<00:00, 10.65it/s] 准确率: 0.9894128970163619 精确率: [0.99181074 0.98672114] 召回率: [0.98821396 0.99076923] F-score: [0.99000908 0.98874104] ---------------------------------------------------------------------------------------------------- 29_intention_albert_loss_11.7247: 100%|█████████| 17/17 [00:01<00:00, 10.69it/s] 准确率: 0.9894128970163619 精确率: [0.99181074 0.98672114] 召回率: [0.98821396 0.99076923] F-score: [0.99000908 0.98874104] ---------------------------------------------------------------------------------------------------- 30_intention_albert_loss_11.1622: 100%|█████████| 17/17 [00:01<00:00, 10.39it/s] 准确率: 0.9894128970163619 精确率: [0.99181074 0.98672114] 召回率: [0.98821396 0.99076923] F-score: [0.99000908 0.98874104] ----------------------------------------------------------------------------------------------------
2. 基于朴素贝叶斯
from sklearn.naive_bayes import MultinomialNB import pandas as pd from datasets import load_from_disk from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics import precision_recall_fscore_support from sklearn.metrics import accuracy_score import pickle import jieba.posseg as psg import jieba jieba.setLogLevel(0) def cut_word(sentence): allow = ['n', 'nr', 'ns', 'nt', 'nl', 'nz', 'nsf', 's'] + ['v', 'vd', 'vn', 'vx'] + ['a', 'ad', 'al', 'an'] stopwords = [word.strip() for word in open('file/stopwords.txt')] sentence_words = [] sentence = psg.lcut(sentence) for word, pos in sentence: if pos not in allow: continue if word in stopwords: continue sentence_words.append(word) return ' '.join(sentence_words) def train_vectorizer(): questions = load_from_disk('data/intention.data')['train'] questions = [cut_word(question) for question in questions['title']] tokenizer = CountVectorizer(max_features=21246) tokenizer.fit(questions) print('特征数:', len(tokenizer.get_feature_names_out())) pickle.dump(tokenizer, open('finish/intention/bayes/vectorizer.pkl', 'wb')) def train_bayes_model(): vectorizer = pickle.load(open('finish/intention/bayes/vectorizer.pkl', 'rb')) questions = load_from_disk('data/intention.data')['train'] inputs = [cut_word(title) for title in questions['title']] labels = questions['label'] inputs = vectorizer.transform(inputs) estimator = MultinomialNB() estimator.fit(inputs, labels) pickle.dump(estimator, open('finish/intention/bayes/bayes.pkl', 'wb')) def eval_bayes_model(): vectorizer = pickle.load(open('finish/intention/bayes/vectorizer.pkl', 'rb')) estimator = pickle.load(open('finish/intention/bayes/bayes.pkl', 'rb')) questions = load_from_disk('data/intention.data')['test'] inputs = [cut_word(question) for question in questions['title']] labels = questions['label'] inputs = vectorizer.transform(inputs) ypreds = estimator.predict(inputs) precision, recall, f_score, true_sum = precision_recall_fscore_support(labels, ypreds) print('准确率:', accuracy_score(labels, ypreds)) print('精确率:', precision) print('召回率:', recall) print('F-score:', f_score) if __name__ == '__main__': train_vectorizer() train_bayes_model() eval_bayes_model()
评估的结果为:
准确率: 0.9701636188642926 精确率: [0.99430199 0.94536585] 召回率: [0.94922937 0.99384615] F-score: [0.97124304 0.969 ]
3. 基于支持向量机
import pickle from sklearn.svm import SVC from datasets import load_from_disk import jieba jieba.setLogLevel(0) import fasttext from sklearn.metrics import accuracy_score from sklearn.metrics import precision_recall_fscore_support import datasets datasets.disable_progress_bar() def train_svm(): traindata = load_from_disk('data/intention.data')['train'] tokenizer = fasttext.load_model('pretrained/cc.zh.300.bin') def collate_function(batch_data): titles = batch_data['title'] labels = batch_data['label'] model_inputs = [] for title in titles: inputs = tokenizer.get_sentence_vector(' '.join(jieba.lcut(title))) model_inputs.append(inputs.tolist()) return {'title': model_inputs, 'label': labels} # 数据向量化 traindata = traindata.map(collate_function, batched=True, batch_size=32) # 训练支持向量机 estimator = SVC() estimator.fit(traindata['title'], traindata['label']) # 存储模型 pickle.dump(estimator, open('finish/intention/svm/svm.pkl', 'wb')) def eval_svm(): estimator = pickle.load(open('finish/intention/svm/svm.pkl', 'rb')) tokenizer = fasttext.load_model('pretrained/cc.zh.300.bin') traindata = load_from_disk('data/intention.data') def collate_function(batch_data): titles = batch_data['title'] labels = batch_data['label'] model_inputs = [] for title in titles: inputs = tokenizer.get_sentence_vector(' '.join(jieba.lcut(title))) model_inputs.append(inputs.tolist()) return {'title': model_inputs, 'label': labels} traindata = traindata.map(collate_function, batched=True, batch_size=32) # 训练集准确率 y_pred = estimator.predict(traindata['train']['title']) y_true = traindata['train']['label'] print('准确率:', accuracy_score(y_true, y_pred)) precision, recall, f_score, true_sum = precision_recall_fscore_support(y_true, y_pred) print('精确率:', precision) print('召回率:', recall) print('F-score:', f_score) print('-' * 50) # 测试集准确率 y_pred = estimator.predict(traindata['test']['title']) y_true = traindata['test']['label'] print('测试集:', accuracy_score(y_true, y_pred)) precision, recall, f_score, true_sum = precision_recall_fscore_support(y_true, y_pred) print('精确率:', precision) print('召回率:', recall) print('F-score:', f_score) if __name__ == '__main__': train_svm() eval_svm()
评估结果:
训练集: 0.9879503698401336 (array([0.99103551, 0.98462111]), array([0.98582371, 0.99027067]), array([0.98842274, 0.98743781]), array([8747, 8017])) 测试集: 0.9880725190839694 (array([0.98912551, 0.98690176]), array([0.98822997, 0.98789713]), array([0.98867754, 0.98739919]), array([2209, 1983]))