在做检索时,我们可以通过关键字召回可能的结果。这里就是两个简单的方法:
- 基于 Inverted Index 进行召回
- 基于 TF-IDF 进行召回
1. 基于 Inverted Index 进行召回
这个思路较为简单,比如:我们有 1 万个文档,通过 jieba 分词工具,并去除停用词。构建词到文档的映射。当输入一个 query sentence,我们将其词提取出来,通过这些词快速获得包含这些关键字的文档,并根据包含关键字的多少进行排序,返回即可。
实现代码:
import jieba jieba.setLogLevel(0) import random import pandas as pd import pickle from collections import Counter from data_select import select_questions from data_select import select_and_show_question # 构建问题倒排索引 def build_inverted_index(): questions = select_questions() inverted_index = {} stopwords = [word.strip() for word in open('file/stopwords.txt')] for qid, question in questions: words = [word for word in jieba.lcut(question) if word not in stopwords] if len(words) == 0: print('分词失败问题:', question) continue # 构建索引 for word in words: if word not in inverted_index: inverted_index[word] = [qid] else: inverted_index[word].append(qid) pickle.dump(inverted_index, open('finish/keyword/inverted_index/inverted_index.pkl', 'wb')) # 通过倒排索引返回包含关键字的候选列表 def generate_candidate(query, inversed_index, topK): # 输入问题分词并停用词过滤 query = jieba.lcut(query) stopwords = [word.strip() for word in open('file/stopwords.txt')] query_words = [word for word in query if word not in stopwords] print('输入分词:', query_words) # 存储包含关键词的候选问题列表 candidate_questions = [] # 获得关键词对应的所有问题 for word in query_words: try: candidate_questions.extend(inversed_index[word]) except: pass # 选择包含关键字最多的前 100 个问题 candidate_questions = Counter(candidate_questions).most_common(topK) candidate_questions = [question for question, freq in candidate_questions] return candidate_questions def test(): # 读取倒排索引 inverted_index = pickle.load(open('finish/keyword/inverted_index/inverted_index.pkl', 'rb')) query_string = '宝宝的妈妈嗓子疼有点发烧孩子就是发烧' print('输入问题:', query_string) ids = generate_candidate(query_string, inverted_index, topK=10) print(ids) print('-' * 50) select_and_show_question(ids) if __name__ == '__main__': build_inverted_index() test()
2. 基于 TF-IDF 进行召回
这个思路同样比较简单,我们先通过语料库训练 TfidfVectorizer 模型。然后将所有的问题转换为 TF-IDF 向量存储到 faiss 或者 milvus 向量数据库中。当一个新的 query sentence 输入时,使用训练好的 TF-IDF 模型转换为 TF-IDF 向量,在数据库中使用 cosine similarity 进行相似度比较,返回 top K 作为召回结果。
训练代码:
import jieba jieba.setLogLevel(0) import jieba.analyse as analyse import numpy as np from sklearn.preprocessing import OneHotEncoder import jieba.posseg as psg import pandas as pd import pickle from sklearn.feature_extraction.text import TfidfVectorizer import faiss import torch import torch.nn.functional as F from data_select import select_all_questions import re def is_chinese_word(words): for word in words: if '\u4e00' <= word <= '\u9fff': continue else: return False return True def cut_word(sentence): # n = ['n', 'nr', 'ns', 'nt', 'nl', 'nz', 'nsf', 's'] + ['v', 'vd', 'vn', 'vx'] + ['a', 'ad', 'al', 'an'] # 粗粒度分词 # words_with_pos = psg.cut(sentence) # question_words = [word for word, pos in words_with_pos if pos in p] # 抽取关键字 # question_words = analyse.tfidf(sentence, allowPOS=p, topK=30) # 搜索引擎模式,尽可能的分出词 question_words = jieba.lcut_for_search(sentence) question_words = [word for word in question_words if is_chinese_word(word)] # words = analyse.textrank(sentence, allowPOS=allow_pos) # print('同义词增强:', [synonyms.nearby(word) for word in words]) return ' '.join(question_words) def train_tfidf(): questions = select_all_questions() questions_words = [cut_word(question) for qid, question in questions] max_features = 81920 stopwords = [word.strip() for word in open('file/stopwords.txt')] estimator = TfidfVectorizer(max_features=max_features, stop_words=stopwords, ngram_range=(1, 2)) estimator.fit(questions_words) print('特征数量:', len(estimator.get_feature_names_out())) print('特征内容:', estimator.get_feature_names_out()[:50]) pickle.dump(estimator, open('finish/keyword/tfidf/tfidf.pkl', 'wb')) if __name__ == '__main__': train_tfidf()
查询代码:
import faiss import pickle import numpy as np import pandas as pd import jieba jieba.setLogLevel(0) import jieba.analyse as analyse from data_select import select_and_show_question from data_select import select_and_show_solution from data_select import select_questions from keyword_tfidf_train import cut_word def generate_tfidf_to_faiss(): estimator = pickle.load(open('finish/keyword/tfidf/tfidf.pkl', 'rb')) questions = select_questions() questions_words = [(qid, cut_word(question)) for qid, question in questions] write_number = 0 database = faiss.IndexIDMap(faiss.IndexFlatIP(81920)) for qid, question in questions_words: try: # 有些句子分出的关键词列表为空,此时跳过 question = estimator.transform([question]).toarray().tolist() database.add_with_ids(np.array(question), [qid]) write_number += 1 except Exception as e: pass print('写入 TF-IDF 数量:', write_number) faiss.write_index(database, 'finish/keyword/tfidf/tfidf.faiss') def test(): estimator = pickle.load(open('finish/keyword/tfidf/tfidf.pkl', 'rb')) database = faiss.read_index('finish/keyword/tfidf/tfidf.faiss') # 输入问题 # input_question = '宝宝的妈妈嗓子疼有点发烧孩子就是发烧' # input_question = '怀孕时乳房会有刺痛感吗' # input_question = '小孩发烧,吃点什么药啊?' # input_question = '染头发影响宝宝吃奶吗?' query_string = '吃点啥药能降血压啊?' print('输入问题:', query_string) query_words = [cut_word(query_string)] print('输入分词:', query_words) query_vector = estimator.transform(query_words).toarray() distances, ids = database.search(query_vector, 10) print(ids[0]) print(distances[0].tolist()) select_and_show_question(ids[0]) print('-' * 100) select_and_show_solution(ids[0]) if __name__ == '__main__': # generate_tfidf_to_faiss() test()