我们构建一个《水浒传》中 108 好汉的知识图谱,问答系统的设置问题围绕该知识图谱设计。这一步主要包含两部分:
- 爬取数据
- 构建图谱
1. 爬取数据
数据中一部分从百度百科爬取,一部分则手动构建。
import requests from bs4 import BeautifulSoup import json import re from pyhanlp import JClass headers = {'User-Agent': open('header.txt').read().strip()} cookies = {} for item in open('cookie.txt').read().strip().split('; '): index = item.find('=') key = item[:index] val = item[index + 1:] cookies[key] = val def get_base_info(): """获得英雄的基本信息""" cookies = {} for item in open('cookie.txt').read().strip().split('; '): index = item.find('=') key, val = item[:index], item[index + 1:] cookies[key] = val request_url = 'https://baike.baidu.com/item/%E4%B8%80%E7%99%BE%E5%8D%95%E5%85%AB%E5%B0%86/19408?fr=aladdin' response = requests.get(request_url, headers=headers, cookies=cookies) soup = BeautifulSoup(response.text, features='lxml') base_info = [] # 36天罡 table = soup.find('table', attrs={'data-id': 'gntl9fqdtx'}) for tr in table.find_all('tr')[1:]: rank, starname, nickname, realname, _, _, position = tr.get_text(' ').split() base_info.append(f'{realname} {starname} {rank} {nickname} {position}\n') # 72地煞 table = soup.find('table', attrs={'data-id': 'gntl9fr8r7'}) for tr in table.find_all('tr')[1:]: rank, starname, nickname, realname, _, _, postion = tr.get_text(' ').split() base_info.append(f'{realname} {starname} {rank} {nickname} {position}\n') open('data/spider_base.txt', 'w').writelines(base_info) def get_lemma_id(): """lemma_id用于拼接相应英雄的百科地址""" lemma_ids = [] for pn in range(1, 10): request_url = f'https://baike.baidu.com/starmap/api/getlemmalist?lemmaId=711115&nodeId=ab7fe1266a9e3bec11029341&pn={pn}&rn=50' response = requests.get(request_url, headers=headers) if response.status_code != 200: continue response = json.loads(response.text) name_list = response['data']['list'] if name_list == []: break for name in name_list: lemma_ids.append(f"{name['lemmaTitle']} {name['lemmaId']}\n") # 存储请求地址 open('data/spider_lemmaid.txt', 'w').writelines(lemma_ids) def get_description(): """获得英雄描述""" descriptions = [] for line in open('data/spider_lemmaid.txt'): name, lemma_id = line.split() request_url = f'https://baike.baidu.com/item/{name}/{lemma_id}' response = requests.get(request_url, headers=headers, cookies=cookies) soup = BeautifulSoup(response.text, features='lxml') if soup.title.string.strip() == '百度百科-验证': print('百度反爬,抓取内容失败。') return paragraph = soup.find_all('div', attrs={'class': 'para MARK_MODULE', 'label-module': 'para'}) # 只读取第一段的介绍内容 paragraph = paragraph[0] # 去除内容中的 HTML 标签 description = paragraph.get_text() # 去除 [1]、[15] 这样的内容 description = re.sub(r'\[\d+\]', '', description) description = re.sub(r'\[\d+-\d+\]', '', description) description = ''.join(description.split()) descriptions.append(f'{name} {description}\n') open('data/spider_desc.txt', 'w').writelines(descriptions) if __name__ == '__main__': get_base_info() get_lemma_id() get_description()
2. 构建图谱
这一步实际包含两个任务,一个是将知识图谱中的实体单独生成自定义字典文件,以提高 jieba、hanlp 在做分词、实体抽取时的准确性。
from py2neo import Graph from py2neo import Node from py2neo import Relationship from py2neo import Subgraph import pickle from pyhanlp import JClass frequency = 99999 custom_dict_path = '../../nlu/userdict/' part_of_speech = {'name': 'hh', 'star': 'xx', 'nick': 'wh', 'faction': 'px', 'relation': 'gx', 'position': 'gz'} open(custom_dict_path + 'special.txt', 'w').writelines([f'{word} {frequency} ts\n' for word in part_of_speech.values()]) def process_hero(): """构建英雄名字列表""" hero_name = [] for line in open('../spider/data/spider_lemmaid.txt'): name, _ = line.strip().split() name = name.strip() hero_name.append(name) pickle.dump(hero_name, open('data/hero_name.list', 'wb')) def process_base(): """构建英雄基本信息字典""" hero_base = {} real_words, star_words, nick_words, position_words = [], [], [], [] # 存储实体 for line in open('../spider/data/spider_base.txt'): realname, starname, rank, nickname, position = line.strip().split() hero_base[realname] = {'ranking' : int(rank), 'starname': starname, 'nickname': nickname, 'position': position} real_words.append(realname) star_words.append(starname) nick_words.append(nickname) position_words.append(position) pickle.dump(hero_base, open('data/hero_base.dict', 'wb')) real_words = [f'{word} {frequency} {part_of_speech["name"]}\n' for word in set(real_words)] star_words = [f'{word} {frequency} {part_of_speech["star"]}\n' for word in set(star_words)] nick_words = [f'{word} {frequency} {part_of_speech["nick"]}\n' for word in set(nick_words)] position_words = [f'{word} {frequency} {part_of_speech["position"]}\n' for word in set(position_words)] open(custom_dict_path + 'name.txt', 'w').writelines(real_words) open(custom_dict_path + 'star.txt', 'w').writelines(star_words) open(custom_dict_path + 'nick.txt', 'w').writelines(nick_words) open(custom_dict_path + 'position.txt', 'w').writelines(position_words) def process_desc(): """构建英雄描述字典""" hero_desc = {} normalize = JClass('com.hankcs.hanlp.dictionary.other.CharTable') for line in open('../spider/data/spider_desc.txt'): realname, desc = line.strip().split() desc = ''.join(desc.split()) desc = normalize.convert(desc) hero_desc[realname] = {'description': desc} pickle.dump(hero_desc, open('data/hero_desc.dict', 'wb')) def process_result(): """构建英雄结局字典""" hero_result = {} for line in open('../spider/data/manual_result.txt'): name_list, result = line.strip().split() hero_list = name_list.split('、') for name in hero_list: name = name.strip() hero_result[name] = {'result': result} pickle.dump(hero_result, open('data/hero_result.dict', 'wb')) def process_relation(): """关系词加入到分词词典中""" relation_words = [] for line in open('../spider/data/manual_relation.txt'): _, _, re1, re2 = line.strip().split() relation_words.extend([re1, re2]) relation_words = [f'{word} {frequency} {part_of_speech["relation"]}\n' for word in set(relation_words)] open(custom_dict_path + 'relation.txt', 'w').writelines(relation_words) def hero_to_neo4j(): """信息存储入库""" graph = Graph('http://localhost:7474', name='neo4j', auth=('neo4j', 'jay332572')) try: graph.delete_all() except: pass hero_base = pickle.load(open('data/hero_base.dict', 'rb')) hero_desc = pickle.load(open('data/hero_desc.dict', 'rb')) hero_name = pickle.load(open('data/hero_name.list', 'rb')) hero_result = pickle.load(open('data/hero_result.dict', 'rb')) # 补充信息 for name in hero_name: if name not in hero_result: hero_result[name] = {'result': '未知'} # 创建节点 nodes = {} for name in hero_name: node = Node('hero', name=name) node.update(hero_base[name]) node.update(hero_desc[name]) node.update(hero_result[name]) nodes[name] = node # 创建关系 relations = [] for line in open('../spider/data/manual_relation.txt'): name1, name2, re1, re2 = line.strip().split() node1, node2 = nodes[name1], nodes[name2] # re1 为大类关系,双方向, re2 为小类关系,单方向 relation = Relationship(node1, re2, node2, type=re1) relations.append(relation) # 信息存储 sub_graph = Subgraph(nodes.values(), relations) graph.create(sub_graph) if __name__ == '__main__': process_hero() process_base() process_desc() process_result() process_relation() hero_to_neo4j()