我们构建一个《水浒传》中 108 好汉的知识图谱,问答系统的设置问题围绕该知识图谱设计。这一步主要包含两部分:
- 爬取数据
- 构建图谱
1. 爬取数据
数据中一部分从百度百科爬取,一部分则手动构建。
import requests
from bs4 import BeautifulSoup
import json
import re
from pyhanlp import JClass
headers = {'User-Agent': open('header.txt').read().strip()}
cookies = {}
for item in open('cookie.txt').read().strip().split('; '):
index = item.find('=')
key = item[:index]
val = item[index + 1:]
cookies[key] = val
def get_base_info():
"""获得英雄的基本信息"""
cookies = {}
for item in open('cookie.txt').read().strip().split('; '):
index = item.find('=')
key, val = item[:index], item[index + 1:]
cookies[key] = val
request_url = 'https://baike.baidu.com/item/%E4%B8%80%E7%99%BE%E5%8D%95%E5%85%AB%E5%B0%86/19408?fr=aladdin'
response = requests.get(request_url, headers=headers, cookies=cookies)
soup = BeautifulSoup(response.text, features='lxml')
base_info = []
# 36天罡
table = soup.find('table', attrs={'data-id': 'gntl9fqdtx'})
for tr in table.find_all('tr')[1:]:
rank, starname, nickname, realname, _, _, position = tr.get_text(' ').split()
base_info.append(f'{realname} {starname} {rank} {nickname} {position}\n')
# 72地煞
table = soup.find('table', attrs={'data-id': 'gntl9fr8r7'})
for tr in table.find_all('tr')[1:]:
rank, starname, nickname, realname, _, _, postion = tr.get_text(' ').split()
base_info.append(f'{realname} {starname} {rank} {nickname} {position}\n')
open('data/spider_base.txt', 'w').writelines(base_info)
def get_lemma_id():
"""lemma_id用于拼接相应英雄的百科地址"""
lemma_ids = []
for pn in range(1, 10):
request_url = f'https://baike.baidu.com/starmap/api/getlemmalist?lemmaId=711115&nodeId=ab7fe1266a9e3bec11029341&pn={pn}&rn=50'
response = requests.get(request_url, headers=headers)
if response.status_code != 200:
continue
response = json.loads(response.text)
name_list = response['data']['list']
if name_list == []:
break
for name in name_list:
lemma_ids.append(f"{name['lemmaTitle']} {name['lemmaId']}\n")
# 存储请求地址
open('data/spider_lemmaid.txt', 'w').writelines(lemma_ids)
def get_description():
"""获得英雄描述"""
descriptions = []
for line in open('data/spider_lemmaid.txt'):
name, lemma_id = line.split()
request_url = f'https://baike.baidu.com/item/{name}/{lemma_id}'
response = requests.get(request_url, headers=headers, cookies=cookies)
soup = BeautifulSoup(response.text, features='lxml')
if soup.title.string.strip() == '百度百科-验证':
print('百度反爬,抓取内容失败。')
return
paragraph = soup.find_all('div', attrs={'class': 'para MARK_MODULE', 'label-module': 'para'})
# 只读取第一段的介绍内容
paragraph = paragraph[0]
# 去除内容中的 HTML 标签
description = paragraph.get_text()
# 去除 [1]、[15] 这样的内容
description = re.sub(r'\[\d+\]', '', description)
description = re.sub(r'\[\d+-\d+\]', '', description)
description = ''.join(description.split())
descriptions.append(f'{name} {description}\n')
open('data/spider_desc.txt', 'w').writelines(descriptions)
if __name__ == '__main__':
get_base_info()
get_lemma_id()
get_description()
2. 构建图谱
这一步实际包含两个任务,一个是将知识图谱中的实体单独生成自定义字典文件,以提高 jieba、hanlp 在做分词、实体抽取时的准确性。
from py2neo import Graph
from py2neo import Node
from py2neo import Relationship
from py2neo import Subgraph
import pickle
from pyhanlp import JClass
frequency = 99999
custom_dict_path = '../../nlu/userdict/'
part_of_speech = {'name': 'hh', 'star': 'xx', 'nick': 'wh', 'faction': 'px', 'relation': 'gx', 'position': 'gz'}
open(custom_dict_path + 'special.txt', 'w').writelines([f'{word} {frequency} ts\n' for word in part_of_speech.values()])
def process_hero():
"""构建英雄名字列表"""
hero_name = []
for line in open('../spider/data/spider_lemmaid.txt'):
name, _ = line.strip().split()
name = name.strip()
hero_name.append(name)
pickle.dump(hero_name, open('data/hero_name.list', 'wb'))
def process_base():
"""构建英雄基本信息字典"""
hero_base = {}
real_words, star_words, nick_words, position_words = [], [], [], [] # 存储实体
for line in open('../spider/data/spider_base.txt'):
realname, starname, rank, nickname, position = line.strip().split()
hero_base[realname] = {'ranking' : int(rank),
'starname': starname,
'nickname': nickname,
'position': position}
real_words.append(realname)
star_words.append(starname)
nick_words.append(nickname)
position_words.append(position)
pickle.dump(hero_base, open('data/hero_base.dict', 'wb'))
real_words = [f'{word} {frequency} {part_of_speech["name"]}\n' for word in set(real_words)]
star_words = [f'{word} {frequency} {part_of_speech["star"]}\n' for word in set(star_words)]
nick_words = [f'{word} {frequency} {part_of_speech["nick"]}\n' for word in set(nick_words)]
position_words = [f'{word} {frequency} {part_of_speech["position"]}\n' for word in set(position_words)]
open(custom_dict_path + 'name.txt', 'w').writelines(real_words)
open(custom_dict_path + 'star.txt', 'w').writelines(star_words)
open(custom_dict_path + 'nick.txt', 'w').writelines(nick_words)
open(custom_dict_path + 'position.txt', 'w').writelines(position_words)
def process_desc():
"""构建英雄描述字典"""
hero_desc = {}
normalize = JClass('com.hankcs.hanlp.dictionary.other.CharTable')
for line in open('../spider/data/spider_desc.txt'):
realname, desc = line.strip().split()
desc = ''.join(desc.split())
desc = normalize.convert(desc)
hero_desc[realname] = {'description': desc}
pickle.dump(hero_desc, open('data/hero_desc.dict', 'wb'))
def process_result():
"""构建英雄结局字典"""
hero_result = {}
for line in open('../spider/data/manual_result.txt'):
name_list, result = line.strip().split()
hero_list = name_list.split('、')
for name in hero_list:
name = name.strip()
hero_result[name] = {'result': result}
pickle.dump(hero_result, open('data/hero_result.dict', 'wb'))
def process_relation():
"""关系词加入到分词词典中"""
relation_words = []
for line in open('../spider/data/manual_relation.txt'):
_, _, re1, re2 = line.strip().split()
relation_words.extend([re1, re2])
relation_words = [f'{word} {frequency} {part_of_speech["relation"]}\n' for word in set(relation_words)]
open(custom_dict_path + 'relation.txt', 'w').writelines(relation_words)
def hero_to_neo4j():
"""信息存储入库"""
graph = Graph('http://localhost:7474', name='neo4j', auth=('neo4j', 'jay332572'))
try: graph.delete_all()
except: pass
hero_base = pickle.load(open('data/hero_base.dict', 'rb'))
hero_desc = pickle.load(open('data/hero_desc.dict', 'rb'))
hero_name = pickle.load(open('data/hero_name.list', 'rb'))
hero_result = pickle.load(open('data/hero_result.dict', 'rb'))
# 补充信息
for name in hero_name:
if name not in hero_result:
hero_result[name] = {'result': '未知'}
# 创建节点
nodes = {}
for name in hero_name:
node = Node('hero', name=name)
node.update(hero_base[name])
node.update(hero_desc[name])
node.update(hero_result[name])
nodes[name] = node
# 创建关系
relations = []
for line in open('../spider/data/manual_relation.txt'):
name1, name2, re1, re2 = line.strip().split()
node1, node2 = nodes[name1], nodes[name2]
# re1 为大类关系,双方向, re2 为小类关系,单方向
relation = Relationship(node1, re2, node2, type=re1)
relations.append(relation)
# 信息存储
sub_graph = Subgraph(nodes.values(), relations)
graph.create(sub_graph)
if __name__ == '__main__':
process_hero()
process_base()
process_desc()
process_result()
process_relation()
hero_to_neo4j()

冀公网安备13050302001966号