数据集:中文酒店评论分类语料.csv
1. 数据处理
from datasets import load_dataset
from datasets import Dataset
import pandas as pd
import re
from pyhanlp import JClass
# import pkg_resources
from symspellpy.symspellpy import SymSpell
import time
import datasets
datasets.disable_progress_bar()
def timer(func):
def inner(*args, **kwargs):
print('function [%s] starts runing' % (func.__name__))
start = time.time()
result = func(*args, **kwargs)
end = time.time()
print('function [%s] executed for %.2f seconds' % (func.__name__, end - start))
print('-' * 51)
return result
return inner
sym_spell = SymSpell(max_dictionary_edit_distance=0, prefix_length=7)
sym_spell.load_dictionary("data/frequency_dictionary_en_82_765.txt", term_index=0, count_index=1)
normalizer = JClass('com.hankcs.hanlp.dictionary.other.CharTable')
def clean_data(label, inputs):
# 文本标准化
inputs = normalizer.convert(inputs)
# 英文分割单词
inputs = sym_spell.word_segmentation(inputs).corrected_string
# 替换连续重复
inputs = re.sub(r'(.){3,}', r'', inputs)
# 去除多余空格
inputs = ''.join(inputs.split())
return {'label': label, 'review': inputs}
@timer
def preprocess():
train_data = pd.read_csv('data/中文酒店评论分类语料.csv')
train_data = train_data[['label', 'review']]
train_data = train_data.dropna()
# DataFrame 转换为 Dataset 对象
train_data = Dataset.from_pandas(train_data)
# 文本清洗
train_data = train_data.map(clean_data, input_columns=['label', 'review'], num_proc=6)
# 过滤空数据
train_data = train_data.filter(lambda x: len(x['review']) > 5)
# 分割数据集
train_data = train_data.train_test_split(test_size=0.2)
train_data['valid'] = train_data.pop('test')
print(train_data)
# 存储数据
train_data.save_to_disk('data/senti-dataset')
if __name__ == '__main__':
preprocess()
2. 模型训练评估
import torch
import torch.nn as nn
from transformers import BertTokenizer
from datasets import load_from_disk
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence
import numpy as np
import datasets
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
import time
from sklearn.metrics import classification_report
# datasets.disable_progress_bar()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class SentimentClassification(nn.Module):
def __init__(self, vocab_size):
super(SentimentClassification, self).__init__()
self.embedding = nn.Embedding(vocab_size, 128)
self.bilstim = nn.LSTM(128, 256, num_layers=1, bidirectional=True, batch_first=True)
self.output_logits = nn.Linear(256 * 2, 2)
def forward(self, sorted_inputs, lengths=None):
inputs_embedding = self.embedding(sorted_inputs)
if lengths is not None:
inputs_embedding = pack_padded_sequence(inputs_embedding, lengths, batch_first=True)
output, (hn, cn) = self.bilstim(inputs_embedding)
# 将 hn 维度从 (2, 2, 256) 修改为 (2, 512)
hn = hn.transpose(0, 1)
last_hidden_state = hn.reshape(hn.shape[0], -1)
logits = self.output_logits(last_hidden_state)
return logits, last_hidden_state
def train():
# 训练数据
train_data = load_from_disk('data/senti-dataset')['train']
# 分词器
tokenizer = BertTokenizer.from_pretrained('data/bert-base-chinese')
# 分类模型
model = SentimentClassification(tokenizer.vocab_size).to(device)
# 损失函数
criterion = nn.CrossEntropyLoss()
# 优化方法
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
# 调节器
scheduler = StepLR(optimizer, step_size=1, gamma=0.9)
def train_step(batch_labels, batch_inputs):
# 输入分词编码
param = {'padding': 'longest',
'return_token_type_ids': False,
'return_attention_mask': False}
batch_inputs = tokenizer(batch_inputs, **param)['input_ids']
# 获得文本长度
input_length = []
for inputs in batch_inputs:
input_length.append(len(inputs))
# 长度降序排列
sorted_length = np.argsort(-np.array(input_length))
# 排序输入和标签
sorted_inputs = []
sorted_labels = []
length = []
for index in sorted_length:
sorted_inputs.append(batch_inputs[index])
sorted_labels.append(batch_labels[index])
length.append(input_length[index])
sorted_inputs = torch.tensor(sorted_inputs, device=device)
sorted_labels = torch.tensor(sorted_labels, device=device)
# 模型计算
logits, last_hidden_state = model(sorted_inputs, length)
# 计算损失
loss = criterion(logits, sorted_labels)
# 梯度清零
optimizer.zero_grad()
# 反向传播
loss.backward()
# 参数更新
optimizer.step()
nonlocal total_loss, total_iter
total_loss += loss.item() * len(sorted_labels)
total_iter += len(sorted_labels)
epochs = 100
prev = 10000
for epoch in range(epochs):
total_loss = 0.0
total_iter = 0
start = time.time()
train_data.map(train_step, input_columns=['label', 'review'], batched=True, batch_size=32, desc='训练')
loss = total_loss / total_iter
print('loss: %.5f time: %.2fs' % (loss, time.time() - start))
if loss < prev:
torch.save(model.state_dict(), 'model/sentimentclassification.pth')
evaluate(epoch)
prev = loss
def evaluate(epoch):
# 训练数据
valid_data = load_from_disk('data/senti-dataset')['valid']
# 分词器
tokenizer = BertTokenizer.from_pretrained('data/bert-base-chinese')
# 分类模型
model = SentimentClassification(tokenizer.vocab_size).to(device)
model.load_state_dict(torch.load('model/sentimentclassification.pth'))
y_true, y_pred = [], []
def predict(batch_labels, batch_inputs):
# 输入分词编码
param = {'padding': 'longest',
'return_token_type_ids': False,
'return_attention_mask': False}
batch_inputs = tokenizer(batch_inputs, **param)['input_ids']
batch_inputs = torch.tensor(batch_inputs, device=device)
logits, last_hidden_state = model(batch_inputs)
predictions = torch.argmax(logits, dim=-1).tolist()
y_pred.extend(predictions)
y_true.extend(batch_labels)
valid_data.map(predict, input_columns=['label', 'review'], batched=True, batch_size=32, desc='评估')
report_result = classification_report(y_true, y_pred, labels=[0, 1], target_names=['差评', '好评'])
print('---epoch %d---------------------------------------------' % epoch)
print(report_result)
if __name__ == '__main__':
train()
# evaluate(10)

冀公网安备13050302001966号