1. 准备数据
该代码用于微博情感分析数据的预处理。主要流程包括:
- 数据存储:使用
pickle保存语料库、训练集和测试集,以便后续使用。 - 加载数据:读取
weibo_senti_100k.csv,去除缺失值。 - 数据分析:统计类别分布。
- 数据集划分:按 85:15 比例分割训练集和测试集,确保类别平衡。
创建 01-准备数据.py 文件并添加如下代码:
import jieba
import logging
jieba.setLogLevel(logging.CRITICAL)
import pickle
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
def demo():
# 加载数据
data = pd.read_csv('weibo_senti_100k/weibo_senti_100k.csv')
data = data.dropna()
inputs = data['review'].to_numpy().tolist()
labels = data['label'].to_numpy().tolist()
print('类别分布:', Counter(labels))
# 数据集分割
train_data, test_data = train_test_split(data, test_size=0.15, stratify=labels, random_state=42)
corpus = data['review'].to_numpy().tolist()
train_data = train_data.to_numpy().tolist()
test_data = test_data.to_numpy().tolist()
print(train_data[:3])
# 存储数据
pickle.dump(inputs, open('data/corpus.pkl', 'wb'))
pickle.dump(train_data, open('data/train.pkl', 'wb'))
pickle.dump(test_data, open('data/test.pkl', 'wb'))
if __name__ == '__main__':
demo()
2. 构建词典
该代码用于构建文本词汇表。主要流程包括:
- 加载数据:读取
corpus.pkl语料库和stopwords.txt停用词。 - 统计词频:对文本进行分词并统计词频。
- 构建词表:过滤低频词和停用词,生成
word_to_id和id_to_word映射。 - 数据存储:使用
pickle保存词汇表,供后续使用。
创建 02-构建词表.py 文件并添加如下代码:
import jieba
import logging
jieba.setLogLevel(logging.CRITICAL)
import pickle
from collections import Counter
def demo():
corpus = pickle.load(open('data/corpus.pkl', 'rb'))
stopwords = {word.strip() for word in open('stopwords.txt')}
word_freq = Counter()
for review in corpus:
words = jieba.lcut(review)
if len(words) == 0:
continue
word_freq.update(words)
# 设定阈值,过滤低频词
word_to_id = {'[PAD]': 0, '[UNK]': 1}
id_to_word = {0: '[PAD]', 1: '[UNK]'}
threshold = 1
start_id = len(word_to_id)
for word, freq in word_freq.items():
if freq >= threshold and word not in stopwords:
word_to_id[word] = start_id
id_to_word[start_id] = word
start_id += 1
print(id_to_word, len(id_to_word))
pickle.dump(word_to_id, open('vocab/word_to_id.pkl', 'wb'))
pickle.dump(id_to_word, open('vocab/id_to_word.pkl', 'wb'))
if __name__ == '__main__':
demo()
3. 分词器
该代码实现了一个基于 jieba 分词的文本 分词器。主要功能包括:
- 加载词汇表:读取
word_to_id.pkl和id_to_word.pkl。 - 编码文本:将文本转换为 ID 序列,并进行
PAD填充。 - 获取词汇表大小、保存/加载分词器。
- 测试示例:对文本进行编码并打印结果。
创建 tokenizer.py 并添加如下代码:
import jieba
import logging
jieba.setLogLevel(logging.CRITICAL)
import pickle
import torch
from torch.nn.utils.rnn import pad_sequence
class Tokenzier:
def __init__(self):
self.word_to_id = pickle.load(open('vocab/word_to_id.pkl', 'rb'))
self.id_to_word = pickle.load(open('vocab/id_to_word.pkl', 'rb'))
self.unk = self.word_to_id['[UNK]']
self.pad = self.word_to_id['[PAD]']
def get_vocab_size(self):
return len(self.word_to_id)
def encode(self, texts):
words = [jieba.lcut(text) for text in texts]
batch_ids, batch_len = [], []
for text in texts:
ids = []
words = jieba.lcut(text)
for word in words:
if word in self.word_to_id:
id = self.word_to_id[word]
else:
id = self.unk
ids.append(id)
batch_ids.append(torch.tensor(ids))
batch_len.append(len(ids))
# 将批次数据 PAD 对齐
batch_ids = pad_sequence(batch_ids, batch_first=True, padding_value=self.pad)
batch_len = torch.tensor(batch_len)
return batch_ids, batch_len
def save(self, path):
pickle.dump(self, open(path, 'wb'))
@classmethod
def load(cls, path):
tokenizer = pickle.load(open(path, 'rb'))
return tokenizer
def demo():
tokenizer = Tokenzier()
batch_ids, batch_len = tokenizer.encode(['梦想有多大,舞台就有多大![鼓掌]', '[花心][鼓掌]//@小懒猫Melody2011: [春暖花开]'])
print(batch_ids)
if __name__ == '__main__':
demo()

冀公网安备13050302001966号