概述
目录
- 一、不分词划分数据集
- 1.划分数据集
- 2.将各数据集写入txt文件
- 二、分词划分数据集
- 1.分词
- 2.完整代码
本篇文章的主要任务是将自己的数据集使用在Chinese-Text-Classification-PyTorch项目中
github地址: Chinese-Text-Classification
数据集:二分类的文本数据,做情感分析,review为评论内容,label分为1,0正负项。
一、不分词划分数据集
用pandas读取csv数据文件,用sklearn中的train_test_split函数划分数据集
1.划分数据集
按8:1:1比例将数据集划分成训练集,验证集,测试集:
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.2,
shuffle=True,
stratify=y,
random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_test,
y_test,
test_size=0.5,
shuffle=True,
stratify=y_test,
random_state=42)
2.将各数据集写入txt文件
生成三个txt文件,test.txt,train.txt,dev.txt 用于匹配替换项目数据集格式
# 划分成txt文件
testdir = "./WeiboData/data/test.txt"
traindir = "./WeiboData/data/train.txt"
validdir = "./WeiboData/data/dev.txt"
print(X_test)
print(y_test)
with open(testdir, 'a+', encoding='utf-8-sig') as f:
for i,j in zip(X_test,y_test):
f.write(str(i)+'t'+str(j)+'n')
with open(traindir, 'a+', encoding='utf-8-sig') as f:
for i,j in zip(X_train,y_train):
f.write(str(i)+'t'+str(j)+'n')
with open(validdir, 'a+', encoding='utf-8-sig') as f:
for i,j in zip(X_valid,y_valid):
f.write(str(i)+'t'+str(j)+'n')
f.close()
完整代码:
import pandas as pd
import jieba
from sklearn.model_selection import train_test_split
data = pd.read_csv(r'D:StudyPycahrmProjectssentimentAnalysiswb_data1_denote1.csv', encoding='utf-8-sig')
X = data['review'].values
y = data.label.values
# 5:3:2
# 8:1:1
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.2,
shuffle=True,
stratify=y,
random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_test,
y_test,
test_size=0.5,
shuffle=True,
stratify=y_test,
random_state=42)
print("训练集样本数 = ", len(y_train))
print("训练集中正样本数 = ", len([w for w in y_train if w == 1]))
print("训练集中负样本数 = ", len([w for w in y_train if w == 0]))
print("验证集样本数 = ", len(y_valid))
print("验证集中正样本数 = ", len([w for w in y_valid if w == 1]))
print("验证集中负样本数 = ", len([w for w in y_valid if w == 0]))
print("测试集样本数 = ", len(y_test))
print("测试集中正样本数 = ", len([w for w in y_test if w == 1]))
print("测试集中负样本数 = ", len([w for w in y_test if w == 0]))
# 划分成txt文件
testdir = "./WeiboData/data/test.txt"
traindir = "./WeiboData/data/train.txt"
validdir = "./WeiboData/data/dev.txt"
print(X_test)
print(y_test)
with open(testdir, 'a+', encoding='utf-8-sig') as f:
for i,j in zip(X_test,y_test):
f.write(str(i)+'t'+str(j)+'n')
with open(traindir, 'a+', encoding='utf-8-sig') as f:
for i,j in zip(X_train,y_train):
f.write(str(i)+'t'+str(j)+'n')
with open(validdir, 'a+', encoding='utf-8-sig') as f:
for i,j in zip(X_valid,y_valid):
f.write(str(i)+'t'+str(j)+'n')
f.close()
二、分词划分数据集
1.分词
注意分词后需要按行划分成列表
代码如下(示例):
# 用jieba对各数据集分词
def tokenizer(data):
# 得到文本数据
text = []
for i in range(data.shape[0]):
text.append(str(data[i]))
comment = 'n'.join(text)
# 清洗文本数据-用正则表达式删去数字、字母、标点符号、特殊符号等
import re
symbols = "[0-9!%,。.,、~?()()?!“”::;"";……&-_|.A.B.C*^]"
comments = re.sub(symbols, '', comment)
comments_list = jieba.cut(comments) # 精确模式
# comments_list = jieba.cut_for_search(comments)#搜索引擎模式
x_train = ' '.join([x for x in comments_list]) # 用空格连接分好的词
return x_train
# 对各数据集分词
X_test = tokenizer(X_test)
X_train = tokenizer(X_train)
X_valid = tokenizer(X_valid)
# 按行将string划分成列表
X_valid = X_valid.split('n')
X_test = X_test.split('n')
X_train = X_train.split('n')
2.完整代码
import pandas as pd
import jieba
from sklearn.model_selection import train_test_split
# 用jieba对各数据集分词
def tokenizer(data):
# 得到文本数据
text = []
for i in range(data.shape[0]):
text.append(str(data[i]))
comment = 'n'.join(text)
# 清洗文本数据-用正则表达式删去数字、字母、标点符号、特殊符号等
import re
symbols = "[0-9!%,。.,、~?()()?!“”::;"";……&-_|.A.B.C*^]"
comments = re.sub(symbols, '', comment)
comments_list = jieba.cut(comments) # 精确模式
# comments_list = jieba.cut_for_search(comments)#搜索引擎模式
x_train = ' '.join([x for x in comments_list]) # 用空格连接分好的词
return x_train
data = pd.read_csv(r'D:StudyPycahrmProjectssentimentAnalysiswb_data1_denote1.csv', encoding='utf-8-sig')
X = data['review'].values
y = data.label.values
# 5:3:2
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.5,
shuffle=True,
stratify=y,
random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_test,
y_test,
test_size=0.3,
shuffle=True,
stratify=y_test,
random_state=42)
print("训练集样本数 = ", len(y_train))
print("训练集中正样本数 = ", len([w for w in y_train if w == 1]))
print("训练集中负样本数 = ", len([w for w in y_train if w == 0]))
print("验证集样本数 = ", len(y_valid))
print("验证集中正样本数 = ", len([w for w in y_valid if w == 1]))
print("验证集中负样本数 = ", len([w for w in y_valid if w == 0]))
print("测试集样本数 = ", len(y_test))
print("测试集中正样本数 = ", len([w for w in y_test if w == 1]))
print("测试集中负样本数 = ", len([w for w in y_test if w == 0]))
# 划分成txt文件
testdir = "./WeiboData/data/test.txt"
traindir = "./WeiboData/data/train.txt"
validdir = "./WeiboData/data/dev.txt"
print(X_test)
print(y_test)
# 对各数据集分词
X_test = tokenizer(X_test)
X_train = tokenizer(X_train)
X_valid = tokenizer(X_valid)
X_valid = X_valid.split('n')
X_test = X_test.split('n')
X_train = X_train.split('n')
print(X_test)
print(type(X_test))
print(len(X_test))
with open(testdir, 'a+', encoding='utf-8-sig') as f:
for i,j in zip(X_test,y_test):
f.write(str(i)+'t'+str(j)+'n')
with open(traindir, 'a+', encoding='utf-8-sig') as f:
for i,j in zip(X_train,y_train):
f.write(str(i)+'t'+str(j)+'n')
with open(validdir, 'a+', encoding='utf-8-sig') as f:
for i,j in zip(X_valid,y_valid):
f.write(str(i)+'t'+str(j)+'n')
f.close()
最后
以上就是如意戒指为你收集整理的【NLP项目-文本分类】划分测试集,训练集,验证集一、不分词划分数据集二、分词划分数据集的全部内容,希望文章能够帮你解决【NLP项目-文本分类】划分测试集,训练集,验证集一、不分词划分数据集二、分词划分数据集所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
发表评论 取消回复