概述
import xlwt
import numpy as np
import pandas as pd
from copy import deepcopy
from sklearn.metrics import accuracy_score, mean_absolute_error
from collections import OrderedDict
from mord import LogisticAT
from sklearn.model_selection import StratifiedKFold
from pathlib import Path
if __name__ == '__main__':
p = Path("D:OCdata")
# names = ["ARWU2020-5bin", "SWD", "bank-10bin", "housing-10bin", "housing-5bin", "ERA", "ESL", "LEV", "machine-5bin",
#
"bank-5bin", "stock-10bin", "stock-5bin", "car", "ARWU2020-10bin", "ARWU2020-5bin", "automobile",
#
"QSR2020-10bin", "QSR2020-5bin", "computer-5bin", "ARWU2020-5bin", "QSR2020-10bin", "ARWU2020-5bin",
#
"computer-10bin", "ERA", "ESL"]
names = ["ARWU2020-5bin","ARWU2020-10bin","QSR2020-5bin","QSR2020-10bin","housing-5bin","housing-10bin","stock-5bin","stock-10bin","bank-5bin","bank-10bin","computer-5bin","computer-10bin", "car","ERA", "ESL", "LEV","automobile"]
for name in names:
path = p.joinpath(name + ".csv")
data = np.array(pd.read_csv(path, header=None))
X = data[:, :-1]
y = data[:, -1]
n_example = X.shape[0]
n_attrib = X.shape[1]
n_class = len(np.unique(y))
n_distrib = np.unique(y,return_counts=True)[1]
# print(n_distrib)
print("数据集{}的样本个数为{},属性个数为{},类别个数为{},类别分布为{}".format(name,n_example,n_attrib,n_class,n_distrib))
数据集ARWU2020-5bin的样本个数为990,属性个数为6,类别个数为5,类别分布为[198 198 198 198 198]
数据集ARWU2020-10bin的样本个数为990,属性个数为6,类别个数为10,类别分布为[99 99 99 99 99 99 99 99 99 99]
数据集QSR2020-5bin的样本个数为495,属性个数为6,类别个数为5,类别分布为[ 99 101 100 96 99]
数据集QSR2020-10bin的样本个数为495,属性个数为6,类别个数为10,类别分布为[49 50 51 50 50 50 50 49 50 46]
数据集housing-5bin的样本个数为506,属性个数为13,类别个数为5,类别分布为[102 101 101 101 101]
数据集housing-10bin的样本个数为506,属性个数为13,类别个数为10,类别分布为[51 51 51 51 51 51 50 50 50 50]
数据集stock-5bin的样本个数为950,属性个数为9,类别个数为5,类别分布为[190 190 190 190 190]
数据集stock-10bin的样本个数为950,属性个数为9,类别个数为10,类别分布为[95 95 95 95 95 95 95 95 95 95]
数据集bank-5bin的样本个数为8192,属性个数为8,类别个数为5,类别分布为[1639 1639 1638 1638 1638]
数据集bank-10bin的样本个数为8192,属性个数为8,类别个数为10,类别分布为[820 820 819 819 819 819 819 819 819 819]
数据集computer-5bin的样本个数为8192,属性个数为12,类别个数为5,类别分布为[1639 1639 1638 1638 1638]
数据集computer-10bin的样本个数为8192,属性个数为12,类别个数为10,类别分布为[820 820 819 819 819 819 819 819 819 819]
数据集car的样本个数为1728,属性个数为21,类别个数为4,类别分布为[1210 384 69 65]
数据集ERA的样本个数为1000,属性个数为4,类别个数为9,类别分布为[ 92 142 181 172 158 118 88 31 18]
数据集ESL的样本个数为488,属性个数为4,类别个数为9,类别分布为[ 2 12 38 100 116 135 62 19 4]
数据集LEV的样本个数为1000,属性个数为4,类别个数为5,类别分布为[ 93 280 403 197 27]
数据集automobile的样本个数为205,属性个数为71,类别个数为6,类别分布为[ 3 22 67 54 32 27]
最后
以上就是魁梧红酒为你收集整理的Python:查看数据集信息的全部内容,希望文章能够帮你解决Python:查看数据集信息所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
发表评论 取消回复