数据清洗的方法:
设置阈值去掉异常值
随机森林预测去掉点的数值加进去
onehot编码(不适用于决策树和随机森林):
先将一个属性分成几个类别
然后再将样本的数据变成矩阵01,1表示其所在类别
会导致特征数增多
数据清洗代码实现
复制代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102import numpy as np import pandas as pd from fuzzywuzzy import fuzz from fuzzywuzzy import process def enum_row(row): print row['state'] def find_state_code(row): if row['state'] != 0: print process.extractOne(row['state'], states, score_cutoff=80) def capital(str): return str.capitalize() def correct_state(row): if row['state'] != 0: state = process.extractOne(row['state'], states, score_cutoff=80) if state: state_name = state[0] return ' '.join(map(capital, state_name.split(' '))) return row['state'] def fill_state_code(row): if row['state'] != 0: state = process.extractOne(row['state'], states, score_cutoff=80) if state: state_name = state[0] return state_to_code[state_name] return '' if __name__ == "__main__": pd.set_option('display.width', 200) data = pd.read_excel('sales.xlsx', sheetname='sheet1', header=0) print 'data.head() = n', data.head() print 'data.tail() = n', data.tail() print 'data.dtypes = n', data.dtypes print 'data.columns = n', data.columns for c in data.columns: print c, print data['total'] = data['Jan'] + data['Feb'] + data['Mar'] print data.head() print data['Jan'].sum() print data['Jan'].min() print data['Jan'].max() print data['Jan'].mean() print '=============' # 添加一行 s1 = data[['Jan', 'Feb', 'Mar', 'total']].sum() print s1 s2 = pd.DataFrame(data=s1) print s2 print s2.T print s2.T.reindex(columns=data.columns) # 即: s = pd.DataFrame(data=data[['Jan', 'Feb', 'Mar', 'total']].sum()).T s = s.reindex(columns=data.columns, fill_value=0) print s data = data.append(s, ignore_index=True) data = data.rename(index={15:'Total'}) print data.tail() # apply的使用 print '==============apply的使用==========' data.apply(enum_row, axis=1) state_to_code = {"VERMONT": "VT", "GEORGIA": "GA", "IOWA": "IA", "Armed Forces Pacific": "AP", "GUAM": "GU", "KANSAS": "KS", "FLORIDA": "FL", "AMERICAN SAMOA": "AS", "NORTH CAROLINA": "NC", "HAWAII": "HI", "NEW YORK": "NY", "CALIFORNIA": "CA", "ALABAMA": "AL", "IDAHO": "ID", "FEDERATED STATES OF MICRONESIA": "FM", "Armed Forces Americas": "AA", "DELAWARE": "DE", "ALASKA": "AK", "ILLINOIS": "IL", "Armed Forces Africa": "AE", "SOUTH DAKOTA": "SD", "CONNECTICUT": "CT", "MONTANA": "MT", "MASSACHUSETTS": "MA", "PUERTO RICO": "PR", "Armed Forces Canada": "AE", "NEW HAMPSHIRE": "NH", "MARYLAND": "MD", "NEW MEXICO": "NM", "MISSISSIPPI": "MS", "TENNESSEE": "TN", "PALAU": "PW", "COLORADO": "CO", "Armed Forces Middle East": "AE", "NEW JERSEY": "NJ", "UTAH": "UT", "MICHIGAN": "MI", "WEST VIRGINIA": "WV", "WASHINGTON": "WA", "MINNESOTA": "MN", "OREGON": "OR", "VIRGINIA": "VA", "VIRGIN ISLANDS": "VI", "MARSHALL ISLANDS": "MH", "WYOMING": "WY", "OHIO": "OH", "SOUTH CAROLINA": "SC", "INDIANA": "IN", "NEVADA": "NV", "LOUISIANA": "LA", "NORTHERN MARIANA ISLANDS": "MP", "NEBRASKA": "NE", "ARIZONA": "AZ", "WISCONSIN": "WI", "NORTH DAKOTA": "ND", "Armed Forces Europe": "AE", "PENNSYLVANIA": "PA", "OKLAHOMA": "OK", "KENTUCKY": "KY", "RHODE ISLAND": "RI", "DISTRICT OF COLUMBIA": "DC", "ARKANSAS": "AR", "MISSOURI": "MO", "TEXAS": "TX", "MAINE": "ME"} states = state_to_code.keys() print fuzz.ratio('Python Package', 'PythonPackage') print process.extract('Mississippi', states) print process.extract('Mississipi', states, limit=1) print process.extractOne('Mississipi', states) data.apply(find_state_code, axis=1) print 'Before Correct State:n', data['state'] data['state'] = data.apply(correct_state, axis=1) print 'After Correct State:n', data['state'] data.insert(5, 'State Code', np.nan) data['State Code'] = data.apply(fill_state_code, axis=1) print data # group by print '==============group by================' print data.groupby('State Code') print 'All Columns:n' print data.groupby('State Code').sum() print 'Short Columns:n' print data[['State Code', 'Jan', 'Feb', 'Mar', 'total']].groupby('State Code').sum() # 写入文件 data.to_excel('sales_result.xls', sheet_name='Sheet1', index=False)
主成分分析PCA代码实现:
复制代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84import pandas as pd import numpy as np from sklearn.decomposition import PCA from sklearn.linear_model import LogisticRegressionCV from sklearn import metrics from sklearn.model_selection import train_test_split import matplotlib as mpl import matplotlib.pyplot as plt import matplotlib.patches as mpatches from sklearn.pipeline import Pipeline from sklearn.preprocessing import PolynomialFeatures def extend(a, b): return 1.05*a-0.05*b, 1.05*b-0.05*a if __name__ == '__main__': pd.set_option('display.width', 200) data = pd.read_csv('iris.data', header=None) columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'type'] data.rename(columns=dict(zip(np.arange(5), columns)), inplace=True) data['type'] = pd.Categorical(data['type']).codes print data.head(5) x = data.loc[:, columns[:-1]] y = data['type'] pca = PCA(n_components=2, whiten=True, random_state=0) x = pca.fit_transform(x) print '各方向方差:', pca.explained_variance_ print '方差所占比例:', pca.explained_variance_ratio_ print x[:5] cm_light = mpl.colors.ListedColormap(['#77E0A0', '#FF8080', '#A0A0FF']) cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b']) mpl.rcParams['font.sans-serif'] = u'SimHei' mpl.rcParams['axes.unicode_minus'] = False plt.figure(facecolor='w') plt.scatter(x[:, 0], x[:, 1], s=30, c=y, marker='o', cmap=cm_dark) plt.grid(b=True, ls=':') plt.xlabel(u'组份1', fontsize=14) plt.ylabel(u'组份2', fontsize=14) plt.title(u'鸢尾花数据PCA降维', fontsize=18) # plt.savefig('1.png') plt.show() x, x_test, y, y_test = train_test_split(x, y, train_size=0.7) model = Pipeline([ ('poly', PolynomialFeatures(degree=2, include_bias=True)), ('lr', LogisticRegressionCV(Cs=np.logspace(-3, 4, 8), cv=5, fit_intercept=False)) ]) model.fit(x, y) print '最优参数:', model.get_params('lr')['lr'].C_ y_hat = model.predict(x) print '训练集精确度:', metrics.accuracy_score(y, y_hat) y_test_hat = model.predict(x_test) print '测试集精确度:', metrics.accuracy_score(y_test, y_test_hat) N, M = 500, 500 # 横纵各采样多少个值 x1_min, x1_max = extend(x[:, 0].min(), x[:, 0].max()) # 第0列的范围 x2_min, x2_max = extend(x[:, 1].min(), x[:, 1].max()) # 第1列的范围 t1 = np.linspace(x1_min, x1_max, N) t2 = np.linspace(x2_min, x2_max, M) x1, x2 = np.meshgrid(t1, t2) # 生成网格采样点 x_show = np.stack((x1.flat, x2.flat), axis=1) # 测试点 y_hat = model.predict(x_show) # 预测值 y_hat = y_hat.reshape(x1.shape) # 使之与输入的形状相同 plt.figure(facecolor='w') plt.pcolormesh(x1, x2, y_hat, cmap=cm_light) # 预测值的显示 plt.scatter(x[:, 0], x[:, 1], s=30, c=y, edgecolors='k', cmap=cm_dark) # 样本的显示 plt.xlabel(u'组份1', fontsize=14) plt.ylabel(u'组份2', fontsize=14) plt.xlim(x1_min, x1_max) plt.ylim(x2_min, x2_max) plt.grid(b=True, ls=':') patchs = [mpatches.Patch(color='#77E0A0', label='Iris-setosa'), mpatches.Patch(color='#FF8080', label='Iris-versicolor'), mpatches.Patch(color='#A0A0FF', label='Iris-virginica')] plt.legend(handles=patchs, fancybox=True, framealpha=0.8, loc='lower right') plt.title(u'鸢尾花Logistic回归分类效果', fontsize=17) # plt.savefig('2.png') plt.show()
最后
以上就是忐忑狗最近收集整理的关于机器学习——数据清洗,特征选择的全部内容,更多相关机器学习——数据清洗内容请搜索靠谱客的其他文章。
本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
发表评论 取消回复