概述
特征选择
在sklearn.feature_selection
中的类可以用于在样本集中做特征选择和降维,从而提高估计器的准确率或提升他们在高维数据集中的性能。
删除方差小的特征
VarianceThreshold
是一个用于特征选择的简单方法。它移除所有方差不满足给定阈值的特征。
VarianceThreshold(threshold=(.8 * (1 - .8)))
单变量特征选择
单变量特征选择通过基于单变量统计实验选择最好的特征。可以看做是估计器的预处理步骤。scikit-learn
暴露特征选择函数使用transform
方法:
SelectKBest
SelectPercentile
SelectFpr/SelectFdr/SelectFwe
GenericUnivariateSelect
X_new = SelectKBest(chi2, k=2).fit_transform(X)
# coding: utf-8
# Univariate Feature Selection
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_classif
X, y = load_iris(return_X_y=True)
E = np.random.RandomState(42).uniform(0, 0.1, size=(X.shape[0], 20))
X = np.hstack((X, E))
X_train, X_test, y_train, y_test = train_test_split(
X, y, stratify=y, random_state=0)
plt.figure(1)
plt.clf()
X_indices = np.arange(X.shape[-1])
selector = SelectKBest(f_classif, k=4)
selector.fit(X_train, y_train)
scores = -np.log10(selector.pvalues_)
scores /= scores.max()
plt.bar(X_indices - 0.45, scores, width=.2,
label=r'Univariance score ($-Log(p_{value})$)')
clf = make_pipeline(MinMaxScaler(), LinearSVC())
clf.fit(X_train, y_train)
print('Classification accuracy wih selecting features: {:.3f}'
.format(clf.score(X_test, y_test)))
svm_weights = np.abs(clf[-1].coef_).sum(axis=0)
svm_weights /= svm_weights.sum()
plt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight')
clf_selected = make_pipeline(
SelectKBest(f_classif, k=4), MinMaxScaler(), LinearSVC())
clf_selected.fit(X_train, y_train)
print('Classification accuracy after univariate feature selection: {:.3f}'
.format(clf_selected.score(X_test, y_test)))
svm_weights_selected = np.abs(clf_selected[-1].coef_).sum(axis=0)
svm_weights_selected /= svm_weights_selected.sum()
plt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected,
width=.2, label='SVM weights after selection')
plt.title("Comparing feature selection")
plt.xlabel('Feature number')
plt.yticks(())
plt.axis('tight')
plt.legend(loc='upper right')
plt.show()
迭代特征消除
假设一个外部的估计器指定权重给特征,迭代特征消除逐步考虑越来越小的特征集合以选择特征。
# coding: utf-8
# Recursive feature elimination with cross-validation
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=25, n_informative=3,
n_redundant=2, n_repeated=0, n_classes=8,
n_clusters_per_class=1, random_state=0)
svc = SVC(kernel='linear')
rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(2),
scoring='accuracy')
rfecv.fit(X, y)
print("Optimal number of features: %d" % rfecv.n_features_)
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_)+1), rfecv.grid_scores_)
plt.show()
使用SelectFromModel
特征选择
SelectFromModel
是一种元估计器,能够和其它任意有coef_
个feature_importances_
属性一起使用。
# coding: utf-8
# Feature selection using SelectFromModel and LassoCV
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target
feature_names = diabetes.feature_names
clf = LassoCV().fit(X, y)
importance = np.abs(clf.coef_)
idx_third = importance.argsort()[-3]
threshold = importance[idx_third] + 0.01
idx_features = (-importance).argsort()[:2]
name_features = np.array(feature_names)[idx_features]
print('Selected features: {}'.format(name_features))
sfm = SelectFromModel(clf, threshold=threshold)
sfm.fit(X, y)
X_transform = sfm.transform(X)
n_features = sfm.transform(X).shape[1]
plt.title(
"Features from diabets using SelectFromModel with "
"threshold %0.3f." % sfm.threshold)
feature1 = X_transform[:, 0]
feature2 = X_transform[:, 1]
plt.plot(feature1, feature2, 'r.')
plt.xlabel("First feature: {}".format(name_features[0]))
plt.ylabel("Second feature: {}".format(name_features[1]))
plt.ylim([np.min(feature2), np.max(feature2)])
plt.show()
基于L1的特征选择
linear_model.Lasso/linear_model.LogisticRegression/svm.LinearSVC
基于树的特征选择
ExtraTreesClassifier
特征选择作为管道的一部分
特征选择是在做实际学习前的一个预处理步骤。推荐的方式是使用sklearn.pipeline.Pipeline
clf = Pipeline([
('feature_selection', SelectFromModel(LinearSVC(penalty="l1"))),
('classification', RandomForestClassifier())])
clf.fit(X, y)
最后
以上就是殷勤鸡为你收集整理的特征选择的全部内容,希望文章能够帮你解决特征选择所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
发表评论 取消回复