展开

特征选择

最后发布时间 : 2023-04-25 11:10:36 浏览量 :

Recursive feature elimination

给定为特征(例如,线性模型的系数)分配权重的外部估计器,递归特征消除(RFE)的目标是通过递归地考虑越来越小的特征集来选择特征。首先,在初始特征集上训练估计器,并通过任何特定属性(如coeff_,feature_importances_)或可调用属性来获得每个特征的重要性。然后,从当前的一组特征中删除最不重要的特征。该过程在修剪后的集合上递归地重复,直到最终达到所需数量的要选择的特征。

StratifiedKfold method over Kfold
https://www.kaggle.com/general/231173

https://www.yourdatateacher.com/2021/05/05/feature-selection-in-machine-learning-using-lasso-regression/
https://stackoverflow.com/questions/19018333/gridsearchcv-on-logisticregression-in-scikit-learn

L1-based feature selection

from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import auc
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel


from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import auc
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import auc
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
## 导入数据
X = pd.read_csv("merged_abundance_table_species.0.1.txt", sep='\t',index_col=0).T
Y = pd.read_csv("metadata.txt", sep='\t',index_col=0).values.ravel()
​
## 数据预处理
feature = X.columns
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
Y = np.array(Y)
特征选择

## 特征选择
selector = SelectFromModel(estimator=LogisticRegression(C=1, penalty='l1', solver='liblinear')).fit(X, Y)

selector.estimator_.coef_

feature[selector.get_support()]
print(len(feature[selector.get_support()]))
select_X = selector.transform(X)

## 一次嵌套交叉验证
rfc = RandomForestClassifier()

rfc = RandomForestClassifier()
p_grid = {"n_estimators": [20,50],  'max_depth': [5,10,20]}
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
clf = GridSearchCV(estimator=rfc, param_grid=p_grid, cv=inner_cv)


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
# cv = KFold(n_splits=5, shuffle=True, random_state=1)
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
fig, ax = plt.subplots(figsize=(6, 6))
for fold, (train, test) in enumerate(cv.split(select_X, Y)):
   clf.fit(select_X[train], Y[train])
   viz = RocCurveDisplay.from_estimator(
       clf,
       select_X[test],
       Y[test],
       name=f"ROC fold {fold}",
       alpha=0.3,
       lw=1,
       ax=ax,
   )
   interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
   interp_tpr[0] = 0.0
   tprs.append(interp_tpr)
   aucs.append(viz.roc_auc)
ax.plot([0, 1], [0, 1], "k--", label="chance level (AUC = 0.5)")

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
ax.plot(
   mean_fpr,
   mean_tpr,
   color="b",
   label=r"Mean ROC (AUC = %0.2f $\pm$ %0.2f)" % (mean_auc, std_auc),
   lw=2,
   alpha=0.8,
)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
ax.fill_between(
   mean_fpr,
   tprs_lower,
   tprs_upper,
   color="grey",
   alpha=0.2,
   label=r"$\pm$ 1 std. dev.",
)
ax.set(
   xlim=[-0.05, 1.05],
   ylim=[-0.05, 1.05],
   xlabel="False Positive Rate",
   ylabel="True Positive Rate",
   title=f"Mean ROC curve with variability",
)
ax.axis("square")
ax.legend(loc="lower right")
plt.show()


## 30次嵌套交叉验证
rfc = RandomForestClassifier()
p_grid = {"n_estimators": [20,50],  'max_depth': [5,10,20]}
NUM_TRIALS = 30
nested_scores = np.zeros(NUM_TRIALS)
for i in range(NUM_TRIALS):
   inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)
   outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)
   clf = GridSearchCV(estimator=rfc, param_grid=p_grid, cv=inner_cv)
   nested_score = cross_val_score(clf, X=select_X, y=Y, cv=outer_cv)
   nested_scores[i] = nested_score.mean()
   
print(" nested_scores of {:6f} with std. dev. of {:6f}.".format(
       nested_scores.mean(), nested_scores.std()))


# Plot scores on each trial for nested and non-nested CV
plt.figure()
plt.subplot(211)
(nested_line,) = plt.plot(nested_scores, color="b")
plt.ylabel("score", fontsize="14")
plt.legend(
   [ nested_line],
   [ "Nested CV"],
   bbox_to_anchor=(0, 0.4, 0.5, 0),
)
plt.title(
   "Nested Cross Validation",
   x=0.5,
   y=1.1,
   fontsize="15",
)
plt.show()