from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
#导入数据
iris = datasets.load_iris()
#区分数据的自变量和因变量
iris_X = iris.data
iris_Y = iris.target
#将数据分成训练集和测试集,比例为:80%和20%
iris_train_X , iris_test_X, iris_train_Y ,iris_test_Y = train_test_split(
iris_X, iris_Y, test_size=0.2,random_state=0)
#训练逻辑回归模型
log_reg = LogisticRegression() #此处这个函数中有很多参数可供选择
log_reg.fit(iris_train_X, iris_train_Y)
#预测
predict = log_reg.predict(iris_test_X)
accuracy = log_reg.score(iris_test_X,iris_test_Y)
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
X, y = load_boston(return_X_y=True)
print(X.shape)
model = LinearRegression()
model.fit(X, y)
print(model.predict(X[:4,:]))
print(y[:4])
print(model.coef_)
print(model.intercept_)
print(model.get_params())
print(model.score(X, y)) # R^2 cofficient of determination
from sklearn.datasets import make_regression
import matplotlib.pyplot as plt
X,y = make_regression(n_samples=100, n_features=1, n_targets=1, noise=10)
plt.scatter(X,y)
from sklearn import preprocessing
import numpy as np
a = np.array([[10,2.7,3.6],
[-100,5,5],
[120,20,40]], dtype=np.float64)
print(a)
print(preprocessing.scale(a))
from sklearn import preprocessing
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.svm import SVC
import matplotlib.pyplot as plt
X,y = make_classification(n_samples=300, n_features=2, n_redundant=0, n_informative=2,
random_state=22, n_clusters_per_class=1, scale=100)
print(X.shape)
plt.scatter(X[:,0], X[:,1],c=y)
X_ = preprocessing.scale(X)
X_train,X_test,y_train,y_test = train_test_split(X_,y, test_size=.3)
clf =SVC()
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
## no scale
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=.3)
clf =SVC()
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
X,y =load_iris(return_X_y=True)
X_train,X_test,y_train,y_test = train_test_split(X,y, random_state=4)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train)
print(knn.score(X_test, y_test))
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
X,y =load_iris(return_X_y=True)
knn = KNeighborsClassifier(n_neighbors=5)
score = cross_val_score(knn, X, y, cv=5, scoring="accuracy")
print(score)
print(score.mean())
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
X,y =load_iris(return_X_y=True)
k_range = range(1,31)
k_score = []
for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)
score = cross_val_score(knn, X, y, cv=5, scoring="accuracy")
k_score.append(score.mean())
plt.plot(k_range, k_score)
plt.xlabel("Value of K for KNN")
plt.ylabel("Cross-Validated Accuracy")
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
X,y =load_iris(return_X_y=True)
k_range = range(1,31)
k_score = []
for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)
loss = -cross_val_score(knn, X, y, cv=10, scoring="neg_mean_squared_error")
k_score.append(loss.mean())
plt.plot(k_range, k_score)
plt.xlabel("Value of K for KNN")
plt.ylabel("Cross-Validated Accuracy")
from sklearn.model_selection import learning_curve
from sklearn.datasets import load_digits
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import numpy as np
X,y = load_digits(return_X_y=True)
train_sizes,train_loss,test_loss = learning_curve(SVC(gamma=0.001), X, y, cv=10,
scoring="neg_mean_squared_error",
train_sizes=[0.1,0.25,0.5,0.75,1])
train_loss_mean = -np.mean(train_loss,axis=1)
print(train_loss_mean)
test_loss_mean = -np.mean(test_loss,axis=1)
plt.plot(train_sizes,train_loss_mean,'o-',color="r",label="Training")
plt.plot(train_sizes,test_loss_mean,"o-",color="g",label="Cross-validation")
plt.legend(loc="best")
测试数据集的loss不能下降
train_sizes,train_loss,test_loss = learning_curve(SVC(gamma=0.1), X, y, cv=10,
scoring="neg_mean_squared_error",
train_sizes=[0.1,0.25,0.5,0.75,1])
没有Overfitting, 可以减少误差
from sklearn.model_selection import validation_curve
from sklearn.datasets import load_digits
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import numpy as np
X,y = load_digits(return_X_y=True)
param_range = np.logspace(-6, -2.3,5)
train_loss,test_loss = validation_curve(SVC(), X, y,
param_name="gamma",
param_range=param_range, cv=10,
scoring="neg_mean_squared_error")
train_loss_mean = -np.mean(train_loss,axis=1)
test_loss_mean = -np.mean(test_loss,axis=1)
plt.plot(param_range,train_loss_mean,'o-',color="r",label="Training")
plt.plot(param_range,test_loss_mean,"o-",color="g",label="Cross-validation")
plt.legend(loc="best")
from sklearn import svm
from sklearn import datasets
import pickle
import joblib
X,y = datasets.load_iris(return_X_y=True)
clf = svm.SVC()
clf.fit(X,y)
with open("clf.pickle","wb") as f:
pickle.dump(clf, f)
with open("clf.pickle","rb") as f:
clf2 = pickle.load(f)
print(clf2.predict(X[0:1]))
joblib.dump(clf,"clf.pkl")
clf3 = joblib.load("clf.pkl")
print(clf3.predict(X[0:1]))