K折交叉验证
k折交叉验证是划分数据集的一种方式,特别适合少量数据集
在原始数据中划分k份,取1份作为测试集,k-1份作为训练集
最后算出平均性能值
以MINIST数据为例子
```python
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
#加载数据集
minst=keras.datasets.mnist.load_data()#训练集和测试集分别有60000和10000
(x_train,y_train),(x_test,y_test)=minst#下面我要将训练集和测试集重新组合成一个训练集,相应的标签也要重组#这里使用numpy的垂直方向堆叠
x_data=np.vstack((x_train,x_test))#这里使用numpy的水平方向堆叠
y_label=np.hstack((y_train,y_test))#测试堆叠是否正确 结果标签为3 打印的标签也为3
plt.imshow(x_data[65000])
print(y_label[65000])
#令k=5 epochs=5k=5
num_epochs=5#算出滑动步长 x_data的shape为(70000,28,28) 划分5组 每个步长为14000
slid_step=int(x_data.shape[0]/k)#数据类型转换
x_data.astype(dtype='float32')
y_label.astype(dtype="int32")#historys用来记录5组loss,auc,val_loss,val_auc
historys=[]
#五组实验
for i in range(k):x_test, y_test = x_data[i * slid_step:(i + 1) * slid_step],y_label[i * slid_step:(i + 1) * slid_step]x_train, y_train = x_data[i * slid_step:], y_label[i * slid_step:]#搭建结构model = keras.models.Sequential([keras.layers.Flatten(),keras.layers.Dense(128, activation='relu'),keras.layers.Dense(10, activation='softmax')])model.compile(optimizer="rmsprop", loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False),metrics=['sparse_categorical_accuracy'])history = model.fit(x_train, y_train, epochs=num_epochs,batch_size=128,validation_data=(x_test,y_test))historys.append(history.history)
然后算出各项平均指标
loss_mean=[]
for i in range(5):plt.subplot(1,5,i+1)plt.plot(list(range(1,6)),list(historys[i].values())[0])loss_mean.append(np.sum(list(historys[i].values())[0])/5)plt.title("loss")
plt.show()
print(loss_mean)
print("均值loss:",sum(loss_mean)/5)
[0.9816258639553614, 1.2614621218195983, 1.634634684441948, 2.221515530428221, 3.0848385429355782]
均值loss: 1.8368153487161414
acc_mean=[]
for i in range(5):plt.subplot(1,5,i+1)plt.plot(list(range(1,6)),list(historys[i].values())[1])acc_mean.append(np.sum(list(historys[i].values())[1])/5)plt.title("acc")
plt.show()
print(acc_mean)
print("平均值acc:",sum(acc_mean)/5)
[0.9320542335510253, 0.9282821655273438, 0.9242952346801758, 0.9239570617675781, 0.9144857406616211]
“平均值acc:” 0.9246148872375489
val_loss=[]
for i in range(5):plt.subplot(1,5,i+1)plt.plot(list(range(1,6)),list(historys[i].values())[2])val_loss.append(np.sum(list(historys[i].values())[2])/5)plt.title("val_loss")
plt.show()
print(val_loss)
print("平均值val_loss:",sum(val_loss)/5)
[0.32521084280269486, 0.40465637274256777, 0.49552535546081405, 0.9327450055751416, 1.1576837492657068]
平均值val_loss: 0.663164265169385
val_acc=[]
for i in range(5):plt.subplot(1,5,i+1)plt.plot(list(range(1,6)),list(historys[i].values())[3])val_acc.append(np.sum(list(historys[i].values())[3])/5)plt.title("val_acc")
plt.show()
print(val_acc)
print("平均值val_acc:",sum(val_acc)/5)
[0.9402713775634766, 0.9435285568237305, 0.9429286003112793, 0.9443714141845703, 0.9401143074035645]
平均值val_acc: 0.9422428512573242
结合每个模型在测试集的loss和acc表现 选择最接近均值的模型