学习笔记【机器学习重点与实战】——7 支持向量机实战

程序员文章站 2022-05-22 09:43:51

...

1 sklearn支持向量机分类

通过scipy中的multivariate_normal创建四分类随机多元正态分布样本集，使用sklearn中的svm.SVC对样本分类，实现代码如下：

import numpy as np
from sklearn import svm
from scipy import stats
from sklearn.metrics import accuracy_score
import matplotlib as mpl
import matplotlib.pyplot as plt


def extend(a, b, r=0.01):
    return a * (1 + r) - b * r, -a * r + b * (1 + r)


if __name__ == "__main__":
    # 创建四分类样本集
    np.random.seed(100)         # 撒固定的种子，保证每次样本集数据相同
    N = 200                     # 每个分类200个样本点
    x = np.empty((4*N, 2))      # 随机返回800*2的元组
    means = [(-1, 1), (1, 1), (1, -1), (-1, -1)]                # 各个类的分布均值
    sigmas = [np.eye(2), 2*np.eye(2), np.diag((1,2)), np.array(((3, 2), (2, 3)))]   # 各个类的分布协方差矩阵
    for i in range(4):
        mn = stats.multivariate_normal(means[i], sigmas[i]*0.1) # 根据分布均值和协方差创建多元正态分布
        x[i*N:(i+1)*N, :] = mn.rvs(N)                           # 随机抽取创建多元正态分布，并对样本特征X赋值
    a = np.array((0,1,2,3)).reshape((-1, 1))
    y = np.tile(a, N).flatten() # 创建标签值

    # 支持向量分类机模型，高斯核，错误项的惩罚参数C=1，核函数系数gamma=1，一对一分类决策函数
    # a.一对多法（one-vs-rest,简称1-v-r SVMs）。训练时依次把某个类别的样本归为一类,其他剩余的样本归为另一类，
    # 这样k个类别的样本就构造出了k个SVM。分类时将未知样本分类为具有最大分类函数值的那类。
    # b.一对一法（one-vs-one,简称1-v-1 SVMs）。其做法是在任意两类样本之间设计一个SVM，
    # 因此k个类别的样本就需要设计k(k-1)/2个SVM。当对一个未知样本进行分类时，最后得票最多的类别即为该未知样本的类别。
    # Libsvm中的多类分类就是根据这个方法实现的。
    clf = svm.SVC(C=1, kernel='rbf', gamma=1, decision_function_shape='ovo')
    clf.fit(x, y)                           # 训练SVC
    y_hat = clf.predict(x)                  # 预测
    acc = accuracy_score(y, y_hat)          # 训练集精度
    np.set_printoptions(suppress=True)
    print('预测正确的样本个数：%d，正确率：%.2f%%' % (round(acc*4*N), 100*acc))
    print('支撑向量数目：', clf.n_support_)
    print(clf.decision_function(x))         # 样本点距超平面距离

    # 画图
    x1_min, x2_min = np.min(x, axis=0)
    x1_max, x2_max = np.max(x, axis=0)
    x1_min, x1_max = extend(x1_min, x1_max)
    x2_min, x2_max = extend(x2_min, x2_max)
    x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j]
    x_test = np.stack((x1.flat, x2.flat), axis=1)
    y_test = clf.predict(x_test)
    y_test = y_test.reshape(x1.shape)
    cm_light = mpl.colors.ListedColormap(['#FF8080', '#80FF80', '#8080FF', '#F0F080'])
    cm_dark = mpl.colors.ListedColormap(['r', 'g', 'b', 'y'])
    mpl.rcParams['font.sans-serif'] = ['SimHei']
    mpl.rcParams['axes.unicode_minus'] = False
    plt.figure(facecolor='w')
    plt.pcolormesh(x1, x2, y_test, cmap=cm_light)
    plt.contour(x1, x2, y_test, levels=(0,1,2), colors='k', linestyles='--')
    plt.scatter(x[:, 0], x[:, 1], s=20, c=y, cmap=cm_dark, edgecolors='k', alpha=0.7)
    plt.xlabel('$X_1$', fontsize=11)
    plt.ylabel('$X_2$', fontsize=11)
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.grid(b=True)
    plt.tight_layout(pad=2.5)
    plt.title('SVM多分类方法：One/One or One/Rest', fontsize=14)
    plt.show()

输出结果如下：

预测正确的样本个数：792，正确率：99.00%
支撑向量数目： [14 29 22 23]
[[ 1.50432191  1.2820055   1.71149331  0.38199111 -0.55217345 -0.41357172]
 [ 1.6535127   1.3253407   1.75495585  0.57391031 -0.08517941 -0.32364579]
 [ 1.23434954  1.14042349  1.36757566  0.23062588 -0.19802485 -0.16228541]
 ...
 [-0.03906226 -0.04228719 -1.47537791  0.13425707 -1.1147555  -1.37442757]
 [-0.21845916 -0.12649473 -1.1387974   0.16443125 -1.17337707 -1.19144718]
 [ 0.02383455 -0.63753932 -1.56095266 -0.49305719 -1.30025723 -1.00020888]]

输出图形如下：

学习笔记【机器学习重点与实战】——7 支持向量机实战

2 SVM不同核函数、参数分类对比

使用sklearn中的svm.SVC对样本使用不同核函数、参数进行分类对比，实现代码如下：

import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.metrics import accuracy_score
import matplotlib as mpl
import matplotlib.colors
import matplotlib.pyplot as plt


if __name__ == "__main__":
    # pandas读入数据
    data = pd.read_csv('bipartition.txt', sep='\t', header=None)
    # 抽取特征值、标签值
    x, y = data[[0, 1]], data[2]
    print("训练样本个数：%d，特征数：%d \n" % x.shape)

    # 不同核函数、参数的分类器
    clf_param = (('linear', 0.1), ('linear', 0.5), ('linear', 1), ('linear', 2),
                ('rbf', 1, 0.1), ('rbf', 1, 1), ('rbf', 1, 10), ('rbf', 1, 100),
                ('rbf', 5, 0.1), ('rbf', 5, 1), ('rbf', 5, 10), ('rbf', 5, 100))
    x1_min, x2_min = np.min(x, axis=0)
    x1_max, x2_max = np.max(x, axis=0)
    x1, x2 = np.mgrid[x1_min:x1_max:200j, x2_min:x2_max:200j]
    grid_test = np.stack((x1.flat, x2.flat), axis=1)

    cm_light = mpl.colors.ListedColormap(['#77E0A0', '#FFA0A0'])
    cm_dark = mpl.colors.ListedColormap(['g', 'r'])
    mpl.rcParams['font.sans-serif'] = ['SimHei']
    mpl.rcParams['axes.unicode_minus'] = False
    plt.figure(figsize=(13, 8), facecolor='w')

    # 遍历分类器参数
    for i, param in enumerate(clf_param):
        # 根据核函数和参数创建SVC
        clf = svm.SVC(C=param[1], kernel=param[0])
        # 如果为高斯核函数，对gama赋值
        if param[0] == 'rbf':
            clf.gamma = param[2]
            title = '高斯核，C=%.1f，$\gamma$ =%.1f' % (param[1], param[2])
        else:
            title = '线性核，C=%.1f' % param[1]

        clf.fit(x, y)                   # 训练SVC
        y_hat = clf.predict(x)          # 预测训练集

        print(title)
        acc = accuracy_score(y, y_hat)
        title += '\n准确率：' + str(acc) + ',支撑向量的数目:' +  str(clf.n_support_)
        print('准确率：', acc)
        print('支撑向量数目：%s \n' % str(clf.n_support_))

        # 画图
        plt.subplot(3, 4, i+1)
        grid_hat = clf.predict(grid_test)                                   # 预测分类值
        grid_hat = grid_hat.reshape(x1.shape)                               # 使之与输入的形状相同
        plt.pcolormesh(x1, x2, grid_hat, cmap=cm_light, alpha=0.8)
        plt.scatter(x[0], x[1], c=y, edgecolors='k', s=40, cmap=cm_dark)    # 样本的显示
        # 支撑向量
        plt.scatter(x.loc[clf.support_, 0], x.loc[clf.support_, 1], edgecolors='k', facecolors='none', s=100, marker='o')
        z = clf.decision_function(grid_test)
        # print('clf.decision_function(x) = ', clf.decision_function(x))
        # print('clf.predict(x) = ', clf.predict(x))
        z = z.reshape(x1.shape)
        # 创建等高线
        plt.contour(x1, x2, z, colors=list('kbrbk'), linestyles=['--', '--', '-', '--', '--'],
                    linewidths=[1, 0.5, 1.5, 0.5, 1], levels=[-1, -0.5, 0, 0.5, 1])
        plt.xlim(x1_min, x1_max)
        plt.ylim(x2_min, x2_max)
        plt.title(title, fontsize=12)
    plt.suptitle('SVM不同核函数、参数分类对比', fontsize=16)
    plt.tight_layout(1.4)
    plt.subplots_adjust(top=0.9)
    plt.show()

输出结果如下：

训练样本个数：100，特征数：2

线性核，C=0.1
准确率： 0.95
支撑向量数目：[25 25] 

线性核，C=0.5
准确率： 0.95
支撑向量数目：[15 15] 

线性核，C=1.0
准确率： 0.95
支撑向量数目：[12 12] 

线性核，C=2.0
准确率： 0.95
支撑向量数目：[10 10] 

高斯核，C=1.0，$\gamma$ =0.1
准确率： 0.93
支撑向量数目：[20 20] 

高斯核，C=1.0，$\gamma$ =1.0
准确率： 0.95
支撑向量数目：[11 12] 

高斯核，C=1.0，$\gamma$ =10.0
准确率： 0.95
支撑向量数目：[17 27] 

高斯核，C=1.0，$\gamma$ =100.0
准确率： 0.99
支撑向量数目：[38 47] 

高斯核，C=5.0，$\gamma$ =0.1
准确率： 0.95
支撑向量数目：[12 12] 

高斯核，C=5.0，$\gamma$ =1.0
准确率： 0.95
支撑向量数目：[9 9] 

高斯核，C=5.0，$\gamma$ =10.0
准确率： 0.97
支撑向量数目：[15 25] 

高斯核，C=5.0，$\gamma$ =100.0
准确率： 0.99
支撑向量数目：[35 46]

输出图形如下：

学习笔记【机器学习重点与实战】——7 支持向量机实战

惩罚参数C是损失函数系数，γ是高斯核函数系数。由输出图形，及原理可得出：

C越大，训练精度越大，有可能过拟合，过渡带宽度越来越小
γ是高斯核函数中的 $\frac{1}{2 σ^{2}}$ ，相当于精度，γ越大，越贴合训练集，会过拟合，现象是边界不光滑
γ足够小，高斯核会退化成线性核
SVM需要调参，才能有好的结果。

而对于中小规模数据，可以使用svc；但对于大规模数据不建议直接使用，可分块再做。

3 SVC对不平衡数据的处理

创建正例为10、反例为990的不平衡样本集，使用sklearn中的svm.SVC，分别使用线性核与高斯核，对正反例赋予不同权重值进行样本分类，实现代码如下：

import numpy as np
from sklearn import svm
import matplotlib.colors
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.exceptions import UndefinedMetricWarning
import warnings


if __name__ == "__main__":
    np.random.seed(0)   # 撒固定的种子，保证每次样本集数据相同

    c1 = 990        # 反例样本数
    c2 = 10         # 正例样本数
    N = c1 + c2     # 样本总数
    x_c1 = 3*np.random.randn(c1, 2)             # 随机取反例坐标值（二维）
    x_c2 = 0.5*np.random.randn(c2, 2) + (4, 4)  # 随机取正例坐标值（二维）
    x = np.vstack((x_c1, x_c2))                 # 堆叠正例反例坐标值
    y = np.ones(N)                              # 对正例赋标签值
    y[:c1] = -1                                 # 对反例赋标签值

    # 样本点显示大小
    s = np.ones(N) * 30
    s[:c1] = 10

    # 权重值
    weight = [1,30,1,30]
    # 对正例使用权重值，使用线型核、高斯核的SVC分类器
    clfs = [svm.SVC(C=1, kernel='linear', class_weight={-1:1, 1:weight[0]}),
           svm.SVC(C=1, kernel='linear', class_weight={-1:1, 1:weight[1]}),
           svm.SVC(C=0.8, kernel='rbf', gamma=0.5, class_weight={-1:1, 1:weight[2]}),
           svm.SVC(C=0.8, kernel='rbf', gamma=0.5, class_weight={-1:1, 1:weight[3]})]
    titles = [('Linear, Weight=%d' % weight[0]), ('Linear, Weight=%d' % weight[1]), ('RBF, Weight=%d' % weight[2]),
              ('RBF, Weight=%d' % weight[3])]

    x1_min, x2_min = np.min(x, axis=0)
    x1_max, x2_max = np.max(x, axis=0)
    x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j]
    grid_test = np.stack((x1.flat, x2.flat), axis=1)  # 测试点

    cm_light = matplotlib.colors.ListedColormap(['#77E0A0', '#FF8080'])
    cm_dark = matplotlib.colors.ListedColormap(['g', 'r'])
    matplotlib.rcParams['font.sans-serif'] = ['SimHei']
    matplotlib.rcParams['axes.unicode_minus'] = False
    plt.figure(figsize=(10, 8), facecolor='w')
    # 遍历分类器
    for i, clf in enumerate(clfs):
        clf.fit(x, y)               # 训练SVC
        y_hat = clf.predict(x)      # 预测训练集

        # 输出性能度量值
        print(i+1, '次：')
        print('accuracy（精确度）：\t', accuracy_score(y, y_hat))
        print('precision（准确率）：\t', precision_score(y, y_hat, pos_label=1))
        print('recall（召回率）：\t\t', recall_score(y, y_hat, pos_label=1))
        print('F1-score（F1度量）：\t', f1_score(y, y_hat, pos_label=1))
        print()

        # 画图
        plt.subplot(2, 2, i+1)
        grid_hat = clf.predict(grid_test)           # 预测分类值
        grid_hat = grid_hat.reshape(x1.shape)       # 使之与输入的形状相同
        plt.pcolormesh(x1, x2, grid_hat, cmap=cm_light, alpha=0.8)
        plt.scatter(x[:, 0], x[:, 1], c=y, edgecolors='k', s=s, cmap=cm_dark)      # 样本的显示
        plt.xlim(x1_min, x1_max)
        plt.ylim(x2_min, x2_max)
        plt.title(titles[i])
        plt.grid(b=True, ls=':')
    plt.suptitle('SVC对不平衡数据的处理', fontsize=18)
    plt.tight_layout(1.5)
    plt.subplots_adjust(top=0.92)
    plt.show()

输出结果如下：

1 次：
accuracy（精确度）：   0.99
precision（准确率）：  0.0
recall（召回率）：     0.0
F1-score（F1度量）：  0.0

2 次：
accuracy（精确度）：   0.941
precision（准确率）：  0.14492753623188406
recall（召回率）：     1.0
F1-score（F1度量）：  0.25316455696202533

3 次：
accuracy（精确度）：   0.994
precision（准确率）：  0.7
recall（召回率）：     0.7
F1-score（F1度量）：  0.7

4 次：
accuracy（精确度）：   0.994
precision（准确率）：  0.625
recall（召回率）：     1.0
F1-score（F1度量）：  0.7692307692307693

输出图形如下：

学习笔记【机器学习重点与实战】——7 支持向量机实战

由训练结果可知：

对于非线性样本，高斯核明显优于线性核；
若不对不平衡数据进行处理，召回率、F1度量值会较低。

4 SVC对手写数字的识别

使用sklearn中的GridSearchCV（超参数自动搜索模块），寻找svm.SVC对样本分类的最优的C和gama参数组合，实现代码如下：

import numpy as np
from sklearn import svm
import matplotlib.colors
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.metrics import accuracy_score
import os
from sklearn.model_selection import GridSearchCV
from time import time


if __name__ == "__main__":
    print('Load Training File Start...')
    data = np.loadtxt('optdigits.tra', dtype=np.float, delimiter=',')
    x, y = np.split(data, (-1, ), axis=1)
    print("训练样本个数：%d，特征数：%d \n" % x.shape)
    images = x.reshape(-1, 8, 8)            # 每个样本转换为8*8数组
    y = y.ravel().astype(np.int)            # 转置为行，并将值转换为整型

    print('Load Test Data Start...')
    data = np.loadtxt('optdigits.tes', dtype=np.float, delimiter=',')
    x_test, y_test = np.split(data, (-1, ), axis=1)
    print("训练样本个数：%d，特征数：%d \n" % x_test.shape)
    images_test = x_test.reshape(-1, 8, 8)  # 每个样本转换为8*8数组
    y_test = y_test.ravel().astype(np.int)  # 转置为行，并将值转换为整型
    print('Load Data OK...')

    # SVC的C和gamma值选取范围
    params = {'C':np.logspace(0, 3, 7), 'gamma':np.logspace(-5, 0, 11)}
    # 超参数自动搜索模块GridSearchCV，系统地遍历多种参数组合，通过交叉验证确定最佳效果参数 3折交叉验证
    model = GridSearchCV(svm.SVC(kernel='rbf'), param_grid=params, cv=3)
    print('Start Learning...')
    t0 = time()
    model.fit(x, y)     # 训练数据
    t1 = time()
    t = t1 - t0         # 训练耗时
    print('训练+CV耗时：%d分钟%.3f秒' % (int(t/60), t - 60*int(t/60)))
    print('最优参数：\t', model.best_params_)
    print('Learning is OK...')
    print('训练集准确率：', accuracy_score(y, model.predict(x)))
    y_hat = model.predict(x_test)
    print('测试集准确率：', accuracy_score(y_test, y_hat))

    # 选取错分样本
    err_images = images_test[y_test != y_hat]
    err_y_hat = y_hat[y_test != y_hat]
    err_y = y_test[y_test != y_hat]
    print('错分样本的预测值：', err_y_hat)        # 错分样本的预测值
    print('错分样本的实际值：', err_y)            # 错分样本的实际值

    # 输出错分样本图片
    matplotlib.rcParams['font.sans-serif'] = ['SimHei']
    matplotlib.rcParams['axes.unicode_minus'] = False
    plt.figure(figsize=(10, 8), facecolor='w')
    for index, image in enumerate(err_images):
        if index >= 12:
            break
        plt.subplot(3, 4, index + 1)
        plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
        plt.title('错分为：%i，真实值：%i' % (err_y_hat[index], err_y[index]))
    plt.tight_layout()
    plt.show()

输出结果如下：

Load Training File Start...
训练样本个数：3823，特征数：64 

Load Test Data Start...
训练样本个数：1797，特征数：64 

Load Data OK...
Start Learning...
训练+CV耗时：6分钟25.230秒
最优参数：    {'C': 10.0, 'gamma': 0.001}
Learning is OK...
训练集准确率： 1.0
测试集准确率： 0.9827490261547023
[0 1 2 ... 8 9 8]
[0 1 2 ... 8 9 8]
错分样本的预测值： [9 1 1 1 1 9 5 9 9 9 9 9 9 8 1 0 1 3 8 9 9 3 5 9 1 7 3 5 8 5 1]
错分样本的实际值： [5 2 2 2 8 7 7 5 7 7 7 7 7 1 8 6 8 9 9 3 8 8 8 7 8 3 9 9 3 3 8]

输出图形如下：

学习笔记【机器学习重点与实战】——7 支持向量机实战

5 sklearn支持向量机回归

创建回归样本集，使用sklearn中的svm.SVR，分别使用高斯核、线性核、多项式核，进行样本回归，实现代码如下：

import numpy as np
from sklearn import svm
import matplotlib.colors
import matplotlib.pyplot as plt


if __name__ == "__main__":
    N = 50                              # 样本数
    np.random.seed(0)                   # 撒固定的种子，保证每次样本集数据相同
    x = np.sort(np.random.uniform(0, 6, N), axis=0)
    y = 2*np.sin(x) + 0.1*np.random.randn(N)    # 构造y值
    x = x.reshape(-1, 1)                # 转换为列

    # 使用SVR的不同核函数训练
    print('SVR - RBF')
    svr_rbf = svm.SVR(kernel='rbf', gamma=0.2, C=100)
    svr_rbf.fit(x, y)
    print('SVR - Linear')
    svr_linear = svm.SVR(kernel='linear', C=100)
    svr_linear.fit(x, y)
    print('SVR - Polynomial')
    svr_poly = svm.SVR(kernel='poly', degree=3, C=100)
    svr_poly.fit(x, y)
    print('Fit OK.')

    # 构造测试集，并用不同分类器进行预测
    x_test = np.linspace(x.min(), 1.1*x.max(), 100).reshape(-1, 1)
    y_rbf = svr_rbf.predict(x_test)
    y_linear = svr_linear.predict(x_test)
    y_poly = svr_poly.predict(x_test)

    # 画图
    matplotlib.rcParams['font.sans-serif'] = ['SimHei']
    matplotlib.rcParams['axes.unicode_minus'] = False
    plt.figure(figsize=(7, 6), facecolor='w')
    plt.plot(x_test, y_rbf, 'r-', linewidth=2, label='RBF Kernel')
    plt.plot(x_test, y_linear, 'g-', linewidth=2, label='Linear Kernel')
    plt.plot(x_test, y_poly, 'b-', linewidth=2, label='Polynomial Kernel')
    plt.plot(x, y, 'mo', markersize=6, markeredgecolor='k')
    # 绘制高斯核SVR分类器的支持向量
    plt.scatter(x[svr_rbf.support_], y[svr_rbf.support_], s=200, c='r', marker='*', edgecolors='k', label='RBF Support Vectors', zorder=10)
    plt.legend(loc='lower left', fontsize=12)
    plt.title('使用不同核函数的SVR进行回归', fontsize=15)
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.grid(b=True, ls=':')
    plt.tight_layout(2)
    plt.show()

输出图形如下：

学习笔记【机器学习重点与实战】——7 支持向量机实战

6 参考

机器学习升级版视频 - 邹博
《统计学习方法》第7章支持向量机

===========文档信息============
学习笔记由博主整理编辑，供非商用学习交流用
如本文涉及侵权，请随时留言博主，必妥善处置
版权声明：非商用*转载-保持署名-注明出处
署名(BY) ：dkjkls（dkj卡洛斯）
文章出处：http://blog.csdn.net/dkjkls

相关标签：机器学习 SVM SVC SVR 支持向量机

上一篇： [MAVEN]web工程的调试

下一篇：一款很好的画图工具 dia

学习笔记【机器学习重点与实战】——7 支持向量机实战

1 sklearn支持向量机分类

2 SVM不同核函数、参数分类对比

3 SVC对不平衡数据的处理

4 SVC对手写数字的识别

5 sklearn支持向量机回归

6 参考

学习笔记【机器学习重点与实战】——10 聚类算法实现与实战

学习笔记【机器学习重点与实战】——4 集成学习-Bagging

python机器学习理论与实战（五）支持向量机

python机器学习理论与实战（六）支持向量机