欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

【ML】K近邻算法(2)

程序员文章站 2022-07-14 13:40:32
...

前文链接

K近邻算法(1)

距离函数

根据距离计算公式,计算两点之间的距离LpL_p
Lp(x,y)=(i=1nxiyip)1p L_p(\pmb{x}, \pmb{y})=(\sum_{i=1}^n|x_i-y_i|^p)^{\frac{1}{p}}

import math
from itertools import combinations

def L(x, y, p=2):
    if x and y and len(x)==len(y) and p>=1:
        ans=0
        for i in range(len(x)):
            ans+=math.pow(abs(x[i]-y[i]), p)
        return math.pow(ans, 1/p)
    else: return -1

print(L([1, 2], [3, 7]))

KNN

使用KNN算法在iris数据集上进行聚类预测

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from collections import Counter

iris=load_iris()
df=pd.DataFrame(iris.data, columns=iris.feature_names)
df['label']=iris.target
df.columns=['sepal length', 'sepal width', 'petal lenght', 'petal width', 'label']

# print(df[:100])

data=np.array(df.iloc[:100, [0, 1, -1]])
X, y=data[:, :-1], data[:, -1] # 提取特征和标签
# 分离训练集和测试集8:2
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2)
# print(X_train.shape)
# print(X_test.shape)

class KNN:
    def __init__(self, X_train, y_train, k_neighbors=3, p=2):
        self.k=k_neighbors
        self.p=p
        self.X_train=X_train
        self.y_train=y_train

    # 预测样本点的label
    def predict(self, X):
        # 维护一个容量为k的列表,存储距离X最近的k的样本点
        knn_pts=[]
        # 对前k个点进行检测,存储距离和标签
        for i in range(self.k):
            d=np.linalg.norm(X-self.X_train[i], ord=self.p)
            knn_pts.append((d, self.y_train[i]))

        # 对k~n个点进行检测,更新列表
        for i in range(self.k, len(self.X_train)):
            max_index=knn_pts.index(max(knn_pts, key=lambda x: x[0]))
            d=np.linalg.norm(X-self.X_train[i], ord=self.p)
            if knn_pts[max_index][0]>d:
                knn_pts[max_index]=(d, self.y_train[i])

        # 投票
        knn=[pt[-1] for pt in knn_pts]
        pairs=Counter(knn)
        ans=sorted(pairs.items(), key=lambda x: x[1])[-1][0]
        return ans

    def score(self, X_test, y_test):
        cnt=0
        for X, y in zip(X_test, y_test):
            label=self.predict(X)
            if label==y: cnt+=1
        return cnt/len(X_test)

def demo_1():
    clf=KNN(X_train, y_train, k_neighbors=5)
    ans=clf.score(X_test, y_test)
    print('the correct ratio is {}'.format(ans))

    test_pt=[6.0, 3.0]
    # 绘制结果图像
    plt.scatter(df[:50]['sepal length'], df[:50]['sepal width'], label='0', color='blue', alpha=0.8)
    plt.scatter(df[50:100]['sepal length'], df[50:100]['sepal width'], label='1', color='yellow', alpha=0.8)
    plt.plot(test_pt[0], test_pt[1], 'rx', label='test point')
    plt.xlabel('sepal length')
    plt.ylabel('sepal width')
    plt.legend()
    plt.show()

demo_1()

数据聚类预测结果如下
【ML】K近邻算法(2)可以直接调用scikit-learn包中函数

from sklearn.neighbors import KNeighborsClassifier as KNN
clf=KNN()
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

参数说明:
n_neighbors:近邻点的个数
p:距离参数
algorithm:近邻算法,可以用auto, ball, kd_tree, brute
weights:确定近邻的权重

KD树

KD树是二叉树到mm维向量的扩展,在每个级别,选择一个特征进行分类,KD树的实现主要包括建树和查询两部分,保持平衡的KD树查询的时间复杂度为O(mlogn)\mathcal{O}(m\log n). 但是当m1m\gg 1时,复杂度会趋向于O(mn)\mathcal{O}(mn).

代码

from math import sqrt
from collections import namedtuple

class KDNode(object):
    def __init__(self, sample, split, left, right):
        self.sample=sample # k维样本点
        self.split=split # 当前分割维度的序号
        self.left=left
        self.right=right

class KDTree(object):
    def __init__(self, data):
        k=len(data[0])
        
        def makeNode(split, data_set):
            if not data_set: return None
            data_set.sort(key=lambda x: x[split])
            # 在中位数点处划分
            med_pos=len(data_set)//2
            med=data_set[med_pos]
            # 下一个划分维度
            ne_split=(split+1)%k

            return KDNode(med, split, 
            makeNode(ne_split, data_set[:med_pos]), # 递归创建左子树
            makeNode(ne_split, data_set[med_pos+1:]) # 递归创建右子树
            )
        self.root=makeNode(0, data) # 从第0维开始创建kd树

ans=namedtuple('Res', 'nearest_point nearest_dist nodes_visited')

# kd树的先序遍历
def preorder(root):
    print(root.sample)
    if root.left: preorder(root.left)
    if root.right: preorder(root.right)


# 查询kd树
def find(tree, point):
    k=len(point) # k维
    def travel(kd_node, target, max_dist):
        if not kd_node: return ans([0]*k, float('inf'), 0)
        nodes_vis=1
        d=kd_node.split # 分裂维度
        pivot=kd_node.sample

        if target[d]<=pivot[d]: # 左子树
            near_node=kd_node.left
            ne_node=kd_node.right
        else:
            near_node=kd_node.right # 右子树
            ne_node=kd_node.left
        
        t=travel(near_node, target, max_dist) # 进行遍历找到包含目标点的区域
        nearest=t.nearest_point # 当前最近点
        dist=t.nearest_dist # 最近距离
        nodes_vis+=t.nodes_visited

        if dist<max_dist: max_dist=dist
        cur_dist=abs(pivot[d]-target[d]) # 当前维度上目标点与分割超平面的距离
        # 如果超球体与超平面不相交则直接返回
        if max_dist<cur_dist: return ans(nearest, dist, nodes_vis)

        cur_dist=sqrt(sum((p1-p2)**2 for p1, p2 in zip(pivot, target)))
        if cur_dist<dist:
            nearest=pivot
            dist=cur_dist
            max_dist=dist

        # 检查另一侧
        t2=travel(ne_node, target, max_dist)
        nodes_vis+=t2.nodes_visited
        if t2.nearest_dist<dist:
            nearest=t2.nearest_point
            dist=t2.nearest_dist
        return ans(nearest, dist, nodes_vis)

    return travel(tree.root, point, float('inf'))


def demo():
    data=[[2, 3], [5, 4], [9, 6], [4, 7], [8, 1], [7, 2]]
    kd=KDTree(data)
    preorder(kd.root)
    ans=find(kd, [3, 4.5])
    print(ans)

demo()

from random import random
def demo2():
    def random_vec(k):
        return [random() for _ in range(k)]

    # 产生n个k维向量
    def random_pts(k, n):
        return [random_vec(k) for _ in range(n)]

    n_samples=1005
    samples=random_pts(3, n_samples)
    kd=KDTree(samples)
    ans=find(kd, [0.1, 0.5, 0.8])
    print(ans)

demo2()

Ball-Tree

相关论文下载链接:ball tree and kd tree
ball tree的数据结构是建立以半径为基础的邻域,以样本xix_i为中心的求在形式上等于NR(xi)N_R(x_i),其中RR为半径,通过将较小的球嵌入较大的球进行建树. 样本中重要的条件属于单个球,这样可以使得计算复杂度保持在O(mlogn)\mathcal{O}(m\log n),根据球的属性来测量点和中心之间的距离以判断是否属于该球.