欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

k近邻算法

程序员文章站 2022-07-13 11:36:12
...

k近邻算法代码实现

import numpy as np
import operator


##给出训练数据以及对应的类别
def create_dataset():
    group = np.array([[1.0, 2.0], [1.2, 0.1],
                      [0.1, 1.4], [0.3, 3.5]])
    labels = ['A', 'A', 'B', 'B']
    return group, labels


##通过KNN进行分类
def classify(input, dataSet, label, k):
    dataSize = dataSet.shape[0]
    ## 重复input为dataSet的大小
    diff = np.tile(input, (dataSize, 1)) - dataSet
    sqdiff = diff ** 2
    ## 列向量分别相加,得到一列新的向量
    squareDist = np.array([sum(x) for x in sqdiff])
    dist = squareDist ** 0.5

    ## 对距离进行排序
    ## argsort()根据元素的值从大到小对元素进行排序,返回下标
    sortedDistIndex = np.argsort(dist)

    classCount = {}
    for i in range(k):
        ## 因为已经对距离进行排序,所以直接循环sortedDistIndx
        voteLabel = label[sortedDistIndex[i]]
        ## 对选取的k个样本所属的类别个数进行统计
        ## 如果获取的标签不在classCount中,返回0
        classCount[voteLabel] = classCount.get(voteLabel, 0) + 1
    ## 选取出现的类别次数最多的类别
    maxCount = 0
    for key, value in classCount.items():
        if value > maxCount:
            maxCount = value
            classes = key

    return classes


data, labels = create_dataset()
input = [1.0, 2.0]
print(classify(input, data, labels, 2))
from math import sqrt
from random import randint


##Generate KD tree
def createTree(dataSet, layer=0, feature=2):
    length = len(dataSet)
    dataSetCopy = dataSet[:]
    featureNum = layer % feature
    dataSetCopy.sort(key=lambda x: x[featureNum])
    layer += 1
    if length == 0:
        return None
    elif length == 1:
        return {'Value': dataSet[0], 'Layer': layer, 'feature': featureNum, 'Left': None, 'Right': None}
    elif length != 1:
        midNum = length // 2
        dataSetLeft = dataSetCopy[:midNum]
        dataSetRight = dataSetCopy[midNum + 1:]
        return {'Value': dataSetCopy[midNum], 'Layer': layer, 'feature': featureNum,
                'Left': createTree(dataSetLeft, layer)
            , 'Right': createTree(dataSetRight, layer)}

# calculate distance
def calDistance(sourcePoint, targetPoint):
    length = len(targetPoint)
    sum = 0.0
    for i in range(length):
        sum += (sourcePoint[i] - targetPoint[i]) ** 2
    sum = sqrt(sum)
    return sum


# DFS algorithm
def dfs(kdTree, target, tracklist=[]):
    tracklistCopy = tracklist[:]
    if not kdTree:
        return None, tracklistCopy
    elif not kdTree['Left']:
        tracklistCopy.append(kdTree['Value'])
        return kdTree['Value'], tracklistCopy
    elif kdTree['Left']:
        pointValue = kdTree['Value']
        feature = kdTree['feature']
        tracklistCopy.append(pointValue)
        # return kdTree['Value'], tracklistCopy
        if target[feature] <= pointValue[feature]:
            return dfs(kdTree['Left'], target, tracklistCopy)
        elif target[feature] > pointValue[feature]:
            return dfs(kdTree['Right'], target, tracklistCopy)


# A function use to find a point in KDtree
def findPoint(Tree, value):
    if Tree != None and Tree['Value'] == value:
        return Tree
    else:
        if Tree['Left'] != None:
            return findPoint(Tree['Left'], value) or findPoint(Tree['Right'], value)


# KDtree search algorithm
def kdTreeSearch(tracklist, target, usedPoint=[], minDistance=float('inf'), minDistancePoint=None):
    tracklistCopy = tracklist[:]
    usedPointCopy = usedPoint[:]

    if not minDistancePoint:
        minDistancePoint = tracklistCopy[-1]

    if len(tracklistCopy) == 1:
        return minDistancePoint
    else:
        point = findPoint(kdTree, tracklist[-1])

        if calDistance(point['Value'], target) < minDistance:
            minDistance = calDistance(point['Value'], target)
            minDistancePoint = point['Value']
        fatherPoint = findPoint(kdTree, tracklistCopy[-2])
        fatherPointval = fatherPoint['Value']
        fatherPointfea = fatherPoint['feature']

        if calDistance(fatherPoint['Value'], target) < minDistance:
            minDistance = calDistance(fatherPoint['Value'], target)
            minDistancePoint = fatherPoint['Value']

        if point == fatherPoint['Left']:
            anotherPoint = fatherPoint['Right']
        elif point == fatherPoint['Right']:
            anotherPoint = fatherPoint['Left']

        if (anotherPoint == None or anotherPoint['Value'] in usedPointCopy or
                abs(fatherPointval[fatherPointfea] - target[fatherPointfea]) > minDistance):
            usedPoint = tracklistCopy.pop()
            usedPointCopy.append(usedPoint)
            return kdTreeSearch(tracklistCopy, target, usedPointCopy, minDistance, minDistancePoint)
        else:
            usedPoint = tracklistCopy.pop()
            usedPointCopy.append(usedPoint)
            subvalue, subtrackList = dfs(anotherPoint, target)
            tracklistCopy.extend(subtrackList)
            return kdTreeSearch(tracklistCopy, target, usedPointCopy, minDistance, minDistancePoint)


trainingSet = [(2, 3), (5, 4), (9, 6), (4, 7), (8, 1), (7, 2)]
kdTree = createTree(trainingSet)
target = eval(input('Input target point:'))
value, trackList = dfs(kdTree, target)
nnPoint = kdTreeSearch(trackList, target)
print(nnPoint)

k近邻算法实例(计算房租价格)
https://blog.csdn.net/Arwen_H/article/details/81978432

import pandas as pd #导入pandas库
from sklearn.neighbors import KNeighborsRegressor #导入机器学习库中的K近邻回归模型
from sklearn.metrics import mean_squared_error #导入机器学习库中的均方误差回归损失模型
dc_listings=pd.read_csv(r'C:\Users\宋益东\Downloads\listings.csv') #将数据导入,路径请根据自己的文件路径设置
features = ['accommodates','bedrooms','bathrooms','beds','price','minimum_nights','maximum_nights','number_of_reviews'] #因数据太多设置只选取这8列
dc_listings = dc_listings[features] #获取只需要的数据列,并覆盖原始数据
dc_listings.head()#先查看一下数据情况,对数据有个初步了解
dc_listings['price']=dc_listings.price.str.replace("\$|,",'').astype(float) #将价格price变成数字类型
dc_listings=dc_listings.dropna() #过滤缺失数据
normalized_listings = dc_listings #复制一下数据

norm_train_df=normalized_listings.copy().iloc[0:2792] #创建训练集训练集取2792个样本
norm_test_df=normalized_listings.copy().iloc[2792:] #创建测试集取879个样本
cols = ['accommodates','bedrooms','bathrooms','beds','minimum_nights','maximum_nights','number_of_reviews'] #选择测试集的训练的列
knn = KNeighborsRegressor(10) #模型近邻值手动设置成10,其他为默认参数
knn.fit(norm_train_df[cols], norm_train_df['price']) #X放入训练集数据,Y放入目标输出数据
two_features_predictions = knn.predict(norm_test_df[cols]) #输出测试集结果
two_features_mse = mean_squared_error(norm_test_df['price'], two_features_predictions)
two_features_rmse = two_features_mse ** (1/2)
print(two_features_rmse) #输出模型验证结果,根据结果调整近邻参数
print(knn.predict([[1,3,3,3,1,30,0]])) #设置自己房子的信息,预测出对应的价格