@zsh-o 2018-08-14T02:03:21.000000Z 字数 4257 阅读 1146

3 - k近邻

《统计学习方法》

%matplotlib inline
import numpy as np
from matplotlib import pyplot as plt
epsilon = 1e-5

import pydot
from IPython.display import Image, display
def viewPydot(pdot):
    plt = Image(pdot.create_png())
    display(plt)

def kNN(x, X, Y, k, p): # y [0, N-1]
    m,n = X.shape
    class_N = np.max(Y) + 1
    diffs = np.abs(np.tile(x, (m, 1)) - X)
    if p is np.inf:
        diss = np.max(diffs, axis=1)
    else:
        diss = np.power(np.sum(np.power(diffs, p), axis=1), 1./p)
    args = np.argsort(diss)[:k]
    countY = np.zeros(class_N)
    for index in args:
        countY[index] = countY[index] + 1
    return np.argmax(countY)

例3.1

X = np.array([
    [5,1],
    [4,4],
])
Y = np.array([
    [0],
    [1],
])

for i in range(1,6):
    print("%d -> %d"%(i, kNN(x=np.array([1,1]), X=X, Y=Y, k=1, p=i)))

1 -> 0
2 -> 0
3 -> 1
4 -> 1
5 -> 1

kd树

A = np.array([
    [2,3],
    [5,4],
    [9,6],
    [4,7],
    [8,1],
    [7,2]
])

def D(X):
    return np.mean(np.power(X - np.mean(X,axis=0), 2), axis=0)

class Node(object):
    def __init__(self, p=None, dim=None, lc=None, rc=None):
        self.p = p
        self.dim = dim
        self. lc = lc
        self.rc = rc
        self.visited = False

def kd_Tree(X):
    if(len(X) == 0):
        return None;
    N = X.shape[0]
    ## 选择需要划分的维度
    Ds = D(X)
    dim = np.argmax(Ds)
    X = np.array(sorted(X, key=lambda x:x[dim]))
    root = Node()
    root.p = X[int(N/2)]
    root.dim = dim
    root.lc = kd_Tree(X[:int(N/2)])
    root.rc = kd_Tree(X[int(N/2)+1:])
    return root

搜索

搜索也很简单，首先按照二分的方法搜索找到最优的候选叶子节点，然后向上回溯，查看父节点的未访问到的节点是否有小于候选节点与目标节点距离，小于则更新候选节点
判断目标节点到该轴的距离是否小于最小距离

def kd_search(root, x):
    global min_dis
    global min_node
    min_dis = np.inf
    min_node = None
    def _search(t): ## 递归
        global min_dis
        global min_node
        if ((x[t.dim] < t.p[t.dim]) & (t.lc is None)) | ((x[t.dim] >= t.p[t.dim]) & (t.rc is None)):  ## 叶子节点，二分查找的最优候选节点
            dis = np.sqrt(np.sum(np.power(t.p - x, 2)))
            if dis <= min_dis:
                min_dis = dis
                min_node = t.p
            t.visited = True
            return
        else:
            if x[t.dim] < t.p[t.dim]:
                _search(t.lc)
            else:
                _search(t.rc)
        if np.abs(x[t.dim] - t.p[t.dim]) >= min_dis:
            t.visited = True
        else:
            if x[t.dim] > t.p[t.dim]:
                _search(t.rc)
            else:
                _search(t.lc)
        dis = np.sqrt(np.sum(np.power(t.p - x, 2)))
        if dis <= min_dis:
            min_dis = dis
            min_node = t.p
        t.visited = True
    _search(root)
    return min_node, min_dis

tree = kd_Tree(A)

kd_search(tree, np.array([4,3]))

(array([5, 4]), 1.4142135623730951)

dot = pydot.Dot()
global level
level = 1
def create_dot(p):
    global level
    p_name = "%d # %d, %s" % (level, p.dim, str(p.p))
    dot.add_node(pydot.Node(name=p_name))
    if p.lc is not None:
        level = level + 1
        c = p.lc
        c_name = "%d # %d, %s" % (level, c.dim, str(c.p))
        dot.add_edge(pydot.Edge(dst=c_name, src=p_name))
        create_dot(c)
        level = level -1
    if p.rc is not None:
        level = level + 1
        c = p.rc
        c_name = "%d # %d, %s" % (level, c.dim, str(c.p))
        dot.add_edge(pydot.Edge(dst=c_name, src=p_name))
        create_dot(c)
        level = level -1

create_dot(tree)
viewPydot(dot)

output_19_0.png-23.1kB

3 - k近邻

例3.1

最近邻性能分析

kd树

搜索

内容目录