

对于knn来说,有两个hyperparameters(超参数:choices about the algorithm that we set rather than learn. Very problem-dependent, must try them all out and see what works best.),其一是怎么选取distance metric,其二是怎么选取k。

这儿说两种distance metric,一种是Manhattan metric, 也叫L1,另一种是Euclidean metric, 即L2。

这两种距离各有应用的方面,目前自己也在学习阶段,并不是特别清楚,但老师说,L1是coordinate dependence。







                3、计算test instance与training set的distance


                5、k个邻居进行投票(这儿只讨论binary knn)


from sklearn.datasets import load_iris
from sklearn import cross_validation
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from collections import Counter
from operator import itemgetter
import numpy as np
import math

# 1) given two data points, calculate the euclidean distance between them
def get_distance(data1, data2):
    points = zip(data1, data2)
    diffs_squared_distance = [pow(a - b, 2) for (a, b) in points]
    return math.sqrt(sum(diffs_squared_distance))

# 2) given a training set and a test instance, use getDistance to calculate all pairwise distances
def get_neighbours(training_set, test_instance, k):
    distances = [_get_tuple_distance(training_instance, test_instance) for training_instance in training_set]
    # index 1 is the calculated distance between training_instance and test_instance
    sorted_distances = sorted(distances, key=itemgetter(1))
    # extract only training instances without distance
    sorted_training_instances = [tuple[0] for tuple in sorted_distances]
    # select first k elements
    return sorted_training_instances[:k]

def _get_tuple_distance(training_instance, test_instance):
    return (training_instance, get_distance(test_instance, training_instance[0]))

def get_majority_vote(neighbours):
    # index 1 is the class
    classes = [neighbour[1] for neighbour in neighbours]
    count = Counter(classes)
    return count.most_common()[0][0]

def main():
    # load the data and create the training and test sets
    # random_state = 1 is just a seed to permit reproducibility of the train/test split,即设置种子,使得随机结果能够在以后的实验再次出现
    iris = load_iris()
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(,, test_size=0.4, random_state=1)
    # reformat train/test datasets for convenience
    train = np.array(list(zip(X_train,y_train)))
    test = np.array(list(zip(X_test, y_test)))
        array([[array([5.8, 2.8, 5.1, 2.4]), 2],
       [array([6. , 2.2, 4. , 1. ]), 1],
       [array([5.5, 4.2, 1.4, 0.2]), 0],.....])

    # generate predictions
    predictions = []
    # let's arbitrarily set k equal to 5, meaning that to predict the class of new instances,
    k = 5
    # for each instance in the test set, get nearest neighbours and majority vote on predicted class
    for x in range(len(X_test)):
            print('Classifying test instance number ',str(x),":")
            neighbours = get_neighbours(training_set=train, test_instance=test[x][0], k=5)
            majority_vote = get_majority_vote(neighbours)
            print('Predicted label=',str(majority_vote),', Actual label=',str(test[x][1]))
    # summarize performance of the classification
    print('The overall accuracy of the model is: ',accuracy_score(y_test_new, predictions))
    report = classification_report(y_test, predictions, target_names = iris.target_names)
    print('A detailed classification report: \n\n',report)
if __name__ == "__main__":

