From bf9e2b0400698ba7f623f14fe1ed00f58e3b2e31 Mon Sep 17 00:00:00 2001 From: Parth Paradkar Date: Sat, 5 Oct 2019 12:25:32 +0530 Subject: [PATCH 1/3] Pure implementation of KNN added --- machine_learning/k_nearest_neighbours.py | 41 ++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 machine_learning/k_nearest_neighbours.py diff --git a/machine_learning/k_nearest_neighbours.py b/machine_learning/k_nearest_neighbours.py new file mode 100644 index 000000000000..307afdedbab6 --- /dev/null +++ b/machine_learning/k_nearest_neighbours.py @@ -0,0 +1,41 @@ +import numpy as np +from collections import Counter +from sklearn import datasets +from sklearn.model_selection import train_test_split + +data = datasets.load_iris() + +# print(data) + +X = np.array(data['data']) +y = np.array(data['target']) +classes = data['target_names'] + +X_train, X_test, y_train, y_test = train_test_split(X, y) + +def euclidean_distance(a, b): + """ + Gives the euclidean distance between two points + >>> euclidean_distance([0, 0], [3, 4]) + 5.0 + >>> euclidean_distance([1, 2, 3], [1, 8, 11]) + 10.0 + """ + return np.linalg.norm(np.array(a) - np.array(b)) + +def classifier(train_data, train_target, classes, point, k=5): + """ + Classifies the point using the KNN algorithm + k closest points are found (ranked in ascending order of euclidean distance) + """ + data = zip(train_data, train_target) + distances = [] + for data_point in data: + distance = euclidean_distance(data_point[0], point) + distances.append((distance, data_point[1])) + votes = [i[1] for i in sorted(distances)[:k]] + result = Counter(votes).most_common(1)[0][0] + return classes[result] + +if __name__ == "__main__": + print(classifier(X_train, y_train, classes, [4.4, 3.1, 1.3, 1.4])) From 0ae6ccd9a8dc9932cc438017ce510ad118e1097a Mon Sep 17 00:00:00 2001 From: Parth Paradkar Date: Sat, 5 Oct 2019 18:42:54 +0530 Subject: [PATCH 2/3] Comments and test case added --- machine_learning/k_nearest_neighbours.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/machine_learning/k_nearest_neighbours.py b/machine_learning/k_nearest_neighbours.py index 307afdedbab6..7c80d07e7bc2 100644 --- a/machine_learning/k_nearest_neighbours.py +++ b/machine_learning/k_nearest_neighbours.py @@ -29,13 +29,17 @@ def classifier(train_data, train_target, classes, point, k=5): k closest points are found (ranked in ascending order of euclidean distance) """ data = zip(train_data, train_target) + # List of distances of all points from the point to be classified distances = [] for data_point in data: distance = euclidean_distance(data_point[0], point) distances.append((distance, data_point[1])) + # Choosing 'k' points with the least distances. votes = [i[1] for i in sorted(distances)[:k]] + # Most common class occuring among them is chosen to be the class into which the point is classified result = Counter(votes).most_common(1)[0][0] return classes[result] + if __name__ == "__main__": - print(classifier(X_train, y_train, classes, [4.4, 3.1, 1.3, 1.4])) + print(classifier([[0, 0], [1, 0], [0, 1], [0.5, 0.5], [3, 3], [2, 3], [3, 2]], [0, 0, 0, 0, 1, 1, 1], ['A', 'B'], [1.2, 1.2])) From 42b37adac88d0814b6823797540f32deff95a4ae Mon Sep 17 00:00:00 2001 From: Parth Paradkar Date: Sat, 5 Oct 2019 19:49:59 +0530 Subject: [PATCH 3/3] doctest added --- machine_learning/k_nearest_neighbours.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/machine_learning/k_nearest_neighbours.py b/machine_learning/k_nearest_neighbours.py index 7c80d07e7bc2..83d8399fe9b6 100644 --- a/machine_learning/k_nearest_neighbours.py +++ b/machine_learning/k_nearest_neighbours.py @@ -5,8 +5,6 @@ data = datasets.load_iris() -# print(data) - X = np.array(data['data']) y = np.array(data['target']) classes = data['target_names'] @@ -27,6 +25,17 @@ def classifier(train_data, train_target, classes, point, k=5): """ Classifies the point using the KNN algorithm k closest points are found (ranked in ascending order of euclidean distance) + Params: + :train_data: Set of points that are classified into two or more classes + :train_target: List of classes in the order of train_data points + :classes: Labels of the classes + :point: The data point that needs to be classifed + + >>> X_train = [[0, 0], [1, 0], [0, 1], [0.5, 0.5], [3, 3], [2, 3], [3, 2]] + >>> y_train = [0, 0, 0, 0, 1, 1, 1] + >>> classes = ['A','B']; point = [1.2,1.2] + >>> classifier(X_train, y_train, classes,point) + 'A' """ data = zip(train_data, train_target) # List of distances of all points from the point to be classified @@ -36,10 +45,11 @@ def classifier(train_data, train_target, classes, point, k=5): distances.append((distance, data_point[1])) # Choosing 'k' points with the least distances. votes = [i[1] for i in sorted(distances)[:k]] - # Most common class occuring among them is chosen to be the class into which the point is classified + # Most commonly occuring class among them + # is the class into which the point is classified result = Counter(votes).most_common(1)[0][0] return classes[result] if __name__ == "__main__": - print(classifier([[0, 0], [1, 0], [0, 1], [0.5, 0.5], [3, 3], [2, 3], [3, 2]], [0, 0, 0, 0, 1, 1, 1], ['A', 'B'], [1.2, 1.2])) + print(classifier(X_train, y_train, classes, [4.4, 3.1, 1.3, 1.4])) \ No newline at end of file