Skip to content

Commit 0d01a4a

Browse files
authored
Added one of the most important machine learning algorithm
The k-means clustering is done by using tensorflow which is the vital and growing machine learning library of google.
1 parent 3770551 commit 0d01a4a

File tree

1 file changed

+141
-0
lines changed

1 file changed

+141
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
import tensorflow as tf
2+
from random import choice, shuffle
3+
from numpy import array
4+
5+
6+
def TFKMeansCluster(vectors, noofclusters):
7+
"""
8+
K-Means Clustering using TensorFlow.
9+
'vectors' should be a n*k 2-D NumPy array, where n is the number
10+
of vectors of dimensionality k.
11+
'noofclusters' should be an integer.
12+
"""
13+
14+
noofclusters = int(noofclusters)
15+
assert noofclusters < len(vectors)
16+
17+
#Find out the dimensionality
18+
dim = len(vectors[0])
19+
20+
#Will help select random centroids from among the available vectors
21+
vector_indices = list(range(len(vectors)))
22+
shuffle(vector_indices)
23+
24+
#GRAPH OF COMPUTATION
25+
#We initialize a new graph and set it as the default during each run
26+
#of this algorithm. This ensures that as this function is called
27+
#multiple times, the default graph doesn't keep getting crowded with
28+
#unused ops and Variables from previous function calls.
29+
30+
graph = tf.Graph()
31+
32+
with graph.as_default():
33+
34+
#SESSION OF COMPUTATION
35+
36+
sess = tf.Session()
37+
38+
##CONSTRUCTING THE ELEMENTS OF COMPUTATION
39+
40+
##First lets ensure we have a Variable vector for each centroid,
41+
##initialized to one of the vectors from the available data points
42+
centroids = [tf.Variable((vectors[vector_indices[i]]))
43+
for i in range(noofclusters)]
44+
##These nodes will assign the centroid Variables the appropriate
45+
##values
46+
centroid_value = tf.placeholder("float64", [dim])
47+
cent_assigns = []
48+
for centroid in centroids:
49+
cent_assigns.append(tf.assign(centroid, centroid_value))
50+
51+
##Variables for cluster assignments of individual vectors(initialized
52+
##to 0 at first)
53+
assignments = [tf.Variable(0) for i in range(len(vectors))]
54+
##These nodes will assign an assignment Variable the appropriate
55+
##value
56+
assignment_value = tf.placeholder("int32")
57+
cluster_assigns = []
58+
for assignment in assignments:
59+
cluster_assigns.append(tf.assign(assignment,
60+
assignment_value))
61+
62+
##Now lets construct the node that will compute the mean
63+
#The placeholder for the input
64+
mean_input = tf.placeholder("float", [None, dim])
65+
#The Node/op takes the input and computes a mean along the 0th
66+
#dimension, i.e. the list of input vectors
67+
mean_op = tf.reduce_mean(mean_input, 0)
68+
69+
##Node for computing Euclidean distances
70+
#Placeholders for input
71+
v1 = tf.placeholder("float", [dim])
72+
v2 = tf.placeholder("float", [dim])
73+
euclid_dist = tf.sqrt(tf.reduce_sum(tf.pow(tf.sub(
74+
v1, v2), 2)))
75+
76+
##This node will figure out which cluster to assign a vector to,
77+
##based on Euclidean distances of the vector from the centroids.
78+
#Placeholder for input
79+
centroid_distances = tf.placeholder("float", [noofclusters])
80+
cluster_assignment = tf.argmin(centroid_distances, 0)
81+
82+
##INITIALIZING STATE VARIABLES
83+
84+
##This will help initialization of all Variables defined with respect
85+
##to the graph. The Variable-initializer should be defined after
86+
##all the Variables have been constructed, so that each of them
87+
##will be included in the initialization.
88+
init_op = tf.initialize_all_variables()
89+
90+
#Initialize all variables
91+
sess.run(init_op)
92+
93+
##CLUSTERING ITERATIONS
94+
95+
#Now perform the Expectation-Maximization steps of K-Means clustering
96+
#iterations. To keep things simple, we will only do a set number of
97+
#iterations, instead of using a Stopping Criterion.
98+
noofiterations = 100
99+
for iteration_n in range(noofiterations):
100+
101+
##EXPECTATION STEP
102+
##Based on the centroid locations till last iteration, compute
103+
##the _expected_ centroid assignments.
104+
#Iterate over each vector
105+
for vector_n in range(len(vectors)):
106+
vect = vectors[vector_n]
107+
#Compute Euclidean distance between this vector and each
108+
#centroid. Remember that this list cannot be named
109+
#'centroid_distances', since that is the input to the
110+
#cluster assignment node.
111+
distances = [sess.run(euclid_dist, feed_dict={
112+
v1: vect, v2: sess.run(centroid)})
113+
for centroid in centroids]
114+
#Now use the cluster assignment node, with the distances
115+
#as the input
116+
assignment = sess.run(cluster_assignment, feed_dict = {
117+
centroid_distances: distances})
118+
#Now assign the value to the appropriate state variable
119+
sess.run(cluster_assigns[vector_n], feed_dict={
120+
assignment_value: assignment})
121+
122+
##MAXIMIZATION STEP
123+
#Based on the expected state computed from the Expectation Step,
124+
#compute the locations of the centroids so as to maximize the
125+
#overall objective of minimizing within-cluster Sum-of-Squares
126+
for cluster_n in range(noofclusters):
127+
#Collect all the vectors assigned to this cluster
128+
assigned_vects = [vectors[i] for i in range(len(vectors))
129+
if sess.run(assignments[i]) == cluster_n]
130+
#Compute new centroid location
131+
new_location = sess.run(mean_op, feed_dict={
132+
mean_input: array(assigned_vects)})
133+
#Assign value to appropriate variable
134+
sess.run(cent_assigns[cluster_n], feed_dict={
135+
centroid_value: new_location})
136+
137+
#Return centroids and assignments
138+
centroids = sess.run(centroids)
139+
assignments = sess.run(assignments)
140+
return centroids, assignments
141+

0 commit comments

Comments
 (0)