Modify thecodefrom cluster.py on eClass to implement bisecting k-means , which i

ID: 3603544 • Letter: M

Question

Modify thecodefrom cluster.py on eClass to implement bisecting k-means, which is an extension of the basic k-means algorithm. Start with one cluster containing all data points, and then repeatedly split the largest cluster into two (k-means with k = 2) until you have the desired number of clusters. Use the program visual.py to see the results

import math
import random

def euclidD(point1, point2):
sum = 0
for index in range(len(point1)):
diff = (point1[index]-point2[index]) ** 2
sum = sum + diff

return math.sqrt(sum)

def manhattanD(point1, point2):
sum = 0
for i in range(len(point1)):
diff = abs(point1[i]-point2[i])
sum = sum + diff

return sum

def createCentroids(k, datadict):
centroids=[]
centroidCount = 0
centroidKeys = []
random.seed(68)

keys = list(datadict.keys())

while centroidCount < k:
rkey = random.randint(0,len(datadict))
if rkey not in centroidKeys:
centroids.append(datadict[keys[rkey]])
centroidKeys.append(rkey)
centroidCount = centroidCount + 1

return centroids

def showCentroids(centroids):
print("CENTROIDS", end = " ")
for cent in centroids:
#print("%4.1f" % (cent[0]), end = " ")
print(cent, end = " ")
print()

def showClusters(clusters, datadict):
for c in clusters:
print ("CLUSTER", end = " ")
for key in c:
print(datadict[key], end=" ")
print()

def assignPointsToClusters(centroids,datadict):
clusters = []
k = len(centroids)
for i in range(k):
clusters.append([])
for akey in datadict:
distances = []
for clusterIndex in range(k):
dist = euclidD(datadict[akey],centroids[clusterIndex])
distances.append(dist)

mindist = min(distances)
index = distances.index(mindist)
clusters[index].append(akey)
return clusters

def updateCentroids(clusters, centroids,datadict):
dimensions = len(datadict[list(datadict.keys())[0]])
k = len(centroids)
for clusterIndex in range(k):
sums = [0]*dimensions
for akey in clusters[clusterIndex]:
datapoints = datadict[akey]
for ind in range(len(datapoints)):
sums[ind] = sums[ind] + datapoints[ind]
for ind in range(len(sums)):
clusterLen = len(clusters[clusterIndex])
if clusterLen != 0:
sums[ind] = sums[ind]/clusterLen
centroids[clusterIndex] = sums
return centroids

def createClusters(centroids, datadict, repeats):
k = len(centroids)
doAgain = True
apass = 0
while doAgain and apass < repeats:
clusters = []
prevClusters = clusters

clusters = assignPointsToClusters(centroids,datadict)

if clusters == prevClusters:
doAgain = False

centroids = updateCentroids(clusters, centroids, datadict)

apass = apass + 1

return clusters

def readFile(filename):
datafile = open(filename, "r")
datadict = {}

key = 0
for aline in datafile:
key = key + 1
items = aline.split()
datadict[key] = [int(k) for k in items]

return datadict

def main():
datadict = readFile("scores.txt")
numClusters = 5 # number of clusters
dataCentroids = createCentroids(numClusters, datadict)
maxPass = 10
dataClusters = createClusters(dataCentroids, datadict, maxPass)

if __name__ == "__main__":
main()

Explanation / Answer