python - Kmeans clustering with heatmaps -
just wondering how go using k means clustering data set? restricted using packages or modules. https://raw.githubusercontent.com/gsprint23/cpts215/master/progassignments/files/simple.csv
this data set training one
https://raw.githubusercontent.com/gsprint23/cpts215/master/progassignments/files/cancer.csv
have been trying solve while, tried couple things none of them seemed have worked. no code required if give me general thought process solve i'd grateful.
this current way of thinking. i'm trying put data heatmap current thought process first randomly choose centers. create list of lists each point distance each center. find index of minimum distance each point each center. create data frame of same size data set , fill each index each element index of center point closest to. recompute center taking mean of points same center index repeat process multiple times until index data frame not change. create new data frame , add points have same center point close in frame. create heatmap.
this did not seem work though. wondering, on right track or off, , if on right track parts need change in order fix issue. if not please point me on right track.
here code at
import matplotlib.pyplot plt import numpy np import pandas pd import math import random #%matplotlib inline def truncate(f, n): return math.floor(f * 10 ** n) / 10 ** n def choosecenter(data, centers): length = data.shape cent = [] while len(cent) < centers : x = random.randrange(0,length[0]) y = random.randrange(0,length[1]) if data.iloc[x][y] not in cent: d = truncate(data.iloc[x][y],2) cent.append(d) return cent def distance(val, center): return math.sqrt((val- center)**2) def getdistances(centers, data): length = data.shape dist = [] in range(length[0]): j in range(length[1]): y = [] k in range(len(centers)): val = distance(data.iloc[i][j], centers[k]) y.append(truncate(val,3)) dist.append(y) return dist def findclosest(data, dist): close = data.copy() length = close.shape indexes = [] in range(len(dist)): pt = min(dist[i]) idx = dist[i].index(pt) indexes.append(idx) #print(indexes) length = data.shape n = np.array(indexes) n = pd.dataframe(np.reshape(n, (length[0],length[1]))) #reshape data frame same shape data #keep running find closest until there no change #try heatmap on this? #this should cluster it, make sure test #might need tweaking return n # in range(length[0]): # j in range(length[1]): # print('dist[i]', dist[j]) # pt = min(dist[j]) # print(pt) # idx = dist[j].index(pt) # close.iloc[i][j] = int(idx) #return close def computenewcenter(data, close): d = dict() in range(len(close)): j in range(len(close[0])): d[close.iloc[i][j]] = [] in range(len(data)): j in range(len(data[0])): if close.iloc[i][j] in d: d[close.iloc[i][j]].append(data.iloc[i][j]) newcenters = [] key, value in d.items(): m = np.mean(value) newcenters.append(truncate(m, 3)) return newcenters # lst = [[] * numcenters] # in range(len(close)): # j in range(len(close[0])): # if close.iloc[i][j] def main(): data = np.array(pd.read_csv('https://raw.githubusercontent.com/gsprint23/cpts215/master/progassignments/files/simple.csv', header=none)) data = data.t #print(data) df = pd.dataframe(data[1:], columns=data[0], dtype=float).t df = df.iloc[::-1] # print(df) # print(df.iloc[1][9]) # print(df) # print(df.iloc[0][1]) # heatmap = plt.pcolor(df, cmap=plt.cm.bwr) # plt.colorbar(heatmap) c = choosecenter(df, 3) print(c) #print(len(c)) dist = getdistances(c, df) #print(dist) y = findclosest(df, dist) # q = [] # in range(len(c)): # q.append([]) # #print(q) j = computenewcenter(df, y) #print(j) length = df.shape oldframe = pd.dataframe(np.ndarray((length[0],length[1]))) oldframe = oldframe.fillna(0) ct=0 while y.equals(oldframe) == false: ct+=1 oldframe = y.copy() c = computenewcenter(df, oldframe) #print(c) dist = getdistances(c, df) #print(dist) y = findclosest(df, dist) #print(y) #plt.pcolor(df, cmap=plt.cm.bwr) l = [] in range(len(y)): j in range(len(y[0])): if y.iloc[i][j] == 1: l.append(df.iloc[i][j]) in range(len(y)): j in range(len(y[0])): if y.iloc[i][j] == 2: l.append(df.iloc[i][j]) in range(len(y)): j in range(len(y[0])): if y.iloc[i][j] == 0: l.append(df.iloc[i][j]) l = np.ndarray((length[0],length[1])) l = pd.dataframe(l) print(l) hm = plt.pcolor(l, cmap=plt.cm.bwr) plt.colorbar(hm) # print(y) # print(c) # print(ct) #plt.pcolor(y, cmap=plt.cm.bwr) if __name__ == '__main__': main()
thank reading
Comments
Post a Comment