python - Kmeans clustering with heatmaps -


just wondering how go using k means clustering data set? restricted using packages or modules. https://raw.githubusercontent.com/gsprint23/cpts215/master/progassignments/files/simple.csv

this data set training one

https://raw.githubusercontent.com/gsprint23/cpts215/master/progassignments/files/cancer.csv

have been trying solve while, tried couple things none of them seemed have worked. no code required if give me general thought process solve i'd grateful.

this current way of thinking. i'm trying put data heatmap current thought process first randomly choose centers. create list of lists each point distance each center. find index of minimum distance each point each center. create data frame of same size data set , fill each index each element index of center point closest to. recompute center taking mean of points same center index repeat process multiple times until index data frame not change. create new data frame , add points have same center point close in frame. create heatmap.

this did not seem work though. wondering, on right track or off, , if on right track parts need change in order fix issue. if not please point me on right track.

here code at

import matplotlib.pyplot plt import numpy np import pandas pd import math import random #%matplotlib inline  def truncate(f, n):     return math.floor(f * 10 ** n) / 10 ** n  def choosecenter(data, centers):     length = data.shape     cent = []     while len(cent) < centers :         x = random.randrange(0,length[0])         y = random.randrange(0,length[1])         if data.iloc[x][y] not in cent:             d = truncate(data.iloc[x][y],2)             cent.append(d)     return cent   def distance(val, center):     return math.sqrt((val- center)**2)   def getdistances(centers, data):     length = data.shape     dist = []     in range(length[0]):         j in range(length[1]):             y = []             k in range(len(centers)):                 val = distance(data.iloc[i][j], centers[k])                  y.append(truncate(val,3))             dist.append(y)     return dist  def findclosest(data, dist):     close = data.copy()     length = close.shape     indexes = []     in range(len(dist)):         pt = min(dist[i])         idx = dist[i].index(pt)         indexes.append(idx)     #print(indexes)     length = data.shape     n = np.array(indexes)     n = pd.dataframe(np.reshape(n, (length[0],length[1])))     #reshape data frame same shape data     #keep running find closest until there no change     #try heatmap on this?     #this should cluster it, make sure test     #might need tweaking      return n #    in range(length[0]): #        j in range(length[1]): #            print('dist[i]', dist[j]) #            pt = min(dist[j]) #            print(pt) #            idx = dist[j].index(pt) #            close.iloc[i][j] = int(idx)     #return close  def computenewcenter(data, close):     d = dict()     in range(len(close)):         j in range(len(close[0])):             d[close.iloc[i][j]] = []      in range(len(data)):         j in range(len(data[0])):             if close.iloc[i][j] in d:                 d[close.iloc[i][j]].append(data.iloc[i][j])     newcenters = []      key, value in d.items():         m = np.mean(value)         newcenters.append(truncate(m, 3))      return newcenters #    lst = [[] * numcenters] #    in range(len(close)): #        j in range(len(close[0])): #            if close.iloc[i][j]   def main():     data = np.array(pd.read_csv('https://raw.githubusercontent.com/gsprint23/cpts215/master/progassignments/files/simple.csv',  header=none))     data = data.t     #print(data)     df = pd.dataframe(data[1:], columns=data[0], dtype=float).t     df = df.iloc[::-1] #    print(df) #    print(df.iloc[1][9]) #    print(df) #    print(df.iloc[0][1]) #    heatmap = plt.pcolor(df, cmap=plt.cm.bwr) #    plt.colorbar(heatmap)     c = choosecenter(df, 3)     print(c)     #print(len(c))     dist = getdistances(c, df)     #print(dist)     y = findclosest(df, dist) #    q = [] #    in range(len(c)): #        q.append([]) #    #print(q)     j = computenewcenter(df, y)     #print(j)     length = df.shape     oldframe = pd.dataframe(np.ndarray((length[0],length[1])))     oldframe = oldframe.fillna(0)     ct=0     while y.equals(oldframe) == false:         ct+=1         oldframe = y.copy()         c = computenewcenter(df, oldframe)         #print(c)         dist = getdistances(c, df)         #print(dist)         y = findclosest(df, dist)         #print(y)     #plt.pcolor(df, cmap=plt.cm.bwr)      l = []     in range(len(y)):         j in range(len(y[0])):             if y.iloc[i][j] == 1:                 l.append(df.iloc[i][j])      in range(len(y)):         j in range(len(y[0])):             if y.iloc[i][j] == 2:                 l.append(df.iloc[i][j])     in range(len(y)):         j in range(len(y[0])):             if y.iloc[i][j] == 0:                  l.append(df.iloc[i][j])       l = np.ndarray((length[0],length[1]))     l = pd.dataframe(l)     print(l)     hm = plt.pcolor(l, cmap=plt.cm.bwr)     plt.colorbar(hm)     #    print(y) #    print(c) #    print(ct)     #plt.pcolor(y, cmap=plt.cm.bwr)   if __name__ == '__main__':     main() 

thank reading


Comments

Popular posts from this blog

ZeroMQ on Windows, with Qt Creator -

unity3d - Unity SceneManager.LoadScene quits application -

python - Error while using APScheduler: 'NoneType' object has no attribute 'now' -