[MATLAB] k-means演算法
k-means又稱c-means Clustering,k表示群集的數量,演算法如下
- 給定一資料集S,選擇k個點當群集中心(群心)。
- 計算每一資料與各群心距離,資料歸類在與之最短距離的群心那群。
- 歸類好的資料再算出一個新的群心(通常用平均)。
- 看新的群心與舊的群心位置是否接近或者固定。
- 重複2~4直到4成立,則分群完畢。
虛擬碼為:
Input : 群數k, 群心Centers of clusters(kC)
Output:分群結果team
Subroutine:算距離distFunc, 算新的群心re_center, 分群clustering
Input : 群數k, 群心Centers of clusters(kC)
Output:分群結果team
Subroutine:算距離distFunc, 算新的群心re_center, 分群clustering
main k_means(Database, k, kC, distFunc, re_center, k_clustering)
While 1
分群結果 team = clustering(Database, kC, distFunc)
重新找群心 NewkC = re_center(Database, team, k)
If 新的群心NewkC和舊群心kC差距可接受 Then //差距可接受代表分類完成
kC = NewkC //更新群心
break;
Else //還沒找到滿意的群心就繼續找
kC = NewkC //更新群心
End If
End While 1
return team, kC //回傳資料群集編號和群心
End k_means
sub re_center(Database, team, k)
For each Cluster of k
新群心NewkC = 屬於第team(Cluster)組的資料取算術平均數為新的群心
End For Cluster
return NewkC
End re_center
sub clustering(Database, kC)
tempDist = ? //距離暫存器,?要比所有可能的距離都大
For each P of Database
For each C of kC
距離dist = distFunc(C, P)
If dist < tempDist Then //紀錄哪個群心最近
tempDist = dist
FLAG = C
End If
End For C
將tempDist 變回 ?
team(P) = FLAG //P點的分類完成
End For P
return team
End clustering
MATLAB code:
function [team kx ky]= k_means(x, y, kx, ky, seed_num)
while 1
team = k_Clustering( x, y, kx, ky);% 分群
[nkx, nky] = k_re_center(x, y, team, seed_num); %更新新的群心
if ( sum(kx == nkx) == seed_num ) && ( sum(ky == nky) == seed_num) %新的群心是否跟舊的一樣
kx = nkx;
ky = nky;
break; %一樣的話就跳出
else %不一樣就繼續
kx = nkx;
ky = nky;
end
end
function team = k_Clustering(x, y, kx, ky)
% This is for K-Clustering
% (x,y) are dataset and (kx, ky) are Cluster-center
mid_dis = 9999999999; %距離的暫存器
for i = 1 : length(x); %總共要判斷length(x)個資料
for j = 1 : length(kx); %總共有length(kx)個群集中心
dist = k_distFunc( [x(i) y(i)], [kx(j) ky(j)]); %計算第i個資料和第j個群集中心的距離
if dist < mid_dis %判斷距離哪個群集中心較近
mid_dis = dist; %更新距離的暫存器
FLAG = j; %紀錄現在距離哪個群集中心最近
end
end
mid_dis = 9999999999; %距離的暫存器變回初始值
team(i,1) = FLAG; %第i個資料屬於第FLAG個群集
end
function [kx, ky] = k_re_center( x, y, team, seed_num)
% re-find clustered data for new cluster center
for i = 1 : seed_num
kx(i) = sum(x(team == i )) / sum(team == i);
ky(i) = sum(y(team == i )) / sum(team == i);
end
function D = k_distFunc(P1, P2)
% This is for finding Distance Between 2D Points
% Input can be an vector
D = norm(P1 - P2);
留言
張貼留言