C語言中K-means算法實現(xiàn)代碼
K-means算法是很典型的基于距離的聚類算法,采用距離作為相似性的評價指標(biāo),即認為兩個對象的距離越近,其相似度就越大。該算法認為簇是由距離靠近的對象組成的,因此把得到緊湊且獨立的簇作為最終目標(biāo)。
算法過程如下:
1)從N個樣本隨機選取K個樣本作為質(zhì)心
2)對剩余的每個樣本測量其到每個質(zhì)心的距離,并把它歸到最近的質(zhì)心的類
3)重新計算已經(jīng)得到的各個類的質(zhì)心
4)迭代2~3步直至新的質(zhì)心與原質(zhì)心相等或小于指定閾值,算法結(jié)束
#include<stdio.h> #include<stdlib.h> #include<string.h> #include<time.h> #include<math.h> #define DIMENSIOM 2 //目前只是處理2維的數(shù)據(jù) #define MAX_ROUND_TIME 100 //最大的聚類次數(shù) typedef struct Item{ int dimension_1; //用于存放第一維的數(shù)據(jù) int dimension_2; //用于存放第二維的數(shù)據(jù) int clusterID; //用于存放該item的cluster center是誰 }Item; Item* data; typedef struct ClusterCenter{ double dimension_1; double dimension_2; int clusterID; }ClusterCenter; ClusterCenter* cluster_center_new; int isContinue; int* cluster_center; //記錄center double* distanceFromCenter; //記錄一個“點”到所有center的距離 int data_size; char filename[200]; int cluster_count; void initial(); void readDataFromFile(); void initial_cluster(); void calculateDistance_ToOneCenter(int itemID, int centerID, int count); void calculateDistance_ToAllCenter(int itemID); void partition_forOneItem(int itemID); void partition_forAllItem_OneCluster(int round); void calculate_clusterCenter(int round); void K_means(); void writeClusterDataToFile(int round); void writeClusterCenterToFile(int round); void compareNew_OldClusterCenter(double* new_X_Y); void test_1(); int main(int argc, char* argv[]){ if( argc != 4 ) { printf("This application need other parameter to run:" "\n\t\tthe first is the size of data set," "\n\t\tthe second is the file name that contain data" "\n\t\tthe third indicate the cluster_count" "\n"); exit(0); } srand((unsigned)time(NULL)); data_size = atoi(argv[1]); strcat(filename, argv[2]); cluster_count = atoi(argv[3]); initial(); readDataFromFile(); initial_cluster(); //test_1(); //partition_forAllItem_OneCluster(); //calculate_clusterCenter(); K_means(); return 0; } /* * 對涉及到的二維動態(tài)數(shù)組根據(jù)main函數(shù)中傳入的參數(shù)分配空間 * */ void initial(){ data = (Item*)malloc(sizeof(struct Item) * (data_size + 1)); if( !data ) { printf("malloc error:data!"); exit(0); } cluster_center = (int*)malloc(sizeof(int) * (cluster_count + 1)); if( !cluster_center ) { printf("malloc error:cluster_center!\n"); exit(0); } distanceFromCenter = (double*)malloc(sizeof(double) * (cluster_count + 1)); if( !distanceFromCenter ) { printf("malloc error: distanceFromCenter!\n"); exit(0); } cluster_center_new = (ClusterCenter*)malloc(sizeof(struct ClusterCenter) * (cluster_count + 1)); if( !cluster_center_new ) { printf("malloc cluster center new error!\n"); exit(0); } } /* * 從文件中讀入x和y數(shù)據(jù) * */ void readDataFromFile(){ FILE* fread; if( NULL == (fread = fopen(filename, "r"))) { printf("open file(%s) error!\n", filename); exit(0); } int row; for( row = 1; row <= data_size; row++ ) { if( 2 != fscanf(fread, "%d %d ", &data[row].dimension_1, &data[row].dimension_2)) { printf("fscanf error: %d\n", row); } data[row].clusterID = 0; } } /* * 根據(jù)從主函數(shù)中傳入的@cluster_count(聚類的個數(shù))來隨機的選擇@cluster_count個 * 初始的聚類的起點 * */ void initial_cluster(){ //輔助產(chǎn)生不重復(fù)的數(shù) int* auxiliary; int i; auxiliary = (int*)malloc(sizeof(int) * (data_size + 1)); if( !auxiliary ) { printf("malloc error: auxiliary"); exit(0); } for( i = 1; i <= data_size; i++ ) { auxiliary[i] = i; } //產(chǎn)生初始化的cluster_count個聚類 int length = data_size; int random; for( i = 1; i <= cluster_count; i++ ) { random = rand()%length + 1; //printf("%d \n", auxiliary[random]); //data[auxiliary[random]].clusterID = auxiliary[random]; cluster_center[i] = auxiliary[random]; auxiliary[random] = auxiliary[length--]; } for( i = 1; i <= cluster_count; i++ ) { cluster_center_new[i].dimension_1 = data[cluster_center[i]].dimension_1; cluster_center_new[i].dimension_2 = data[cluster_center[i]].dimension_2; cluster_center_new[i].clusterID = i; data[cluster_center[i]].clusterID = i; } } /* * 計算一個點(還沒有劃分到cluster center的點)到一個cluster center的distance * @itemID: 不屬于任何cluster中的點 * @centerID: center的ID * @count: 表明在計算的是itemID到第幾個@center的distance,并且指明了結(jié)果放在distanceFromCenter的第幾號元素 * */ void calculateDistance_ToOneCenter(int itemID,int centerID){ distanceFromCenter[centerID] = sqrt( (data[itemID].dimension_1-cluster_center_new[centerID].dimension_1)*(double)(data[itemID].dimension_1-cluster_center_new[centerID].dimension_1) + (double)(data[itemID].dimension_2-cluster_center_new[centerID].dimension_2) * (data[itemID].dimension_2-cluster_center_new[centerID].dimension_2) ); } /* * 計算一個點(還沒有劃分到cluster center的點)到每個cluster center的distance * */ void calculateDistance_ToAllCenter(int itemID){ int i; for( i = 1; i <= cluster_count; i++ ) { calculateDistance_ToOneCenter(itemID, i); } } void test_1() { calculateDistance_ToAllCenter(3); int i; for( i = 1; i <= cluster_count; i++ ) { printf("%f ", distanceFromCenter[i]); } } /* * 在得到任一的點(不屬于任一cluster的)到每一個cluster center的distance之后,決定它屬于哪一個cluster center,即取距離最小的 * 函數(shù)功能:得到一個item所屬的cluster center * */ void partition_forOneItem(int itemID){ //操作對象是 distanceFromCenter和cluster_center int i; int min_index = 1; double min_value = distanceFromCenter[1]; for( i = 2; i <= cluster_count; i++ ) { if( distanceFromCenter[i] < min_value ) { min_value = distanceFromCenter[i]; min_index = i; } } data[itemID].clusterID = cluster_center_new[min_index].clusterID; } /* * 得到所有的item所屬于的cluster center , 在一輪的聚類中 * */ void partition_forAllItem_OneCluster(int round){ //changed!!!!!!!!!!!!!!!!!!!!!!!! int i; for( i = 1; i <= data_size; i++ ) { if( data[i].clusterID != 0 ) continue; else { calculateDistance_ToAllCenter(i); //計算i到所有center的distance partition_forOneItem(i); //根據(jù)distance對i進行partition } } //把聚類得到的數(shù)據(jù)寫入到文件中 writeClusterDataToFile(round); } /* * 將聚類得到的數(shù)據(jù)寫入到文件中,每一個類寫入一個文件中 * @round: 表明在進行第幾輪的cluster,該參數(shù)的另一個作用是指定了文件名字中的第一個項. * */ void writeClusterDataToFile(int round){ int i; char filename[200]; FILE** file; file = (FILE**)malloc(sizeof(FILE*) * (cluster_count + 1)); if( !file ) { printf("malloc file error!\n"); exit(0); } for( i = 1; i <= cluster_count; i++ ) { sprintf(filename, ".//ClusterProcess//round%d_cluster%d.data", round, i); if( NULL == (file[i] = fopen(filename, "w"))) { printf("file open(%s) error!", filename); exit(0); } } for( i = 1; i <= data_size; i++ ) { //sprintf(filename, ".//ClusterProcess//round%d_cluster%d.data", round, data[i].clusterID); fprintf(file[data[i].clusterID], "%d\t%d\n", data[i].dimension_1, data[i].dimension_2); } for( i = 1; i <= cluster_count; i++ ) { //sprintf(filename, ".//ClusterProcess//round%d_cluster%d.data", round, i); fclose(file[i]); } } /* * 重新計算新的cluster center * */ void calculate_clusterCenter(int round){ //changed!!!!!!!!!!!!!!!!!!!!!! int i; double* new_X_Y; /* 用來計算和保存新的cluster center的值,同樣的,0號元素不用。1,2號元素分別用來 存放第一個聚類的所有的項的x和y的累加和。3,4號元素分別用來存放第二個聚類的所有 的項的x和y的累加和...... */ new_X_Y = (double*)malloc(sizeof(double) * (2 * cluster_count + 1)); if( !new_X_Y ) { printf("malloc error: new_X_Y!\n"); exit(0); } //初始化為0 for( i = 1; i <= 2*cluster_count; i++ ) new_X_Y[i] = 0.0; //用來統(tǒng)計屬于各個cluster的item的個數(shù) int* counter; counter = (int*)malloc(sizeof(int) * (cluster_count + 1)); if( !counter ) { printf("malloc error: counter\n"); exit(0); } //初始化為0 for( i = 1; i <= cluster_count; i++ ) counter[i] = 0; for( i = 1; i <= data_size; i++ ) { new_X_Y[data[i].clusterID * 2 - 1] += data[i].dimension_1; new_X_Y[data[i].clusterID * 2] += data[i].dimension_2; counter[data[i].clusterID]++; } for( i = 1; i <= cluster_count; i++ ) { new_X_Y[2 * i - 1] = new_X_Y[2 * i - 1] / (double)(counter[i]); new_X_Y[2 * i] = new_X_Y[2 * i] / (double)(counter[i]); } //要將cluster center的值保存在文件中,后續(xù)作圖 writeClusterCenterToFile(round); /* * 在這里比較一下新的和舊的cluster center值的差別。如果是相等的,則停止K-means算法。 * */ compareNew_OldClusterCenter(new_X_Y); //將新的cluster center的值放入cluster_center_new for( i = 1; i <= cluster_count; i++ ) { cluster_center_new[i].dimension_1 = new_X_Y[2 * i - 1]; cluster_center_new[i].dimension_2 = new_X_Y[2 * i]; cluster_center_new[i].clusterID = i; } free(new_X_Y); free(counter); //在重新計算了新的cluster center之后,意味著我們要重新來為每一個Item進行聚類,所以data中用于表示聚類ID的clusterID //要都重新置為0。 for( i = 1; i <= data_size; i++ ) { data[i].clusterID = 0; } } /* * 將得到的新的cluster_count個cluster center的值保存在文件中。以便于觀察聚類的過程。 * */ void writeClusterCenterToFile(int round){ FILE* file; int i; char filename[200]; sprintf(filename, ".//ClusterProcess//round%d_clusterCenter.data", round); if( NULL == (file = fopen(filename, "w"))) { printf("open file(%s) error!\n", filename); exit(0); } for( i = 1; i <= cluster_count; i++ ) { fprintf(file, "%f\t%f\n", cluster_center_new[i].dimension_1, cluster_center_new[i].dimension_2); } for( i = 1; i <= cluster_count; i++ ) { fclose(file); } } /* * 比較新舊的cluster center的差異 * */ void compareNew_OldClusterCenter(double* new_X_Y){ int i; isContinue = 0; //等于0表示的是不要繼續(xù) for( i = 1; i <= cluster_count; i++ ) { if( new_X_Y[2 * i - 1] != cluster_center_new[i].dimension_1 || new_X_Y[2 * i] != cluster_center_new[i].dimension_2) { isContinue = 1; //要繼續(xù) break; } } } /************************************************************************************************ * K-means算法 * ***********************************************************************************************/ void K_means(){ int times_cluster; for( times_cluster = 1; times_cluster <= MAX_ROUND_TIME; times_cluster++ ) { printf("\n times : %d \n", times_cluster); partition_forAllItem_OneCluster(times_cluster); calculate_clusterCenter(times_cluster); if( 0 == isContinue ) { break; //printf("\n\nthe application can stop!\n\n"); } } }
以上就是本文的全部內(nèi)容,希望對大家的學(xué)習(xí)有所幫助,也希望大家多多支持我們。
您可能感興趣的文章
- 04-02c語言函數(shù)調(diào)用后清空內(nèi)存 c語言調(diào)用函數(shù)刪除字符
- 04-02c語言的正則匹配函數(shù) c語言正則表達式函數(shù)庫
- 04-02func函數(shù)+在C語言 func函數(shù)在c語言中
- 04-02c語言中對數(shù)函數(shù)的表達式 c語言中對數(shù)怎么表達
- 04-02c語言用函數(shù)寫分段 用c語言表示分段函數(shù)
- 04-02c語言編寫函數(shù)冒泡排序 c語言冒泡排序法函數(shù)
- 04-02c語言沒有round函數(shù) round c語言
- 04-02c語言分段函數(shù)怎么求 用c語言求分段函數(shù)
- 04-02C語言中怎么打出三角函數(shù) c語言中怎么打出三角函數(shù)的值
- 04-02c語言調(diào)用函數(shù)求fibo C語言調(diào)用函數(shù)求階乘


閱讀排行
本欄相關(guān)
- 04-02c語言函數(shù)調(diào)用后清空內(nèi)存 c語言調(diào)用
- 04-02func函數(shù)+在C語言 func函數(shù)在c語言中
- 04-02c語言的正則匹配函數(shù) c語言正則表達
- 04-02c語言用函數(shù)寫分段 用c語言表示分段
- 04-02c語言中對數(shù)函數(shù)的表達式 c語言中對
- 04-02c語言編寫函數(shù)冒泡排序 c語言冒泡排
- 04-02c語言沒有round函數(shù) round c語言
- 04-02c語言分段函數(shù)怎么求 用c語言求分段
- 04-02C語言中怎么打出三角函數(shù) c語言中怎
- 04-02c語言調(diào)用函數(shù)求fibo C語言調(diào)用函數(shù)求
隨機閱讀
- 08-05織夢dedecms什么時候用欄目交叉功能?
- 01-10使用C語言求解撲克牌的順子及n個骰子
- 01-10delphi制作wav文件的方法
- 01-10C#中split用法實例總結(jié)
- 01-11ajax實現(xiàn)頁面的局部加載
- 01-10SublimeText編譯C開發(fā)環(huán)境設(shè)置
- 08-05dedecms(織夢)副欄目數(shù)量限制代碼修改
- 04-02jquery與jsp,用jquery
- 08-05DEDE織夢data目錄下的sessions文件夾有什
- 01-11Mac OSX 打開原生自帶讀寫NTFS功能(圖文