First fkmeans commit

This commit is contained in:
BlackLight 2010-11-18 19:22:00 +01:00
commit 77f676e4d2
4 changed files with 688 additions and 0 deletions

3
Makefile Normal file
View file

@ -0,0 +1,3 @@
all:
gcc -g -O3 -Wall -pedantic -pedantic-errors -std=c99 -o kmeans-test test.c kmeans.c -lm

445
kmeans.c Normal file
View file

@ -0,0 +1,445 @@
/*
* =====================================================================================
*
* Filename: kmeans.c
*
* Description: k-means clusterization algorithm implementation in C
*
* Version: 1.0
* Created: 12/11/2010 10:43:28
* Revision: none
* Compiler: gcc
*
* Author: BlackLight (http://0x00.ath.cx), <blacklight@autistici.org>
* Licence: GNU GPL v.3
* Company: DO WHAT YOU WANT CAUSE A PIRATE IS FREE, YOU ARE A PIRATE!
*
* =====================================================================================
*/
#include "kmeans.h"
#include <alloca.h>
#include <float.h>
#include <limits.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
/**
* \brief Initialize the centers of the clusters taking the K most distant elements in the dataset
* \param km k-means object
*/
static void
__kmeans_init_centers ( kmeans_t *km )
{
int i, j, k, l,
index_found = 0,
max_index = 0,
assigned_centers = 0,
*assigned_centers_indexes = NULL;
double dist = 0.0,
max_dist = 0.0;
for ( i=0; i < km->dataset_size; i++ )
{
dist = 0.0;
for ( j=0; j < km->dataset_dim; j++ )
{
dist += ( km->dataset[i][j] ) * ( km->dataset[i][j] );
}
if ( dist > max_dist )
{
max_dist = dist;
max_index = i;
}
}
for ( i=0; i < km->dataset_dim; i++ )
{
km->centers[0][i] = km->dataset[max_index][i];
}
if ( !( assigned_centers_indexes = (int*) realloc ( assigned_centers_indexes, (++assigned_centers) * sizeof ( int ))))
{
return;
}
assigned_centers_indexes[ assigned_centers - 1 ] = max_index;
for ( i=1; i < km->k; i++ )
{
max_dist = 0.0;
max_index = 0;
for ( j=0; j < km->dataset_size; j++ )
{
index_found = 0;
for ( k=0; k < assigned_centers && !index_found; k++ )
{
if ( assigned_centers_indexes[k] == j )
{
index_found = 1;
}
}
if ( index_found )
continue;
dist = 0.0;
for ( k=0; k < assigned_centers; k++ )
{
for ( l=0; l < km->dataset_dim; l++ )
{
dist += ( km->dataset[j][l] - km->centers[k][l] ) * ( km->dataset[j][l] - km->centers[k][l] );
}
}
if ( dist > max_dist )
{
max_dist = dist;
max_index = j;
}
}
for ( j=0; j < km->dataset_dim; j++ )
{
km->centers[i][j] = km->dataset[max_index][j];
}
if ( !( assigned_centers_indexes = (int*) realloc ( assigned_centers_indexes, (++assigned_centers) * sizeof ( int ))))
{
return;
}
assigned_centers_indexes[ assigned_centers - 1 ] = max_index;
}
free ( assigned_centers_indexes );
} /* ----- end of function kmeans_init_centers ----- */
/**
* \brief Create a new k-means object
* \param dataset Dataset to be clustered
* \param dataset_size Number of elements in the dataset
* \param dataset_dim Dimension of each element of the dataset
* \param K Number of clusters
* \return Reference to the newly created k-means object, if successfull, NULL otherwise
*/
kmeans_t*
kmeans_new ( double **dataset, const int dataset_size, const int dataset_dim, const int K )
{
int i, j;
kmeans_t *km = NULL;
if ( !( km = (kmeans_t*) malloc ( sizeof ( kmeans_t ))))
{
return NULL;
}
if ( !( km->dataset = (double**) calloc ( dataset_size, sizeof ( double* ))))
{
return NULL;
}
for ( i=0; i < dataset_size; i++ )
{
if ( !( km->dataset[i] = (double*) calloc ( dataset_dim, sizeof ( double ))))
{
return NULL;
}
for ( j=0; j < dataset_dim; j++ )
{
km->dataset[i][j] = dataset[i][j];
}
}
km->dataset_size = dataset_size;
km->dataset_dim = dataset_dim;
km->k = K;
if ( !( km->clusters = (double***) calloc ( K, sizeof ( double** ))))
{
return NULL;
}
if ( !( km->cluster_sizes = (int*) calloc ( K, sizeof ( int* ))))
{
return NULL;
}
if ( !( km->centers = (double**) calloc ( K, sizeof ( double* ))))
{
return NULL;
}
for ( i=0; i < K; i++ )
{
if ( !( km->centers[i] = (double*) calloc ( dataset_dim, sizeof ( double ))))
{
return NULL;
}
}
__kmeans_init_centers ( km );
return km;
} /* ----- end of function kmeans_new ----- */
/**
* \brief Function that performs a single step for k-means algorithm
* \param km k-means object
* \return 0 if no changes were performed by this step, 1 otherwise, -1 in case of error
*/
static int
__kmeans_step ( kmeans_t *km )
{
int i, j, k,
best_center = 0;
double dist = 0.0,
min_dist = DBL_MAX,
**old_centers = NULL;
if ( km->clusters[0] )
{
for ( i=0; i < km->k; i++ )
{
for ( j=0; j < km->cluster_sizes[i]; j++ )
{
free ( km->clusters[i][j] );
km->clusters[i][j] = NULL;
}
free ( km->clusters[i] );
km->clusters[i] = NULL;
km->cluster_sizes[i] = 0;
}
}
if ( !( old_centers = (double**) alloca ( km->k * sizeof ( double* ))))
{
return -1;
}
for ( i=0; i < km->k; i++ )
{
if ( !( old_centers[i] = (double*) alloca ( km->dataset_dim * sizeof ( double ))))
{
return -1;
}
for ( j=0; j < km->dataset_dim; j++ )
{
old_centers[i][j] = km->centers[i][j];
}
}
for ( i=0; i < km->dataset_size; i++ )
{
min_dist = DBL_MAX;
best_center = 0;
for ( j=0; j < km->k; j++ )
{
dist = 0.0;
for ( k=0; k < km->dataset_dim; k++ )
{
dist += ( km->dataset[i][k] - km->centers[j][k] ) * ( km->dataset[i][k] - km->centers[j][k] );
}
if ( dist < min_dist )
{
min_dist = dist;
best_center = j;
}
}
if ( !( km->clusters[best_center] = (double**) realloc ( km->clusters[best_center], (++(km->cluster_sizes[best_center])) * sizeof ( double* ))))
{
return -1;
}
if ( !( km->clusters [best_center] [km->cluster_sizes[best_center]-1] = (double*) calloc ( km->dataset_dim, sizeof ( double ))))
{
return -1;
}
for ( j=0; j < km->dataset_dim; j++ )
{
km->clusters [best_center] [km->cluster_sizes[best_center]-1] [j] = km->dataset[i][j];
}
}
for ( i=0; i < km->k; i++ )
{
for ( j=0; j < km->dataset_dim; j++ )
{
km->centers[i][j] = 0.0;
for ( k=0; k < km->cluster_sizes[i]; k++ )
{
km->centers[i][j] += km->clusters[i][k][j];
}
if ( km->cluster_sizes[i] != 0 )
{
km->centers[i][j] /= (double) km->cluster_sizes[i];
}
}
}
for ( i=0; i < km->k; i++ )
{
for ( j=0; j < km->dataset_dim; j++ )
{
if ( km->centers[i][j] != old_centers[i][j] )
{
return 1;
}
}
}
return 0;
} /* ----- end of function __kmeans_step ----- */
/**
* \brief Perform the k-means algorithm over a k-means object
* \param km k-means object
*/
void
kmeans ( kmeans_t *km )
{
while ( __kmeans_step ( km ) != 0 );
} /* ----- end of function kmeans ----- */
/**
* \brief Compute the heuristic coefficient associated to the current number of clusters through Schwarz's criterion
* \param km k-means object
* \return Real value expressing how well that number of clusters models the dataset
*/
static double
__kmeans_heuristic_coefficient ( kmeans_t *km )
{
int i, j, k;
double distorsion = 0.0;
for ( i=0; i < km->k; i++ )
{
for ( j=0; j < km->cluster_sizes[i]; j++ )
{
for ( k=0; k < km->dataset_dim; k++ )
{
distorsion += ( km->centers[i][k] - km->clusters[i][j][k] ) * ( km->centers[i][k] - km->clusters[i][j][k] );
}
}
}
return distorsion + km->k * log ( km->dataset_size );
} /* ----- end of function __kmeans_heuristic_coefficient ----- */
/**
* \brief Remove a k-means object
* \param km k-means object to be deallocaed
*/
void
kmeans_free ( kmeans_t *km )
{
int i, j;
for ( i=0; i < km->k; i++ )
{
for ( j=0; j < km->cluster_sizes[i]; j++ )
{
free ( km->clusters[i][j] );
km->clusters[i][j] = NULL;
}
free ( km->clusters[i] );
km->clusters[i] = NULL;
}
free ( km->clusters );
km->clusters = NULL;
free ( km->cluster_sizes );
km->cluster_sizes = NULL;
for ( i=0; i < km->k; i++ )
{
free ( km->centers[i] );
km->centers[i] = NULL;
}
free ( km->centers );
km->centers = NULL;
for ( i=0; i < km->dataset_size; i++ )
{
free ( km->dataset[i] );
km->dataset[i] = NULL;
}
free ( km->dataset );
km->dataset = NULL;
free ( km );
km = NULL;
} /* ----- end of function kmeans_free ----- */
/**
* \brief Perform a k-means clustering over a dataset automatically choosing the best value of k using Schwarz's criterion
* \param dataset Dataset to be clustered
* \param dataset_size Number of elements in the dataset
* \param dataset_dim Dimension of each element of the dataset
* \return Reference to the newly created k-means object, if successfull, NULL otherwise
*/
kmeans_t*
kmeans_auto ( double **dataset, int dataset_size, int dataset_dim )
{
int i;
double heuristic = 0.0,
best_heuristic = DBL_MAX;
kmeans_t *km = NULL,
*best_km = NULL;
for ( i=1; i <= dataset_size; i++ )
{
if ( !( km = kmeans_new ( dataset, dataset_size, dataset_dim, i )))
return NULL;
kmeans ( km );
heuristic = __kmeans_heuristic_coefficient ( km );
if ( heuristic < best_heuristic )
{
if ( best_km )
{
kmeans_free ( best_km );
}
best_km = km;
best_heuristic = heuristic;
} else {
kmeans_free ( km );
}
}
return best_km;
} /* ----- end of function kmeans_auto ----- */

39
kmeans.h Normal file
View file

@ -0,0 +1,39 @@
/*
* =====================================================================================
*
* Filename: kmeans.h
*
* Description: Header file for C k-means implementation
*
* Version: 1.0
* Created: 12/11/2010 10:43:55
* Revision: none
* Compiler: gcc
*
* Author: BlackLight (http://0x00.ath.cx), <blacklight@autistici.org>
* Licence: GNU GPL v.3
* Company: DO WHAT YOU WANT CAUSE A PIRATE IS FREE, YOU ARE A PIRATE!
*
* =====================================================================================
*/
#ifndef __KMEANS_H
#define __KMEANS_H
typedef struct __kmeans_t {
double **dataset;
int dataset_size;
int dataset_dim;
int k;
int *cluster_sizes;
double ***clusters;
double **centers;
} kmeans_t;
kmeans_t* kmeans_new ( double **dataset, const int dataset_size, const int dataset_dim, const int K );
kmeans_t* kmeans_auto ( double **dataset, int dataset_size, int dataset_dim );
void kmeans ( kmeans_t *km );
void kmeans_free ( kmeans_t *km );
#endif

201
test.c Normal file
View file

@ -0,0 +1,201 @@
/*
* =====================================================================================
*
* Filename: main.c
*
* Description: Test implementation for k-means library
*
* Version: 1.0
* Created: 17/11/2010 16:01:08
* Revision: none
* Compiler: gcc
*
* Author: BlackLight (http://0x00.ath.cx), <blacklight@autistici.org>
* Licence: GNU GPL v.3
* Company: DO WHAT YOU WANT CAUSE A PIRATE IS FREE, YOU ARE A PIRATE!
*
* =====================================================================================
*/
#include "kmeans.h"
#include <alloca.h>
#include <float.h>
#include <limits.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#define DATASET_SIZE 6
#define MIN_VAL 0
#define MAX_VAL 5
static const char colors[][10] = {
"\033[0m", "\033[91m", "\033[92m", "\033[93m", "\033[94m", "\033[95m", "\033[96m", "\033[97m",
"\033[98m", "\033[99m", "\033[100m", "\033[101m", "\033[102m", "\033[103m", "\033[104m", "\033[105m", "\033[107m",
"\033[1m", "\033[01;91m", "\033[01;92m", "\033[01;93m", "\033[01;94m", "\033[01;95m", "\033[01;96m", "\033[01;97m",
"\033[01;98m", "\033[01;99m", "\033[01;100m", "\033[01;101m", "\033[01;102m", "\033[01;103m", "\033[01;104m", "\033[01;105m", "\033[01;107m"
};
/**
* \brief Give a bidimensional representations of a dataset (by dimension 2) and its clusters detected by the k-means algorithm
* \param km k-means object
*/
void
dataset_print ( kmeans_t *km )
{
int i, j, k, l, found;
double min_x = DBL_MAX,
min_y = DBL_MAX,
max_x = DBL_MIN,
max_y = DBL_MIN;
/* If the dataset is not bidimensional, exit */
if ( km->dataset_dim != 2 )
{
return;
}
for ( i=0; i < km->dataset_size; i++ )
{
if ( km->dataset[i][0] < min_x )
min_x = km->dataset[i][0];
if ( km->dataset[i][1] < min_y )
min_y = km->dataset[i][1];
if ( km->dataset[i][0] > max_x )
max_x = km->dataset[i][0];
if ( km->dataset[i][1] > max_y )
max_y = km->dataset[i][1];
}
printf ( "+-" );
for ( i = (int) min_y; i <= (int) max_y; i++ )
{
printf ( "--" );
}
printf ( "+\n" );
for ( i = (int) min_x; i <= (int) max_x; i++ )
{
printf ( "| " );
for ( j = (int) min_y; j <= (int) max_y; j++ )
{
found = 0;
for ( k=0; k < km->k && !found; k++ )
{
for ( l=0; l < km->cluster_sizes[k] && !found; l++ )
{
if ( (int) km->clusters[k][l][0] == i &&
(int) km->clusters[k][l][1] == j )
{
found = 1;
}
}
}
if ( found )
{
printf ( "%s* %s", ( k < sizeof ( colors )) ? colors[k] : colors[1], colors[0] );
} else {
printf ( " " );
}
}
printf ( "|\n" );
}
printf ( "+-" );
for ( i = (int) min_y; i < (int) max_y + 1; i++ )
{
printf ( "--" );
}
printf ( "+\n\n" );
} /* ----- end of function dataset_print ----- */
/**
* \brief Main function for the program
*/
int
main ( int argc, char *argv[] )
{
int i, j, min_val, max_val, dataset_size;
double **dataset;
kmeans_t *km = NULL;
switch ( argc )
{
case 1:
min_val = MIN_VAL;
max_val = MAX_VAL;
dataset_size = DATASET_SIZE;
break;
case 2:
min_val = MIN_VAL;
max_val = MAX_VAL;
dataset_size = atoi ( argv[1] );
break;
case 3:
min_val = MIN_VAL;
max_val = atoi ( argv[2] );
dataset_size = atoi ( argv[1] );
break;
default:
min_val = atoi ( argv[3] );
max_val = atoi ( argv[2] );
dataset_size = atoi ( argv[1] );
break;
}
srand ( time ( NULL ));
if ( !( dataset = (double**) alloca ( dataset_size * sizeof ( double* ))))
return 1;
for ( i=0; i < dataset_size; i++ )
{
if ( !( dataset[i] = (double*) alloca ( 2 * sizeof ( double ))))
return 1;
for ( j=0; j < 2; j++ )
{
dataset[i][j] = ( rand() % ( max_val - min_val )) + min_val;
}
}
if ( !( km = kmeans_auto ( dataset, dataset_size, 2 )))
{
return 1;
}
dataset_print ( km );
for ( i=0; i < km->k; i++ )
{
printf ( "Cluster %d: [ ", i );
for ( j=0; j < km->cluster_sizes[i]; j++ )
{
printf ( "(%d, %d), ", (int) km->clusters[i][j][0], (int) km->clusters[i][j][1] );
}
printf ( "]\n" );
}
kmeans_free ( km );
return EXIT_SUCCESS;
} /* ---------- end of function main ---------- */