k-means clustering for SOM output

2024-11-27 22:25:12 +01:00 · 2010-11-20 16:47:57 +01:00 · 2010-11-20 16:47:57 +01:00 · d41753a8a4
commit d41753a8a4
parent ec196b3968
12 changed files with 2650 additions and 54 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -25,10 +25,12 @@ bayesian.c \
 cluster.c \
 correlation.c \
 db.c \
 fkmeans/kmeans.c \
 fsom/fsom.c \
 modules.c \
 mysql.c \
 neural.c \
 neural_cluster.c \
 outdb.c \
 postgresql.c \
 regex.c \
--- a/Makefile.in
+++ b/Makefile.in
@ -84,8 +84,10 @@ am_libsf_ai_preproc_la_OBJECTS = libsf_ai_preproc_la-alert_history.lo \
 	libsf_ai_preproc_la-cencode.lo libsf_ai_preproc_la-bayesian.lo \
 	libsf_ai_preproc_la-cluster.lo \
 	libsf_ai_preproc_la-correlation.lo libsf_ai_preproc_la-db.lo \
-	libsf_ai_preproc_la-fsom.lo libsf_ai_preproc_la-modules.lo \
+	libsf_ai_preproc_la-kmeans.lo libsf_ai_preproc_la-fsom.lo \
-	libsf_ai_preproc_la-mysql.lo libsf_ai_preproc_la-neural.lo \
+	libsf_ai_preproc_la-modules.lo libsf_ai_preproc_la-mysql.lo \
 	libsf_ai_preproc_la-neural.lo \
 	libsf_ai_preproc_la-neural_cluster.lo \
 	libsf_ai_preproc_la-outdb.lo libsf_ai_preproc_la-postgresql.lo \
 	libsf_ai_preproc_la-regex.lo libsf_ai_preproc_la-spp_ai.lo \
 	libsf_ai_preproc_la-stream.lo libsf_ai_preproc_la-webserv.lo
@ -267,10 +269,12 @@ bayesian.c \
 cluster.c \
 correlation.c \
 db.c \
 fkmeans/kmeans.c \
 fsom/fsom.c \
 modules.c \
 mysql.c \
 neural.c \
 neural_cluster.c \
 outdb.c \
 postgresql.c \
 regex.c \
@ -416,6 +420,9 @@ libsf_ai_preproc_la-correlation.lo: correlation.c
 libsf_ai_preproc_la-db.lo: db.c
 	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsf_ai_preproc_la_CFLAGS) $(CFLAGS) -c -o libsf_ai_preproc_la-db.lo `test -f 'db.c' || echo '$(srcdir)/'`db.c
 libsf_ai_preproc_la-kmeans.lo: fkmeans/kmeans.c
 	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsf_ai_preproc_la_CFLAGS) $(CFLAGS) -c -o libsf_ai_preproc_la-kmeans.lo `test -f 'fkmeans/kmeans.c' || echo '$(srcdir)/'`fkmeans/kmeans.c
 libsf_ai_preproc_la-fsom.lo: fsom/fsom.c
 	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsf_ai_preproc_la_CFLAGS) $(CFLAGS) -c -o libsf_ai_preproc_la-fsom.lo `test -f 'fsom/fsom.c' || echo '$(srcdir)/'`fsom/fsom.c
@ -428,6 +435,9 @@ libsf_ai_preproc_la-mysql.lo: mysql.c
 libsf_ai_preproc_la-neural.lo: neural.c
 	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsf_ai_preproc_la_CFLAGS) $(CFLAGS) -c -o libsf_ai_preproc_la-neural.lo `test -f 'neural.c' || echo '$(srcdir)/'`neural.c
 libsf_ai_preproc_la-neural_cluster.lo: neural_cluster.c
 	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsf_ai_preproc_la_CFLAGS) $(CFLAGS) -c -o libsf_ai_preproc_la-neural_cluster.lo `test -f 'neural_cluster.c' || echo '$(srcdir)/'`neural_cluster.c
 libsf_ai_preproc_la-outdb.lo: outdb.c
 	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsf_ai_preproc_la_CFLAGS) $(CFLAGS) -c -o libsf_ai_preproc_la-outdb.lo `test -f 'outdb.c' || echo '$(srcdir)/'`outdb.c
--- a/fkmeans/Doxyfile
+++ b/fkmeans/Doxyfile
--- a/fkmeans/Makefile
+++ b/fkmeans/Makefile
@ -0,0 +1,3 @@
 all:
 	gcc -g -O3 -Wall -pedantic -pedantic-errors -std=c99 -o kmeans-test test.c kmeans.c -lm
--- a/fkmeans/README
+++ b/fkmeans/README
@ -0,0 +1,88 @@
 fkmeans is a tiny C library that allows you to perform k-means clustering
 algorithm over arbitrary sets of n-dimensional data. All you need to do is:
 - Include the file kmeans.h in your sources;
 - Consider your data set as a vector of vectors of double items (double**),
  where each vector is an n-dimensional item of your data set;
 - If you want to perform the k-means algorithm over your data and you already
  know the number k of clusters there contained, or its estimate, you want to
  execute some code like this (in this example, the data set is 3-dimensional,
  i.e. it contains N vectors whose size is 3, and we know it contains n_clus
  clusters):
    kmeans_t *km;
    double **dataset;
    ...
    km = kmeans_new ( dataset, N, 3, n_clus );
    kmeans ( km );
    ...
    kmeans_free ( km );
  If you don't already know the number of clusters contained in your data set,
  you can use the function kmeans_auto() for automatically attempting to find
  the best one using Schwarz's criterion. Be careful, this operation can be very
  slow, especially if executed on data set having many elements. The example
  above would simply become something like:
    kmeans_t *km;
    double **dataset;
    ...
    km = kmeans_auto ( dataset, N, 3 );
    ...
    kmeans_free ( km );
 - Once the clustering has been performed, the clusters of data can be simply
  accessed from your kmeans_t* structure, as they are held as a double*** field
  named "clusters". Each vector in this structure represents a cluter, whose
  size is specified in the field cluster_sizes[i] of the structure. Each cluster
  contains the items that form it, each of it is an n-dimensional vector. The
  number of clusters is specified in the field "k" of the structure, the
  number of dimensions of each element is specified in the field "dataset_dim"
  and the number of elements in the originary data set is specified in the field
  "dataset_size". So, for example:
    for ( i=0; i < km->k; i++ )
    {
 	    printf ( "cluster %d: [ ", i );
 	    for ( j=0; j < km->cluster_sizes[i]; j++ )
 	    {
 		    printf ( "(" );
 		    for ( k=0; k < km->dataset_size; k++ )
 		    {
 			    printf ( "%f, ", km->clusters[i][j][k] );
 		    }
 		    printf ( "), ");
 		}
 	    printf ( "]\n" );
 	}
  The library however already comes with a sample implementation, contained in
  "test.c", and typing "make" this example will be built. This example takes 0,
  1, 2 or 3 command-line arguments, in format
  $ ./kmeans-test [num_elements] [min_value] [max_value]
  and randomly generates a 2-dimensional data set containing num_elements, whose
  coordinates are between min_value and max_value. The clustering is then
  performed and the results are shown on stdout, with the clusters coloured in
  different ways;
 - After you write your source, remember to include the file "kmeans.c",
  containing the implementation of the library, in the list of your sources
  files;
 - That's all. Include "kmeans.h", write your code using
  kmeans_new()+kmeans()+kmeans_free() or kmeans_auto()+kmeans_free(), explore
  your clusters, remember to include "kmeans.c" in the list of your source
  files, and you're ready for k-means clustering.
 Author: Fabio "BlackLight" Manganiello,
        <blacklight@autistici.org>,
        http://0x00.ath.cx
--- a/fkmeans/kmeans.c
+++ b/fkmeans/kmeans.c
@ -0,0 +1,445 @@
 /*
 * =====================================================================================
 *
 *       Filename:  kmeans.c
 *
 *    Description:  k-means clusterization algorithm implementation in C
 *
 *        Version:  1.0
 *        Created:  12/11/2010 10:43:28
 *       Revision:  none
 *       Compiler:  gcc
 *
 *         Author:  BlackLight (http://0x00.ath.cx), <blacklight@autistici.org>
 *        Licence:  GNU GPL v.3
 *        Company:  DO WHAT YOU WANT CAUSE A PIRATE IS FREE, YOU ARE A PIRATE!
 *
 * =====================================================================================
 */
 #include	"kmeans.h"
 #include	<alloca.h>
 #include	<float.h>
 #include	<limits.h>
 #include	<math.h>
 #include	<stdio.h>
 #include	<stdlib.h>
 /**
 * \brief  Initialize the centers of the clusters taking the K most distant elements in the dataset
 * \param  km 	k-means object
 */
 static void
 __kmeans_init_centers ( kmeans_t *km )
 {
 	int i, j, k, l,
 	    index_found = 0,
 	    max_index = 0,
 	    assigned_centers = 0,
 	    *assigned_centers_indexes = NULL;
 	double dist = 0.0,
 		  max_dist = 0.0;
 	for ( i=0; i < km->dataset_size; i++ )
 	{
 		dist = 0.0;
 		for ( j=0; j < km->dataset_dim; j++ )
 		{
 			dist += ( km->dataset[i][j] ) * ( km->dataset[i][j] );
 		}
 		if ( dist > max_dist )
 		{
 			max_dist = dist;
 			max_index = i;
 		}
 	}
 	for ( i=0; i < km->dataset_dim; i++ )
 	{
 		km->centers[0][i] = km->dataset[max_index][i];
 	}
 	if ( !( assigned_centers_indexes = (int*) realloc ( assigned_centers_indexes, (++assigned_centers) * sizeof ( int ))))
 	{
 		return;
 	}
 	assigned_centers_indexes[ assigned_centers - 1 ] = max_index;
 	for ( i=1; i < km->k; i++ )
 	{
 		max_dist = 0.0;
 		max_index = 0;
 		for ( j=0; j < km->dataset_size; j++ )
 		{
 			index_found = 0;
 			for ( k=0; k < assigned_centers && !index_found; k++ )
 			{
 				if ( assigned_centers_indexes[k] == j )
 				{
 					index_found = 1;
 				}
 			}
 			if ( index_found )
 				continue;
 			dist = 0.0;
 			for ( k=0; k < assigned_centers; k++ )
 			{
 				for ( l=0; l < km->dataset_dim; l++ )
 				{
 					dist += ( km->dataset[j][l] - km->centers[k][l] ) * ( km->dataset[j][l] - km->centers[k][l] );
 				}
 			}
 			if ( dist > max_dist )
 			{
 				max_dist = dist;
 				max_index = j;
 			}
 		}
 		for ( j=0; j < km->dataset_dim; j++ )
 		{
 			km->centers[i][j] = km->dataset[max_index][j];
 		}
 		if ( !( assigned_centers_indexes = (int*) realloc ( assigned_centers_indexes, (++assigned_centers) * sizeof ( int ))))
 		{
 			return;
 		}
 		assigned_centers_indexes[ assigned_centers - 1 ] = max_index;
 	}
 	free ( assigned_centers_indexes );
 }		/* -----  end of function kmeans_init_centers  ----- */
 /**
 * \brief  Create a new k-means object
 * \param  dataset 		Dataset to be clustered
 * \param  dataset_size 	Number of elements in the dataset
 * \param  dataset_dim 	Dimension of each element of the dataset
 * \param  K 			Number of clusters
 * \return Reference to the newly created k-means object, if successfull, NULL otherwise
 */
 kmeans_t*
 kmeans_new ( double **dataset, const int dataset_size, const int dataset_dim, const int K )
 {
 	int i, j;
 	kmeans_t *km = NULL;
 	if ( !( km = (kmeans_t*) malloc ( sizeof ( kmeans_t ))))
 	{
 		return NULL;
 	}
 	if ( !( km->dataset = (double**) calloc ( dataset_size, sizeof ( double* ))))
 	{
 		return NULL;
 	}
 	for ( i=0; i < dataset_size; i++ )
 	{
 		if ( !( km->dataset[i] = (double*) calloc ( dataset_dim, sizeof ( double ))))
 		{
 			return NULL;
 		}
 		for ( j=0; j < dataset_dim; j++ )
 		{
 			km->dataset[i][j] = dataset[i][j];
 		}
 	}
 	km->dataset_size = dataset_size;
 	km->dataset_dim = dataset_dim;
 	km->k = K;
 	if ( !( km->clusters = (double***) calloc ( K, sizeof ( double** ))))
 	{
 		return NULL;
 	}
 	if ( !( km->cluster_sizes = (int*) calloc ( K, sizeof ( int* ))))
 	{
 		return NULL;
 	}
 	if ( !( km->centers = (double**) calloc ( K, sizeof ( double* ))))
 	{
 		return NULL;
 	}
 	for ( i=0; i < K; i++ )
 	{
 		if ( !( km->centers[i] = (double*) calloc ( dataset_dim, sizeof ( double ))))
 		{
 			return NULL;
 		}
 	}
 	__kmeans_init_centers ( km );
 	return km;
 }		/* -----  end of function kmeans_new  ----- */
 /**
 * \brief  Function that performs a single step for k-means algorithm
 * \param  km 	k-means object
 * \return 0 if no changes were performed by this step, 1 otherwise, -1 in case of error
 */
 static int
 __kmeans_step ( kmeans_t *km )
 {
 	int i, j, k,
 	    best_center = 0;
 	double dist = 0.0,
 		  min_dist = DBL_MAX,
 		  **old_centers = NULL;
 	if ( km->clusters[0] )
 	{
 		for ( i=0; i < km->k; i++ )
 		{
 			for ( j=0; j < km->cluster_sizes[i]; j++ )
 			{
 				free ( km->clusters[i][j] );
 				km->clusters[i][j] = NULL;
 			}
 			free ( km->clusters[i] );
 			km->clusters[i] = NULL;
 			km->cluster_sizes[i] = 0;
 		}
 	}
 	if ( !( old_centers = (double**) alloca ( km->k * sizeof ( double* ))))
 	{
 		return -1;
 	}
 	for ( i=0; i < km->k; i++ )
 	{
 		if ( !( old_centers[i] = (double*) alloca ( km->dataset_dim * sizeof ( double ))))
 		{
 			return -1;
 		}
 		for ( j=0; j < km->dataset_dim; j++ )
 		{
 			old_centers[i][j] = km->centers[i][j];
 		}
 	}
 	for ( i=0; i < km->dataset_size; i++ )
 	{
 		min_dist = DBL_MAX;
 		best_center = 0;
 		for ( j=0; j < km->k; j++ )
 		{
 			dist = 0.0;
 			for ( k=0; k < km->dataset_dim; k++ )
 			{
 				dist += ( km->dataset[i][k] - km->centers[j][k] ) * ( km->dataset[i][k] - km->centers[j][k] );
 			}
 			if ( dist < min_dist )
 			{
 				min_dist = dist;
 				best_center = j;
 			}
 		}
 		if ( !( km->clusters[best_center] = (double**) realloc ( km->clusters[best_center], (++(km->cluster_sizes[best_center])) * sizeof ( double* ))))
 		{
 			return -1;
 		}
 		if ( !( km->clusters [best_center] [km->cluster_sizes[best_center]-1] = (double*) calloc ( km->dataset_dim, sizeof ( double ))))
 		{
 			return -1;
 		}
 		for ( j=0; j < km->dataset_dim; j++ )
 		{
 			km->clusters [best_center] [km->cluster_sizes[best_center]-1] [j] = km->dataset[i][j];
 		}
 	}
 	for ( i=0; i < km->k; i++ )
 	{
 		for ( j=0; j < km->dataset_dim; j++ )
 		{
 			km->centers[i][j] = 0.0;
 			for ( k=0; k < km->cluster_sizes[i]; k++ )
 			{
 				km->centers[i][j] += km->clusters[i][k][j];
 			}
 			if ( km->cluster_sizes[i] != 0 )
 			{
 				km->centers[i][j] /= (double) km->cluster_sizes[i];
 			}
 		}
 	}
 	for ( i=0; i < km->k; i++ )
 	{
 		for ( j=0; j < km->dataset_dim; j++ )
 		{
 			if ( km->centers[i][j] != old_centers[i][j] )
 			{
 				return 1;
 			}
 		}
 	}
 	return 0;
 }		/* -----  end of function __kmeans_step  ----- */
 /**
 * \brief  Perform the k-means algorithm over a k-means object
 * \param  km 	k-means object
 */
 void
 kmeans ( kmeans_t *km )
 {
 	while ( __kmeans_step ( km ) != 0 );
 }		/* -----  end of function kmeans  ----- */
 /**
 * \brief  Compute the heuristic coefficient associated to the current number of clusters through Schwarz's criterion
 * \param  km 	k-means object
 * \return Real value expressing how well that number of clusters models the dataset
 */
 static double
 __kmeans_heuristic_coefficient ( kmeans_t *km )
 {
 	int i, j, k;
 	double distorsion = 0.0;
 	for ( i=0; i < km->k; i++ )
 	{
 		for ( j=0; j < km->cluster_sizes[i]; j++ )
 		{
 			for ( k=0; k < km->dataset_dim; k++ )
 			{
 				distorsion += ( km->centers[i][k] - km->clusters[i][j][k] ) * ( km->centers[i][k] - km->clusters[i][j][k] );
 			}
 		}
 	}
 	return distorsion + km->k * log ( km->dataset_size );
 }		/* -----  end of function __kmeans_heuristic_coefficient  ----- */
 /**
 * \brief  Remove a k-means object
 * \param  km 	k-means object to be deallocaed
 */
 void
 kmeans_free ( kmeans_t *km )
 {
 	int i, j;
 	for ( i=0; i < km->k; i++ )
 	{
 		for ( j=0; j < km->cluster_sizes[i]; j++ )
 		{
 			free ( km->clusters[i][j] );
 			km->clusters[i][j] = NULL;
 		}
 		free ( km->clusters[i] );
 		km->clusters[i] = NULL;
 	}
 	free ( km->clusters );
 	km->clusters = NULL;
 	free ( km->cluster_sizes );
 	km->cluster_sizes = NULL;
 	for ( i=0; i < km->k; i++ )
 	{
 		free ( km->centers[i] );
 		km->centers[i] = NULL;
 	}
 	free ( km->centers );
 	km->centers = NULL;
 	for ( i=0; i < km->dataset_size; i++ )
 	{
 		free ( km->dataset[i] );
 		km->dataset[i] = NULL;
 	}
 	free ( km->dataset );
 	km->dataset = NULL;
 	free ( km );
 	km = NULL;
 }		/* -----  end of function kmeans_free  ----- */
 /**
 * \brief  Perform a k-means clustering over a dataset automatically choosing the best value of k using Schwarz's criterion
 * \param  dataset 		Dataset to be clustered
 * \param  dataset_size 	Number of elements in the dataset
 * \param  dataset_dim 	Dimension of each element of the dataset
 * \return Reference to the newly created k-means object, if successfull, NULL otherwise
 */
 kmeans_t*
 kmeans_auto ( double **dataset, int dataset_size, int dataset_dim )
 {
 	int i;
 	double heuristic = 0.0,
 		  best_heuristic = DBL_MAX;
 	kmeans_t *km = NULL,
 		    *best_km = NULL;
 	for ( i=1; i <= dataset_size; i++ )
 	{
 		if ( !( km = kmeans_new ( dataset, dataset_size, dataset_dim, i )))
 			return NULL;
 		kmeans ( km );
 		heuristic = __kmeans_heuristic_coefficient ( km );
 		if ( heuristic < best_heuristic )
 		{
 			if ( best_km )
 			{
 				kmeans_free ( best_km );
 			}
 			best_km = km;
 			best_heuristic = heuristic;
 		} else {
 			kmeans_free ( km );
 		}
 	}
 	return best_km;
 }		/* -----  end of function kmeans_auto  ----- */
--- a/fkmeans/kmeans.h
+++ b/fkmeans/kmeans.h
@ -0,0 +1,52 @@
 /*
 * =====================================================================================
 *
 *       Filename:  kmeans.h
 *
 *    Description:  Header file for C k-means implementation
 *
 *        Version:  1.0
 *        Created:  12/11/2010 10:43:55
 *       Revision:  none
 *       Compiler:  gcc
 *
 *         Author:  BlackLight (http://0x00.ath.cx), <blacklight@autistici.org>
 *        Licence:  GNU GPL v.3
 *        Company:  DO WHAT YOU WANT CAUSE A PIRATE IS FREE, YOU ARE A PIRATE!
 *
 * =====================================================================================
 */
 #ifndef 	__KMEANS_H
 #define 	__KMEANS_H
 typedef struct __kmeans_t  {
 	/** Input data set */
 	double **dataset;
 	/** Number of elements in the data set */
 	int dataset_size;
 	/** Dimension of each element of the data set */
 	int dataset_dim;
 	/** Number of clusters */
 	int k;
 	/** Vector containing the number of elements in each cluster */
 	int *cluster_sizes;
 	/** Clusters */
 	double ***clusters;
 	/** Coordinates of the centers of the clusters */
 	double **centers;
 } kmeans_t;
 kmeans_t* kmeans_new ( double **dataset, const int dataset_size, const int dataset_dim, const int K );
 kmeans_t* kmeans_auto ( double **dataset, int dataset_size, int dataset_dim );
 void kmeans ( kmeans_t *km );
 void kmeans_free ( kmeans_t *km );
 #endif
--- a/mysql.c
+++ b/mysql.c
@ -48,18 +48,26 @@ __mysql_do_init ( MYSQL **__DB, BOOL is_out )
 		return (void*) *__DB;
 	if ( !( *__DB = (MYSQL*) malloc ( sizeof ( MYSQL ))))
 	{
 		return NULL;
 	}
 	if ( !( mysql_init ( *__DB )))
 	{
 		return NULL;
 	}
 	if ( is_out )
 	{
 		if ( !mysql_real_connect ( *__DB, config->outdbhost, config->outdbuser, config->outdbpass, NULL, 0, NULL, 0 ))
 		{
 			return NULL;
 		}
 		if ( mysql_select_db ( *__DB, config->outdbname ))
 		{
 			return NULL;
 		}
 	} else {
 		if ( !mysql_real_connect ( *__DB, config->dbhost, config->dbuser, config->dbpass, NULL, 0, NULL, 0 ))
 			return NULL;
--- a/neural.c
+++ b/neural.c
@ -37,21 +37,22 @@
 /** Enumeration for the input fields of the SOM neural network */
 enum  { som_src_ip, som_dst_ip, som_src_port, som_dst_port, som_time, som_gid, som_sid, som_rev, SOM_NUM_ITEMS };
-typedef struct  {
+PRIVATE time_t latest_serialization_time         = ( time_t ) 0;
-	unsigned int  gid;
+PRIVATE som_network_t *net                       = NULL;
-	unsigned int  sid;
+PRIVATE AI_alerts_per_neuron *alerts_per_neuron = NULL;
 	unsigned int  rev;
 	uint32_t      src_ip_addr;
 	uint32_t      dst_ip_addr;
 	uint16_t      src_port;
 	uint16_t      dst_port;
 	time_t        timestamp;
 } AI_som_alert_tuple;
 PRIVATE time_t latest_serialization_time  = ( time_t ) 0;
 PRIVATE som_network_t *net                = NULL;
 PRIVATE pthread_mutex_t neural_mutex;
 /**
 * \brief  Get the hash table containing the alerts associated to each output neuron
 * \return The hash table
 */
 AI_alerts_per_neuron*
 AI_get_alerts_per_neuron ()
 {
 	return alerts_per_neuron;
 }		/* -----  end of function AI_get_alerts_per_neuron  ----- */
 /**
 * \brief  Get the current weight of the neural correlation index using a hyperbolic tangent function with a parameter expressed in function of the current number of alerts in the database
 * \return The weight of the correlation index ( 0 <= weight < 1 )
@ -126,6 +127,11 @@ __AI_som_alert_distance ( const AI_som_alert_tuple alert1, const AI_som_alert_tu
 		  x2 = 0,
 		  y2 = 0;
 	int i;
 	BOOL is_found = false;
 	AI_alerts_per_neuron *found = NULL;
 	AI_alerts_per_neuron_key key;
 	if ( !( input1 = (double*) alloca ( SOM_NUM_ITEMS * sizeof ( double ))))
 	{
 		AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
@ -136,24 +142,128 @@ __AI_som_alert_distance ( const AI_som_alert_tuple alert1, const AI_som_alert_tu
 		AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
 	}
 	pthread_mutex_lock ( &neural_mutex );
 	if ( !net )
 	{
 		pthread_mutex_unlock ( &neural_mutex );
 		return 0.0;
 	}
 	__AI_alert_to_som_data ( alert1, &input1 );
 	__AI_alert_to_som_data ( alert2, &input2 );
 	pthread_mutex_lock ( &neural_mutex );
 	som_set_inputs ( net, input1 );
 	som_get_best_neuron_coordinates ( net, &x1, &y1 );
 	__AI_alert_to_som_data ( alert2, &input2 );
 	som_set_inputs ( net, input2 );
 	som_get_best_neuron_coordinates ( net, &x2, &y2 );
 	pthread_mutex_unlock ( &neural_mutex );
 	/* Check if there are already entries in the hash table for these two neurons, otherwise
 	 * it creates them and append these two alerts */
 	key.x = x1;
 	key.y = y1;
 	HASH_FIND ( hh, alerts_per_neuron, &key, sizeof ( key ), found );
 	if ( !found )
 	{
 		if ( !( found = (AI_alerts_per_neuron*) calloc ( 1, sizeof ( AI_alerts_per_neuron ))))
 		{
 			AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
 		}
 		found->key = key;
 		found->n_alerts = 1;
 		if ( !( found->alerts = (AI_som_alert_tuple*) calloc ( 1, sizeof ( AI_som_alert_tuple ))))
 		{
 			AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
 		}
 		found->alerts[0] = alert1;
 		HASH_ADD ( hh, alerts_per_neuron, key, sizeof ( key ), found );
 	} else {
 		is_found = false;
 		for ( i=0; i < found->n_alerts && !is_found; i++ )
 		{
 			if (
 				alert1.gid == found->alerts[i].gid &&
 				alert1.sid == found->alerts[i].sid &&
 				alert1.rev == found->alerts[i].rev &&
 				alert1.src_ip_addr == found->alerts[i].src_ip_addr &&
 				alert1.dst_ip_addr == found->alerts[i].dst_ip_addr &&
 				alert1.src_port == found->alerts[i].src_port &&
 				alert1.dst_port == found->alerts[i].dst_port )
 			{
 				is_found = true;
 			}
 		}
 		if ( !is_found )
 		{
 			if ( !( found->alerts = (AI_som_alert_tuple*) realloc ( found->alerts,
 							(++(found->n_alerts)) * sizeof ( AI_som_alert_tuple ))))
 			{
 				AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
 			}
 			found->alerts[ found->n_alerts - 1 ] = alert1;
 		}
 	}
 	key.x = x2;
 	key.y = y2;
 	HASH_FIND ( hh, alerts_per_neuron, &key, sizeof ( key ), found );
 	if ( !found )
 	{
 		if ( !( found = (AI_alerts_per_neuron*) calloc ( 1, sizeof ( AI_alerts_per_neuron ))))
 		{
 			AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
 		}
 		found->key = key;
 		found->n_alerts = 1;
 		if ( !( found->alerts = (AI_som_alert_tuple*) calloc ( 1, sizeof ( AI_som_alert_tuple ))))
 		{
 			AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
 		}
 		found->alerts[0] = alert2;
 		HASH_ADD ( hh, alerts_per_neuron, key, sizeof ( key ), found );
 	} else {
 		is_found = false;
 		for ( i=0; i < found->n_alerts && !is_found; i++ )
 		{
 			if (
 				alert2.gid == found->alerts[i].gid &&
 				alert2.sid == found->alerts[i].sid &&
 				alert2.rev == found->alerts[i].rev &&
 				alert2.src_ip_addr == found->alerts[i].src_ip_addr &&
 				alert2.dst_ip_addr == found->alerts[i].dst_ip_addr &&
 				alert2.src_port == found->alerts[i].src_port &&
 				alert2.dst_port == found->alerts[i].dst_port )
 			{
 				is_found = true;
 			}
 		}
 		if ( !is_found )
 		{
 			if ( !( found->alerts = (AI_som_alert_tuple*) realloc ( found->alerts,
 				(++(found->n_alerts)) * sizeof ( AI_som_alert_tuple ))))
 			{
 				AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
 			}
 			found->alerts[ found->n_alerts - 1 ] = alert2;
 		}
 	}
 	/* Return the normalized euclidean distance in [0,1] (the normalization is made considering that the maximum distance
 	 * between two points on the output neurons matrix is the distance between the upper-left and bottom-right points) */
 	return sqrt ((double) ( (x2-x1)*(x2-x1) + (y2-y1)*(y2-y1) )) /
@ -170,9 +280,7 @@ __AI_som_alert_distance ( const AI_som_alert_tuple alert1, const AI_som_alert_tu
 double
 AI_alert_neural_som_correlation ( const AI_snort_alert *a, const AI_snort_alert *b )
 {
-	size_t                 i = 0;
+	AI_som_alert_tuple t1, t2;
 	unsigned long long int time_sum = 0;
 	AI_som_alert_tuple     t1, t2;
 	t1.gid = a->gid;
 	t1.sid = a->sid;
@ -181,18 +289,7 @@ AI_alert_neural_som_correlation ( const AI_snort_alert *a, const AI_snort_alert
 	t1.dst_ip_addr = ntohl ( a->ip_dst_addr );
 	t1.src_port = ntohs ( a->tcp_src_port );
 	t1.dst_port = ntohs ( a->tcp_dst_port );
-	time_sum = (unsigned long long int) a->timestamp;
+	t1.timestamp = a->timestamp;
 	/* The timestamp of this alert is computed like the average timestamp of the grouped alerts */
 	for ( i=1; i < a->grouped_alerts_count; i++ )
 	{
 		if ( a->grouped_alerts[i-1] )
 		{
 			time_sum += (unsigned long long int) a->grouped_alerts[i-1]->timestamp;
 		}
 	}
 	t1.timestamp = (time_t) ( time_sum / a->grouped_alerts_count );
 	t2.gid = b->gid;
 	t2.sid = b->sid;
@ -201,17 +298,7 @@ AI_alert_neural_som_correlation ( const AI_snort_alert *a, const AI_snort_alert
 	t2.dst_ip_addr = ntohl ( b->ip_dst_addr );
 	t2.src_port = ntohs ( b->tcp_src_port );
 	t2.dst_port = ntohs ( b->tcp_dst_port );
-	time_sum = (unsigned long long int) b->timestamp;
+	t2.timestamp = b->timestamp;
 	for ( i=1; i < b->grouped_alerts_count; i++ )
 	{
 		if ( b->grouped_alerts[i-1] )
 		{
 			time_sum += (unsigned long long int) b->grouped_alerts[i-1]->timestamp;
 		}
 	}
 	t2.timestamp = (time_t) ( time_sum / b->grouped_alerts_count );
 	return __AI_som_alert_distance ( t1, t2 );
 }		/* -----  end of function AI_alert_neural_som_correlation  ----- */
@ -338,8 +425,9 @@ __AI_som_train ()
 void*
 AI_neural_thread ( void *arg )
 {
 	BOOL do_train = false;
 	struct stat st;
 	BOOL do_train = false;
 	pthread_t neural_clustering_thread;
 	pthread_mutex_init ( &neural_mutex, NULL );
@ -353,6 +441,14 @@ AI_neural_thread ( void *arg )
 		AI_fatal_err ( "AIPreproc: neural network thread launched but netfile option was not specified", __FILE__, __LINE__ );
 	}
 	if ( config->neuralClusteringInterval != 0 )
 	{
 		if ( pthread_create ( &neural_clustering_thread, NULL, AI_neural_clustering_thread, NULL ) != 0 )
 		{
 			AI_fatal_err ( "Failed to create the manual correlations parsing thread", __FILE__, __LINE__ );
 		}
 	}
 	while ( 1 )
 	{
 		if ( stat ( config->netfile, &st ) < 0 )
--- a/neural_cluster.c
+++ b/neural_cluster.c
@ -0,0 +1,194 @@
 /*
 * =====================================================================================
 *
 *       Filename:  neural_cluster.c
 *
 *    Description:  Perform the clusterization over the output layer of the SOM neural
 *                  network, in order to attempt to find the alerts belonging to the
 *                  same attack scenario. The clusterization is operated through k-means
 *                  using Schwarz criterion in order to find the optimal number of
 *                  clusters, the implementation is in fkmeans/
 *
 *        Version:  0.1
 *        Created:  19/11/2010 18:37:35
 *       Revision:  none
 *       Compiler:  gcc
 *
 *         Author:  BlackLight (http://0x00.ath.cx), <blacklight@autistici.org>
 *        Licence:  GNU GPL v.3
 *        Company:  DO WHAT YOU WANT CAUSE A PIRATE IS FREE, YOU ARE A PIRATE!
 *
 * =====================================================================================
 */
 #include	"spp_ai.h"
 /** \defgroup neural_cluster Module for clustering the alerts associated to the
 * neural network output layer in order to find alerts belonging to the same scenario
 * @{ */
 #include	"fkmeans/kmeans.h"
 #include	<stdio.h>
 #include	<stdlib.h>
 #include	<unistd.h>
 /**
 * \brief  Print the clusters associated to the SOM output to an XML log file
 * \param  km 				k-means object
 * \param  alerts_per_neuron 	Hash table containing the alerts associated to each neuron
 */
 PRIVATE void
 __AI_neural_clusters_to_xml ( kmeans_t *km, AI_alerts_per_neuron *alerts_per_neuron )
 {
 	int i, j, k, l, are_equal;
 	FILE *fp = NULL;
 	uint32_t src_addr = 0,
 		    dst_addr = 0;
 	char src_ip[INET_ADDRSTRLEN] = { 0 },
 		dst_ip[INET_ADDRSTRLEN] = { 0 };
 	AI_alerts_per_neuron_key key;
 	AI_alerts_per_neuron *alert_iterator = NULL;
 	if ( !( fp = fopen ( config->neural_clusters_log, "w" )))
 	{
 		AI_fatal_err ( "Unable to write on the neural clusters XML log file", __FILE__, __LINE__ );
 	}
 	fprintf ( fp, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n\n"
 		"<clusters>\n" );
 	for ( i=0; i < km->k; i++ )
 	{
 		fprintf ( fp, "\t<cluster id=\"%d\">\n", i );
 		for ( j=0; j < km->cluster_sizes[i]; j++ )
 		{
 			key.x = km->clusters[i][j][0];
 			key.y = km->clusters[i][j][1];
 			HASH_FIND ( hh, alerts_per_neuron, &key, sizeof ( key ), alert_iterator );
 			if ( alert_iterator )
 			{
 				for ( k=0; k < alert_iterator->n_alerts; k++ )
 				{
 					are_equal = 0;
 					for ( l=0; l < alert_iterator->n_alerts && !are_equal; l++ )
 					{
 						if ( k != l )
 						{
 							if (
 								alert_iterator->alerts[k].gid == alert_iterator->alerts[l].gid &&
 								alert_iterator->alerts[k].sid == alert_iterator->alerts[l].sid &&
 								alert_iterator->alerts[k].rev == alert_iterator->alerts[l].rev &&
 								alert_iterator->alerts[k].src_ip_addr == alert_iterator->alerts[l].src_ip_addr &&
 								alert_iterator->alerts[k].dst_ip_addr == alert_iterator->alerts[l].dst_ip_addr &&
 								alert_iterator->alerts[k].src_port == alert_iterator->alerts[l].src_port &&
 								alert_iterator->alerts[k].dst_port == alert_iterator->alerts[l].dst_port &&
 								alert_iterator->alerts[k].timestamp == alert_iterator->alerts[l].timestamp )
 							{
 								are_equal = 1;
 							}
 						}
 					}
 					if ( !are_equal )
 					{
 						src_addr = htonl ( alert_iterator->alerts[k].src_ip_addr );
 						dst_addr = htonl ( alert_iterator->alerts[k].dst_ip_addr );
 						inet_ntop ( AF_INET, &src_addr, src_ip, INET_ADDRSTRLEN );
 						inet_ntop ( AF_INET, &dst_addr, dst_ip, INET_ADDRSTRLEN );
 						fprintf ( fp, "\t\t<alert gid=\"%d\" sid=\"%d\" rev=\"%d\" src_ip=\"%s\" src_port=\"%d\" "
 							"dst_ip=\"%s\" dst_port=\"%d\" timestamp=\"%lu\" xcoord=\"%d\" ycoord=\"%d\"/>\n",
 							alert_iterator->alerts[k].gid,
 							alert_iterator->alerts[k].sid,
 							alert_iterator->alerts[k].rev,
 							src_ip, alert_iterator->alerts[k].src_port,
 							dst_ip, alert_iterator->alerts[k].dst_port,
 							alert_iterator->alerts[k].timestamp,
 							alert_iterator->key.x, alert_iterator->key.y );
 					}
 				}
 			}
 		}
 		fprintf ( fp, "\t</cluster>\n" );
 	}
 	fprintf ( fp, "</clusters>\n" );
 	fclose ( fp );
 }		/* -----  end of function __AI_neural_clusters_to_xml  ----- */
 /**
 * \brief  Thread that performs the k-means clustering over the output layer of
 * the SOM neural network
 */
 void*
 AI_neural_clustering_thread ( void *arg )
 {
 	AI_alerts_per_neuron *alerts_per_neuron = NULL,
 					 *alert_iterator    = NULL;
 	kmeans_t *km = NULL;
 	double **dataset = NULL;
 	int i, dataset_size = 0;
 	while ( 1 )
 	{
 		dataset = NULL;
 		dataset_size = 0;
 		alerts_per_neuron = AI_get_alerts_per_neuron();
 		for ( alert_iterator = alerts_per_neuron; alert_iterator; alert_iterator = (AI_alerts_per_neuron*) alert_iterator->hh.next )
 		{
 			if ( alert_iterator->n_alerts > 0 )
 			{
 				if ( !( dataset = (double**) realloc ( dataset, (++dataset_size) * sizeof ( double* ))))
 				{
 					AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
 				}
 				if ( !( dataset[dataset_size-1] = (double*) calloc ( 2, sizeof ( double ))))
 				{
 					AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
 				}
 				dataset[dataset_size-1][0] = (double) alert_iterator->key.x;
 				dataset[dataset_size-1][1] = (double) alert_iterator->key.y;
 			}
 		}
 		if ( dataset && dataset_size != 0 )
 		{
 			if ( !( km = kmeans_auto ( dataset, dataset_size, 2 )))
 			{
 				AI_fatal_err ( "Unable to initialize the k-means clustering object", __FILE__, __LINE__ );
 			}
 			__AI_neural_clusters_to_xml ( km, alerts_per_neuron );
 			kmeans_free ( km );
 			for ( i=0; i < dataset_size; i++ )
 			{
 				free ( dataset[i] );
 			}
 			free ( dataset );
 		}
 		sleep ( config->neuralClusteringInterval );
 	}
 	pthread_exit ((void*) 0);
 	return (void*) 0;
 }		/* -----  end of function AI_neural_clustering_thread  ----- */
 /** @} */
--- a/spp_ai.c
+++ b/spp_ai.c
@ -172,14 +172,14 @@ static AI_config * AI_parse(char *args)
 {
 	char *arg;
 	char *match;
-	char alertfile[1024]          = { 0 },
+	char alertfile[1024]           = { 0 },
-		alert_history_file[1024] = { 0 },
+		alert_history_file[1024]  = { 0 },
-		clusterfile[1024]        = { 0 },
+		clusterfile[1024]         = { 0 },
-		corr_alerts_dir[1024]    = { 0 },
+		corr_alerts_dir[1024]     = { 0 },
-		corr_modules_dir[1024]   = { 0 },
+		corr_modules_dir[1024]    = { 0 },
-		corr_rules_dir[1024]     = { 0 },
+		corr_rules_dir[1024]      = { 0 },
-		webserv_dir[1024]        = { 0 },
+		webserv_dir[1024]         = { 0 },
-		webserv_banner[1024]     = { 0 };
+		webserv_banner[1024]      = { 0 };
 	char **matches       = NULL;
 	int  nmatches        = 0;
@ -217,6 +217,7 @@ static AI_config * AI_parse(char *args)
 			     correlation_graph_interval           = 0,
 			     database_parsing_interval            = 0,
 				manual_correlations_parsing_interval = 0,
 				neural_clustering_interval           = 0,
 				neural_network_training_interval     = 0,
 				neural_train_steps                   = 0,
 				output_neurons_per_side              = 0,
@ -526,6 +527,27 @@ static AI_config * AI_parse(char *args)
 	config->neuralNetworkTrainingInterval = neural_network_training_interval;
 	_dpd.logMsg( "    Neural network training interval: %u\n", config->neuralNetworkTrainingInterval );
 	/* Parsing the neural_clustering_interval option */
 	if (( arg = (char*) strcasestr( args, "neural_clustering_interval" ) ))
 	{
 		for ( arg += strlen("neural_clustering_interval");
 				*arg && (*arg < '0' || *arg > '9');
 				arg++ );
 		if ( !(*arg) )
 		{
 			AI_fatal_err ( "neural_clustering_interval option used but "
 				"no value specified", __FILE__, __LINE__ );
 		}
 		neural_clustering_interval = strtoul ( arg, NULL, 10 );
 	} else {
 		neural_clustering_interval = DEFAULT_NEURAL_CLUSTERING_INTERVAL;
 	}
 	config->neuralClusteringInterval = neural_clustering_interval;
 	_dpd.logMsg( "    Neural network clustering interval: %u\n", config->neuralClusteringInterval );
 	/* Parsing the output_neurons_per_side option */
 	if (( arg = (char*) strcasestr( args, "output_neurons_per_side" ) ))
 	{
@ -796,6 +818,9 @@ static AI_config * AI_parse(char *args)
 	_dpd.logMsg("    webserv_dir: %s\n", config->webserv_dir);
 	snprintf ( config->neural_clusters_log, sizeof ( config->neural_clusters_log ), "%s/neural_clusters.xml", config->webserv_dir );
 	_dpd.logMsg("    neural_clusters_log: %s\n", config->neural_clusters_log);
 	/* Parsing the corr_modules_dir option */
 	if (( arg = (char*) strcasestr( args, "corr_modules_dir" ) ))
 	{
--- a/spp_ai.h
+++ b/spp_ai.h
@ -81,6 +81,11 @@
 * alert correlations and the next one (this value should usually be high) */
 #define 	DEFAULT_NEURAL_NETWORK_TRAINING_INTERVAL 	43200
 /** Default interval in seconds between an execution of the thread that attempts to cluster
 * the output layer of the neural network searching for alerts belonging to the same
 * attack scenario and the next one */
 #define 	DEFAULT_NEURAL_CLUSTERING_INTERVAL 		1200
 /** Default interval of validity in seconds for an entry in the cache of correlated alerts */
 #define 	DEFAULT_BAYESIAN_CORRELATION_CACHE_VALIDITY 	600
@ -193,6 +198,11 @@ typedef struct
 	/** Interval in seconds between an invocation of the thread for parsing XML manual correlations and the next one */
 	unsigned long  manualCorrelationsParsingInterval;
 	/** Interval in seconds between an execution of the thread that attempts to cluster
 	 * the output layer of the neural network searching for alerts belonging to the same
 	 * attack scenario and the next one */
 	unsigned long  neuralClusteringInterval;
 	/** Interval in seconds for which an entry in the cache of correlated alerts is valid */
 	unsigned long  bayesianCorrelationCacheValidity;
@ -256,6 +266,9 @@ typedef struct
 	/** File keeping the serialized neural network used for the alert correlation */
 	char          netfile[1024];
 	/** File containing the likely clusters computed over the output layer of the neural network */
 	char          neural_clusters_log[1024];
 	/** Database name, if database logging is used */
 	char          dbname[256];
@ -451,6 +464,34 @@ typedef struct  {
 	UT_hash_handle            hh;
 } AI_alert_correlation;
 /*****************************************************************/
 /** Expresses an alert as a numerical tuple manageable by a neural network */
 typedef struct  {
 	unsigned int  gid;
 	unsigned int  sid;
 	unsigned int  rev;
 	uint32_t      src_ip_addr;
 	uint32_t      dst_ip_addr;
 	uint16_t      src_port;
 	uint16_t      dst_port;
 	time_t        timestamp;
 } AI_som_alert_tuple;
 /*****************************************************************/
 /** Key for the AI_alerts_per_neuron hash table */
 typedef struct  {
 	int x;
 	int y;
 } AI_alerts_per_neuron_key;
 /*****************************************************************/
 /** Struct that holds, for each point of the output layer, the list of associated alerts
 * for easily performing the clustering algorithm */
 typedef struct  {
 	AI_alerts_per_neuron_key  key;
 	AI_som_alert_tuple        *alerts;
 	int                       n_alerts;
 	UT_hash_handle            hh;
 } AI_alerts_per_neuron;
 /*****************************************************************/
 /** Enumeration for describing the table in the output database */
 enum  { ALERTS_TABLE, IPV4_HEADERS_TABLE, TCP_HEADERS_TABLE, PACKET_STREAMS_TABLE, CLUSTERED_ALERTS_TABLE, CORRELATED_ALERTS_TABLE, N_TABLES };
@ -513,6 +554,8 @@ void                   AI_outdb_mutex_initialize ();
 void*                  AI_store_alert_to_db_thread ( void* );
 void*                  AI_store_cluster_to_db_thread ( void* );
 void*                  AI_store_correlation_to_db_thread ( void* );
 void*                  AI_neural_clustering_thread ( void* );
 AI_alerts_per_neuron*  AI_get_alerts_per_neuron ();
 double(**AI_get_corr_functions ( size_t* ))(const AI_snort_alert*, const AI_snort_alert*);
 double(**AI_get_corr_weights ( size_t* ))();
		`@ -0,0 +1,3 @@`
							`all:`
							`gcc -g -O3 -Wall -pedantic -pedantic-errors -std=c99 -o kmeans-test test.c kmeans.c -lm`