k-means clustering for SOM output

2025-01-27 08:20:28 +01:00 · 2010-11-20 16:47:57 +01:00 · 2010-11-20 16:47:57 +01:00 · d41753a8a4
commit d41753a8a4
parent ec196b3968
12 changed files with 2650 additions and 54 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -25,10 +25,12 @@ bayesian.c \
 cluster.c \
 correlation.c \
 db.c \
+fkmeans/kmeans.c \
 fsom/fsom.c \
 modules.c \
 mysql.c \
 neural.c \
+neural_cluster.c \
 outdb.c \
 postgresql.c \
 regex.c \
--- a/Makefile.in
+++ b/Makefile.in
@ -84,8 +84,10 @@ am_libsf_ai_preproc_la_OBJECTS = libsf_ai_preproc_la-alert_history.lo \
 	libsf_ai_preproc_la-cencode.lo libsf_ai_preproc_la-bayesian.lo \
 	libsf_ai_preproc_la-cluster.lo \
 	libsf_ai_preproc_la-correlation.lo libsf_ai_preproc_la-db.lo \
-	libsf_ai_preproc_la-fsom.lo libsf_ai_preproc_la-modules.lo \
-	libsf_ai_preproc_la-mysql.lo libsf_ai_preproc_la-neural.lo \
+	libsf_ai_preproc_la-kmeans.lo libsf_ai_preproc_la-fsom.lo \
+	libsf_ai_preproc_la-modules.lo libsf_ai_preproc_la-mysql.lo \
+	libsf_ai_preproc_la-neural.lo \
+	libsf_ai_preproc_la-neural_cluster.lo \
 	libsf_ai_preproc_la-outdb.lo libsf_ai_preproc_la-postgresql.lo \
 	libsf_ai_preproc_la-regex.lo libsf_ai_preproc_la-spp_ai.lo \
 	libsf_ai_preproc_la-stream.lo libsf_ai_preproc_la-webserv.lo
@ -267,10 +269,12 @@ bayesian.c \
 cluster.c \
 correlation.c \
 db.c \
+fkmeans/kmeans.c \
 fsom/fsom.c \
 modules.c \
 mysql.c \
 neural.c \
+neural_cluster.c \
 outdb.c \
 postgresql.c \
 regex.c \
@ -416,6 +420,9 @@ libsf_ai_preproc_la-correlation.lo: correlation.c
 libsf_ai_preproc_la-db.lo: db.c
 	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsf_ai_preproc_la_CFLAGS) $(CFLAGS) -c -o libsf_ai_preproc_la-db.lo `test -f 'db.c' || echo '$(srcdir)/'`db.c

+libsf_ai_preproc_la-kmeans.lo: fkmeans/kmeans.c
+	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsf_ai_preproc_la_CFLAGS) $(CFLAGS) -c -o libsf_ai_preproc_la-kmeans.lo `test -f 'fkmeans/kmeans.c' || echo '$(srcdir)/'`fkmeans/kmeans.c
+
 libsf_ai_preproc_la-fsom.lo: fsom/fsom.c
 	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsf_ai_preproc_la_CFLAGS) $(CFLAGS) -c -o libsf_ai_preproc_la-fsom.lo `test -f 'fsom/fsom.c' || echo '$(srcdir)/'`fsom/fsom.c

@ -428,6 +435,9 @@ libsf_ai_preproc_la-mysql.lo: mysql.c
 libsf_ai_preproc_la-neural.lo: neural.c
 	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsf_ai_preproc_la_CFLAGS) $(CFLAGS) -c -o libsf_ai_preproc_la-neural.lo `test -f 'neural.c' || echo '$(srcdir)/'`neural.c

+libsf_ai_preproc_la-neural_cluster.lo: neural_cluster.c
+	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsf_ai_preproc_la_CFLAGS) $(CFLAGS) -c -o libsf_ai_preproc_la-neural_cluster.lo `test -f 'neural_cluster.c' || echo '$(srcdir)/'`neural_cluster.c
+
 libsf_ai_preproc_la-outdb.lo: outdb.c
 	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsf_ai_preproc_la_CFLAGS) $(CFLAGS) -c -o libsf_ai_preproc_la-outdb.lo `test -f 'outdb.c' || echo '$(srcdir)/'`outdb.c

--- a/fkmeans/Doxyfile
+++ b/fkmeans/Doxyfile
--- a/fkmeans/Makefile
+++ b/fkmeans/Makefile
@ -0,0 +1,3 @@
+all:
+	gcc -g -O3 -Wall -pedantic -pedantic-errors -std=c99 -o kmeans-test test.c kmeans.c -lm
+
--- a/fkmeans/README
+++ b/fkmeans/README
@ -0,0 +1,88 @@
+fkmeans is a tiny C library that allows you to perform k-means clustering
+algorithm over arbitrary sets of n-dimensional data. All you need to do is:
+
+- Include the file kmeans.h in your sources;
+
+- Consider your data set as a vector of vectors of double items (double**),
+  where each vector is an n-dimensional item of your data set;
+
+- If you want to perform the k-means algorithm over your data and you already
+  know the number k of clusters there contained, or its estimate, you want to
+  execute some code like this (in this example, the data set is 3-dimensional,
+  i.e. it contains N vectors whose size is 3, and we know it contains n_clus
+  clusters):
+
+    kmeans_t *km;
+    double **dataset;
+    ...
+    km = kmeans_new ( dataset, N, 3, n_clus );
+    kmeans ( km );
+    ...
+    kmeans_free ( km );
+
+  If you don't already know the number of clusters contained in your data set,
+  you can use the function kmeans_auto() for automatically attempting to find
+  the best one using Schwarz's criterion. Be careful, this operation can be very
+  slow, especially if executed on data set having many elements. The example
+  above would simply become something like:
+
+    kmeans_t *km;
+    double **dataset;
+    ...
+    km = kmeans_auto ( dataset, N, 3 );
+    ...
+    kmeans_free ( km );
+
+- Once the clustering has been performed, the clusters of data can be simply
+  accessed from your kmeans_t* structure, as they are held as a double*** field
+  named "clusters". Each vector in this structure represents a cluter, whose
+  size is specified in the field cluster_sizes[i] of the structure. Each cluster
+  contains the items that form it, each of it is an n-dimensional vector. The
+  number of clusters is specified in the field "k" of the structure, the
+  number of dimensions of each element is specified in the field "dataset_dim"
+  and the number of elements in the originary data set is specified in the field
+  "dataset_size". So, for example:
+
+    for ( i=0; i < km->k; i++ )
+    {
+	    printf ( "cluster %d: [ ", i );
+
+	    for ( j=0; j < km->cluster_sizes[i]; j++ )
+	    {
+		    printf ( "(" );
+
+		    for ( k=0; k < km->dataset_size; k++ )
+		    {
+			    printf ( "%f, ", km->clusters[i][j][k] );
+		    }
+
+		    printf ( "), ");
+		}
+
+	    printf ( "]\n" );
+	}
+
+  The library however already comes with a sample implementation, contained in
+  "test.c", and typing "make" this example will be built. This example takes 0,
+  1, 2 or 3 command-line arguments, in format
+
+  $ ./kmeans-test [num_elements] [min_value] [max_value]
+
+  and randomly generates a 2-dimensional data set containing num_elements, whose
+  coordinates are between min_value and max_value. The clustering is then
+  performed and the results are shown on stdout, with the clusters coloured in
+  different ways;
+
+- After you write your source, remember to include the file "kmeans.c",
+  containing the implementation of the library, in the list of your sources
+  files;
+
+- That's all. Include "kmeans.h", write your code using
+  kmeans_new()+kmeans()+kmeans_free() or kmeans_auto()+kmeans_free(), explore
+  your clusters, remember to include "kmeans.c" in the list of your source
+  files, and you're ready for k-means clustering.
+
+Author: Fabio "BlackLight" Manganiello,
+        <blacklight@autistici.org>,
+        http://0x00.ath.cx
+
--- a/fkmeans/kmeans.c
+++ b/fkmeans/kmeans.c
@ -0,0 +1,445 @@
+/*
+ * =====================================================================================
+ *
+ *       Filename:  kmeans.c
+ *
+ *    Description:  k-means clusterization algorithm implementation in C
+ *
+ *        Version:  1.0
+ *        Created:  12/11/2010 10:43:28
+ *       Revision:  none
+ *       Compiler:  gcc
+ *
+ *         Author:  BlackLight (http://0x00.ath.cx), <blacklight@autistici.org>
+ *        Licence:  GNU GPL v.3
+ *        Company:  DO WHAT YOU WANT CAUSE A PIRATE IS FREE, YOU ARE A PIRATE!
+ *
+ * =====================================================================================
+ */
+
+#include	"kmeans.h"
+
+#include	<alloca.h>
+#include	<float.h>
+#include	<limits.h>
+#include	<math.h>
+#include	<stdio.h>
+#include	<stdlib.h>
+
+/**
+ * \brief  Initialize the centers of the clusters taking the K most distant elements in the dataset
+ * \param  km 	k-means object
+ */
+
+static void
+__kmeans_init_centers ( kmeans_t *km )
+{
+	int i, j, k, l,
+	    index_found = 0,
+	    max_index = 0,
+	    assigned_centers = 0,
+	    *assigned_centers_indexes = NULL;
+
+	double dist = 0.0,
+		  max_dist = 0.0;
+
+	for ( i=0; i < km->dataset_size; i++ )
+	{
+		dist = 0.0;
+
+		for ( j=0; j < km->dataset_dim; j++ )
+		{
+			dist += ( km->dataset[i][j] ) * ( km->dataset[i][j] );
+		}
+
+		if ( dist > max_dist )
+		{
+			max_dist = dist;
+			max_index = i;
+		}
+	}
+
+	for ( i=0; i < km->dataset_dim; i++ )
+	{
+		km->centers[0][i] = km->dataset[max_index][i];
+	}
+
+	if ( !( assigned_centers_indexes = (int*) realloc ( assigned_centers_indexes, (++assigned_centers) * sizeof ( int ))))
+	{
+		return;
+	}
+
+	assigned_centers_indexes[ assigned_centers - 1 ] = max_index;
+
+	for ( i=1; i < km->k; i++ )
+	{
+		max_dist = 0.0;
+		max_index = 0;
+
+		for ( j=0; j < km->dataset_size; j++ )
+		{
+			index_found = 0;
+			
+			for ( k=0; k < assigned_centers && !index_found; k++ )
+			{
+				if ( assigned_centers_indexes[k] == j )
+				{
+					index_found = 1;
+				}
+			}
+
+			if ( index_found )
+				continue;
+
+			dist = 0.0;
+
+			for ( k=0; k < assigned_centers; k++ )
+			{
+				for ( l=0; l < km->dataset_dim; l++ )
+				{
+					dist += ( km->dataset[j][l] - km->centers[k][l] ) * ( km->dataset[j][l] - km->centers[k][l] );
+				}
+			}
+
+			if ( dist > max_dist )
+			{
+				max_dist = dist;
+				max_index = j;
+			}
+		}
+
+		for ( j=0; j < km->dataset_dim; j++ )
+		{
+			km->centers[i][j] = km->dataset[max_index][j];
+		}
+
+		if ( !( assigned_centers_indexes = (int*) realloc ( assigned_centers_indexes, (++assigned_centers) * sizeof ( int ))))
+		{
+			return;
+		}
+
+		assigned_centers_indexes[ assigned_centers - 1 ] = max_index;
+	}
+
+	free ( assigned_centers_indexes );
+}		/* -----  end of function kmeans_init_centers  ----- */
+
+/**
+ * \brief  Create a new k-means object
+ * \param  dataset 		Dataset to be clustered
+ * \param  dataset_size 	Number of elements in the dataset
+ * \param  dataset_dim 	Dimension of each element of the dataset
+ * \param  K 			Number of clusters
+ * \return Reference to the newly created k-means object, if successfull, NULL otherwise
+ */
+
+kmeans_t*
+kmeans_new ( double **dataset, const int dataset_size, const int dataset_dim, const int K )
+{
+	int i, j;
+	kmeans_t *km = NULL;
+
+	if ( !( km = (kmeans_t*) malloc ( sizeof ( kmeans_t ))))
+	{
+		return NULL;
+	}
+
+	if ( !( km->dataset = (double**) calloc ( dataset_size, sizeof ( double* ))))
+	{
+		return NULL;
+	}
+
+	for ( i=0; i < dataset_size; i++ )
+	{
+		if ( !( km->dataset[i] = (double*) calloc ( dataset_dim, sizeof ( double ))))
+		{
+			return NULL;
+		}
+
+		for ( j=0; j < dataset_dim; j++ )
+		{
+			km->dataset[i][j] = dataset[i][j];
+		}
+	}
+
+	km->dataset_size = dataset_size;
+	km->dataset_dim = dataset_dim;
+	km->k = K;
+
+	if ( !( km->clusters = (double***) calloc ( K, sizeof ( double** ))))
+	{
+		return NULL;
+	}
+
+	if ( !( km->cluster_sizes = (int*) calloc ( K, sizeof ( int* ))))
+	{
+		return NULL;
+	}
+
+	if ( !( km->centers = (double**) calloc ( K, sizeof ( double* ))))
+	{
+		return NULL;
+	}
+
+	for ( i=0; i < K; i++ )
+	{
+		if ( !( km->centers[i] = (double*) calloc ( dataset_dim, sizeof ( double ))))
+		{
+			return NULL;
+		}
+	}
+
+	__kmeans_init_centers ( km );
+	return km;
+}		/* -----  end of function kmeans_new  ----- */
+
+/**
+ * \brief  Function that performs a single step for k-means algorithm
+ * \param  km 	k-means object
+ * \return 0 if no changes were performed by this step, 1 otherwise, -1 in case of error
+ */
+
+static int
+__kmeans_step ( kmeans_t *km )
+{
+	int i, j, k,
+	    best_center = 0;
+
+	double dist = 0.0,
+		  min_dist = DBL_MAX,
+		  **old_centers = NULL;
+
+	if ( km->clusters[0] )
+	{
+		for ( i=0; i < km->k; i++ )
+		{
+			for ( j=0; j < km->cluster_sizes[i]; j++ )
+			{
+				free ( km->clusters[i][j] );
+				km->clusters[i][j] = NULL;
+			}
+
+			free ( km->clusters[i] );
+			km->clusters[i] = NULL;
+			km->cluster_sizes[i] = 0;
+		}
+	}
+
+	if ( !( old_centers = (double**) alloca ( km->k * sizeof ( double* ))))
+	{
+		return -1;
+	}
+
+	for ( i=0; i < km->k; i++ )
+	{
+		if ( !( old_centers[i] = (double*) alloca ( km->dataset_dim * sizeof ( double ))))
+		{
+			return -1;
+		}
+
+		for ( j=0; j < km->dataset_dim; j++ )
+		{
+			old_centers[i][j] = km->centers[i][j];
+		}
+	}
+
+	for ( i=0; i < km->dataset_size; i++ )
+	{
+		min_dist = DBL_MAX;
+		best_center = 0;
+
+		for ( j=0; j < km->k; j++ )
+		{
+			dist = 0.0;
+
+			for ( k=0; k < km->dataset_dim; k++ )
+			{
+				dist += ( km->dataset[i][k] - km->centers[j][k] ) * ( km->dataset[i][k] - km->centers[j][k] );
+			}
+
+			if ( dist < min_dist )
+			{
+				min_dist = dist;
+				best_center = j;
+			}
+		}
+
+		if ( !( km->clusters[best_center] = (double**) realloc ( km->clusters[best_center], (++(km->cluster_sizes[best_center])) * sizeof ( double* ))))
+		{
+			return -1;
+		}
+
+		if ( !( km->clusters [best_center] [km->cluster_sizes[best_center]-1] = (double*) calloc ( km->dataset_dim, sizeof ( double ))))
+		{
+			return -1;
+		}
+
+		for ( j=0; j < km->dataset_dim; j++ )
+		{
+			km->clusters [best_center] [km->cluster_sizes[best_center]-1] [j] = km->dataset[i][j];
+		}
+	}
+
+	for ( i=0; i < km->k; i++ )
+	{
+		for ( j=0; j < km->dataset_dim; j++ )
+		{
+			km->centers[i][j] = 0.0;
+
+			for ( k=0; k < km->cluster_sizes[i]; k++ )
+			{
+				km->centers[i][j] += km->clusters[i][k][j];
+			}
+
+			if ( km->cluster_sizes[i] != 0 )
+			{
+				km->centers[i][j] /= (double) km->cluster_sizes[i];
+			}
+		}
+	}
+
+	for ( i=0; i < km->k; i++ )
+	{
+		for ( j=0; j < km->dataset_dim; j++ )
+		{
+			if ( km->centers[i][j] != old_centers[i][j] )
+			{
+				return 1;
+			}
+		}
+	}
+
+	return 0;
+}		/* -----  end of function __kmeans_step  ----- */
+
+/**
+ * \brief  Perform the k-means algorithm over a k-means object
+ * \param  km 	k-means object
+ */
+
+void
+kmeans ( kmeans_t *km )
+{
+	while ( __kmeans_step ( km ) != 0 );
+}		/* -----  end of function kmeans  ----- */
+
+/**
+ * \brief  Compute the heuristic coefficient associated to the current number of clusters through Schwarz's criterion
+ * \param  km 	k-means object
+ * \return Real value expressing how well that number of clusters models the dataset
+ */
+
+static double
+__kmeans_heuristic_coefficient ( kmeans_t *km )
+{
+	int i, j, k;
+	double distorsion = 0.0;
+
+	for ( i=0; i < km->k; i++ )
+	{
+		for ( j=0; j < km->cluster_sizes[i]; j++ )
+		{
+			for ( k=0; k < km->dataset_dim; k++ )
+			{
+				distorsion += ( km->centers[i][k] - km->clusters[i][j][k] ) * ( km->centers[i][k] - km->clusters[i][j][k] );
+			}
+		}
+	}
+
+	return distorsion + km->k * log ( km->dataset_size );
+}		/* -----  end of function __kmeans_heuristic_coefficient  ----- */
+
+/**
+ * \brief  Remove a k-means object
+ * \param  km 	k-means object to be deallocaed
+ */
+
+void
+kmeans_free ( kmeans_t *km )
+{
+	int i, j;
+
+	for ( i=0; i < km->k; i++ )
+	{
+		for ( j=0; j < km->cluster_sizes[i]; j++ )
+		{
+			free ( km->clusters[i][j] );
+			km->clusters[i][j] = NULL;
+		}
+
+		free ( km->clusters[i] );
+		km->clusters[i] = NULL;
+	}
+
+	free ( km->clusters );
+	km->clusters = NULL;
+
+	free ( km->cluster_sizes );
+	km->cluster_sizes = NULL;
+
+	for ( i=0; i < km->k; i++ )
+	{
+		free ( km->centers[i] );
+		km->centers[i] = NULL;
+	}
+
+	free ( km->centers );
+	km->centers = NULL;
+
+	for ( i=0; i < km->dataset_size; i++ )
+	{
+		free ( km->dataset[i] );
+		km->dataset[i] = NULL;
+	}
+
+	free ( km->dataset );
+	km->dataset = NULL;
+
+	free ( km );
+	km = NULL;
+}		/* -----  end of function kmeans_free  ----- */
+
+/**
+ * \brief  Perform a k-means clustering over a dataset automatically choosing the best value of k using Schwarz's criterion
+ * \param  dataset 		Dataset to be clustered
+ * \param  dataset_size 	Number of elements in the dataset
+ * \param  dataset_dim 	Dimension of each element of the dataset
+ * \return Reference to the newly created k-means object, if successfull, NULL otherwise
+ */
+
+kmeans_t*
+kmeans_auto ( double **dataset, int dataset_size, int dataset_dim )
+{
+	int i;
+
+	double heuristic = 0.0,
+		  best_heuristic = DBL_MAX;
+
+	kmeans_t *km = NULL,
+		    *best_km = NULL;
+
+	for ( i=1; i <= dataset_size; i++ )
+	{
+		if ( !( km = kmeans_new ( dataset, dataset_size, dataset_dim, i )))
+			return NULL;
+
+		kmeans ( km );
+		heuristic = __kmeans_heuristic_coefficient ( km );
+
+		if ( heuristic < best_heuristic )
+		{
+			if ( best_km )
+			{
+				kmeans_free ( best_km );
+			}
+
+			best_km = km;
+			best_heuristic = heuristic;
+		} else {
+			kmeans_free ( km );
+		}
+	}
+	
+	return best_km;
+}		/* -----  end of function kmeans_auto  ----- */
+
--- a/fkmeans/kmeans.h
+++ b/fkmeans/kmeans.h
@ -0,0 +1,52 @@
+/*
+ * =====================================================================================
+ *
+ *       Filename:  kmeans.h
+ *
+ *    Description:  Header file for C k-means implementation
+ *
+ *        Version:  1.0
+ *        Created:  12/11/2010 10:43:55
+ *       Revision:  none
+ *       Compiler:  gcc
+ *
+ *         Author:  BlackLight (http://0x00.ath.cx), <blacklight@autistici.org>
+ *        Licence:  GNU GPL v.3
+ *        Company:  DO WHAT YOU WANT CAUSE A PIRATE IS FREE, YOU ARE A PIRATE!
+ *
+ * =====================================================================================
+ */
+
+#ifndef 	__KMEANS_H
+#define 	__KMEANS_H
+
+typedef struct __kmeans_t  {
+	/** Input data set */
+	double **dataset;
+
+	/** Number of elements in the data set */
+	int dataset_size;
+
+	/** Dimension of each element of the data set */
+	int dataset_dim;
+
+	/** Number of clusters */
+	int k;
+
+	/** Vector containing the number of elements in each cluster */
+	int *cluster_sizes;
+
+	/** Clusters */
+	double ***clusters;
+
+	/** Coordinates of the centers of the clusters */
+	double **centers;
+} kmeans_t;
+
+kmeans_t* kmeans_new ( double **dataset, const int dataset_size, const int dataset_dim, const int K );
+kmeans_t* kmeans_auto ( double **dataset, int dataset_size, int dataset_dim );
+void kmeans ( kmeans_t *km );
+void kmeans_free ( kmeans_t *km );
+
+#endif
+
--- a/mysql.c
+++ b/mysql.c
@ -48,18 +48,26 @@ __mysql_do_init ( MYSQL **__DB, BOOL is_out )
 		return (void*) *__DB;

 	if ( !( *__DB = (MYSQL*) malloc ( sizeof ( MYSQL ))))
+	{
 		return NULL;
+	}

 	if ( !( mysql_init ( *__DB )))
+	{
 		return NULL;
+	}

 	if ( is_out )
 	{
 		if ( !mysql_real_connect ( *__DB, config->outdbhost, config->outdbuser, config->outdbpass, NULL, 0, NULL, 0 ))
+		{
 			return NULL;
+		}

 		if ( mysql_select_db ( *__DB, config->outdbname ))
+		{
 			return NULL;
+		}
 	} else {
 		if ( !mysql_real_connect ( *__DB, config->dbhost, config->dbuser, config->dbpass, NULL, 0, NULL, 0 ))
 			return NULL;
--- a/neural.c
+++ b/neural.c
@ -37,21 +37,22 @@
 /** Enumeration for the input fields of the SOM neural network */
 enum  { som_src_ip, som_dst_ip, som_src_port, som_dst_port, som_time, som_gid, som_sid, som_rev, SOM_NUM_ITEMS };

-typedef struct  {
-	unsigned int  gid;
-	unsigned int  sid;
-	unsigned int  rev;
-	uint32_t      src_ip_addr;
-	uint32_t      dst_ip_addr;
-	uint16_t      src_port;
-	uint16_t      dst_port;
-	time_t        timestamp;
-} AI_som_alert_tuple;
-
-PRIVATE time_t latest_serialization_time  = ( time_t ) 0;
-PRIVATE som_network_t *net                = NULL;
+PRIVATE time_t latest_serialization_time         = ( time_t ) 0;
+PRIVATE som_network_t *net                       = NULL;
+PRIVATE AI_alerts_per_neuron *alerts_per_neuron = NULL;
 PRIVATE pthread_mutex_t neural_mutex;

+/**
+ * \brief  Get the hash table containing the alerts associated to each output neuron
+ * \return The hash table
+ */
+
+AI_alerts_per_neuron*
+AI_get_alerts_per_neuron ()
+{
+	return alerts_per_neuron;
+}		/* -----  end of function AI_get_alerts_per_neuron  ----- */
+
 /**
 * \brief  Get the current weight of the neural correlation index using a hyperbolic tangent function with a parameter expressed in function of the current number of alerts in the database
 * \return The weight of the correlation index ( 0 <= weight < 1 )
@ -126,6 +127,11 @@ __AI_som_alert_distance ( const AI_som_alert_tuple alert1, const AI_som_alert_tu
 		  x2 = 0,
 		  y2 = 0;
 	
+	int i;
+	BOOL is_found = false;
+	AI_alerts_per_neuron *found = NULL;
+	AI_alerts_per_neuron_key key;
+
 	if ( !( input1 = (double*) alloca ( SOM_NUM_ITEMS * sizeof ( double ))))
 	{
 		AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
@ -136,24 +142,128 @@ __AI_som_alert_distance ( const AI_som_alert_tuple alert1, const AI_som_alert_tu
 		AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
 	}

-	pthread_mutex_lock ( &neural_mutex );
-
 	if ( !net )
 	{
-		pthread_mutex_unlock ( &neural_mutex );
 		return 0.0;
 	}

 	__AI_alert_to_som_data ( alert1, &input1 );
+	__AI_alert_to_som_data ( alert2, &input2 );
+
+	pthread_mutex_lock ( &neural_mutex );
+
 	som_set_inputs ( net, input1 );
 	som_get_best_neuron_coordinates ( net, &x1, &y1 );

-	__AI_alert_to_som_data ( alert2, &input2 );
 	som_set_inputs ( net, input2 );
 	som_get_best_neuron_coordinates ( net, &x2, &y2 );

 	pthread_mutex_unlock ( &neural_mutex );

+	/* Check if there are already entries in the hash table for these two neurons, otherwise
+	 * it creates them and append these two alerts */
+	key.x = x1;
+	key.y = y1;
+	HASH_FIND ( hh, alerts_per_neuron, &key, sizeof ( key ), found );
+
+	if ( !found )
+	{
+		if ( !( found = (AI_alerts_per_neuron*) calloc ( 1, sizeof ( AI_alerts_per_neuron ))))
+		{
+			AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
+		}
+
+		found->key = key;
+		found->n_alerts = 1;
+
+		if ( !( found->alerts = (AI_som_alert_tuple*) calloc ( 1, sizeof ( AI_som_alert_tuple ))))
+		{
+			AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
+		}
+
+		found->alerts[0] = alert1;
+		HASH_ADD ( hh, alerts_per_neuron, key, sizeof ( key ), found );
+	} else {
+		is_found = false;
+
+		for ( i=0; i < found->n_alerts && !is_found; i++ )
+		{
+			if (
+				alert1.gid == found->alerts[i].gid &&
+				alert1.sid == found->alerts[i].sid &&
+				alert1.rev == found->alerts[i].rev &&
+				alert1.src_ip_addr == found->alerts[i].src_ip_addr &&
+				alert1.dst_ip_addr == found->alerts[i].dst_ip_addr &&
+				alert1.src_port == found->alerts[i].src_port &&
+				alert1.dst_port == found->alerts[i].dst_port )
+			{
+				is_found = true;
+			}
+		}
+
+		if ( !is_found )
+		{
+			if ( !( found->alerts = (AI_som_alert_tuple*) realloc ( found->alerts,
+							(++(found->n_alerts)) * sizeof ( AI_som_alert_tuple ))))
+			{
+				AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
+			}
+
+			found->alerts[ found->n_alerts - 1 ] = alert1;
+		}
+	}
+
+	key.x = x2;
+	key.y = y2;
+	HASH_FIND ( hh, alerts_per_neuron, &key, sizeof ( key ), found );
+
+	if ( !found )
+	{
+		if ( !( found = (AI_alerts_per_neuron*) calloc ( 1, sizeof ( AI_alerts_per_neuron ))))
+		{
+			AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
+		}
+
+		found->key = key;
+		found->n_alerts = 1;
+
+		if ( !( found->alerts = (AI_som_alert_tuple*) calloc ( 1, sizeof ( AI_som_alert_tuple ))))
+		{
+			AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
+		}
+
+		found->alerts[0] = alert2;
+		HASH_ADD ( hh, alerts_per_neuron, key, sizeof ( key ), found );
+	} else {
+		is_found = false;
+
+		for ( i=0; i < found->n_alerts && !is_found; i++ )
+		{
+			if (
+				alert2.gid == found->alerts[i].gid &&
+				alert2.sid == found->alerts[i].sid &&
+				alert2.rev == found->alerts[i].rev &&
+				alert2.src_ip_addr == found->alerts[i].src_ip_addr &&
+				alert2.dst_ip_addr == found->alerts[i].dst_ip_addr &&
+				alert2.src_port == found->alerts[i].src_port &&
+				alert2.dst_port == found->alerts[i].dst_port )
+			{
+				is_found = true;
+			}
+		}
+
+		if ( !is_found )
+		{
+			if ( !( found->alerts = (AI_som_alert_tuple*) realloc ( found->alerts,
+				(++(found->n_alerts)) * sizeof ( AI_som_alert_tuple ))))
+			{
+				AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
+			}
+
+			found->alerts[ found->n_alerts - 1 ] = alert2;
+		}
+	}
+
 	/* Return the normalized euclidean distance in [0,1] (the normalization is made considering that the maximum distance
 	 * between two points on the output neurons matrix is the distance between the upper-left and bottom-right points) */
 	return sqrt ((double) ( (x2-x1)*(x2-x1) + (y2-y1)*(y2-y1) )) /
@ -170,9 +280,7 @@ __AI_som_alert_distance ( const AI_som_alert_tuple alert1, const AI_som_alert_tu
 double
 AI_alert_neural_som_correlation ( const AI_snort_alert *a, const AI_snort_alert *b )
 {
-	size_t                 i = 0;
-	unsigned long long int time_sum = 0;
-	AI_som_alert_tuple     t1, t2;
+	AI_som_alert_tuple t1, t2;

 	t1.gid = a->gid;
 	t1.sid = a->sid;
@ -181,18 +289,7 @@ AI_alert_neural_som_correlation ( const AI_snort_alert *a, const AI_snort_alert
 	t1.dst_ip_addr = ntohl ( a->ip_dst_addr );
 	t1.src_port = ntohs ( a->tcp_src_port );
 	t1.dst_port = ntohs ( a->tcp_dst_port );
-	time_sum = (unsigned long long int) a->timestamp;
-	
-	/* The timestamp of this alert is computed like the average timestamp of the grouped alerts */
-	for ( i=1; i < a->grouped_alerts_count; i++ )
-	{
-		if ( a->grouped_alerts[i-1] )
-		{
-			time_sum += (unsigned long long int) a->grouped_alerts[i-1]->timestamp;
-		}
-	}
-
-	t1.timestamp = (time_t) ( time_sum / a->grouped_alerts_count );
+	t1.timestamp = a->timestamp;

 	t2.gid = b->gid;
 	t2.sid = b->sid;
@ -201,17 +298,7 @@ AI_alert_neural_som_correlation ( const AI_snort_alert *a, const AI_snort_alert
 	t2.dst_ip_addr = ntohl ( b->ip_dst_addr );
 	t2.src_port = ntohs ( b->tcp_src_port );
 	t2.dst_port = ntohs ( b->tcp_dst_port );
-	time_sum = (unsigned long long int) b->timestamp;
-	
-	for ( i=1; i < b->grouped_alerts_count; i++ )
-	{
-		if ( b->grouped_alerts[i-1] )
-		{
-			time_sum += (unsigned long long int) b->grouped_alerts[i-1]->timestamp;
-		}
-	}
-
-	t2.timestamp = (time_t) ( time_sum / b->grouped_alerts_count );
+	t2.timestamp = b->timestamp;
 	return __AI_som_alert_distance ( t1, t2 );
 }		/* -----  end of function AI_alert_neural_som_correlation  ----- */

@ -338,8 +425,9 @@ __AI_som_train ()
 void*
 AI_neural_thread ( void *arg )
 {
-	BOOL do_train = false;
 	struct stat st;
+	BOOL do_train = false;
+	pthread_t neural_clustering_thread;

 	pthread_mutex_init ( &neural_mutex, NULL );

@ -353,6 +441,14 @@ AI_neural_thread ( void *arg )
 		AI_fatal_err ( "AIPreproc: neural network thread launched but netfile option was not specified", __FILE__, __LINE__ );
 	}

+	if ( config->neuralClusteringInterval != 0 )
+	{
+		if ( pthread_create ( &neural_clustering_thread, NULL, AI_neural_clustering_thread, NULL ) != 0 )
+		{
+			AI_fatal_err ( "Failed to create the manual correlations parsing thread", __FILE__, __LINE__ );
+		}
+	}
+
 	while ( 1 )
 	{
 		if ( stat ( config->netfile, &st ) < 0 )
--- a/neural_cluster.c
+++ b/neural_cluster.c
@ -0,0 +1,194 @@
+/*
+ * =====================================================================================
+ *
+ *       Filename:  neural_cluster.c
+ *
+ *    Description:  Perform the clusterization over the output layer of the SOM neural
+ *                  network, in order to attempt to find the alerts belonging to the
+ *                  same attack scenario. The clusterization is operated through k-means
+ *                  using Schwarz criterion in order to find the optimal number of
+ *                  clusters, the implementation is in fkmeans/
+ *
+ *        Version:  0.1
+ *        Created:  19/11/2010 18:37:35
+ *       Revision:  none
+ *       Compiler:  gcc
+ *
+ *         Author:  BlackLight (http://0x00.ath.cx), <blacklight@autistici.org>
+ *        Licence:  GNU GPL v.3
+ *        Company:  DO WHAT YOU WANT CAUSE A PIRATE IS FREE, YOU ARE A PIRATE!
+ *
+ * =====================================================================================
+ */
+
+#include	"spp_ai.h"
+
+/** \defgroup neural_cluster Module for clustering the alerts associated to the
+ * neural network output layer in order to find alerts belonging to the same scenario
+ * @{ */
+
+#include	"fkmeans/kmeans.h"
+
+#include	<stdio.h>
+#include	<stdlib.h>
+#include	<unistd.h>
+
+/**
+ * \brief  Print the clusters associated to the SOM output to an XML log file
+ * \param  km 				k-means object
+ * \param  alerts_per_neuron 	Hash table containing the alerts associated to each neuron
+ */
+
+PRIVATE void
+__AI_neural_clusters_to_xml ( kmeans_t *km, AI_alerts_per_neuron *alerts_per_neuron )
+{
+	int i, j, k, l, are_equal;
+	FILE *fp = NULL;
+
+	uint32_t src_addr = 0,
+		    dst_addr = 0;
+
+	char src_ip[INET_ADDRSTRLEN] = { 0 },
+		dst_ip[INET_ADDRSTRLEN] = { 0 };
+
+	AI_alerts_per_neuron_key key;
+	AI_alerts_per_neuron *alert_iterator = NULL;
+
+	if ( !( fp = fopen ( config->neural_clusters_log, "w" )))
+	{
+		AI_fatal_err ( "Unable to write on the neural clusters XML log file", __FILE__, __LINE__ );
+	}
+
+	fprintf ( fp, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n\n"
+		"<clusters>\n" );
+
+	for ( i=0; i < km->k; i++ )
+	{
+		fprintf ( fp, "\t<cluster id=\"%d\">\n", i );
+
+		for ( j=0; j < km->cluster_sizes[i]; j++ )
+		{
+			key.x = km->clusters[i][j][0];
+			key.y = km->clusters[i][j][1];
+			HASH_FIND ( hh, alerts_per_neuron, &key, sizeof ( key ), alert_iterator );
+
+			if ( alert_iterator )
+			{
+				for ( k=0; k < alert_iterator->n_alerts; k++ )
+				{
+					are_equal = 0;
+
+					for ( l=0; l < alert_iterator->n_alerts && !are_equal; l++ )
+					{
+						if ( k != l )
+						{
+							if (
+								alert_iterator->alerts[k].gid == alert_iterator->alerts[l].gid &&
+								alert_iterator->alerts[k].sid == alert_iterator->alerts[l].sid &&
+								alert_iterator->alerts[k].rev == alert_iterator->alerts[l].rev &&
+								alert_iterator->alerts[k].src_ip_addr == alert_iterator->alerts[l].src_ip_addr &&
+								alert_iterator->alerts[k].dst_ip_addr == alert_iterator->alerts[l].dst_ip_addr &&
+								alert_iterator->alerts[k].src_port == alert_iterator->alerts[l].src_port &&
+								alert_iterator->alerts[k].dst_port == alert_iterator->alerts[l].dst_port &&
+								alert_iterator->alerts[k].timestamp == alert_iterator->alerts[l].timestamp )
+							{
+								are_equal = 1;
+							}
+						}
+					}
+
+					if ( !are_equal )
+					{
+						src_addr = htonl ( alert_iterator->alerts[k].src_ip_addr );
+						dst_addr = htonl ( alert_iterator->alerts[k].dst_ip_addr );
+						inet_ntop ( AF_INET, &src_addr, src_ip, INET_ADDRSTRLEN );
+						inet_ntop ( AF_INET, &dst_addr, dst_ip, INET_ADDRSTRLEN );
+
+						fprintf ( fp, "\t\t<alert gid=\"%d\" sid=\"%d\" rev=\"%d\" src_ip=\"%s\" src_port=\"%d\" "
+							"dst_ip=\"%s\" dst_port=\"%d\" timestamp=\"%lu\" xcoord=\"%d\" ycoord=\"%d\"/>\n",
+							alert_iterator->alerts[k].gid,
+							alert_iterator->alerts[k].sid,
+							alert_iterator->alerts[k].rev,
+							src_ip, alert_iterator->alerts[k].src_port,
+							dst_ip, alert_iterator->alerts[k].dst_port,
+							alert_iterator->alerts[k].timestamp,
+							alert_iterator->key.x, alert_iterator->key.y );
+					}
+				}
+			}
+		}
+
+		fprintf ( fp, "\t</cluster>\n" );
+	}
+
+	fprintf ( fp, "</clusters>\n" );
+	fclose ( fp );
+}		/* -----  end of function __AI_neural_clusters_to_xml  ----- */
+
+/**
+ * \brief  Thread that performs the k-means clustering over the output layer of
+ * the SOM neural network
+ */
+
+void*
+AI_neural_clustering_thread ( void *arg )
+{
+	AI_alerts_per_neuron *alerts_per_neuron = NULL,
+					 *alert_iterator    = NULL;
+
+	kmeans_t *km = NULL;
+	double **dataset = NULL;
+	int i, dataset_size = 0;
+
+	while ( 1 )
+	{
+		dataset = NULL;
+		dataset_size = 0;
+		alerts_per_neuron = AI_get_alerts_per_neuron();
+		
+		for ( alert_iterator = alerts_per_neuron; alert_iterator; alert_iterator = (AI_alerts_per_neuron*) alert_iterator->hh.next )
+		{
+			if ( alert_iterator->n_alerts > 0 )
+			{
+				if ( !( dataset = (double**) realloc ( dataset, (++dataset_size) * sizeof ( double* ))))
+				{
+					AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
+				}
+
+				if ( !( dataset[dataset_size-1] = (double*) calloc ( 2, sizeof ( double ))))
+				{
+					AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
+				}
+
+				dataset[dataset_size-1][0] = (double) alert_iterator->key.x;
+				dataset[dataset_size-1][1] = (double) alert_iterator->key.y;
+			}
+		}
+
+		if ( dataset && dataset_size != 0 )
+		{
+			if ( !( km = kmeans_auto ( dataset, dataset_size, 2 )))
+			{
+				AI_fatal_err ( "Unable to initialize the k-means clustering object", __FILE__, __LINE__ );
+			}
+
+			__AI_neural_clusters_to_xml ( km, alerts_per_neuron );
+			kmeans_free ( km );
+
+			for ( i=0; i < dataset_size; i++ )
+			{
+				free ( dataset[i] );
+			}
+
+			free ( dataset );
+		}
+
+		sleep ( config->neuralClusteringInterval );
+	}
+
+	pthread_exit ((void*) 0);
+	return (void*) 0;
+}		/* -----  end of function AI_neural_clustering_thread  ----- */
+
+/** @} */
+
--- a/spp_ai.c
+++ b/spp_ai.c
@ -172,14 +172,14 @@ static AI_config * AI_parse(char *args)
 {
 	char *arg;
 	char *match;
-	char alertfile[1024]          = { 0 },
-		alert_history_file[1024] = { 0 },
-		clusterfile[1024]        = { 0 },
-		corr_alerts_dir[1024]    = { 0 },
-		corr_modules_dir[1024]   = { 0 },
-		corr_rules_dir[1024]     = { 0 },
-		webserv_dir[1024]        = { 0 },
-		webserv_banner[1024]     = { 0 };
+	char alertfile[1024]           = { 0 },
+		alert_history_file[1024]  = { 0 },
+		clusterfile[1024]         = { 0 },
+		corr_alerts_dir[1024]     = { 0 },
+		corr_modules_dir[1024]    = { 0 },
+		corr_rules_dir[1024]      = { 0 },
+		webserv_dir[1024]         = { 0 },
+		webserv_banner[1024]      = { 0 };

 	char **matches       = NULL;
 	int  nmatches        = 0;
@ -217,6 +217,7 @@ static AI_config * AI_parse(char *args)
 			     correlation_graph_interval           = 0,
 			     database_parsing_interval            = 0,
 				manual_correlations_parsing_interval = 0,
+				neural_clustering_interval           = 0,
 				neural_network_training_interval     = 0,
 				neural_train_steps                   = 0,
 				output_neurons_per_side              = 0,
@ -526,6 +527,27 @@ static AI_config * AI_parse(char *args)
 	config->neuralNetworkTrainingInterval = neural_network_training_interval;
 	_dpd.logMsg( "    Neural network training interval: %u\n", config->neuralNetworkTrainingInterval );

+	/* Parsing the neural_clustering_interval option */
+	if (( arg = (char*) strcasestr( args, "neural_clustering_interval" ) ))
+	{
+		for ( arg += strlen("neural_clustering_interval");
+				*arg && (*arg < '0' || *arg > '9');
+				arg++ );
+
+		if ( !(*arg) )
+		{
+			AI_fatal_err ( "neural_clustering_interval option used but "
+				"no value specified", __FILE__, __LINE__ );
+		}
+
+		neural_clustering_interval = strtoul ( arg, NULL, 10 );
+	} else {
+		neural_clustering_interval = DEFAULT_NEURAL_CLUSTERING_INTERVAL;
+	}
+
+	config->neuralClusteringInterval = neural_clustering_interval;
+	_dpd.logMsg( "    Neural network clustering interval: %u\n", config->neuralClusteringInterval );
+
 	/* Parsing the output_neurons_per_side option */
 	if (( arg = (char*) strcasestr( args, "output_neurons_per_side" ) ))
 	{
@ -796,6 +818,9 @@ static AI_config * AI_parse(char *args)

 	_dpd.logMsg("    webserv_dir: %s\n", config->webserv_dir);

+	snprintf ( config->neural_clusters_log, sizeof ( config->neural_clusters_log ), "%s/neural_clusters.xml", config->webserv_dir );
+	_dpd.logMsg("    neural_clusters_log: %s\n", config->neural_clusters_log);
+
 	/* Parsing the corr_modules_dir option */
 	if (( arg = (char*) strcasestr( args, "corr_modules_dir" ) ))
 	{
--- a/spp_ai.h
+++ b/spp_ai.h
@ -81,6 +81,11 @@
 * alert correlations and the next one (this value should usually be high) */
 #define 	DEFAULT_NEURAL_NETWORK_TRAINING_INTERVAL 	43200

+/** Default interval in seconds between an execution of the thread that attempts to cluster
+ * the output layer of the neural network searching for alerts belonging to the same
+ * attack scenario and the next one */
+#define 	DEFAULT_NEURAL_CLUSTERING_INTERVAL 		1200
+
 /** Default interval of validity in seconds for an entry in the cache of correlated alerts */
 #define 	DEFAULT_BAYESIAN_CORRELATION_CACHE_VALIDITY 	600

@ -193,6 +198,11 @@ typedef struct
 	/** Interval in seconds between an invocation of the thread for parsing XML manual correlations and the next one */
 	unsigned long  manualCorrelationsParsingInterval;

+	/** Interval in seconds between an execution of the thread that attempts to cluster
+	 * the output layer of the neural network searching for alerts belonging to the same
+	 * attack scenario and the next one */
+	unsigned long  neuralClusteringInterval;
+
 	/** Interval in seconds for which an entry in the cache of correlated alerts is valid */
 	unsigned long  bayesianCorrelationCacheValidity;

@ -256,6 +266,9 @@ typedef struct
 	/** File keeping the serialized neural network used for the alert correlation */
 	char          netfile[1024];

+	/** File containing the likely clusters computed over the output layer of the neural network */
+	char          neural_clusters_log[1024];
+
 	/** Database name, if database logging is used */
 	char          dbname[256];

@ -451,6 +464,34 @@ typedef struct  {
 	UT_hash_handle            hh;
 } AI_alert_correlation;
 /*****************************************************************/
+/** Expresses an alert as a numerical tuple manageable by a neural network */
+typedef struct  {
+	unsigned int  gid;
+	unsigned int  sid;
+	unsigned int  rev;
+	uint32_t      src_ip_addr;
+	uint32_t      dst_ip_addr;
+	uint16_t      src_port;
+	uint16_t      dst_port;
+	time_t        timestamp;
+} AI_som_alert_tuple;
+/*****************************************************************/
+/** Key for the AI_alerts_per_neuron hash table */
+typedef struct  {
+	int x;
+	int y;
+} AI_alerts_per_neuron_key;
+/*****************************************************************/
+/** Struct that holds, for each point of the output layer, the list of associated alerts
+ * for easily performing the clustering algorithm */
+typedef struct  {
+	AI_alerts_per_neuron_key  key;
+	AI_som_alert_tuple        *alerts;
+	int                       n_alerts;
+	UT_hash_handle            hh;
+} AI_alerts_per_neuron;
+/*****************************************************************/
+

 /** Enumeration for describing the table in the output database */
 enum  { ALERTS_TABLE, IPV4_HEADERS_TABLE, TCP_HEADERS_TABLE, PACKET_STREAMS_TABLE, CLUSTERED_ALERTS_TABLE, CORRELATED_ALERTS_TABLE, N_TABLES };
@ -513,6 +554,8 @@ void                   AI_outdb_mutex_initialize ();
 void*                  AI_store_alert_to_db_thread ( void* );
 void*                  AI_store_cluster_to_db_thread ( void* );
 void*                  AI_store_correlation_to_db_thread ( void* );
+void*                  AI_neural_clustering_thread ( void* );
+AI_alerts_per_neuron*  AI_get_alerts_per_neuron ();

 double(**AI_get_corr_functions ( size_t* ))(const AI_snort_alert*, const AI_snort_alert*);
 double(**AI_get_corr_weights ( size_t* ))();