k-means clustering for SOM output

This commit is contained in:
BlackLight 2010-11-20 16:47:57 +01:00
parent ec196b3968
commit d41753a8a4
12 changed files with 2650 additions and 54 deletions

View file

@ -25,10 +25,12 @@ bayesian.c \
cluster.c \
correlation.c \
db.c \
fkmeans/kmeans.c \
fsom/fsom.c \
modules.c \
mysql.c \
neural.c \
neural_cluster.c \
outdb.c \
postgresql.c \
regex.c \

View file

@ -84,8 +84,10 @@ am_libsf_ai_preproc_la_OBJECTS = libsf_ai_preproc_la-alert_history.lo \
libsf_ai_preproc_la-cencode.lo libsf_ai_preproc_la-bayesian.lo \
libsf_ai_preproc_la-cluster.lo \
libsf_ai_preproc_la-correlation.lo libsf_ai_preproc_la-db.lo \
libsf_ai_preproc_la-fsom.lo libsf_ai_preproc_la-modules.lo \
libsf_ai_preproc_la-mysql.lo libsf_ai_preproc_la-neural.lo \
libsf_ai_preproc_la-kmeans.lo libsf_ai_preproc_la-fsom.lo \
libsf_ai_preproc_la-modules.lo libsf_ai_preproc_la-mysql.lo \
libsf_ai_preproc_la-neural.lo \
libsf_ai_preproc_la-neural_cluster.lo \
libsf_ai_preproc_la-outdb.lo libsf_ai_preproc_la-postgresql.lo \
libsf_ai_preproc_la-regex.lo libsf_ai_preproc_la-spp_ai.lo \
libsf_ai_preproc_la-stream.lo libsf_ai_preproc_la-webserv.lo
@ -267,10 +269,12 @@ bayesian.c \
cluster.c \
correlation.c \
db.c \
fkmeans/kmeans.c \
fsom/fsom.c \
modules.c \
mysql.c \
neural.c \
neural_cluster.c \
outdb.c \
postgresql.c \
regex.c \
@ -416,6 +420,9 @@ libsf_ai_preproc_la-correlation.lo: correlation.c
libsf_ai_preproc_la-db.lo: db.c
$(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsf_ai_preproc_la_CFLAGS) $(CFLAGS) -c -o libsf_ai_preproc_la-db.lo `test -f 'db.c' || echo '$(srcdir)/'`db.c
libsf_ai_preproc_la-kmeans.lo: fkmeans/kmeans.c
$(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsf_ai_preproc_la_CFLAGS) $(CFLAGS) -c -o libsf_ai_preproc_la-kmeans.lo `test -f 'fkmeans/kmeans.c' || echo '$(srcdir)/'`fkmeans/kmeans.c
libsf_ai_preproc_la-fsom.lo: fsom/fsom.c
$(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsf_ai_preproc_la_CFLAGS) $(CFLAGS) -c -o libsf_ai_preproc_la-fsom.lo `test -f 'fsom/fsom.c' || echo '$(srcdir)/'`fsom/fsom.c
@ -428,6 +435,9 @@ libsf_ai_preproc_la-mysql.lo: mysql.c
libsf_ai_preproc_la-neural.lo: neural.c
$(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsf_ai_preproc_la_CFLAGS) $(CFLAGS) -c -o libsf_ai_preproc_la-neural.lo `test -f 'neural.c' || echo '$(srcdir)/'`neural.c
libsf_ai_preproc_la-neural_cluster.lo: neural_cluster.c
$(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsf_ai_preproc_la_CFLAGS) $(CFLAGS) -c -o libsf_ai_preproc_la-neural_cluster.lo `test -f 'neural_cluster.c' || echo '$(srcdir)/'`neural_cluster.c
libsf_ai_preproc_la-outdb.lo: outdb.c
$(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsf_ai_preproc_la_CFLAGS) $(CFLAGS) -c -o libsf_ai_preproc_la-outdb.lo `test -f 'outdb.c' || echo '$(srcdir)/'`outdb.c

1630
fkmeans/Doxyfile Normal file

File diff suppressed because it is too large Load diff

3
fkmeans/Makefile Normal file
View file

@ -0,0 +1,3 @@
all:
gcc -g -O3 -Wall -pedantic -pedantic-errors -std=c99 -o kmeans-test test.c kmeans.c -lm

88
fkmeans/README Normal file
View file

@ -0,0 +1,88 @@
fkmeans is a tiny C library that allows you to perform k-means clustering
algorithm over arbitrary sets of n-dimensional data. All you need to do is:
- Include the file kmeans.h in your sources;
- Consider your data set as a vector of vectors of double items (double**),
where each vector is an n-dimensional item of your data set;
- If you want to perform the k-means algorithm over your data and you already
know the number k of clusters there contained, or its estimate, you want to
execute some code like this (in this example, the data set is 3-dimensional,
i.e. it contains N vectors whose size is 3, and we know it contains n_clus
clusters):
kmeans_t *km;
double **dataset;
...
km = kmeans_new ( dataset, N, 3, n_clus );
kmeans ( km );
...
kmeans_free ( km );
If you don't already know the number of clusters contained in your data set,
you can use the function kmeans_auto() for automatically attempting to find
the best one using Schwarz's criterion. Be careful, this operation can be very
slow, especially if executed on data set having many elements. The example
above would simply become something like:
kmeans_t *km;
double **dataset;
...
km = kmeans_auto ( dataset, N, 3 );
...
kmeans_free ( km );
- Once the clustering has been performed, the clusters of data can be simply
accessed from your kmeans_t* structure, as they are held as a double*** field
named "clusters". Each vector in this structure represents a cluter, whose
size is specified in the field cluster_sizes[i] of the structure. Each cluster
contains the items that form it, each of it is an n-dimensional vector. The
number of clusters is specified in the field "k" of the structure, the
number of dimensions of each element is specified in the field "dataset_dim"
and the number of elements in the originary data set is specified in the field
"dataset_size". So, for example:
for ( i=0; i < km->k; i++ )
{
printf ( "cluster %d: [ ", i );
for ( j=0; j < km->cluster_sizes[i]; j++ )
{
printf ( "(" );
for ( k=0; k < km->dataset_size; k++ )
{
printf ( "%f, ", km->clusters[i][j][k] );
}
printf ( "), ");
}
printf ( "]\n" );
}
The library however already comes with a sample implementation, contained in
"test.c", and typing "make" this example will be built. This example takes 0,
1, 2 or 3 command-line arguments, in format
$ ./kmeans-test [num_elements] [min_value] [max_value]
and randomly generates a 2-dimensional data set containing num_elements, whose
coordinates are between min_value and max_value. The clustering is then
performed and the results are shown on stdout, with the clusters coloured in
different ways;
- After you write your source, remember to include the file "kmeans.c",
containing the implementation of the library, in the list of your sources
files;
- That's all. Include "kmeans.h", write your code using
kmeans_new()+kmeans()+kmeans_free() or kmeans_auto()+kmeans_free(), explore
your clusters, remember to include "kmeans.c" in the list of your source
files, and you're ready for k-means clustering.
Author: Fabio "BlackLight" Manganiello,
<blacklight@autistici.org>,
http://0x00.ath.cx

445
fkmeans/kmeans.c Normal file
View file

@ -0,0 +1,445 @@
/*
* =====================================================================================
*
* Filename: kmeans.c
*
* Description: k-means clusterization algorithm implementation in C
*
* Version: 1.0
* Created: 12/11/2010 10:43:28
* Revision: none
* Compiler: gcc
*
* Author: BlackLight (http://0x00.ath.cx), <blacklight@autistici.org>
* Licence: GNU GPL v.3
* Company: DO WHAT YOU WANT CAUSE A PIRATE IS FREE, YOU ARE A PIRATE!
*
* =====================================================================================
*/
#include "kmeans.h"
#include <alloca.h>
#include <float.h>
#include <limits.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
/**
* \brief Initialize the centers of the clusters taking the K most distant elements in the dataset
* \param km k-means object
*/
static void
__kmeans_init_centers ( kmeans_t *km )
{
int i, j, k, l,
index_found = 0,
max_index = 0,
assigned_centers = 0,
*assigned_centers_indexes = NULL;
double dist = 0.0,
max_dist = 0.0;
for ( i=0; i < km->dataset_size; i++ )
{
dist = 0.0;
for ( j=0; j < km->dataset_dim; j++ )
{
dist += ( km->dataset[i][j] ) * ( km->dataset[i][j] );
}
if ( dist > max_dist )
{
max_dist = dist;
max_index = i;
}
}
for ( i=0; i < km->dataset_dim; i++ )
{
km->centers[0][i] = km->dataset[max_index][i];
}
if ( !( assigned_centers_indexes = (int*) realloc ( assigned_centers_indexes, (++assigned_centers) * sizeof ( int ))))
{
return;
}
assigned_centers_indexes[ assigned_centers - 1 ] = max_index;
for ( i=1; i < km->k; i++ )
{
max_dist = 0.0;
max_index = 0;
for ( j=0; j < km->dataset_size; j++ )
{
index_found = 0;
for ( k=0; k < assigned_centers && !index_found; k++ )
{
if ( assigned_centers_indexes[k] == j )
{
index_found = 1;
}
}
if ( index_found )
continue;
dist = 0.0;
for ( k=0; k < assigned_centers; k++ )
{
for ( l=0; l < km->dataset_dim; l++ )
{
dist += ( km->dataset[j][l] - km->centers[k][l] ) * ( km->dataset[j][l] - km->centers[k][l] );
}
}
if ( dist > max_dist )
{
max_dist = dist;
max_index = j;
}
}
for ( j=0; j < km->dataset_dim; j++ )
{
km->centers[i][j] = km->dataset[max_index][j];
}
if ( !( assigned_centers_indexes = (int*) realloc ( assigned_centers_indexes, (++assigned_centers) * sizeof ( int ))))
{
return;
}
assigned_centers_indexes[ assigned_centers - 1 ] = max_index;
}
free ( assigned_centers_indexes );
} /* ----- end of function kmeans_init_centers ----- */
/**
* \brief Create a new k-means object
* \param dataset Dataset to be clustered
* \param dataset_size Number of elements in the dataset
* \param dataset_dim Dimension of each element of the dataset
* \param K Number of clusters
* \return Reference to the newly created k-means object, if successfull, NULL otherwise
*/
kmeans_t*
kmeans_new ( double **dataset, const int dataset_size, const int dataset_dim, const int K )
{
int i, j;
kmeans_t *km = NULL;
if ( !( km = (kmeans_t*) malloc ( sizeof ( kmeans_t ))))
{
return NULL;
}
if ( !( km->dataset = (double**) calloc ( dataset_size, sizeof ( double* ))))
{
return NULL;
}
for ( i=0; i < dataset_size; i++ )
{
if ( !( km->dataset[i] = (double*) calloc ( dataset_dim, sizeof ( double ))))
{
return NULL;
}
for ( j=0; j < dataset_dim; j++ )
{
km->dataset[i][j] = dataset[i][j];
}
}
km->dataset_size = dataset_size;
km->dataset_dim = dataset_dim;
km->k = K;
if ( !( km->clusters = (double***) calloc ( K, sizeof ( double** ))))
{
return NULL;
}
if ( !( km->cluster_sizes = (int*) calloc ( K, sizeof ( int* ))))
{
return NULL;
}
if ( !( km->centers = (double**) calloc ( K, sizeof ( double* ))))
{
return NULL;
}
for ( i=0; i < K; i++ )
{
if ( !( km->centers[i] = (double*) calloc ( dataset_dim, sizeof ( double ))))
{
return NULL;
}
}
__kmeans_init_centers ( km );
return km;
} /* ----- end of function kmeans_new ----- */
/**
* \brief Function that performs a single step for k-means algorithm
* \param km k-means object
* \return 0 if no changes were performed by this step, 1 otherwise, -1 in case of error
*/
static int
__kmeans_step ( kmeans_t *km )
{
int i, j, k,
best_center = 0;
double dist = 0.0,
min_dist = DBL_MAX,
**old_centers = NULL;
if ( km->clusters[0] )
{
for ( i=0; i < km->k; i++ )
{
for ( j=0; j < km->cluster_sizes[i]; j++ )
{
free ( km->clusters[i][j] );
km->clusters[i][j] = NULL;
}
free ( km->clusters[i] );
km->clusters[i] = NULL;
km->cluster_sizes[i] = 0;
}
}
if ( !( old_centers = (double**) alloca ( km->k * sizeof ( double* ))))
{
return -1;
}
for ( i=0; i < km->k; i++ )
{
if ( !( old_centers[i] = (double*) alloca ( km->dataset_dim * sizeof ( double ))))
{
return -1;
}
for ( j=0; j < km->dataset_dim; j++ )
{
old_centers[i][j] = km->centers[i][j];
}
}
for ( i=0; i < km->dataset_size; i++ )
{
min_dist = DBL_MAX;
best_center = 0;
for ( j=0; j < km->k; j++ )
{
dist = 0.0;
for ( k=0; k < km->dataset_dim; k++ )
{
dist += ( km->dataset[i][k] - km->centers[j][k] ) * ( km->dataset[i][k] - km->centers[j][k] );
}
if ( dist < min_dist )
{
min_dist = dist;
best_center = j;
}
}
if ( !( km->clusters[best_center] = (double**) realloc ( km->clusters[best_center], (++(km->cluster_sizes[best_center])) * sizeof ( double* ))))
{
return -1;
}
if ( !( km->clusters [best_center] [km->cluster_sizes[best_center]-1] = (double*) calloc ( km->dataset_dim, sizeof ( double ))))
{
return -1;
}
for ( j=0; j < km->dataset_dim; j++ )
{
km->clusters [best_center] [km->cluster_sizes[best_center]-1] [j] = km->dataset[i][j];
}
}
for ( i=0; i < km->k; i++ )
{
for ( j=0; j < km->dataset_dim; j++ )
{
km->centers[i][j] = 0.0;
for ( k=0; k < km->cluster_sizes[i]; k++ )
{
km->centers[i][j] += km->clusters[i][k][j];
}
if ( km->cluster_sizes[i] != 0 )
{
km->centers[i][j] /= (double) km->cluster_sizes[i];
}
}
}
for ( i=0; i < km->k; i++ )
{
for ( j=0; j < km->dataset_dim; j++ )
{
if ( km->centers[i][j] != old_centers[i][j] )
{
return 1;
}
}
}
return 0;
} /* ----- end of function __kmeans_step ----- */
/**
* \brief Perform the k-means algorithm over a k-means object
* \param km k-means object
*/
void
kmeans ( kmeans_t *km )
{
while ( __kmeans_step ( km ) != 0 );
} /* ----- end of function kmeans ----- */
/**
* \brief Compute the heuristic coefficient associated to the current number of clusters through Schwarz's criterion
* \param km k-means object
* \return Real value expressing how well that number of clusters models the dataset
*/
static double
__kmeans_heuristic_coefficient ( kmeans_t *km )
{
int i, j, k;
double distorsion = 0.0;
for ( i=0; i < km->k; i++ )
{
for ( j=0; j < km->cluster_sizes[i]; j++ )
{
for ( k=0; k < km->dataset_dim; k++ )
{
distorsion += ( km->centers[i][k] - km->clusters[i][j][k] ) * ( km->centers[i][k] - km->clusters[i][j][k] );
}
}
}
return distorsion + km->k * log ( km->dataset_size );
} /* ----- end of function __kmeans_heuristic_coefficient ----- */
/**
* \brief Remove a k-means object
* \param km k-means object to be deallocaed
*/
void
kmeans_free ( kmeans_t *km )
{
int i, j;
for ( i=0; i < km->k; i++ )
{
for ( j=0; j < km->cluster_sizes[i]; j++ )
{
free ( km->clusters[i][j] );
km->clusters[i][j] = NULL;
}
free ( km->clusters[i] );
km->clusters[i] = NULL;
}
free ( km->clusters );
km->clusters = NULL;
free ( km->cluster_sizes );
km->cluster_sizes = NULL;
for ( i=0; i < km->k; i++ )
{
free ( km->centers[i] );
km->centers[i] = NULL;
}
free ( km->centers );
km->centers = NULL;
for ( i=0; i < km->dataset_size; i++ )
{
free ( km->dataset[i] );
km->dataset[i] = NULL;
}
free ( km->dataset );
km->dataset = NULL;
free ( km );
km = NULL;
} /* ----- end of function kmeans_free ----- */
/**
* \brief Perform a k-means clustering over a dataset automatically choosing the best value of k using Schwarz's criterion
* \param dataset Dataset to be clustered
* \param dataset_size Number of elements in the dataset
* \param dataset_dim Dimension of each element of the dataset
* \return Reference to the newly created k-means object, if successfull, NULL otherwise
*/
kmeans_t*
kmeans_auto ( double **dataset, int dataset_size, int dataset_dim )
{
int i;
double heuristic = 0.0,
best_heuristic = DBL_MAX;
kmeans_t *km = NULL,
*best_km = NULL;
for ( i=1; i <= dataset_size; i++ )
{
if ( !( km = kmeans_new ( dataset, dataset_size, dataset_dim, i )))
return NULL;
kmeans ( km );
heuristic = __kmeans_heuristic_coefficient ( km );
if ( heuristic < best_heuristic )
{
if ( best_km )
{
kmeans_free ( best_km );
}
best_km = km;
best_heuristic = heuristic;
} else {
kmeans_free ( km );
}
}
return best_km;
} /* ----- end of function kmeans_auto ----- */

52
fkmeans/kmeans.h Normal file
View file

@ -0,0 +1,52 @@
/*
* =====================================================================================
*
* Filename: kmeans.h
*
* Description: Header file for C k-means implementation
*
* Version: 1.0
* Created: 12/11/2010 10:43:55
* Revision: none
* Compiler: gcc
*
* Author: BlackLight (http://0x00.ath.cx), <blacklight@autistici.org>
* Licence: GNU GPL v.3
* Company: DO WHAT YOU WANT CAUSE A PIRATE IS FREE, YOU ARE A PIRATE!
*
* =====================================================================================
*/
#ifndef __KMEANS_H
#define __KMEANS_H
typedef struct __kmeans_t {
/** Input data set */
double **dataset;
/** Number of elements in the data set */
int dataset_size;
/** Dimension of each element of the data set */
int dataset_dim;
/** Number of clusters */
int k;
/** Vector containing the number of elements in each cluster */
int *cluster_sizes;
/** Clusters */
double ***clusters;
/** Coordinates of the centers of the clusters */
double **centers;
} kmeans_t;
kmeans_t* kmeans_new ( double **dataset, const int dataset_size, const int dataset_dim, const int K );
kmeans_t* kmeans_auto ( double **dataset, int dataset_size, int dataset_dim );
void kmeans ( kmeans_t *km );
void kmeans_free ( kmeans_t *km );
#endif

View file

@ -48,18 +48,26 @@ __mysql_do_init ( MYSQL **__DB, BOOL is_out )
return (void*) *__DB;
if ( !( *__DB = (MYSQL*) malloc ( sizeof ( MYSQL ))))
{
return NULL;
}
if ( !( mysql_init ( *__DB )))
{
return NULL;
}
if ( is_out )
{
if ( !mysql_real_connect ( *__DB, config->outdbhost, config->outdbuser, config->outdbpass, NULL, 0, NULL, 0 ))
{
return NULL;
}
if ( mysql_select_db ( *__DB, config->outdbname ))
{
return NULL;
}
} else {
if ( !mysql_real_connect ( *__DB, config->dbhost, config->dbuser, config->dbpass, NULL, 0, NULL, 0 ))
return NULL;

178
neural.c
View file

@ -37,21 +37,22 @@
/** Enumeration for the input fields of the SOM neural network */
enum { som_src_ip, som_dst_ip, som_src_port, som_dst_port, som_time, som_gid, som_sid, som_rev, SOM_NUM_ITEMS };
typedef struct {
unsigned int gid;
unsigned int sid;
unsigned int rev;
uint32_t src_ip_addr;
uint32_t dst_ip_addr;
uint16_t src_port;
uint16_t dst_port;
time_t timestamp;
} AI_som_alert_tuple;
PRIVATE time_t latest_serialization_time = ( time_t ) 0;
PRIVATE som_network_t *net = NULL;
PRIVATE AI_alerts_per_neuron *alerts_per_neuron = NULL;
PRIVATE pthread_mutex_t neural_mutex;
/**
* \brief Get the hash table containing the alerts associated to each output neuron
* \return The hash table
*/
AI_alerts_per_neuron*
AI_get_alerts_per_neuron ()
{
return alerts_per_neuron;
} /* ----- end of function AI_get_alerts_per_neuron ----- */
/**
* \brief Get the current weight of the neural correlation index using a hyperbolic tangent function with a parameter expressed in function of the current number of alerts in the database
* \return The weight of the correlation index ( 0 <= weight < 1 )
@ -126,6 +127,11 @@ __AI_som_alert_distance ( const AI_som_alert_tuple alert1, const AI_som_alert_tu
x2 = 0,
y2 = 0;
int i;
BOOL is_found = false;
AI_alerts_per_neuron *found = NULL;
AI_alerts_per_neuron_key key;
if ( !( input1 = (double*) alloca ( SOM_NUM_ITEMS * sizeof ( double ))))
{
AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
@ -136,24 +142,128 @@ __AI_som_alert_distance ( const AI_som_alert_tuple alert1, const AI_som_alert_tu
AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
}
pthread_mutex_lock ( &neural_mutex );
if ( !net )
{
pthread_mutex_unlock ( &neural_mutex );
return 0.0;
}
__AI_alert_to_som_data ( alert1, &input1 );
__AI_alert_to_som_data ( alert2, &input2 );
pthread_mutex_lock ( &neural_mutex );
som_set_inputs ( net, input1 );
som_get_best_neuron_coordinates ( net, &x1, &y1 );
__AI_alert_to_som_data ( alert2, &input2 );
som_set_inputs ( net, input2 );
som_get_best_neuron_coordinates ( net, &x2, &y2 );
pthread_mutex_unlock ( &neural_mutex );
/* Check if there are already entries in the hash table for these two neurons, otherwise
* it creates them and append these two alerts */
key.x = x1;
key.y = y1;
HASH_FIND ( hh, alerts_per_neuron, &key, sizeof ( key ), found );
if ( !found )
{
if ( !( found = (AI_alerts_per_neuron*) calloc ( 1, sizeof ( AI_alerts_per_neuron ))))
{
AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
}
found->key = key;
found->n_alerts = 1;
if ( !( found->alerts = (AI_som_alert_tuple*) calloc ( 1, sizeof ( AI_som_alert_tuple ))))
{
AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
}
found->alerts[0] = alert1;
HASH_ADD ( hh, alerts_per_neuron, key, sizeof ( key ), found );
} else {
is_found = false;
for ( i=0; i < found->n_alerts && !is_found; i++ )
{
if (
alert1.gid == found->alerts[i].gid &&
alert1.sid == found->alerts[i].sid &&
alert1.rev == found->alerts[i].rev &&
alert1.src_ip_addr == found->alerts[i].src_ip_addr &&
alert1.dst_ip_addr == found->alerts[i].dst_ip_addr &&
alert1.src_port == found->alerts[i].src_port &&
alert1.dst_port == found->alerts[i].dst_port )
{
is_found = true;
}
}
if ( !is_found )
{
if ( !( found->alerts = (AI_som_alert_tuple*) realloc ( found->alerts,
(++(found->n_alerts)) * sizeof ( AI_som_alert_tuple ))))
{
AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
}
found->alerts[ found->n_alerts - 1 ] = alert1;
}
}
key.x = x2;
key.y = y2;
HASH_FIND ( hh, alerts_per_neuron, &key, sizeof ( key ), found );
if ( !found )
{
if ( !( found = (AI_alerts_per_neuron*) calloc ( 1, sizeof ( AI_alerts_per_neuron ))))
{
AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
}
found->key = key;
found->n_alerts = 1;
if ( !( found->alerts = (AI_som_alert_tuple*) calloc ( 1, sizeof ( AI_som_alert_tuple ))))
{
AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
}
found->alerts[0] = alert2;
HASH_ADD ( hh, alerts_per_neuron, key, sizeof ( key ), found );
} else {
is_found = false;
for ( i=0; i < found->n_alerts && !is_found; i++ )
{
if (
alert2.gid == found->alerts[i].gid &&
alert2.sid == found->alerts[i].sid &&
alert2.rev == found->alerts[i].rev &&
alert2.src_ip_addr == found->alerts[i].src_ip_addr &&
alert2.dst_ip_addr == found->alerts[i].dst_ip_addr &&
alert2.src_port == found->alerts[i].src_port &&
alert2.dst_port == found->alerts[i].dst_port )
{
is_found = true;
}
}
if ( !is_found )
{
if ( !( found->alerts = (AI_som_alert_tuple*) realloc ( found->alerts,
(++(found->n_alerts)) * sizeof ( AI_som_alert_tuple ))))
{
AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
}
found->alerts[ found->n_alerts - 1 ] = alert2;
}
}
/* Return the normalized euclidean distance in [0,1] (the normalization is made considering that the maximum distance
* between two points on the output neurons matrix is the distance between the upper-left and bottom-right points) */
return sqrt ((double) ( (x2-x1)*(x2-x1) + (y2-y1)*(y2-y1) )) /
@ -170,8 +280,6 @@ __AI_som_alert_distance ( const AI_som_alert_tuple alert1, const AI_som_alert_tu
double
AI_alert_neural_som_correlation ( const AI_snort_alert *a, const AI_snort_alert *b )
{
size_t i = 0;
unsigned long long int time_sum = 0;
AI_som_alert_tuple t1, t2;
t1.gid = a->gid;
@ -181,18 +289,7 @@ AI_alert_neural_som_correlation ( const AI_snort_alert *a, const AI_snort_alert
t1.dst_ip_addr = ntohl ( a->ip_dst_addr );
t1.src_port = ntohs ( a->tcp_src_port );
t1.dst_port = ntohs ( a->tcp_dst_port );
time_sum = (unsigned long long int) a->timestamp;
/* The timestamp of this alert is computed like the average timestamp of the grouped alerts */
for ( i=1; i < a->grouped_alerts_count; i++ )
{
if ( a->grouped_alerts[i-1] )
{
time_sum += (unsigned long long int) a->grouped_alerts[i-1]->timestamp;
}
}
t1.timestamp = (time_t) ( time_sum / a->grouped_alerts_count );
t1.timestamp = a->timestamp;
t2.gid = b->gid;
t2.sid = b->sid;
@ -201,17 +298,7 @@ AI_alert_neural_som_correlation ( const AI_snort_alert *a, const AI_snort_alert
t2.dst_ip_addr = ntohl ( b->ip_dst_addr );
t2.src_port = ntohs ( b->tcp_src_port );
t2.dst_port = ntohs ( b->tcp_dst_port );
time_sum = (unsigned long long int) b->timestamp;
for ( i=1; i < b->grouped_alerts_count; i++ )
{
if ( b->grouped_alerts[i-1] )
{
time_sum += (unsigned long long int) b->grouped_alerts[i-1]->timestamp;
}
}
t2.timestamp = (time_t) ( time_sum / b->grouped_alerts_count );
t2.timestamp = b->timestamp;
return __AI_som_alert_distance ( t1, t2 );
} /* ----- end of function AI_alert_neural_som_correlation ----- */
@ -338,8 +425,9 @@ __AI_som_train ()
void*
AI_neural_thread ( void *arg )
{
BOOL do_train = false;
struct stat st;
BOOL do_train = false;
pthread_t neural_clustering_thread;
pthread_mutex_init ( &neural_mutex, NULL );
@ -353,6 +441,14 @@ AI_neural_thread ( void *arg )
AI_fatal_err ( "AIPreproc: neural network thread launched but netfile option was not specified", __FILE__, __LINE__ );
}
if ( config->neuralClusteringInterval != 0 )
{
if ( pthread_create ( &neural_clustering_thread, NULL, AI_neural_clustering_thread, NULL ) != 0 )
{
AI_fatal_err ( "Failed to create the manual correlations parsing thread", __FILE__, __LINE__ );
}
}
while ( 1 )
{
if ( stat ( config->netfile, &st ) < 0 )

194
neural_cluster.c Normal file
View file

@ -0,0 +1,194 @@
/*
* =====================================================================================
*
* Filename: neural_cluster.c
*
* Description: Perform the clusterization over the output layer of the SOM neural
* network, in order to attempt to find the alerts belonging to the
* same attack scenario. The clusterization is operated through k-means
* using Schwarz criterion in order to find the optimal number of
* clusters, the implementation is in fkmeans/
*
* Version: 0.1
* Created: 19/11/2010 18:37:35
* Revision: none
* Compiler: gcc
*
* Author: BlackLight (http://0x00.ath.cx), <blacklight@autistici.org>
* Licence: GNU GPL v.3
* Company: DO WHAT YOU WANT CAUSE A PIRATE IS FREE, YOU ARE A PIRATE!
*
* =====================================================================================
*/
#include "spp_ai.h"
/** \defgroup neural_cluster Module for clustering the alerts associated to the
* neural network output layer in order to find alerts belonging to the same scenario
* @{ */
#include "fkmeans/kmeans.h"
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
/**
* \brief Print the clusters associated to the SOM output to an XML log file
* \param km k-means object
* \param alerts_per_neuron Hash table containing the alerts associated to each neuron
*/
PRIVATE void
__AI_neural_clusters_to_xml ( kmeans_t *km, AI_alerts_per_neuron *alerts_per_neuron )
{
int i, j, k, l, are_equal;
FILE *fp = NULL;
uint32_t src_addr = 0,
dst_addr = 0;
char src_ip[INET_ADDRSTRLEN] = { 0 },
dst_ip[INET_ADDRSTRLEN] = { 0 };
AI_alerts_per_neuron_key key;
AI_alerts_per_neuron *alert_iterator = NULL;
if ( !( fp = fopen ( config->neural_clusters_log, "w" )))
{
AI_fatal_err ( "Unable to write on the neural clusters XML log file", __FILE__, __LINE__ );
}
fprintf ( fp, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n\n"
"<clusters>\n" );
for ( i=0; i < km->k; i++ )
{
fprintf ( fp, "\t<cluster id=\"%d\">\n", i );
for ( j=0; j < km->cluster_sizes[i]; j++ )
{
key.x = km->clusters[i][j][0];
key.y = km->clusters[i][j][1];
HASH_FIND ( hh, alerts_per_neuron, &key, sizeof ( key ), alert_iterator );
if ( alert_iterator )
{
for ( k=0; k < alert_iterator->n_alerts; k++ )
{
are_equal = 0;
for ( l=0; l < alert_iterator->n_alerts && !are_equal; l++ )
{
if ( k != l )
{
if (
alert_iterator->alerts[k].gid == alert_iterator->alerts[l].gid &&
alert_iterator->alerts[k].sid == alert_iterator->alerts[l].sid &&
alert_iterator->alerts[k].rev == alert_iterator->alerts[l].rev &&
alert_iterator->alerts[k].src_ip_addr == alert_iterator->alerts[l].src_ip_addr &&
alert_iterator->alerts[k].dst_ip_addr == alert_iterator->alerts[l].dst_ip_addr &&
alert_iterator->alerts[k].src_port == alert_iterator->alerts[l].src_port &&
alert_iterator->alerts[k].dst_port == alert_iterator->alerts[l].dst_port &&
alert_iterator->alerts[k].timestamp == alert_iterator->alerts[l].timestamp )
{
are_equal = 1;
}
}
}
if ( !are_equal )
{
src_addr = htonl ( alert_iterator->alerts[k].src_ip_addr );
dst_addr = htonl ( alert_iterator->alerts[k].dst_ip_addr );
inet_ntop ( AF_INET, &src_addr, src_ip, INET_ADDRSTRLEN );
inet_ntop ( AF_INET, &dst_addr, dst_ip, INET_ADDRSTRLEN );
fprintf ( fp, "\t\t<alert gid=\"%d\" sid=\"%d\" rev=\"%d\" src_ip=\"%s\" src_port=\"%d\" "
"dst_ip=\"%s\" dst_port=\"%d\" timestamp=\"%lu\" xcoord=\"%d\" ycoord=\"%d\"/>\n",
alert_iterator->alerts[k].gid,
alert_iterator->alerts[k].sid,
alert_iterator->alerts[k].rev,
src_ip, alert_iterator->alerts[k].src_port,
dst_ip, alert_iterator->alerts[k].dst_port,
alert_iterator->alerts[k].timestamp,
alert_iterator->key.x, alert_iterator->key.y );
}
}
}
}
fprintf ( fp, "\t</cluster>\n" );
}
fprintf ( fp, "</clusters>\n" );
fclose ( fp );
} /* ----- end of function __AI_neural_clusters_to_xml ----- */
/**
* \brief Thread that performs the k-means clustering over the output layer of
* the SOM neural network
*/
void*
AI_neural_clustering_thread ( void *arg )
{
AI_alerts_per_neuron *alerts_per_neuron = NULL,
*alert_iterator = NULL;
kmeans_t *km = NULL;
double **dataset = NULL;
int i, dataset_size = 0;
while ( 1 )
{
dataset = NULL;
dataset_size = 0;
alerts_per_neuron = AI_get_alerts_per_neuron();
for ( alert_iterator = alerts_per_neuron; alert_iterator; alert_iterator = (AI_alerts_per_neuron*) alert_iterator->hh.next )
{
if ( alert_iterator->n_alerts > 0 )
{
if ( !( dataset = (double**) realloc ( dataset, (++dataset_size) * sizeof ( double* ))))
{
AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
}
if ( !( dataset[dataset_size-1] = (double*) calloc ( 2, sizeof ( double ))))
{
AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
}
dataset[dataset_size-1][0] = (double) alert_iterator->key.x;
dataset[dataset_size-1][1] = (double) alert_iterator->key.y;
}
}
if ( dataset && dataset_size != 0 )
{
if ( !( km = kmeans_auto ( dataset, dataset_size, 2 )))
{
AI_fatal_err ( "Unable to initialize the k-means clustering object", __FILE__, __LINE__ );
}
__AI_neural_clusters_to_xml ( km, alerts_per_neuron );
kmeans_free ( km );
for ( i=0; i < dataset_size; i++ )
{
free ( dataset[i] );
}
free ( dataset );
}
sleep ( config->neuralClusteringInterval );
}
pthread_exit ((void*) 0);
return (void*) 0;
} /* ----- end of function AI_neural_clustering_thread ----- */
/** @} */

View file

@ -217,6 +217,7 @@ static AI_config * AI_parse(char *args)
correlation_graph_interval = 0,
database_parsing_interval = 0,
manual_correlations_parsing_interval = 0,
neural_clustering_interval = 0,
neural_network_training_interval = 0,
neural_train_steps = 0,
output_neurons_per_side = 0,
@ -526,6 +527,27 @@ static AI_config * AI_parse(char *args)
config->neuralNetworkTrainingInterval = neural_network_training_interval;
_dpd.logMsg( " Neural network training interval: %u\n", config->neuralNetworkTrainingInterval );
/* Parsing the neural_clustering_interval option */
if (( arg = (char*) strcasestr( args, "neural_clustering_interval" ) ))
{
for ( arg += strlen("neural_clustering_interval");
*arg && (*arg < '0' || *arg > '9');
arg++ );
if ( !(*arg) )
{
AI_fatal_err ( "neural_clustering_interval option used but "
"no value specified", __FILE__, __LINE__ );
}
neural_clustering_interval = strtoul ( arg, NULL, 10 );
} else {
neural_clustering_interval = DEFAULT_NEURAL_CLUSTERING_INTERVAL;
}
config->neuralClusteringInterval = neural_clustering_interval;
_dpd.logMsg( " Neural network clustering interval: %u\n", config->neuralClusteringInterval );
/* Parsing the output_neurons_per_side option */
if (( arg = (char*) strcasestr( args, "output_neurons_per_side" ) ))
{
@ -796,6 +818,9 @@ static AI_config * AI_parse(char *args)
_dpd.logMsg(" webserv_dir: %s\n", config->webserv_dir);
snprintf ( config->neural_clusters_log, sizeof ( config->neural_clusters_log ), "%s/neural_clusters.xml", config->webserv_dir );
_dpd.logMsg(" neural_clusters_log: %s\n", config->neural_clusters_log);
/* Parsing the corr_modules_dir option */
if (( arg = (char*) strcasestr( args, "corr_modules_dir" ) ))
{

View file

@ -81,6 +81,11 @@
* alert correlations and the next one (this value should usually be high) */
#define DEFAULT_NEURAL_NETWORK_TRAINING_INTERVAL 43200
/** Default interval in seconds between an execution of the thread that attempts to cluster
* the output layer of the neural network searching for alerts belonging to the same
* attack scenario and the next one */
#define DEFAULT_NEURAL_CLUSTERING_INTERVAL 1200
/** Default interval of validity in seconds for an entry in the cache of correlated alerts */
#define DEFAULT_BAYESIAN_CORRELATION_CACHE_VALIDITY 600
@ -193,6 +198,11 @@ typedef struct
/** Interval in seconds between an invocation of the thread for parsing XML manual correlations and the next one */
unsigned long manualCorrelationsParsingInterval;
/** Interval in seconds between an execution of the thread that attempts to cluster
* the output layer of the neural network searching for alerts belonging to the same
* attack scenario and the next one */
unsigned long neuralClusteringInterval;
/** Interval in seconds for which an entry in the cache of correlated alerts is valid */
unsigned long bayesianCorrelationCacheValidity;
@ -256,6 +266,9 @@ typedef struct
/** File keeping the serialized neural network used for the alert correlation */
char netfile[1024];
/** File containing the likely clusters computed over the output layer of the neural network */
char neural_clusters_log[1024];
/** Database name, if database logging is used */
char dbname[256];
@ -451,6 +464,34 @@ typedef struct {
UT_hash_handle hh;
} AI_alert_correlation;
/*****************************************************************/
/** Expresses an alert as a numerical tuple manageable by a neural network */
typedef struct {
unsigned int gid;
unsigned int sid;
unsigned int rev;
uint32_t src_ip_addr;
uint32_t dst_ip_addr;
uint16_t src_port;
uint16_t dst_port;
time_t timestamp;
} AI_som_alert_tuple;
/*****************************************************************/
/** Key for the AI_alerts_per_neuron hash table */
typedef struct {
int x;
int y;
} AI_alerts_per_neuron_key;
/*****************************************************************/
/** Struct that holds, for each point of the output layer, the list of associated alerts
* for easily performing the clustering algorithm */
typedef struct {
AI_alerts_per_neuron_key key;
AI_som_alert_tuple *alerts;
int n_alerts;
UT_hash_handle hh;
} AI_alerts_per_neuron;
/*****************************************************************/
/** Enumeration for describing the table in the output database */
enum { ALERTS_TABLE, IPV4_HEADERS_TABLE, TCP_HEADERS_TABLE, PACKET_STREAMS_TABLE, CLUSTERED_ALERTS_TABLE, CORRELATED_ALERTS_TABLE, N_TABLES };
@ -513,6 +554,8 @@ void AI_outdb_mutex_initialize ();
void* AI_store_alert_to_db_thread ( void* );
void* AI_store_cluster_to_db_thread ( void* );
void* AI_store_correlation_to_db_thread ( void* );
void* AI_neural_clustering_thread ( void* );
AI_alerts_per_neuron* AI_get_alerts_per_neuron ();
double(**AI_get_corr_functions ( size_t* ))(const AI_snort_alert*, const AI_snort_alert*);
double(**AI_get_corr_weights ( size_t* ))();