mirror of
https://github.com/BlackLight/Snort_AIPreproc.git
synced 2024-11-24 04:35:11 +01:00
k-means clustering for SOM output
This commit is contained in:
parent
ec196b3968
commit
d41753a8a4
12 changed files with 2650 additions and 54 deletions
|
@ -25,10 +25,12 @@ bayesian.c \
|
|||
cluster.c \
|
||||
correlation.c \
|
||||
db.c \
|
||||
fkmeans/kmeans.c \
|
||||
fsom/fsom.c \
|
||||
modules.c \
|
||||
mysql.c \
|
||||
neural.c \
|
||||
neural_cluster.c \
|
||||
outdb.c \
|
||||
postgresql.c \
|
||||
regex.c \
|
||||
|
|
14
Makefile.in
14
Makefile.in
|
@ -84,8 +84,10 @@ am_libsf_ai_preproc_la_OBJECTS = libsf_ai_preproc_la-alert_history.lo \
|
|||
libsf_ai_preproc_la-cencode.lo libsf_ai_preproc_la-bayesian.lo \
|
||||
libsf_ai_preproc_la-cluster.lo \
|
||||
libsf_ai_preproc_la-correlation.lo libsf_ai_preproc_la-db.lo \
|
||||
libsf_ai_preproc_la-fsom.lo libsf_ai_preproc_la-modules.lo \
|
||||
libsf_ai_preproc_la-mysql.lo libsf_ai_preproc_la-neural.lo \
|
||||
libsf_ai_preproc_la-kmeans.lo libsf_ai_preproc_la-fsom.lo \
|
||||
libsf_ai_preproc_la-modules.lo libsf_ai_preproc_la-mysql.lo \
|
||||
libsf_ai_preproc_la-neural.lo \
|
||||
libsf_ai_preproc_la-neural_cluster.lo \
|
||||
libsf_ai_preproc_la-outdb.lo libsf_ai_preproc_la-postgresql.lo \
|
||||
libsf_ai_preproc_la-regex.lo libsf_ai_preproc_la-spp_ai.lo \
|
||||
libsf_ai_preproc_la-stream.lo libsf_ai_preproc_la-webserv.lo
|
||||
|
@ -267,10 +269,12 @@ bayesian.c \
|
|||
cluster.c \
|
||||
correlation.c \
|
||||
db.c \
|
||||
fkmeans/kmeans.c \
|
||||
fsom/fsom.c \
|
||||
modules.c \
|
||||
mysql.c \
|
||||
neural.c \
|
||||
neural_cluster.c \
|
||||
outdb.c \
|
||||
postgresql.c \
|
||||
regex.c \
|
||||
|
@ -416,6 +420,9 @@ libsf_ai_preproc_la-correlation.lo: correlation.c
|
|||
libsf_ai_preproc_la-db.lo: db.c
|
||||
$(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsf_ai_preproc_la_CFLAGS) $(CFLAGS) -c -o libsf_ai_preproc_la-db.lo `test -f 'db.c' || echo '$(srcdir)/'`db.c
|
||||
|
||||
libsf_ai_preproc_la-kmeans.lo: fkmeans/kmeans.c
|
||||
$(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsf_ai_preproc_la_CFLAGS) $(CFLAGS) -c -o libsf_ai_preproc_la-kmeans.lo `test -f 'fkmeans/kmeans.c' || echo '$(srcdir)/'`fkmeans/kmeans.c
|
||||
|
||||
libsf_ai_preproc_la-fsom.lo: fsom/fsom.c
|
||||
$(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsf_ai_preproc_la_CFLAGS) $(CFLAGS) -c -o libsf_ai_preproc_la-fsom.lo `test -f 'fsom/fsom.c' || echo '$(srcdir)/'`fsom/fsom.c
|
||||
|
||||
|
@ -428,6 +435,9 @@ libsf_ai_preproc_la-mysql.lo: mysql.c
|
|||
libsf_ai_preproc_la-neural.lo: neural.c
|
||||
$(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsf_ai_preproc_la_CFLAGS) $(CFLAGS) -c -o libsf_ai_preproc_la-neural.lo `test -f 'neural.c' || echo '$(srcdir)/'`neural.c
|
||||
|
||||
libsf_ai_preproc_la-neural_cluster.lo: neural_cluster.c
|
||||
$(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsf_ai_preproc_la_CFLAGS) $(CFLAGS) -c -o libsf_ai_preproc_la-neural_cluster.lo `test -f 'neural_cluster.c' || echo '$(srcdir)/'`neural_cluster.c
|
||||
|
||||
libsf_ai_preproc_la-outdb.lo: outdb.c
|
||||
$(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libsf_ai_preproc_la_CFLAGS) $(CFLAGS) -c -o libsf_ai_preproc_la-outdb.lo `test -f 'outdb.c' || echo '$(srcdir)/'`outdb.c
|
||||
|
||||
|
|
1630
fkmeans/Doxyfile
Normal file
1630
fkmeans/Doxyfile
Normal file
File diff suppressed because it is too large
Load diff
3
fkmeans/Makefile
Normal file
3
fkmeans/Makefile
Normal file
|
@ -0,0 +1,3 @@
|
|||
all:
|
||||
gcc -g -O3 -Wall -pedantic -pedantic-errors -std=c99 -o kmeans-test test.c kmeans.c -lm
|
||||
|
88
fkmeans/README
Normal file
88
fkmeans/README
Normal file
|
@ -0,0 +1,88 @@
|
|||
fkmeans is a tiny C library that allows you to perform k-means clustering
|
||||
algorithm over arbitrary sets of n-dimensional data. All you need to do is:
|
||||
|
||||
- Include the file kmeans.h in your sources;
|
||||
|
||||
- Consider your data set as a vector of vectors of double items (double**),
|
||||
where each vector is an n-dimensional item of your data set;
|
||||
|
||||
- If you want to perform the k-means algorithm over your data and you already
|
||||
know the number k of clusters there contained, or its estimate, you want to
|
||||
execute some code like this (in this example, the data set is 3-dimensional,
|
||||
i.e. it contains N vectors whose size is 3, and we know it contains n_clus
|
||||
clusters):
|
||||
|
||||
kmeans_t *km;
|
||||
double **dataset;
|
||||
...
|
||||
km = kmeans_new ( dataset, N, 3, n_clus );
|
||||
kmeans ( km );
|
||||
...
|
||||
kmeans_free ( km );
|
||||
|
||||
If you don't already know the number of clusters contained in your data set,
|
||||
you can use the function kmeans_auto() for automatically attempting to find
|
||||
the best one using Schwarz's criterion. Be careful, this operation can be very
|
||||
slow, especially if executed on data set having many elements. The example
|
||||
above would simply become something like:
|
||||
|
||||
kmeans_t *km;
|
||||
double **dataset;
|
||||
...
|
||||
km = kmeans_auto ( dataset, N, 3 );
|
||||
...
|
||||
kmeans_free ( km );
|
||||
|
||||
- Once the clustering has been performed, the clusters of data can be simply
|
||||
accessed from your kmeans_t* structure, as they are held as a double*** field
|
||||
named "clusters". Each vector in this structure represents a cluter, whose
|
||||
size is specified in the field cluster_sizes[i] of the structure. Each cluster
|
||||
contains the items that form it, each of it is an n-dimensional vector. The
|
||||
number of clusters is specified in the field "k" of the structure, the
|
||||
number of dimensions of each element is specified in the field "dataset_dim"
|
||||
and the number of elements in the originary data set is specified in the field
|
||||
"dataset_size". So, for example:
|
||||
|
||||
for ( i=0; i < km->k; i++ )
|
||||
{
|
||||
printf ( "cluster %d: [ ", i );
|
||||
|
||||
for ( j=0; j < km->cluster_sizes[i]; j++ )
|
||||
{
|
||||
printf ( "(" );
|
||||
|
||||
for ( k=0; k < km->dataset_size; k++ )
|
||||
{
|
||||
printf ( "%f, ", km->clusters[i][j][k] );
|
||||
}
|
||||
|
||||
printf ( "), ");
|
||||
}
|
||||
|
||||
printf ( "]\n" );
|
||||
}
|
||||
|
||||
The library however already comes with a sample implementation, contained in
|
||||
"test.c", and typing "make" this example will be built. This example takes 0,
|
||||
1, 2 or 3 command-line arguments, in format
|
||||
|
||||
$ ./kmeans-test [num_elements] [min_value] [max_value]
|
||||
|
||||
and randomly generates a 2-dimensional data set containing num_elements, whose
|
||||
coordinates are between min_value and max_value. The clustering is then
|
||||
performed and the results are shown on stdout, with the clusters coloured in
|
||||
different ways;
|
||||
|
||||
- After you write your source, remember to include the file "kmeans.c",
|
||||
containing the implementation of the library, in the list of your sources
|
||||
files;
|
||||
|
||||
- That's all. Include "kmeans.h", write your code using
|
||||
kmeans_new()+kmeans()+kmeans_free() or kmeans_auto()+kmeans_free(), explore
|
||||
your clusters, remember to include "kmeans.c" in the list of your source
|
||||
files, and you're ready for k-means clustering.
|
||||
|
||||
Author: Fabio "BlackLight" Manganiello,
|
||||
<blacklight@autistici.org>,
|
||||
http://0x00.ath.cx
|
||||
|
445
fkmeans/kmeans.c
Normal file
445
fkmeans/kmeans.c
Normal file
|
@ -0,0 +1,445 @@
|
|||
/*
|
||||
* =====================================================================================
|
||||
*
|
||||
* Filename: kmeans.c
|
||||
*
|
||||
* Description: k-means clusterization algorithm implementation in C
|
||||
*
|
||||
* Version: 1.0
|
||||
* Created: 12/11/2010 10:43:28
|
||||
* Revision: none
|
||||
* Compiler: gcc
|
||||
*
|
||||
* Author: BlackLight (http://0x00.ath.cx), <blacklight@autistici.org>
|
||||
* Licence: GNU GPL v.3
|
||||
* Company: DO WHAT YOU WANT CAUSE A PIRATE IS FREE, YOU ARE A PIRATE!
|
||||
*
|
||||
* =====================================================================================
|
||||
*/
|
||||
|
||||
#include "kmeans.h"
|
||||
|
||||
#include <alloca.h>
|
||||
#include <float.h>
|
||||
#include <limits.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
/**
|
||||
* \brief Initialize the centers of the clusters taking the K most distant elements in the dataset
|
||||
* \param km k-means object
|
||||
*/
|
||||
|
||||
static void
|
||||
__kmeans_init_centers ( kmeans_t *km )
|
||||
{
|
||||
int i, j, k, l,
|
||||
index_found = 0,
|
||||
max_index = 0,
|
||||
assigned_centers = 0,
|
||||
*assigned_centers_indexes = NULL;
|
||||
|
||||
double dist = 0.0,
|
||||
max_dist = 0.0;
|
||||
|
||||
for ( i=0; i < km->dataset_size; i++ )
|
||||
{
|
||||
dist = 0.0;
|
||||
|
||||
for ( j=0; j < km->dataset_dim; j++ )
|
||||
{
|
||||
dist += ( km->dataset[i][j] ) * ( km->dataset[i][j] );
|
||||
}
|
||||
|
||||
if ( dist > max_dist )
|
||||
{
|
||||
max_dist = dist;
|
||||
max_index = i;
|
||||
}
|
||||
}
|
||||
|
||||
for ( i=0; i < km->dataset_dim; i++ )
|
||||
{
|
||||
km->centers[0][i] = km->dataset[max_index][i];
|
||||
}
|
||||
|
||||
if ( !( assigned_centers_indexes = (int*) realloc ( assigned_centers_indexes, (++assigned_centers) * sizeof ( int ))))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
assigned_centers_indexes[ assigned_centers - 1 ] = max_index;
|
||||
|
||||
for ( i=1; i < km->k; i++ )
|
||||
{
|
||||
max_dist = 0.0;
|
||||
max_index = 0;
|
||||
|
||||
for ( j=0; j < km->dataset_size; j++ )
|
||||
{
|
||||
index_found = 0;
|
||||
|
||||
for ( k=0; k < assigned_centers && !index_found; k++ )
|
||||
{
|
||||
if ( assigned_centers_indexes[k] == j )
|
||||
{
|
||||
index_found = 1;
|
||||
}
|
||||
}
|
||||
|
||||
if ( index_found )
|
||||
continue;
|
||||
|
||||
dist = 0.0;
|
||||
|
||||
for ( k=0; k < assigned_centers; k++ )
|
||||
{
|
||||
for ( l=0; l < km->dataset_dim; l++ )
|
||||
{
|
||||
dist += ( km->dataset[j][l] - km->centers[k][l] ) * ( km->dataset[j][l] - km->centers[k][l] );
|
||||
}
|
||||
}
|
||||
|
||||
if ( dist > max_dist )
|
||||
{
|
||||
max_dist = dist;
|
||||
max_index = j;
|
||||
}
|
||||
}
|
||||
|
||||
for ( j=0; j < km->dataset_dim; j++ )
|
||||
{
|
||||
km->centers[i][j] = km->dataset[max_index][j];
|
||||
}
|
||||
|
||||
if ( !( assigned_centers_indexes = (int*) realloc ( assigned_centers_indexes, (++assigned_centers) * sizeof ( int ))))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
assigned_centers_indexes[ assigned_centers - 1 ] = max_index;
|
||||
}
|
||||
|
||||
free ( assigned_centers_indexes );
|
||||
} /* ----- end of function kmeans_init_centers ----- */
|
||||
|
||||
/**
|
||||
* \brief Create a new k-means object
|
||||
* \param dataset Dataset to be clustered
|
||||
* \param dataset_size Number of elements in the dataset
|
||||
* \param dataset_dim Dimension of each element of the dataset
|
||||
* \param K Number of clusters
|
||||
* \return Reference to the newly created k-means object, if successfull, NULL otherwise
|
||||
*/
|
||||
|
||||
kmeans_t*
|
||||
kmeans_new ( double **dataset, const int dataset_size, const int dataset_dim, const int K )
|
||||
{
|
||||
int i, j;
|
||||
kmeans_t *km = NULL;
|
||||
|
||||
if ( !( km = (kmeans_t*) malloc ( sizeof ( kmeans_t ))))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ( !( km->dataset = (double**) calloc ( dataset_size, sizeof ( double* ))))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
for ( i=0; i < dataset_size; i++ )
|
||||
{
|
||||
if ( !( km->dataset[i] = (double*) calloc ( dataset_dim, sizeof ( double ))))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
for ( j=0; j < dataset_dim; j++ )
|
||||
{
|
||||
km->dataset[i][j] = dataset[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
km->dataset_size = dataset_size;
|
||||
km->dataset_dim = dataset_dim;
|
||||
km->k = K;
|
||||
|
||||
if ( !( km->clusters = (double***) calloc ( K, sizeof ( double** ))))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ( !( km->cluster_sizes = (int*) calloc ( K, sizeof ( int* ))))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ( !( km->centers = (double**) calloc ( K, sizeof ( double* ))))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
for ( i=0; i < K; i++ )
|
||||
{
|
||||
if ( !( km->centers[i] = (double*) calloc ( dataset_dim, sizeof ( double ))))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
__kmeans_init_centers ( km );
|
||||
return km;
|
||||
} /* ----- end of function kmeans_new ----- */
|
||||
|
||||
/**
|
||||
* \brief Function that performs a single step for k-means algorithm
|
||||
* \param km k-means object
|
||||
* \return 0 if no changes were performed by this step, 1 otherwise, -1 in case of error
|
||||
*/
|
||||
|
||||
static int
|
||||
__kmeans_step ( kmeans_t *km )
|
||||
{
|
||||
int i, j, k,
|
||||
best_center = 0;
|
||||
|
||||
double dist = 0.0,
|
||||
min_dist = DBL_MAX,
|
||||
**old_centers = NULL;
|
||||
|
||||
if ( km->clusters[0] )
|
||||
{
|
||||
for ( i=0; i < km->k; i++ )
|
||||
{
|
||||
for ( j=0; j < km->cluster_sizes[i]; j++ )
|
||||
{
|
||||
free ( km->clusters[i][j] );
|
||||
km->clusters[i][j] = NULL;
|
||||
}
|
||||
|
||||
free ( km->clusters[i] );
|
||||
km->clusters[i] = NULL;
|
||||
km->cluster_sizes[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if ( !( old_centers = (double**) alloca ( km->k * sizeof ( double* ))))
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
for ( i=0; i < km->k; i++ )
|
||||
{
|
||||
if ( !( old_centers[i] = (double*) alloca ( km->dataset_dim * sizeof ( double ))))
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
for ( j=0; j < km->dataset_dim; j++ )
|
||||
{
|
||||
old_centers[i][j] = km->centers[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
for ( i=0; i < km->dataset_size; i++ )
|
||||
{
|
||||
min_dist = DBL_MAX;
|
||||
best_center = 0;
|
||||
|
||||
for ( j=0; j < km->k; j++ )
|
||||
{
|
||||
dist = 0.0;
|
||||
|
||||
for ( k=0; k < km->dataset_dim; k++ )
|
||||
{
|
||||
dist += ( km->dataset[i][k] - km->centers[j][k] ) * ( km->dataset[i][k] - km->centers[j][k] );
|
||||
}
|
||||
|
||||
if ( dist < min_dist )
|
||||
{
|
||||
min_dist = dist;
|
||||
best_center = j;
|
||||
}
|
||||
}
|
||||
|
||||
if ( !( km->clusters[best_center] = (double**) realloc ( km->clusters[best_center], (++(km->cluster_sizes[best_center])) * sizeof ( double* ))))
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
if ( !( km->clusters [best_center] [km->cluster_sizes[best_center]-1] = (double*) calloc ( km->dataset_dim, sizeof ( double ))))
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
for ( j=0; j < km->dataset_dim; j++ )
|
||||
{
|
||||
km->clusters [best_center] [km->cluster_sizes[best_center]-1] [j] = km->dataset[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
for ( i=0; i < km->k; i++ )
|
||||
{
|
||||
for ( j=0; j < km->dataset_dim; j++ )
|
||||
{
|
||||
km->centers[i][j] = 0.0;
|
||||
|
||||
for ( k=0; k < km->cluster_sizes[i]; k++ )
|
||||
{
|
||||
km->centers[i][j] += km->clusters[i][k][j];
|
||||
}
|
||||
|
||||
if ( km->cluster_sizes[i] != 0 )
|
||||
{
|
||||
km->centers[i][j] /= (double) km->cluster_sizes[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for ( i=0; i < km->k; i++ )
|
||||
{
|
||||
for ( j=0; j < km->dataset_dim; j++ )
|
||||
{
|
||||
if ( km->centers[i][j] != old_centers[i][j] )
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
} /* ----- end of function __kmeans_step ----- */
|
||||
|
||||
/**
|
||||
* \brief Perform the k-means algorithm over a k-means object
|
||||
* \param km k-means object
|
||||
*/
|
||||
|
||||
void
|
||||
kmeans ( kmeans_t *km )
|
||||
{
|
||||
while ( __kmeans_step ( km ) != 0 );
|
||||
} /* ----- end of function kmeans ----- */
|
||||
|
||||
/**
|
||||
* \brief Compute the heuristic coefficient associated to the current number of clusters through Schwarz's criterion
|
||||
* \param km k-means object
|
||||
* \return Real value expressing how well that number of clusters models the dataset
|
||||
*/
|
||||
|
||||
static double
|
||||
__kmeans_heuristic_coefficient ( kmeans_t *km )
|
||||
{
|
||||
int i, j, k;
|
||||
double distorsion = 0.0;
|
||||
|
||||
for ( i=0; i < km->k; i++ )
|
||||
{
|
||||
for ( j=0; j < km->cluster_sizes[i]; j++ )
|
||||
{
|
||||
for ( k=0; k < km->dataset_dim; k++ )
|
||||
{
|
||||
distorsion += ( km->centers[i][k] - km->clusters[i][j][k] ) * ( km->centers[i][k] - km->clusters[i][j][k] );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return distorsion + km->k * log ( km->dataset_size );
|
||||
} /* ----- end of function __kmeans_heuristic_coefficient ----- */
|
||||
|
||||
/**
|
||||
* \brief Remove a k-means object
|
||||
* \param km k-means object to be deallocaed
|
||||
*/
|
||||
|
||||
void
|
||||
kmeans_free ( kmeans_t *km )
|
||||
{
|
||||
int i, j;
|
||||
|
||||
for ( i=0; i < km->k; i++ )
|
||||
{
|
||||
for ( j=0; j < km->cluster_sizes[i]; j++ )
|
||||
{
|
||||
free ( km->clusters[i][j] );
|
||||
km->clusters[i][j] = NULL;
|
||||
}
|
||||
|
||||
free ( km->clusters[i] );
|
||||
km->clusters[i] = NULL;
|
||||
}
|
||||
|
||||
free ( km->clusters );
|
||||
km->clusters = NULL;
|
||||
|
||||
free ( km->cluster_sizes );
|
||||
km->cluster_sizes = NULL;
|
||||
|
||||
for ( i=0; i < km->k; i++ )
|
||||
{
|
||||
free ( km->centers[i] );
|
||||
km->centers[i] = NULL;
|
||||
}
|
||||
|
||||
free ( km->centers );
|
||||
km->centers = NULL;
|
||||
|
||||
for ( i=0; i < km->dataset_size; i++ )
|
||||
{
|
||||
free ( km->dataset[i] );
|
||||
km->dataset[i] = NULL;
|
||||
}
|
||||
|
||||
free ( km->dataset );
|
||||
km->dataset = NULL;
|
||||
|
||||
free ( km );
|
||||
km = NULL;
|
||||
} /* ----- end of function kmeans_free ----- */
|
||||
|
||||
/**
|
||||
* \brief Perform a k-means clustering over a dataset automatically choosing the best value of k using Schwarz's criterion
|
||||
* \param dataset Dataset to be clustered
|
||||
* \param dataset_size Number of elements in the dataset
|
||||
* \param dataset_dim Dimension of each element of the dataset
|
||||
* \return Reference to the newly created k-means object, if successfull, NULL otherwise
|
||||
*/
|
||||
|
||||
kmeans_t*
|
||||
kmeans_auto ( double **dataset, int dataset_size, int dataset_dim )
|
||||
{
|
||||
int i;
|
||||
|
||||
double heuristic = 0.0,
|
||||
best_heuristic = DBL_MAX;
|
||||
|
||||
kmeans_t *km = NULL,
|
||||
*best_km = NULL;
|
||||
|
||||
for ( i=1; i <= dataset_size; i++ )
|
||||
{
|
||||
if ( !( km = kmeans_new ( dataset, dataset_size, dataset_dim, i )))
|
||||
return NULL;
|
||||
|
||||
kmeans ( km );
|
||||
heuristic = __kmeans_heuristic_coefficient ( km );
|
||||
|
||||
if ( heuristic < best_heuristic )
|
||||
{
|
||||
if ( best_km )
|
||||
{
|
||||
kmeans_free ( best_km );
|
||||
}
|
||||
|
||||
best_km = km;
|
||||
best_heuristic = heuristic;
|
||||
} else {
|
||||
kmeans_free ( km );
|
||||
}
|
||||
}
|
||||
|
||||
return best_km;
|
||||
} /* ----- end of function kmeans_auto ----- */
|
||||
|
52
fkmeans/kmeans.h
Normal file
52
fkmeans/kmeans.h
Normal file
|
@ -0,0 +1,52 @@
|
|||
/*
|
||||
* =====================================================================================
|
||||
*
|
||||
* Filename: kmeans.h
|
||||
*
|
||||
* Description: Header file for C k-means implementation
|
||||
*
|
||||
* Version: 1.0
|
||||
* Created: 12/11/2010 10:43:55
|
||||
* Revision: none
|
||||
* Compiler: gcc
|
||||
*
|
||||
* Author: BlackLight (http://0x00.ath.cx), <blacklight@autistici.org>
|
||||
* Licence: GNU GPL v.3
|
||||
* Company: DO WHAT YOU WANT CAUSE A PIRATE IS FREE, YOU ARE A PIRATE!
|
||||
*
|
||||
* =====================================================================================
|
||||
*/
|
||||
|
||||
#ifndef __KMEANS_H
|
||||
#define __KMEANS_H
|
||||
|
||||
typedef struct __kmeans_t {
|
||||
/** Input data set */
|
||||
double **dataset;
|
||||
|
||||
/** Number of elements in the data set */
|
||||
int dataset_size;
|
||||
|
||||
/** Dimension of each element of the data set */
|
||||
int dataset_dim;
|
||||
|
||||
/** Number of clusters */
|
||||
int k;
|
||||
|
||||
/** Vector containing the number of elements in each cluster */
|
||||
int *cluster_sizes;
|
||||
|
||||
/** Clusters */
|
||||
double ***clusters;
|
||||
|
||||
/** Coordinates of the centers of the clusters */
|
||||
double **centers;
|
||||
} kmeans_t;
|
||||
|
||||
kmeans_t* kmeans_new ( double **dataset, const int dataset_size, const int dataset_dim, const int K );
|
||||
kmeans_t* kmeans_auto ( double **dataset, int dataset_size, int dataset_dim );
|
||||
void kmeans ( kmeans_t *km );
|
||||
void kmeans_free ( kmeans_t *km );
|
||||
|
||||
#endif
|
||||
|
8
mysql.c
8
mysql.c
|
@ -48,18 +48,26 @@ __mysql_do_init ( MYSQL **__DB, BOOL is_out )
|
|||
return (void*) *__DB;
|
||||
|
||||
if ( !( *__DB = (MYSQL*) malloc ( sizeof ( MYSQL ))))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ( !( mysql_init ( *__DB )))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ( is_out )
|
||||
{
|
||||
if ( !mysql_real_connect ( *__DB, config->outdbhost, config->outdbuser, config->outdbpass, NULL, 0, NULL, 0 ))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ( mysql_select_db ( *__DB, config->outdbname ))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
} else {
|
||||
if ( !mysql_real_connect ( *__DB, config->dbhost, config->dbuser, config->dbpass, NULL, 0, NULL, 0 ))
|
||||
return NULL;
|
||||
|
|
184
neural.c
184
neural.c
|
@ -37,21 +37,22 @@
|
|||
/** Enumeration for the input fields of the SOM neural network */
|
||||
enum { som_src_ip, som_dst_ip, som_src_port, som_dst_port, som_time, som_gid, som_sid, som_rev, SOM_NUM_ITEMS };
|
||||
|
||||
typedef struct {
|
||||
unsigned int gid;
|
||||
unsigned int sid;
|
||||
unsigned int rev;
|
||||
uint32_t src_ip_addr;
|
||||
uint32_t dst_ip_addr;
|
||||
uint16_t src_port;
|
||||
uint16_t dst_port;
|
||||
time_t timestamp;
|
||||
} AI_som_alert_tuple;
|
||||
|
||||
PRIVATE time_t latest_serialization_time = ( time_t ) 0;
|
||||
PRIVATE som_network_t *net = NULL;
|
||||
PRIVATE time_t latest_serialization_time = ( time_t ) 0;
|
||||
PRIVATE som_network_t *net = NULL;
|
||||
PRIVATE AI_alerts_per_neuron *alerts_per_neuron = NULL;
|
||||
PRIVATE pthread_mutex_t neural_mutex;
|
||||
|
||||
/**
|
||||
* \brief Get the hash table containing the alerts associated to each output neuron
|
||||
* \return The hash table
|
||||
*/
|
||||
|
||||
AI_alerts_per_neuron*
|
||||
AI_get_alerts_per_neuron ()
|
||||
{
|
||||
return alerts_per_neuron;
|
||||
} /* ----- end of function AI_get_alerts_per_neuron ----- */
|
||||
|
||||
/**
|
||||
* \brief Get the current weight of the neural correlation index using a hyperbolic tangent function with a parameter expressed in function of the current number of alerts in the database
|
||||
* \return The weight of the correlation index ( 0 <= weight < 1 )
|
||||
|
@ -126,6 +127,11 @@ __AI_som_alert_distance ( const AI_som_alert_tuple alert1, const AI_som_alert_tu
|
|||
x2 = 0,
|
||||
y2 = 0;
|
||||
|
||||
int i;
|
||||
BOOL is_found = false;
|
||||
AI_alerts_per_neuron *found = NULL;
|
||||
AI_alerts_per_neuron_key key;
|
||||
|
||||
if ( !( input1 = (double*) alloca ( SOM_NUM_ITEMS * sizeof ( double ))))
|
||||
{
|
||||
AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
|
||||
|
@ -136,24 +142,128 @@ __AI_som_alert_distance ( const AI_som_alert_tuple alert1, const AI_som_alert_tu
|
|||
AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
|
||||
}
|
||||
|
||||
pthread_mutex_lock ( &neural_mutex );
|
||||
|
||||
if ( !net )
|
||||
{
|
||||
pthread_mutex_unlock ( &neural_mutex );
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
__AI_alert_to_som_data ( alert1, &input1 );
|
||||
__AI_alert_to_som_data ( alert2, &input2 );
|
||||
|
||||
pthread_mutex_lock ( &neural_mutex );
|
||||
|
||||
som_set_inputs ( net, input1 );
|
||||
som_get_best_neuron_coordinates ( net, &x1, &y1 );
|
||||
|
||||
__AI_alert_to_som_data ( alert2, &input2 );
|
||||
som_set_inputs ( net, input2 );
|
||||
som_get_best_neuron_coordinates ( net, &x2, &y2 );
|
||||
|
||||
pthread_mutex_unlock ( &neural_mutex );
|
||||
|
||||
/* Check if there are already entries in the hash table for these two neurons, otherwise
|
||||
* it creates them and append these two alerts */
|
||||
key.x = x1;
|
||||
key.y = y1;
|
||||
HASH_FIND ( hh, alerts_per_neuron, &key, sizeof ( key ), found );
|
||||
|
||||
if ( !found )
|
||||
{
|
||||
if ( !( found = (AI_alerts_per_neuron*) calloc ( 1, sizeof ( AI_alerts_per_neuron ))))
|
||||
{
|
||||
AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
|
||||
}
|
||||
|
||||
found->key = key;
|
||||
found->n_alerts = 1;
|
||||
|
||||
if ( !( found->alerts = (AI_som_alert_tuple*) calloc ( 1, sizeof ( AI_som_alert_tuple ))))
|
||||
{
|
||||
AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
|
||||
}
|
||||
|
||||
found->alerts[0] = alert1;
|
||||
HASH_ADD ( hh, alerts_per_neuron, key, sizeof ( key ), found );
|
||||
} else {
|
||||
is_found = false;
|
||||
|
||||
for ( i=0; i < found->n_alerts && !is_found; i++ )
|
||||
{
|
||||
if (
|
||||
alert1.gid == found->alerts[i].gid &&
|
||||
alert1.sid == found->alerts[i].sid &&
|
||||
alert1.rev == found->alerts[i].rev &&
|
||||
alert1.src_ip_addr == found->alerts[i].src_ip_addr &&
|
||||
alert1.dst_ip_addr == found->alerts[i].dst_ip_addr &&
|
||||
alert1.src_port == found->alerts[i].src_port &&
|
||||
alert1.dst_port == found->alerts[i].dst_port )
|
||||
{
|
||||
is_found = true;
|
||||
}
|
||||
}
|
||||
|
||||
if ( !is_found )
|
||||
{
|
||||
if ( !( found->alerts = (AI_som_alert_tuple*) realloc ( found->alerts,
|
||||
(++(found->n_alerts)) * sizeof ( AI_som_alert_tuple ))))
|
||||
{
|
||||
AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
|
||||
}
|
||||
|
||||
found->alerts[ found->n_alerts - 1 ] = alert1;
|
||||
}
|
||||
}
|
||||
|
||||
key.x = x2;
|
||||
key.y = y2;
|
||||
HASH_FIND ( hh, alerts_per_neuron, &key, sizeof ( key ), found );
|
||||
|
||||
if ( !found )
|
||||
{
|
||||
if ( !( found = (AI_alerts_per_neuron*) calloc ( 1, sizeof ( AI_alerts_per_neuron ))))
|
||||
{
|
||||
AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
|
||||
}
|
||||
|
||||
found->key = key;
|
||||
found->n_alerts = 1;
|
||||
|
||||
if ( !( found->alerts = (AI_som_alert_tuple*) calloc ( 1, sizeof ( AI_som_alert_tuple ))))
|
||||
{
|
||||
AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
|
||||
}
|
||||
|
||||
found->alerts[0] = alert2;
|
||||
HASH_ADD ( hh, alerts_per_neuron, key, sizeof ( key ), found );
|
||||
} else {
|
||||
is_found = false;
|
||||
|
||||
for ( i=0; i < found->n_alerts && !is_found; i++ )
|
||||
{
|
||||
if (
|
||||
alert2.gid == found->alerts[i].gid &&
|
||||
alert2.sid == found->alerts[i].sid &&
|
||||
alert2.rev == found->alerts[i].rev &&
|
||||
alert2.src_ip_addr == found->alerts[i].src_ip_addr &&
|
||||
alert2.dst_ip_addr == found->alerts[i].dst_ip_addr &&
|
||||
alert2.src_port == found->alerts[i].src_port &&
|
||||
alert2.dst_port == found->alerts[i].dst_port )
|
||||
{
|
||||
is_found = true;
|
||||
}
|
||||
}
|
||||
|
||||
if ( !is_found )
|
||||
{
|
||||
if ( !( found->alerts = (AI_som_alert_tuple*) realloc ( found->alerts,
|
||||
(++(found->n_alerts)) * sizeof ( AI_som_alert_tuple ))))
|
||||
{
|
||||
AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
|
||||
}
|
||||
|
||||
found->alerts[ found->n_alerts - 1 ] = alert2;
|
||||
}
|
||||
}
|
||||
|
||||
/* Return the normalized euclidean distance in [0,1] (the normalization is made considering that the maximum distance
|
||||
* between two points on the output neurons matrix is the distance between the upper-left and bottom-right points) */
|
||||
return sqrt ((double) ( (x2-x1)*(x2-x1) + (y2-y1)*(y2-y1) )) /
|
||||
|
@ -170,9 +280,7 @@ __AI_som_alert_distance ( const AI_som_alert_tuple alert1, const AI_som_alert_tu
|
|||
double
|
||||
AI_alert_neural_som_correlation ( const AI_snort_alert *a, const AI_snort_alert *b )
|
||||
{
|
||||
size_t i = 0;
|
||||
unsigned long long int time_sum = 0;
|
||||
AI_som_alert_tuple t1, t2;
|
||||
AI_som_alert_tuple t1, t2;
|
||||
|
||||
t1.gid = a->gid;
|
||||
t1.sid = a->sid;
|
||||
|
@ -181,18 +289,7 @@ AI_alert_neural_som_correlation ( const AI_snort_alert *a, const AI_snort_alert
|
|||
t1.dst_ip_addr = ntohl ( a->ip_dst_addr );
|
||||
t1.src_port = ntohs ( a->tcp_src_port );
|
||||
t1.dst_port = ntohs ( a->tcp_dst_port );
|
||||
time_sum = (unsigned long long int) a->timestamp;
|
||||
|
||||
/* The timestamp of this alert is computed like the average timestamp of the grouped alerts */
|
||||
for ( i=1; i < a->grouped_alerts_count; i++ )
|
||||
{
|
||||
if ( a->grouped_alerts[i-1] )
|
||||
{
|
||||
time_sum += (unsigned long long int) a->grouped_alerts[i-1]->timestamp;
|
||||
}
|
||||
}
|
||||
|
||||
t1.timestamp = (time_t) ( time_sum / a->grouped_alerts_count );
|
||||
t1.timestamp = a->timestamp;
|
||||
|
||||
t2.gid = b->gid;
|
||||
t2.sid = b->sid;
|
||||
|
@ -201,17 +298,7 @@ AI_alert_neural_som_correlation ( const AI_snort_alert *a, const AI_snort_alert
|
|||
t2.dst_ip_addr = ntohl ( b->ip_dst_addr );
|
||||
t2.src_port = ntohs ( b->tcp_src_port );
|
||||
t2.dst_port = ntohs ( b->tcp_dst_port );
|
||||
time_sum = (unsigned long long int) b->timestamp;
|
||||
|
||||
for ( i=1; i < b->grouped_alerts_count; i++ )
|
||||
{
|
||||
if ( b->grouped_alerts[i-1] )
|
||||
{
|
||||
time_sum += (unsigned long long int) b->grouped_alerts[i-1]->timestamp;
|
||||
}
|
||||
}
|
||||
|
||||
t2.timestamp = (time_t) ( time_sum / b->grouped_alerts_count );
|
||||
t2.timestamp = b->timestamp;
|
||||
return __AI_som_alert_distance ( t1, t2 );
|
||||
} /* ----- end of function AI_alert_neural_som_correlation ----- */
|
||||
|
||||
|
@ -338,8 +425,9 @@ __AI_som_train ()
|
|||
void*
|
||||
AI_neural_thread ( void *arg )
|
||||
{
|
||||
BOOL do_train = false;
|
||||
struct stat st;
|
||||
BOOL do_train = false;
|
||||
pthread_t neural_clustering_thread;
|
||||
|
||||
pthread_mutex_init ( &neural_mutex, NULL );
|
||||
|
||||
|
@ -353,6 +441,14 @@ AI_neural_thread ( void *arg )
|
|||
AI_fatal_err ( "AIPreproc: neural network thread launched but netfile option was not specified", __FILE__, __LINE__ );
|
||||
}
|
||||
|
||||
if ( config->neuralClusteringInterval != 0 )
|
||||
{
|
||||
if ( pthread_create ( &neural_clustering_thread, NULL, AI_neural_clustering_thread, NULL ) != 0 )
|
||||
{
|
||||
AI_fatal_err ( "Failed to create the manual correlations parsing thread", __FILE__, __LINE__ );
|
||||
}
|
||||
}
|
||||
|
||||
while ( 1 )
|
||||
{
|
||||
if ( stat ( config->netfile, &st ) < 0 )
|
||||
|
|
194
neural_cluster.c
Normal file
194
neural_cluster.c
Normal file
|
@ -0,0 +1,194 @@
|
|||
/*
|
||||
* =====================================================================================
|
||||
*
|
||||
* Filename: neural_cluster.c
|
||||
*
|
||||
* Description: Perform the clusterization over the output layer of the SOM neural
|
||||
* network, in order to attempt to find the alerts belonging to the
|
||||
* same attack scenario. The clusterization is operated through k-means
|
||||
* using Schwarz criterion in order to find the optimal number of
|
||||
* clusters, the implementation is in fkmeans/
|
||||
*
|
||||
* Version: 0.1
|
||||
* Created: 19/11/2010 18:37:35
|
||||
* Revision: none
|
||||
* Compiler: gcc
|
||||
*
|
||||
* Author: BlackLight (http://0x00.ath.cx), <blacklight@autistici.org>
|
||||
* Licence: GNU GPL v.3
|
||||
* Company: DO WHAT YOU WANT CAUSE A PIRATE IS FREE, YOU ARE A PIRATE!
|
||||
*
|
||||
* =====================================================================================
|
||||
*/
|
||||
|
||||
#include "spp_ai.h"
|
||||
|
||||
/** \defgroup neural_cluster Module for clustering the alerts associated to the
|
||||
* neural network output layer in order to find alerts belonging to the same scenario
|
||||
* @{ */
|
||||
|
||||
#include "fkmeans/kmeans.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
|
||||
/**
|
||||
* \brief Print the clusters associated to the SOM output to an XML log file
|
||||
* \param km k-means object
|
||||
* \param alerts_per_neuron Hash table containing the alerts associated to each neuron
|
||||
*/
|
||||
|
||||
PRIVATE void
|
||||
__AI_neural_clusters_to_xml ( kmeans_t *km, AI_alerts_per_neuron *alerts_per_neuron )
|
||||
{
|
||||
int i, j, k, l, are_equal;
|
||||
FILE *fp = NULL;
|
||||
|
||||
uint32_t src_addr = 0,
|
||||
dst_addr = 0;
|
||||
|
||||
char src_ip[INET_ADDRSTRLEN] = { 0 },
|
||||
dst_ip[INET_ADDRSTRLEN] = { 0 };
|
||||
|
||||
AI_alerts_per_neuron_key key;
|
||||
AI_alerts_per_neuron *alert_iterator = NULL;
|
||||
|
||||
if ( !( fp = fopen ( config->neural_clusters_log, "w" )))
|
||||
{
|
||||
AI_fatal_err ( "Unable to write on the neural clusters XML log file", __FILE__, __LINE__ );
|
||||
}
|
||||
|
||||
fprintf ( fp, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n\n"
|
||||
"<clusters>\n" );
|
||||
|
||||
for ( i=0; i < km->k; i++ )
|
||||
{
|
||||
fprintf ( fp, "\t<cluster id=\"%d\">\n", i );
|
||||
|
||||
for ( j=0; j < km->cluster_sizes[i]; j++ )
|
||||
{
|
||||
key.x = km->clusters[i][j][0];
|
||||
key.y = km->clusters[i][j][1];
|
||||
HASH_FIND ( hh, alerts_per_neuron, &key, sizeof ( key ), alert_iterator );
|
||||
|
||||
if ( alert_iterator )
|
||||
{
|
||||
for ( k=0; k < alert_iterator->n_alerts; k++ )
|
||||
{
|
||||
are_equal = 0;
|
||||
|
||||
for ( l=0; l < alert_iterator->n_alerts && !are_equal; l++ )
|
||||
{
|
||||
if ( k != l )
|
||||
{
|
||||
if (
|
||||
alert_iterator->alerts[k].gid == alert_iterator->alerts[l].gid &&
|
||||
alert_iterator->alerts[k].sid == alert_iterator->alerts[l].sid &&
|
||||
alert_iterator->alerts[k].rev == alert_iterator->alerts[l].rev &&
|
||||
alert_iterator->alerts[k].src_ip_addr == alert_iterator->alerts[l].src_ip_addr &&
|
||||
alert_iterator->alerts[k].dst_ip_addr == alert_iterator->alerts[l].dst_ip_addr &&
|
||||
alert_iterator->alerts[k].src_port == alert_iterator->alerts[l].src_port &&
|
||||
alert_iterator->alerts[k].dst_port == alert_iterator->alerts[l].dst_port &&
|
||||
alert_iterator->alerts[k].timestamp == alert_iterator->alerts[l].timestamp )
|
||||
{
|
||||
are_equal = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( !are_equal )
|
||||
{
|
||||
src_addr = htonl ( alert_iterator->alerts[k].src_ip_addr );
|
||||
dst_addr = htonl ( alert_iterator->alerts[k].dst_ip_addr );
|
||||
inet_ntop ( AF_INET, &src_addr, src_ip, INET_ADDRSTRLEN );
|
||||
inet_ntop ( AF_INET, &dst_addr, dst_ip, INET_ADDRSTRLEN );
|
||||
|
||||
fprintf ( fp, "\t\t<alert gid=\"%d\" sid=\"%d\" rev=\"%d\" src_ip=\"%s\" src_port=\"%d\" "
|
||||
"dst_ip=\"%s\" dst_port=\"%d\" timestamp=\"%lu\" xcoord=\"%d\" ycoord=\"%d\"/>\n",
|
||||
alert_iterator->alerts[k].gid,
|
||||
alert_iterator->alerts[k].sid,
|
||||
alert_iterator->alerts[k].rev,
|
||||
src_ip, alert_iterator->alerts[k].src_port,
|
||||
dst_ip, alert_iterator->alerts[k].dst_port,
|
||||
alert_iterator->alerts[k].timestamp,
|
||||
alert_iterator->key.x, alert_iterator->key.y );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fprintf ( fp, "\t</cluster>\n" );
|
||||
}
|
||||
|
||||
fprintf ( fp, "</clusters>\n" );
|
||||
fclose ( fp );
|
||||
} /* ----- end of function __AI_neural_clusters_to_xml ----- */
|
||||
|
||||
/**
|
||||
* \brief Thread that performs the k-means clustering over the output layer of
|
||||
* the SOM neural network
|
||||
*/
|
||||
|
||||
void*
|
||||
AI_neural_clustering_thread ( void *arg )
|
||||
{
|
||||
AI_alerts_per_neuron *alerts_per_neuron = NULL,
|
||||
*alert_iterator = NULL;
|
||||
|
||||
kmeans_t *km = NULL;
|
||||
double **dataset = NULL;
|
||||
int i, dataset_size = 0;
|
||||
|
||||
while ( 1 )
|
||||
{
|
||||
dataset = NULL;
|
||||
dataset_size = 0;
|
||||
alerts_per_neuron = AI_get_alerts_per_neuron();
|
||||
|
||||
for ( alert_iterator = alerts_per_neuron; alert_iterator; alert_iterator = (AI_alerts_per_neuron*) alert_iterator->hh.next )
|
||||
{
|
||||
if ( alert_iterator->n_alerts > 0 )
|
||||
{
|
||||
if ( !( dataset = (double**) realloc ( dataset, (++dataset_size) * sizeof ( double* ))))
|
||||
{
|
||||
AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
|
||||
}
|
||||
|
||||
if ( !( dataset[dataset_size-1] = (double*) calloc ( 2, sizeof ( double ))))
|
||||
{
|
||||
AI_fatal_err ( "Fatal dynamic memory allocation error", __FILE__, __LINE__ );
|
||||
}
|
||||
|
||||
dataset[dataset_size-1][0] = (double) alert_iterator->key.x;
|
||||
dataset[dataset_size-1][1] = (double) alert_iterator->key.y;
|
||||
}
|
||||
}
|
||||
|
||||
if ( dataset && dataset_size != 0 )
|
||||
{
|
||||
if ( !( km = kmeans_auto ( dataset, dataset_size, 2 )))
|
||||
{
|
||||
AI_fatal_err ( "Unable to initialize the k-means clustering object", __FILE__, __LINE__ );
|
||||
}
|
||||
|
||||
__AI_neural_clusters_to_xml ( km, alerts_per_neuron );
|
||||
kmeans_free ( km );
|
||||
|
||||
for ( i=0; i < dataset_size; i++ )
|
||||
{
|
||||
free ( dataset[i] );
|
||||
}
|
||||
|
||||
free ( dataset );
|
||||
}
|
||||
|
||||
sleep ( config->neuralClusteringInterval );
|
||||
}
|
||||
|
||||
pthread_exit ((void*) 0);
|
||||
return (void*) 0;
|
||||
} /* ----- end of function AI_neural_clustering_thread ----- */
|
||||
|
||||
/** @} */
|
||||
|
41
spp_ai.c
41
spp_ai.c
|
@ -172,14 +172,14 @@ static AI_config * AI_parse(char *args)
|
|||
{
|
||||
char *arg;
|
||||
char *match;
|
||||
char alertfile[1024] = { 0 },
|
||||
alert_history_file[1024] = { 0 },
|
||||
clusterfile[1024] = { 0 },
|
||||
corr_alerts_dir[1024] = { 0 },
|
||||
corr_modules_dir[1024] = { 0 },
|
||||
corr_rules_dir[1024] = { 0 },
|
||||
webserv_dir[1024] = { 0 },
|
||||
webserv_banner[1024] = { 0 };
|
||||
char alertfile[1024] = { 0 },
|
||||
alert_history_file[1024] = { 0 },
|
||||
clusterfile[1024] = { 0 },
|
||||
corr_alerts_dir[1024] = { 0 },
|
||||
corr_modules_dir[1024] = { 0 },
|
||||
corr_rules_dir[1024] = { 0 },
|
||||
webserv_dir[1024] = { 0 },
|
||||
webserv_banner[1024] = { 0 };
|
||||
|
||||
char **matches = NULL;
|
||||
int nmatches = 0;
|
||||
|
@ -217,6 +217,7 @@ static AI_config * AI_parse(char *args)
|
|||
correlation_graph_interval = 0,
|
||||
database_parsing_interval = 0,
|
||||
manual_correlations_parsing_interval = 0,
|
||||
neural_clustering_interval = 0,
|
||||
neural_network_training_interval = 0,
|
||||
neural_train_steps = 0,
|
||||
output_neurons_per_side = 0,
|
||||
|
@ -526,6 +527,27 @@ static AI_config * AI_parse(char *args)
|
|||
config->neuralNetworkTrainingInterval = neural_network_training_interval;
|
||||
_dpd.logMsg( " Neural network training interval: %u\n", config->neuralNetworkTrainingInterval );
|
||||
|
||||
/* Parsing the neural_clustering_interval option */
|
||||
if (( arg = (char*) strcasestr( args, "neural_clustering_interval" ) ))
|
||||
{
|
||||
for ( arg += strlen("neural_clustering_interval");
|
||||
*arg && (*arg < '0' || *arg > '9');
|
||||
arg++ );
|
||||
|
||||
if ( !(*arg) )
|
||||
{
|
||||
AI_fatal_err ( "neural_clustering_interval option used but "
|
||||
"no value specified", __FILE__, __LINE__ );
|
||||
}
|
||||
|
||||
neural_clustering_interval = strtoul ( arg, NULL, 10 );
|
||||
} else {
|
||||
neural_clustering_interval = DEFAULT_NEURAL_CLUSTERING_INTERVAL;
|
||||
}
|
||||
|
||||
config->neuralClusteringInterval = neural_clustering_interval;
|
||||
_dpd.logMsg( " Neural network clustering interval: %u\n", config->neuralClusteringInterval );
|
||||
|
||||
/* Parsing the output_neurons_per_side option */
|
||||
if (( arg = (char*) strcasestr( args, "output_neurons_per_side" ) ))
|
||||
{
|
||||
|
@ -796,6 +818,9 @@ static AI_config * AI_parse(char *args)
|
|||
|
||||
_dpd.logMsg(" webserv_dir: %s\n", config->webserv_dir);
|
||||
|
||||
snprintf ( config->neural_clusters_log, sizeof ( config->neural_clusters_log ), "%s/neural_clusters.xml", config->webserv_dir );
|
||||
_dpd.logMsg(" neural_clusters_log: %s\n", config->neural_clusters_log);
|
||||
|
||||
/* Parsing the corr_modules_dir option */
|
||||
if (( arg = (char*) strcasestr( args, "corr_modules_dir" ) ))
|
||||
{
|
||||
|
|
43
spp_ai.h
43
spp_ai.h
|
@ -81,6 +81,11 @@
|
|||
* alert correlations and the next one (this value should usually be high) */
|
||||
#define DEFAULT_NEURAL_NETWORK_TRAINING_INTERVAL 43200
|
||||
|
||||
/** Default interval in seconds between an execution of the thread that attempts to cluster
|
||||
* the output layer of the neural network searching for alerts belonging to the same
|
||||
* attack scenario and the next one */
|
||||
#define DEFAULT_NEURAL_CLUSTERING_INTERVAL 1200
|
||||
|
||||
/** Default interval of validity in seconds for an entry in the cache of correlated alerts */
|
||||
#define DEFAULT_BAYESIAN_CORRELATION_CACHE_VALIDITY 600
|
||||
|
||||
|
@ -193,6 +198,11 @@ typedef struct
|
|||
/** Interval in seconds between an invocation of the thread for parsing XML manual correlations and the next one */
|
||||
unsigned long manualCorrelationsParsingInterval;
|
||||
|
||||
/** Interval in seconds between an execution of the thread that attempts to cluster
|
||||
* the output layer of the neural network searching for alerts belonging to the same
|
||||
* attack scenario and the next one */
|
||||
unsigned long neuralClusteringInterval;
|
||||
|
||||
/** Interval in seconds for which an entry in the cache of correlated alerts is valid */
|
||||
unsigned long bayesianCorrelationCacheValidity;
|
||||
|
||||
|
@ -256,6 +266,9 @@ typedef struct
|
|||
/** File keeping the serialized neural network used for the alert correlation */
|
||||
char netfile[1024];
|
||||
|
||||
/** File containing the likely clusters computed over the output layer of the neural network */
|
||||
char neural_clusters_log[1024];
|
||||
|
||||
/** Database name, if database logging is used */
|
||||
char dbname[256];
|
||||
|
||||
|
@ -451,6 +464,34 @@ typedef struct {
|
|||
UT_hash_handle hh;
|
||||
} AI_alert_correlation;
|
||||
/*****************************************************************/
|
||||
/** Expresses an alert as a numerical tuple manageable by a neural network */
|
||||
typedef struct {
|
||||
unsigned int gid;
|
||||
unsigned int sid;
|
||||
unsigned int rev;
|
||||
uint32_t src_ip_addr;
|
||||
uint32_t dst_ip_addr;
|
||||
uint16_t src_port;
|
||||
uint16_t dst_port;
|
||||
time_t timestamp;
|
||||
} AI_som_alert_tuple;
|
||||
/*****************************************************************/
|
||||
/** Key for the AI_alerts_per_neuron hash table */
|
||||
typedef struct {
|
||||
int x;
|
||||
int y;
|
||||
} AI_alerts_per_neuron_key;
|
||||
/*****************************************************************/
|
||||
/** Struct that holds, for each point of the output layer, the list of associated alerts
|
||||
* for easily performing the clustering algorithm */
|
||||
typedef struct {
|
||||
AI_alerts_per_neuron_key key;
|
||||
AI_som_alert_tuple *alerts;
|
||||
int n_alerts;
|
||||
UT_hash_handle hh;
|
||||
} AI_alerts_per_neuron;
|
||||
/*****************************************************************/
|
||||
|
||||
|
||||
/** Enumeration for describing the table in the output database */
|
||||
enum { ALERTS_TABLE, IPV4_HEADERS_TABLE, TCP_HEADERS_TABLE, PACKET_STREAMS_TABLE, CLUSTERED_ALERTS_TABLE, CORRELATED_ALERTS_TABLE, N_TABLES };
|
||||
|
@ -513,6 +554,8 @@ void AI_outdb_mutex_initialize ();
|
|||
void* AI_store_alert_to_db_thread ( void* );
|
||||
void* AI_store_cluster_to_db_thread ( void* );
|
||||
void* AI_store_correlation_to_db_thread ( void* );
|
||||
void* AI_neural_clustering_thread ( void* );
|
||||
AI_alerts_per_neuron* AI_get_alerts_per_neuron ();
|
||||
|
||||
double(**AI_get_corr_functions ( size_t* ))(const AI_snort_alert*, const AI_snort_alert*);
|
||||
double(**AI_get_corr_weights ( size_t* ))();
|
||||
|
|
Loading…
Reference in a new issue