Bayesian correlation now working

This commit is contained in:
BlackLight 2010-09-23 21:57:20 +02:00
parent 0ac6af9921
commit d7e0b426f4
7 changed files with 324 additions and 54 deletions

14
README
View file

@ -152,6 +152,8 @@ preprocessor ai: \
alert_serialization_interval 3600 \
alert_bufsize 30 \
alert_clustering_interval 300 \
bayesian_correlation_interval 1200 \
bayesian_correlation_cache_validity 600 \
correlation_graph_interval 300 \
correlation_rules_dir "/your/snort/dir/etc/corr_rules" \
correlated_alerts_dir "/your/snort/dir/log/correlated_alerts" \
@ -200,6 +202,18 @@ not specified: 30)
of the alerts in the log according to the provided clustering hierarchies and
the next one (default if not specified: 300 seconds)
- bayesian_correlation_interval: Interval, in seconds, that should occur between
two alerts in the history for considering them as, more or less strongly,
correlated (default: 1200 seconds). NOTE: A value of 0 will disable the bayesian
correlation. This setting is strongly suggested when your alert log is still
"learning", i.e. when you don't have enough alerts yet. After this period, you
can set the correlation interval to any value.
- bayesian_correlation_cache_validity: interval, in seconds, for which an entry
in the bayesian correlation hash table (i.e. a pair of alerts with the
associated historical bayesian correlation) is considered as valid
before being updated (default: 600 seconds)
- correlation_graph_interval: The interval that should occur from the building
of the correlation graph between the clustered alerts and the next one (default
if not specified: 300 seconds)

6
TODO
View file

@ -2,16 +2,14 @@
AVERAGE/HIGH PRIORITY:
======================
- Add alerts' history serialization to db.c as well
- Testing more scenarios, making more hyperalert models
- Bayesian learning among alerts in alert log
- libgc support
=============
LOW PRIORITY:
=============
- Managing clusters for addresses, timestamps (and more?)
- libgc support
=====
DONE:
@ -22,4 +20,6 @@ DONE:
+ Managing hyperalert graph connection inside the alert structure itself
+ Keeping track of all the streams and alerts even after clustered
+ Dynamic cluster_min_size algorithm
+ Add alerts' history serialization to db.c as well
+ Bayesian learning among alerts in alert log

View file

@ -21,26 +21,14 @@
#include <sys/stat.h>
typedef struct {
int gid;
int sid;
int rev;
} AI_alert_event_key;
typedef struct _AI_alert_event {
AI_alert_event_key key;
unsigned int count;
time_t timestamp;
struct _AI_alert_event *next;
UT_hash_handle hh;
} AI_alert_event;
/** \defgroup alert_history Manage the serialization and deserialization of alert history to the history file
* @{ */
PRIVATE AI_alert_event *alerts_hash = NULL;
/**
* FUNCTION: AI_alerts_hash_free
* \brief Free a hash table of alert events
* \param events Hash table to be freed
*/
@ -237,3 +225,44 @@ AI_serialize_alerts ( AI_snort_alert **alerts_pool, unsigned int alerts_pool_cou
fclose ( fp );
} /* ----- end of function AI_serialize_alerts ----- */
/**
* \brief Get the sequence of alerts saved in the history file given the ID of the alert
* \param key Key representing the Snort ID of the alert
* \return The flow of events of that type of alert saved in the history
*/
const AI_alert_event*
AI_get_alert_events_by_key ( AI_alert_event_key key )
{
AI_alert_event *found = NULL;
HASH_FIND ( hh, alerts_hash, &key, sizeof ( key ), found );
return found;
} /* ----- end of function AI_get_alert_events_by_key ----- */
/**
* \brief Get the number of alerts saved in the history file
* \return The number of single alerts (not alert types) saved in the history file
*/
unsigned int
AI_get_history_alert_number ()
{
unsigned int alert_count = 0;
AI_alert_event *event_iterator = NULL;
if ( !alerts_hash )
{
AI_deserialize_alerts();
}
for ( event_iterator = alerts_hash; event_iterator; event_iterator = ( AI_alert_event* ) event_iterator->hh.next )
{
alert_count += event_iterator->count;
}
return alert_count;
} /* ----- end of function AI_get_history_alert_number ----- */
/* @} */

View file

@ -30,6 +30,8 @@
#include <sys/stat.h>
#include <pthread.h>
/** \defgroup alert_parser Parse the alert log into binary structures
* @{ */
PRIVATE AI_snort_alert *alerts = NULL;
PRIVATE FILE *alert_fp = NULL;
@ -40,10 +42,6 @@ AI_snort_alert **alerts_pool = NULL;
unsigned int alerts_pool_count = 0;
/** \defgroup alert_parser Parse the alert log into binary structures
* @{ */
/**
* \brief Serialize the pool of alerts in a separated thread
* \param arg void* pointer to the alert to be added to the pool, if any

View file

@ -66,10 +66,40 @@ typedef struct {
UT_hash_handle hh;
} AI_alert_correlation;
PRIVATE AI_hyperalert_info *hyperalerts = NULL;
PRIVATE AI_snort_alert *alerts = NULL;
PRIVATE AI_alert_correlation *correlation_table = NULL;
PRIVATE pthread_mutex_t mutex;
/** Key for the bayesian correlation table */
typedef struct {
/** Snort ID of the first alert */
AI_alert_event_key a;
/** Snort ID of the second alert */
AI_alert_event_key b;
} AI_bayesian_correlation_key;
/** Bayesian alert correlation hash table */
typedef struct {
/** Key for the hash table */
AI_bayesian_correlation_key key;
/** Correlation value */
double correlation;
/** Timestamp of the last acquired correlation value */
time_t latest_computation_time;
/** Make the struct 'hashable' */
UT_hash_handle hh;
} AI_bayesian_correlation;
PRIVATE AI_bayesian_correlation *bayesian_cache = NULL;
PRIVATE AI_hyperalert_info *hyperalerts = NULL;
PRIVATE AI_snort_alert *alerts = NULL;
PRIVATE AI_alert_correlation *correlation_table = NULL;
PRIVATE double k_exp_value = 0.0;
PRIVATE pthread_mutex_t mutex;
/**
* \brief Clean up the correlation hash table
@ -92,11 +122,10 @@ _AI_correlation_table_cleanup ()
* \brief Recursively write a flow of correlated alerts to a .dot file, ready for being rendered as graph
* \param corr Correlated alerts
* \param fp File pointer
* \param strong Boolean value set if the correlation between the alerts is 'strong' (greater than avg + 2*k*deviation)
*/
PRIVATE void
_AI_print_correlated_alerts ( AI_alert_correlation *corr, FILE *fp, BOOL strong )
_AI_print_correlated_alerts ( AI_alert_correlation *corr, FILE *fp )
{
char src_addr1[INET_ADDRSTRLEN],
dst_addr1[INET_ADDRSTRLEN],
@ -141,7 +170,7 @@ _AI_print_correlated_alerts ( AI_alert_correlation *corr, FILE *fp, BOOL strong
"\"[%d.%d.%d] %s\\n"
"%s:%s -> %s:%s\\n"
"%s\\n"
"(%d alerts grouped)\"%s;\n",
"(%d alerts grouped)\";\n",
corr->key.a->gid, corr->key.a->sid, corr->key.a->rev, corr->key.a->desc,
src_addr1, src_port1, dst_addr1, dst_port1,
@ -151,8 +180,7 @@ _AI_print_correlated_alerts ( AI_alert_correlation *corr, FILE *fp, BOOL strong
corr->key.b->gid, corr->key.b->sid, corr->key.b->rev, corr->key.b->desc,
src_addr2, src_port2, dst_addr2, dst_port2,
timestamp2,
corr->key.b->grouped_alerts_count,
strong ? "" : "[style=dotted]"
corr->key.b->grouped_alerts_count
);
} /* ----- end of function _AI_correlation_flow_to_file ----- */
@ -233,14 +261,125 @@ _AI_get_function_arguments ( char *orig_stmt, int *n_args )
} /* ----- end of function _AI_get_function_arguments ----- */
/**
* \brief Compute the correlation coefficient between two alerts, as #INTERSECTION(pre(B), post(A) / #UNION(pre(B), post(A))
* \brief Function used for computing the correlation probability A->B of two alerts (A,B) given their timestamps: f(ta, tb) = exp ( -(tb - ta)^2 / k )
* \param ta Timestamp of A
* \param tb Timestamp of B
* \return The correlation probability A->B
*/
PRIVATE double
_AI_bayesian_correlation_function ( time_t ta, time_t tb )
{
if ( k_exp_value == 0.0 )
k_exp_value = - (double) (config->bayesianCorrelationInterval * config->bayesianCorrelationInterval) / log ( CUTOFF_Y_VALUE );
return exp ( -((ta - tb) * (ta - tb)) / k_exp_value );
} /* ----- end of function _AI_bayesian_correlation_function ----- */
/**
* \brief Compute the correlation between two alerts, A -> B: p[A|B] = p[Corr(A,B)] / P[B]
* \param a First alert
* \param b Second alert
* \return A real coefficient representing p[A|B] using the historical information
*/
PRIVATE double
_AI_alert_bayesian_correlation ( AI_snort_alert *a, AI_snort_alert *b )
{
double corr = 0.0;
unsigned int corr_count = 0,
corr_count_a = 0;
BOOL is_a_correlated = false;
AI_bayesian_correlation_key bayesian_key;
AI_bayesian_correlation *found = NULL;
AI_alert_event_key key_a,
key_b;
AI_alert_event *events_a = NULL,
*events_b = NULL;
AI_alert_event *events_iterator_a,
*events_iterator_b;
if ( !a || !b )
return 0.0;
key_a.gid = a->gid;
key_a.sid = a->sid;
key_a.rev = a->rev;
key_b.gid = b->gid;
key_b.sid = b->sid;
key_b.rev = b->rev;
/* Check if this correlation value is already in our cache */
bayesian_key.a = key_a;
bayesian_key.b = key_b;
HASH_FIND ( hh, bayesian_cache, &bayesian_key, sizeof ( bayesian_key ), found );
if ( found )
{
/* Ok, the abs() is not needed until the time starts running backwards, but it's better going safe... */
if ( abs ( time ( NULL ) - found->latest_computation_time ) <= config->bayesianCorrelationCacheValidity )
/* If our alert couple is there, just return it */
return found->correlation;
}
if ( !( events_a = (AI_alert_event*) AI_get_alert_events_by_key ( key_a )) ||
!( events_b = (AI_alert_event*) AI_get_alert_events_by_key ( key_b )))
return 0.0;
for ( events_iterator_a = events_a; events_iterator_a; events_iterator_a = events_iterator_a->next )
{
is_a_correlated = false;
for ( events_iterator_b = events_b; events_iterator_b; events_iterator_b = events_iterator_b->next )
{
if ( abs ( events_iterator_a->timestamp - events_iterator_b->timestamp ) <= config->bayesianCorrelationInterval )
{
is_a_correlated = true;
corr_count++;
corr += _AI_bayesian_correlation_function ( events_iterator_a->timestamp, events_iterator_b->timestamp );
}
}
if ( is_a_correlated )
corr_count_a++;
}
corr /= (double) corr_count;
corr -= ( events_a->count - corr_count_a ) / events_a->count;
/* _dpd.logMsg ( " Number of '%s' alerts correlated to '%s': %u over %u\\n", a->desc, b->desc, corr_count_a, events_a->count ); */
if ( found )
{
found->correlation = corr;
found->latest_computation_time = time ( NULL );
} else {
if ( !( found = ( AI_bayesian_correlation* ) malloc ( sizeof ( AI_bayesian_correlation ))))
_dpd.fatalMsg ( "AIPreproc: Fatal dynamic memory allocation error at %s:%d\n", __FILE__, __LINE__ );
found->key = bayesian_key;
found->correlation = corr;
found->latest_computation_time = time ( NULL );
}
/* _dpd.logMsg ( "Correlation ('%s') -> ('%s'): %f\\n", a->desc, b->desc, corr ); */
return corr;
} /* ----- end of function _AI_alert_bayesian_correlation ----- */
/**
* \brief Compute the correlation coefficient between two alerts, as #INTERSECTION(pre(B), post(A)) / #UNION(pre(B), post(A)), on the basis of preconditions and postconditions in the knowledge base's correlation rules
* \param a Alert a
* \param b Alert b
* \return The correlation coefficient between A and B as coefficient in [0,1]
*/
PRIVATE double
_AI_correlation_coefficient ( AI_snort_alert *a, AI_snort_alert *b )
_AI_kb_correlation_coefficient ( AI_snort_alert *a, AI_snort_alert *b )
{
unsigned int i, j, k, l,
n_intersection = 0,
@ -444,7 +583,7 @@ _AI_correlation_coefficient ( AI_snort_alert *a, AI_snort_alert *b )
}
return (double) ((double) n_intersection / (double) n_union );
} /* ----- end of function _AI_correlation_coefficient ----- */
} /* ----- end of function _AI_kb_correlation_coefficient ----- */
/**
@ -691,7 +830,8 @@ AI_alert_correlation_thread ( void *arg )
double avg_correlation = 0.0,
std_deviation = 0.0,
corr_threshold = 0.0,
corr_strong_threshold = 0.0;
kb_correlation = 0.0,
bayesian_correlation = 0.0;
FILE *fp = NULL;
@ -800,7 +940,16 @@ AI_alert_correlation_thread ( void *arg )
corr_key.b = alert_iterator2;
corr->key = corr_key;
corr->correlation = _AI_correlation_coefficient ( corr_key.a, corr_key.b );
kb_correlation = _AI_kb_correlation_coefficient ( corr_key.a, corr_key.b );
bayesian_correlation = _AI_alert_bayesian_correlation ( corr_key.a, corr_key.b );
if ( bayesian_correlation == 0.0 || config->bayesianCorrelationInterval == 0 )
corr->correlation = kb_correlation;
else if ( kb_correlation == 0.0 )
corr->correlation = bayesian_correlation;
else
corr->correlation = ( kb_correlation + bayesian_correlation ) / 2;
HASH_ADD ( hh, correlation_table, key, sizeof ( AI_alert_correlation_key ), corr );
}
}
@ -827,7 +976,6 @@ AI_alert_correlation_thread ( void *arg )
std_deviation = sqrt ( std_deviation / (double) HASH_COUNT ( correlation_table ));
corr_threshold = avg_correlation + ( config->correlationThresholdCoefficient * std_deviation );
corr_strong_threshold = avg_correlation + ( 2.0 * config->correlationThresholdCoefficient * std_deviation );
snprintf ( corr_dot_file, sizeof ( corr_dot_file ), "%s/correlated_alerts.dot", config->corr_alerts_dir );
if ( stat ( config->corr_alerts_dir, &st ) < 0 )
@ -862,7 +1010,7 @@ AI_alert_correlation_thread ( void *arg )
corr->key.a->derived_alerts[ corr->key.a->n_derived_alerts - 1 ] = corr->key.b;
corr->key.b->parent_alerts [ corr->key.b->n_parent_alerts - 1 ] = corr->key.a;
_AI_print_correlated_alerts ( corr, fp, ( corr->correlation >= corr_strong_threshold ));
_AI_print_correlated_alerts ( corr, fp );
}
}

View file

@ -157,18 +157,20 @@ static AI_config * AI_parse(char *args)
hierarchy_node **hierarchy_nodes = NULL;
int n_hierarchy_nodes = 0;
unsigned long cleanup_interval = 0,
stream_expire_interval = 0,
alertfile_len = 0,
alert_history_file_len = 0,
alert_serialization_interval = 0,
alert_bufsize = 0,
clusterfile_len = 0,
corr_rules_dir_len = 0,
corr_alerts_dir_len = 0,
alert_clustering_interval = 0,
database_parsing_interval = 0,
correlation_graph_interval = 0;
unsigned long cleanup_interval = 0,
stream_expire_interval = 0,
alertfile_len = 0,
alert_history_file_len = 0,
alert_serialization_interval = 0,
alert_bufsize = 0,
bayesian_correlation_interval = 0,
bayesian_correlation_cache_validity = 0,
clusterfile_len = 0,
corr_rules_dir_len = 0,
corr_alerts_dir_len = 0,
alert_clustering_interval = 0,
database_parsing_interval = 0,
correlation_graph_interval = 0;
BOOL has_cleanup_interval = false,
has_stream_expire_interval = false,
@ -336,11 +338,56 @@ static AI_config * AI_parse(char *args)
}
corr_threshold_coefficient = strtod ( arg, NULL );
_dpd.logMsg( " Correlation threshold coefficient: %d\n", corr_threshold_coefficient );
_dpd.logMsg( " Correlation threshold coefficient: %f\n", corr_threshold_coefficient );
}
config->correlationThresholdCoefficient = corr_threshold_coefficient;
/* Parsing the bayesian_correlation_interval option */
if (( arg = (char*) strcasestr( args, "bayesian_correlation_interval" ) ))
{
for ( arg += strlen("bayesian_correlation_interval");
*arg && (*arg < '0' || *arg > '9');
arg++ );
if ( !(*arg) )
{
_dpd.fatalMsg("AIPreproc: bayesian_correlation_interval option used but "
"no value specified\n");
}
bayesian_correlation_interval = strtoul ( arg, NULL, 10 );
config->bayesianCorrelationInterval = bayesian_correlation_interval;
} else {
bayesian_correlation_interval = DEFAULT_BAYESIAN_CORRELATION_INTERVAL;
}
config->bayesianCorrelationInterval = bayesian_correlation_interval;
_dpd.logMsg( " Bayesian correlation interval: %u\n", config->bayesianCorrelationInterval );
/* Parsing the bayesian_correlation_cache_validity option */
if (( arg = (char*) strcasestr( args, "bayesian_correlation_cache_validity" ) ))
{
for ( arg += strlen("bayesian_correlation_cache_validity");
*arg && (*arg < '0' || *arg > '9');
arg++ );
if ( !(*arg) )
{
_dpd.fatalMsg("AIPreproc: bayesian_correlation_cache_validity option used but "
"no value specified\n");
}
bayesian_correlation_cache_validity = strtoul ( arg, NULL, 10 );
config->bayesianCorrelationCacheValidity = bayesian_correlation_cache_validity;
} else {
bayesian_correlation_cache_validity = DEFAULT_BAYESIAN_CORRELATION_CACHE_VALIDITY;
}
config->bayesianCorrelationCacheValidity = bayesian_correlation_cache_validity;
_dpd.logMsg( " Bayesian cache validity interval: %u\n", config->bayesianCorrelationCacheValidity );
/* Parsing the alertfile option */
if (( arg = (char*) strcasestr( args, "alertfile" ) ))
{

View file

@ -69,6 +69,15 @@
/** Default timeout in seconds between a serialization of the alerts' buffer and the next one */
#define DEFAULT_ALERT_SERIALIZATION_INTERVAL 3600
/** Default interval between two alerts (a,b) for considering them correlated */
#define DEFAULT_BAYESIAN_CORRELATION_INTERVAL 1200
/** Default interval of validity in seconds for an entry in the cache of correlated alerts */
#define DEFAULT_BAYESIAN_CORRELATION_CACHE_VALIDITY 600
/** Cutoff y value in the exponential decay for considering two alerts not correlated */
#define CUTOFF_Y_VALUE 0.01
/****************************/
/* Database support */
#ifdef HAVE_LIBMYSQLCLIENT
@ -143,6 +152,12 @@ typedef struct
/** Interval in seconds between a serialization of the alerts' buffer and the next one */
unsigned long alertSerializationInterval;
/** Interval in seconds between two alerts (a,b) for considering them correlated */
unsigned long bayesianCorrelationInterval;
/** Interval in seconds for which an entry in the cache of correlated alerts is valid */
unsigned long bayesianCorrelationCacheValidity;
/** Size of the alerts' buffer to be periodically sent to the serialization thread */
unsigned long alert_bufsize;
@ -299,6 +314,23 @@ typedef struct _AI_snort_alert {
unsigned int n_derived_alerts;
} AI_snort_alert;
/*****************************************************************/
/** Key for the AI_alert_event structure, containing the Snort ID of the alert */
typedef struct {
int gid;
int sid;
int rev;
} AI_alert_event_key;
/*****************************************************************/
/** Structure representing the historical information of an alert saved in alert_history */
typedef struct _AI_alert_event {
AI_alert_event_key key;
unsigned int count;
time_t timestamp;
struct _AI_alert_event *next;
UT_hash_handle hh;
} AI_alert_event;
/*****************************************************************/
int preg_match ( const char*, char*, char***, int* );
char* str_replace ( char*, char*, char *);
@ -323,10 +355,12 @@ struct pkt_info* AI_get_stream_by_key ( struct pkt_key );
AI_snort_alert* AI_get_alerts ( void );
AI_snort_alert* AI_get_clustered_alerts ( void );
void AI_serialize_alerts ( AI_snort_alert**, unsigned int );
void* AI_deserialize_alerts ();
void* AI_alerts_pool_thread ( void *arg );
void* AI_serializer_thread ( void *arg );
void AI_serialize_alerts ( AI_snort_alert**, unsigned int );
void* AI_deserialize_alerts ();
void* AI_alerts_pool_thread ( void *arg );
void* AI_serializer_thread ( void *arg );
const AI_alert_event* AI_get_alert_events_by_key ( AI_alert_event_key );
unsigned int AI_get_history_alert_number ();
/** Function pointer to the function used for getting the alert list (from log file, db, ...) */
extern AI_snort_alert* (*get_alerts)(void);