From d7e0b426f41ff50fc956d334f76f5960fa005b28 Mon Sep 17 00:00:00 2001 From: BlackLight Date: Thu, 23 Sep 2010 21:57:20 +0200 Subject: [PATCH] Bayesian correlation now working --- README | 14 ++++ TODO | 6 +- alert_history.c | 57 +++++++++++---- alert_parser.c | 6 +- correlation.c | 180 +++++++++++++++++++++++++++++++++++++++++++----- spp_ai.c | 73 ++++++++++++++++---- spp_ai.h | 42 +++++++++-- 7 files changed, 324 insertions(+), 54 deletions(-) diff --git a/README b/README index 40fed34..b8053f8 100644 --- a/README +++ b/README @@ -152,6 +152,8 @@ preprocessor ai: \ alert_serialization_interval 3600 \ alert_bufsize 30 \ alert_clustering_interval 300 \ + bayesian_correlation_interval 1200 \ + bayesian_correlation_cache_validity 600 \ correlation_graph_interval 300 \ correlation_rules_dir "/your/snort/dir/etc/corr_rules" \ correlated_alerts_dir "/your/snort/dir/log/correlated_alerts" \ @@ -200,6 +202,18 @@ not specified: 30) of the alerts in the log according to the provided clustering hierarchies and the next one (default if not specified: 300 seconds) +- bayesian_correlation_interval: Interval, in seconds, that should occur between +two alerts in the history for considering them as, more or less strongly, +correlated (default: 1200 seconds). NOTE: A value of 0 will disable the bayesian +correlation. This setting is strongly suggested when your alert log is still +"learning", i.e. when you don't have enough alerts yet. After this period, you +can set the correlation interval to any value. + +- bayesian_correlation_cache_validity: interval, in seconds, for which an entry +in the bayesian correlation hash table (i.e. a pair of alerts with the + associated historical bayesian correlation) is considered as valid +before being updated (default: 600 seconds) + - correlation_graph_interval: The interval that should occur from the building of the correlation graph between the clustered alerts and the next one (default if not specified: 300 seconds) diff --git a/TODO b/TODO index 55cdf98..b8316ce 100644 --- a/TODO +++ b/TODO @@ -2,16 +2,14 @@ AVERAGE/HIGH PRIORITY: ====================== -- Add alerts' history serialization to db.c as well - Testing more scenarios, making more hyperalert models -- Bayesian learning among alerts in alert log -- libgc support ============= LOW PRIORITY: ============= - Managing clusters for addresses, timestamps (and more?) +- libgc support ===== DONE: @@ -22,4 +20,6 @@ DONE: + Managing hyperalert graph connection inside the alert structure itself + Keeping track of all the streams and alerts even after clustered + Dynamic cluster_min_size algorithm ++ Add alerts' history serialization to db.c as well ++ Bayesian learning among alerts in alert log diff --git a/alert_history.c b/alert_history.c index c7ea7fb..a7af208 100644 --- a/alert_history.c +++ b/alert_history.c @@ -21,26 +21,14 @@ #include -typedef struct { - int gid; - int sid; - int rev; -} AI_alert_event_key; - -typedef struct _AI_alert_event { - AI_alert_event_key key; - unsigned int count; - time_t timestamp; - struct _AI_alert_event *next; - UT_hash_handle hh; -} AI_alert_event; +/** \defgroup alert_history Manage the serialization and deserialization of alert history to the history file + * @{ */ PRIVATE AI_alert_event *alerts_hash = NULL; /** - * FUNCTION: AI_alerts_hash_free * \brief Free a hash table of alert events * \param events Hash table to be freed */ @@ -237,3 +225,44 @@ AI_serialize_alerts ( AI_snort_alert **alerts_pool, unsigned int alerts_pool_cou fclose ( fp ); } /* ----- end of function AI_serialize_alerts ----- */ +/** + * \brief Get the sequence of alerts saved in the history file given the ID of the alert + * \param key Key representing the Snort ID of the alert + * \return The flow of events of that type of alert saved in the history + */ + +const AI_alert_event* +AI_get_alert_events_by_key ( AI_alert_event_key key ) +{ + AI_alert_event *found = NULL; + HASH_FIND ( hh, alerts_hash, &key, sizeof ( key ), found ); + return found; +} /* ----- end of function AI_get_alert_events_by_key ----- */ + + +/** + * \brief Get the number of alerts saved in the history file + * \return The number of single alerts (not alert types) saved in the history file + */ + +unsigned int +AI_get_history_alert_number () +{ + unsigned int alert_count = 0; + AI_alert_event *event_iterator = NULL; + + if ( !alerts_hash ) + { + AI_deserialize_alerts(); + } + + for ( event_iterator = alerts_hash; event_iterator; event_iterator = ( AI_alert_event* ) event_iterator->hh.next ) + { + alert_count += event_iterator->count; + } + + return alert_count; +} /* ----- end of function AI_get_history_alert_number ----- */ + +/* @} */ + diff --git a/alert_parser.c b/alert_parser.c index 8c1d0c0..8a7cf68 100644 --- a/alert_parser.c +++ b/alert_parser.c @@ -30,6 +30,8 @@ #include #include +/** \defgroup alert_parser Parse the alert log into binary structures + * @{ */ PRIVATE AI_snort_alert *alerts = NULL; PRIVATE FILE *alert_fp = NULL; @@ -40,10 +42,6 @@ AI_snort_alert **alerts_pool = NULL; unsigned int alerts_pool_count = 0; -/** \defgroup alert_parser Parse the alert log into binary structures - * @{ */ - - /** * \brief Serialize the pool of alerts in a separated thread * \param arg void* pointer to the alert to be added to the pool, if any diff --git a/correlation.c b/correlation.c index 839828e..a0bd006 100644 --- a/correlation.c +++ b/correlation.c @@ -66,10 +66,40 @@ typedef struct { UT_hash_handle hh; } AI_alert_correlation; -PRIVATE AI_hyperalert_info *hyperalerts = NULL; -PRIVATE AI_snort_alert *alerts = NULL; -PRIVATE AI_alert_correlation *correlation_table = NULL; -PRIVATE pthread_mutex_t mutex; + +/** Key for the bayesian correlation table */ +typedef struct { + /** Snort ID of the first alert */ + AI_alert_event_key a; + + /** Snort ID of the second alert */ + AI_alert_event_key b; +} AI_bayesian_correlation_key; + + +/** Bayesian alert correlation hash table */ +typedef struct { + /** Key for the hash table */ + AI_bayesian_correlation_key key; + + /** Correlation value */ + double correlation; + + /** Timestamp of the last acquired correlation value */ + time_t latest_computation_time; + + /** Make the struct 'hashable' */ + UT_hash_handle hh; +} AI_bayesian_correlation; + + +PRIVATE AI_bayesian_correlation *bayesian_cache = NULL; +PRIVATE AI_hyperalert_info *hyperalerts = NULL; +PRIVATE AI_snort_alert *alerts = NULL; +PRIVATE AI_alert_correlation *correlation_table = NULL; +PRIVATE double k_exp_value = 0.0; +PRIVATE pthread_mutex_t mutex; + /** * \brief Clean up the correlation hash table @@ -92,11 +122,10 @@ _AI_correlation_table_cleanup () * \brief Recursively write a flow of correlated alerts to a .dot file, ready for being rendered as graph * \param corr Correlated alerts * \param fp File pointer - * \param strong Boolean value set if the correlation between the alerts is 'strong' (greater than avg + 2*k*deviation) */ PRIVATE void -_AI_print_correlated_alerts ( AI_alert_correlation *corr, FILE *fp, BOOL strong ) +_AI_print_correlated_alerts ( AI_alert_correlation *corr, FILE *fp ) { char src_addr1[INET_ADDRSTRLEN], dst_addr1[INET_ADDRSTRLEN], @@ -141,7 +170,7 @@ _AI_print_correlated_alerts ( AI_alert_correlation *corr, FILE *fp, BOOL strong "\"[%d.%d.%d] %s\\n" "%s:%s -> %s:%s\\n" "%s\\n" - "(%d alerts grouped)\"%s;\n", + "(%d alerts grouped)\";\n", corr->key.a->gid, corr->key.a->sid, corr->key.a->rev, corr->key.a->desc, src_addr1, src_port1, dst_addr1, dst_port1, @@ -151,8 +180,7 @@ _AI_print_correlated_alerts ( AI_alert_correlation *corr, FILE *fp, BOOL strong corr->key.b->gid, corr->key.b->sid, corr->key.b->rev, corr->key.b->desc, src_addr2, src_port2, dst_addr2, dst_port2, timestamp2, - corr->key.b->grouped_alerts_count, - strong ? "" : "[style=dotted]" + corr->key.b->grouped_alerts_count ); } /* ----- end of function _AI_correlation_flow_to_file ----- */ @@ -233,14 +261,125 @@ _AI_get_function_arguments ( char *orig_stmt, int *n_args ) } /* ----- end of function _AI_get_function_arguments ----- */ /** - * \brief Compute the correlation coefficient between two alerts, as #INTERSECTION(pre(B), post(A) / #UNION(pre(B), post(A)) + * \brief Function used for computing the correlation probability A->B of two alerts (A,B) given their timestamps: f(ta, tb) = exp ( -(tb - ta)^2 / k ) + * \param ta Timestamp of A + * \param tb Timestamp of B + * \return The correlation probability A->B + */ + +PRIVATE double +_AI_bayesian_correlation_function ( time_t ta, time_t tb ) +{ + if ( k_exp_value == 0.0 ) + k_exp_value = - (double) (config->bayesianCorrelationInterval * config->bayesianCorrelationInterval) / log ( CUTOFF_Y_VALUE ); + + return exp ( -((ta - tb) * (ta - tb)) / k_exp_value ); +} /* ----- end of function _AI_bayesian_correlation_function ----- */ + +/** + * \brief Compute the correlation between two alerts, A -> B: p[A|B] = p[Corr(A,B)] / P[B] + * \param a First alert + * \param b Second alert + * \return A real coefficient representing p[A|B] using the historical information + */ + +PRIVATE double +_AI_alert_bayesian_correlation ( AI_snort_alert *a, AI_snort_alert *b ) +{ + double corr = 0.0; + unsigned int corr_count = 0, + corr_count_a = 0; + + BOOL is_a_correlated = false; + AI_bayesian_correlation_key bayesian_key; + AI_bayesian_correlation *found = NULL; + + AI_alert_event_key key_a, + key_b; + + AI_alert_event *events_a = NULL, + *events_b = NULL; + + AI_alert_event *events_iterator_a, + *events_iterator_b; + + if ( !a || !b ) + return 0.0; + + key_a.gid = a->gid; + key_a.sid = a->sid; + key_a.rev = a->rev; + + key_b.gid = b->gid; + key_b.sid = b->sid; + key_b.rev = b->rev; + + /* Check if this correlation value is already in our cache */ + bayesian_key.a = key_a; + bayesian_key.b = key_b; + HASH_FIND ( hh, bayesian_cache, &bayesian_key, sizeof ( bayesian_key ), found ); + + if ( found ) + { + /* Ok, the abs() is not needed until the time starts running backwards, but it's better going safe... */ + if ( abs ( time ( NULL ) - found->latest_computation_time ) <= config->bayesianCorrelationCacheValidity ) + /* If our alert couple is there, just return it */ + return found->correlation; + } + + if ( !( events_a = (AI_alert_event*) AI_get_alert_events_by_key ( key_a )) || + !( events_b = (AI_alert_event*) AI_get_alert_events_by_key ( key_b ))) + return 0.0; + + for ( events_iterator_a = events_a; events_iterator_a; events_iterator_a = events_iterator_a->next ) + { + is_a_correlated = false; + + for ( events_iterator_b = events_b; events_iterator_b; events_iterator_b = events_iterator_b->next ) + { + if ( abs ( events_iterator_a->timestamp - events_iterator_b->timestamp ) <= config->bayesianCorrelationInterval ) + { + is_a_correlated = true; + corr_count++; + corr += _AI_bayesian_correlation_function ( events_iterator_a->timestamp, events_iterator_b->timestamp ); + } + } + + if ( is_a_correlated ) + corr_count_a++; + } + + corr /= (double) corr_count; + corr -= ( events_a->count - corr_count_a ) / events_a->count; + /* _dpd.logMsg ( " Number of '%s' alerts correlated to '%s': %u over %u\\n", a->desc, b->desc, corr_count_a, events_a->count ); */ + + if ( found ) + { + found->correlation = corr; + found->latest_computation_time = time ( NULL ); + } else { + if ( !( found = ( AI_bayesian_correlation* ) malloc ( sizeof ( AI_bayesian_correlation )))) + _dpd.fatalMsg ( "AIPreproc: Fatal dynamic memory allocation error at %s:%d\n", __FILE__, __LINE__ ); + + found->key = bayesian_key; + found->correlation = corr; + found->latest_computation_time = time ( NULL ); + } + + /* _dpd.logMsg ( "Correlation ('%s') -> ('%s'): %f\\n", a->desc, b->desc, corr ); */ + return corr; +} /* ----- end of function _AI_alert_bayesian_correlation ----- */ + + +/** + * \brief Compute the correlation coefficient between two alerts, as #INTERSECTION(pre(B), post(A)) / #UNION(pre(B), post(A)), on the basis of preconditions and postconditions in the knowledge base's correlation rules * \param a Alert a * \param b Alert b * \return The correlation coefficient between A and B as coefficient in [0,1] */ PRIVATE double -_AI_correlation_coefficient ( AI_snort_alert *a, AI_snort_alert *b ) +_AI_kb_correlation_coefficient ( AI_snort_alert *a, AI_snort_alert *b ) { unsigned int i, j, k, l, n_intersection = 0, @@ -444,7 +583,7 @@ _AI_correlation_coefficient ( AI_snort_alert *a, AI_snort_alert *b ) } return (double) ((double) n_intersection / (double) n_union ); -} /* ----- end of function _AI_correlation_coefficient ----- */ +} /* ----- end of function _AI_kb_correlation_coefficient ----- */ /** @@ -691,7 +830,8 @@ AI_alert_correlation_thread ( void *arg ) double avg_correlation = 0.0, std_deviation = 0.0, corr_threshold = 0.0, - corr_strong_threshold = 0.0; + kb_correlation = 0.0, + bayesian_correlation = 0.0; FILE *fp = NULL; @@ -800,7 +940,16 @@ AI_alert_correlation_thread ( void *arg ) corr_key.b = alert_iterator2; corr->key = corr_key; - corr->correlation = _AI_correlation_coefficient ( corr_key.a, corr_key.b ); + kb_correlation = _AI_kb_correlation_coefficient ( corr_key.a, corr_key.b ); + bayesian_correlation = _AI_alert_bayesian_correlation ( corr_key.a, corr_key.b ); + + if ( bayesian_correlation == 0.0 || config->bayesianCorrelationInterval == 0 ) + corr->correlation = kb_correlation; + else if ( kb_correlation == 0.0 ) + corr->correlation = bayesian_correlation; + else + corr->correlation = ( kb_correlation + bayesian_correlation ) / 2; + HASH_ADD ( hh, correlation_table, key, sizeof ( AI_alert_correlation_key ), corr ); } } @@ -827,7 +976,6 @@ AI_alert_correlation_thread ( void *arg ) std_deviation = sqrt ( std_deviation / (double) HASH_COUNT ( correlation_table )); corr_threshold = avg_correlation + ( config->correlationThresholdCoefficient * std_deviation ); - corr_strong_threshold = avg_correlation + ( 2.0 * config->correlationThresholdCoefficient * std_deviation ); snprintf ( corr_dot_file, sizeof ( corr_dot_file ), "%s/correlated_alerts.dot", config->corr_alerts_dir ); if ( stat ( config->corr_alerts_dir, &st ) < 0 ) @@ -862,7 +1010,7 @@ AI_alert_correlation_thread ( void *arg ) corr->key.a->derived_alerts[ corr->key.a->n_derived_alerts - 1 ] = corr->key.b; corr->key.b->parent_alerts [ corr->key.b->n_parent_alerts - 1 ] = corr->key.a; - _AI_print_correlated_alerts ( corr, fp, ( corr->correlation >= corr_strong_threshold )); + _AI_print_correlated_alerts ( corr, fp ); } } diff --git a/spp_ai.c b/spp_ai.c index 7bc2754..9136487 100644 --- a/spp_ai.c +++ b/spp_ai.c @@ -157,18 +157,20 @@ static AI_config * AI_parse(char *args) hierarchy_node **hierarchy_nodes = NULL; int n_hierarchy_nodes = 0; - unsigned long cleanup_interval = 0, - stream_expire_interval = 0, - alertfile_len = 0, - alert_history_file_len = 0, - alert_serialization_interval = 0, - alert_bufsize = 0, - clusterfile_len = 0, - corr_rules_dir_len = 0, - corr_alerts_dir_len = 0, - alert_clustering_interval = 0, - database_parsing_interval = 0, - correlation_graph_interval = 0; + unsigned long cleanup_interval = 0, + stream_expire_interval = 0, + alertfile_len = 0, + alert_history_file_len = 0, + alert_serialization_interval = 0, + alert_bufsize = 0, + bayesian_correlation_interval = 0, + bayesian_correlation_cache_validity = 0, + clusterfile_len = 0, + corr_rules_dir_len = 0, + corr_alerts_dir_len = 0, + alert_clustering_interval = 0, + database_parsing_interval = 0, + correlation_graph_interval = 0; BOOL has_cleanup_interval = false, has_stream_expire_interval = false, @@ -336,11 +338,56 @@ static AI_config * AI_parse(char *args) } corr_threshold_coefficient = strtod ( arg, NULL ); - _dpd.logMsg( " Correlation threshold coefficient: %d\n", corr_threshold_coefficient ); + _dpd.logMsg( " Correlation threshold coefficient: %f\n", corr_threshold_coefficient ); } config->correlationThresholdCoefficient = corr_threshold_coefficient; + /* Parsing the bayesian_correlation_interval option */ + if (( arg = (char*) strcasestr( args, "bayesian_correlation_interval" ) )) + { + for ( arg += strlen("bayesian_correlation_interval"); + *arg && (*arg < '0' || *arg > '9'); + arg++ ); + + if ( !(*arg) ) + { + _dpd.fatalMsg("AIPreproc: bayesian_correlation_interval option used but " + "no value specified\n"); + } + + bayesian_correlation_interval = strtoul ( arg, NULL, 10 ); + config->bayesianCorrelationInterval = bayesian_correlation_interval; + } else { + bayesian_correlation_interval = DEFAULT_BAYESIAN_CORRELATION_INTERVAL; + } + + config->bayesianCorrelationInterval = bayesian_correlation_interval; + _dpd.logMsg( " Bayesian correlation interval: %u\n", config->bayesianCorrelationInterval ); + + /* Parsing the bayesian_correlation_cache_validity option */ + if (( arg = (char*) strcasestr( args, "bayesian_correlation_cache_validity" ) )) + { + for ( arg += strlen("bayesian_correlation_cache_validity"); + *arg && (*arg < '0' || *arg > '9'); + arg++ ); + + if ( !(*arg) ) + { + _dpd.fatalMsg("AIPreproc: bayesian_correlation_cache_validity option used but " + "no value specified\n"); + } + + bayesian_correlation_cache_validity = strtoul ( arg, NULL, 10 ); + config->bayesianCorrelationCacheValidity = bayesian_correlation_cache_validity; + } else { + bayesian_correlation_cache_validity = DEFAULT_BAYESIAN_CORRELATION_CACHE_VALIDITY; + } + + config->bayesianCorrelationCacheValidity = bayesian_correlation_cache_validity; + _dpd.logMsg( " Bayesian cache validity interval: %u\n", config->bayesianCorrelationCacheValidity ); + + /* Parsing the alertfile option */ if (( arg = (char*) strcasestr( args, "alertfile" ) )) { diff --git a/spp_ai.h b/spp_ai.h index 0e0d149..99ddc26 100644 --- a/spp_ai.h +++ b/spp_ai.h @@ -69,6 +69,15 @@ /** Default timeout in seconds between a serialization of the alerts' buffer and the next one */ #define DEFAULT_ALERT_SERIALIZATION_INTERVAL 3600 +/** Default interval between two alerts (a,b) for considering them correlated */ +#define DEFAULT_BAYESIAN_CORRELATION_INTERVAL 1200 + +/** Default interval of validity in seconds for an entry in the cache of correlated alerts */ +#define DEFAULT_BAYESIAN_CORRELATION_CACHE_VALIDITY 600 + +/** Cutoff y value in the exponential decay for considering two alerts not correlated */ +#define CUTOFF_Y_VALUE 0.01 + /****************************/ /* Database support */ #ifdef HAVE_LIBMYSQLCLIENT @@ -143,6 +152,12 @@ typedef struct /** Interval in seconds between a serialization of the alerts' buffer and the next one */ unsigned long alertSerializationInterval; + /** Interval in seconds between two alerts (a,b) for considering them correlated */ + unsigned long bayesianCorrelationInterval; + + /** Interval in seconds for which an entry in the cache of correlated alerts is valid */ + unsigned long bayesianCorrelationCacheValidity; + /** Size of the alerts' buffer to be periodically sent to the serialization thread */ unsigned long alert_bufsize; @@ -299,6 +314,23 @@ typedef struct _AI_snort_alert { unsigned int n_derived_alerts; } AI_snort_alert; /*****************************************************************/ +/** Key for the AI_alert_event structure, containing the Snort ID of the alert */ +typedef struct { + int gid; + int sid; + int rev; +} AI_alert_event_key; +/*****************************************************************/ +/** Structure representing the historical information of an alert saved in alert_history */ +typedef struct _AI_alert_event { + AI_alert_event_key key; + unsigned int count; + time_t timestamp; + struct _AI_alert_event *next; + UT_hash_handle hh; +} AI_alert_event; +/*****************************************************************/ + int preg_match ( const char*, char*, char***, int* ); char* str_replace ( char*, char*, char *); @@ -323,10 +355,12 @@ struct pkt_info* AI_get_stream_by_key ( struct pkt_key ); AI_snort_alert* AI_get_alerts ( void ); AI_snort_alert* AI_get_clustered_alerts ( void ); -void AI_serialize_alerts ( AI_snort_alert**, unsigned int ); -void* AI_deserialize_alerts (); -void* AI_alerts_pool_thread ( void *arg ); -void* AI_serializer_thread ( void *arg ); +void AI_serialize_alerts ( AI_snort_alert**, unsigned int ); +void* AI_deserialize_alerts (); +void* AI_alerts_pool_thread ( void *arg ); +void* AI_serializer_thread ( void *arg ); +const AI_alert_event* AI_get_alert_events_by_key ( AI_alert_event_key ); +unsigned int AI_get_history_alert_number (); /** Function pointer to the function used for getting the alert list (from log file, db, ...) */ extern AI_snort_alert* (*get_alerts)(void);