Changeset 1431
- Timestamp:
- 11/05/07 23:35:07 (1 year ago)
- Files:
-
- nebula/trunk/ChangeLog (modified) (1 diff)
- nebula/trunk/configure.in (modified) (1 diff)
- nebula/trunk/src/Makefile.am (modified) (1 diff)
- nebula/trunk/src/cluster.c (modified) (5 diffs)
- nebula/trunk/src/cluster.h (modified) (3 diffs)
- nebula/trunk/src/nebula.c (modified) (13 diffs)
- nebula/trunk/src/nebula.h (modified) (2 diffs)
- nebula/trunk/src/signals.c (modified) (1 diff)
- nebula/trunk/src/spamsum.h (modified) (1 diff)
- nebula/trunk/src/util.c (modified) (3 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
nebula/trunk/ChangeLog
r1418 r1431 1 0.1.0 2 - queues as default data structures (restrictable in size) 3 0.0.1 1 4 - throughput profiling 2 5 - clean configure script nebula/trunk/configure.in
r1418 r1431 1 1 # $Id$ 2 2 AC_PREREQ(02.50) 3 AC_INIT([nebula], [0. 0.1], [tillmann.werner@gmx.de])3 AC_INIT([nebula], [0.1.0], [tillmann.werner@gmx.de]) 4 4 AM_CONFIG_HEADER(config.h) 5 AM_INIT_AUTOMAKE(nebula,0. 0.1)5 AM_INIT_AUTOMAKE(nebula,0.1.0) 6 6 7 7 AC_PROG_CC nebula/trunk/src/Makefile.am
r1418 r1431 9 9 util.c util.h \ 10 10 trie.c trie.h \ 11 hashq.c hashq.h \12 ngram.c ngram.h \11 queue.c queue.h \ 12 hash.c hash.h \ 13 13 cluster.c cluster.h \ 14 14 nebula.c nebula.h nebula/trunk/src/cluster.c
r1419 r1431 23 23 24 24 #include "cluster.h" 25 #include "hashq.h"26 25 #include "nebula.h" 27 #include " trie.h"26 #include "queue.h" 28 27 29 28 … … 39 38 40 39 /* create element queue */ 41 if ((new->hq = calloc(1, sizeof( hashq))) == NULL) {40 if ((new->hq = calloc(1, sizeof(queue))) == NULL) { 42 41 fprintf(stderr, "Error - Unable to allocate memory: %m.\n"); 43 42 exit(EXIT_FAILURE); … … 49 48 50 49 /* delete cluster */ 51 void cluster_free( cluster *cl, u_char list_files, void(*cbfn)(hashq *hq, u_char flag)) {50 void cluster_free(void *cl, u_char list_flag) { 52 51 if (!cl) return; 53 52 54 if (cbfn) cbfn(cl->hq, list_files); 53 if (list_flag & list_files) printf("-- Cluster has %d elements\n", ((cluster *)cl)->hq->size); 54 queue_free(((cluster *)cl)->hq, list_flag & list_files, hash_free); 55 55 free(cl); 56 56 57 return;58 }59 60 61 /* free a cluster's hash queue and its hashes */62 void cluster_hashq_free(hashq *hq, u_char list_files) {63 hashq_free(hq, list_files, hash_free);64 57 return; 65 58 } … … 69 62 qelem *new; 70 63 71 if (cl->hq->size < cluster q_max) {64 if (cl->hq->size < clusterhashq_max) { 72 65 // set cluster pointer in hash struct 73 66 entry->cl = cl; 74 67 75 68 // add hash to queue 76 if ((new = hashq_prepend(cl->hq, entry)) == NULL) {69 if ((new = queue_prepend(cl->hq, entry)) == NULL) { 77 70 fprintf(stderr, "Error - Could not add entry to cluster.\n"); 78 71 exit(EXIT_FAILURE); … … 93 86 add_entry_to_cluster(cl, l2); 94 87 95 /* prepend new cluster to list */96 cl->next = cluster_list;97 if(cluster_list) cluster_list->prev = cl;98 cluster_list = cl;99 100 num_of_clusters++;101 102 88 return(cl); 103 89 } 104 90 105 91 106 cluster *clusters_merge(cluster *dst, cluster *new) { 107 cluster *cl1, *cl2; 92 cluster *clusters_merge(queue *q, cluster *dst, cluster *src) { 108 93 qelem *entry; 109 94 110 cl1 = dst; 111 cl2 = new; 95 // concatenate hash queues 96 dst->hq->tail->next = src->hq->head; 97 dst->hq->tail = src->hq->tail; 98 dst->hq->size += src->hq->size; 112 99 113 // join lists 114 cl1->hq->tail->next = cl2->hq->head; 115 cl1->hq->tail = cl2->hq->tail; 116 cl1->hq->size += cl2->hq->size; 100 // let elements of src point to dst 101 for (entry = src->hq->head; entry; entry = entry->next) ((hash*)entry->data)->cl = dst; 102 src->hq->head = src->hq->tail = NULL; 117 103 118 // let elements of cl2 point to cl1 119 for (entry = cl2->hq->head; entry; entry = entry->next) ((hash*)entry->data)->cl = cl1; 120 cl2->hq->head = cl2->hq->tail = NULL; 121 122 // unlink and free cluster 2 123 if(cl2->prev) cl2->prev->next = cl2->next; 124 if(cl2->next) cl2->next->prev = cl2->prev; 125 126 // check whether cl2 is the list head 127 if (cluster_list == cl2) cluster_list = cl2->next; 128 129 130 hashq_free(cl2->hq, 0, NULL); 131 cluster_free(cl2, 0, NULL); 132 num_of_clusters--; 104 // unlink and free old cluster 105 queue_free(src->hq, 0, NULL); 106 entry = queue_unlink(q, src->parent); 107 cluster_free(entry->data, 1); 108 free(entry); 133 109 134 110 return(dst); 135 111 } 136 137 138 void clusterlist_delete(cluster *list, u_char list_files) {139 cluster *cl;140 141 while(list) {142 cl = list;143 list = cl->next;144 if (!hide_result) printf("Cluster has %u entries.\n", cl->hq->size);145 cluster_free(cl, list_files, cluster_hashq_free);146 }147 }nebula/trunk/src/cluster.h
r1414 r1431 19 19 */ 20 20 21 #ifndef __NEBULA_CLUSTER _H22 #define __NEBULA_CLUSTER _H 121 #ifndef __NEBULA_CLUSTERQ_H 22 #define __NEBULA_CLUSTERQ_H 1 23 23 24 24 #if HAVE_CONFIG_H … … 28 28 #include <stdlib.h> 29 29 30 #include "hash q.h"30 #include "hash.h" 31 31 #include "trie.h" 32 #include "queue.h" 32 33 33 34 /* this struct can be extended when using different metrics, … … 36 37 typedef struct cluster { 37 38 u_int16_t cnt; 39 void *parent; 38 40 struct cluster *prev; 39 41 struct cluster *next; 40 hashq*hq;42 queue *hq; 41 43 } cluster; 42 44 45 46 47 inline cluster *cluster_new(void); 48 void cluster_free(void *cl, u_char list_flag); 43 49 44 50 inline cluster *create_hashlist(hash *hl1, hash *hl2); 45 51 cluster *add_entry_to_cluster(cluster *cl, hash *entry); 46 52 cluster *create_cluster(hash *l1, hash *l2); 47 cluster *clusters_merge( cluster *dst, cluster *new);53 cluster *clusters_merge(queue *q, cluster *dst, cluster *src); 48 54 void clusterlist_delete(cluster *list, u_char list_files); 49 55 nebula/trunk/src/nebula.c
r1421 r1431 46 46 47 47 void usage(const char* progname, const int exit_val) { 48 #ifdef PROFILE 49 printf("Usage: %s [-lpstv] [-c maximum cluster size] [-r cluster radius] [-q maximum outlier queue size] [-d directory] [file ...]\n", progname); 50 #else 51 printf("Usage: %s [-lpstv] [-a profiling interval] [-c maximum cluster size] [-r cluster radius] [-q maximum outlier queue size] [-d directory] [file ...]\n", progname); 52 #endif 48 printf("Usage: %s " 49 " -C <size>\t cluster queue size\n" 50 "\t\t -E <size>\t cluster element queue size\n" 51 "\t\t -O <size>\t outlier queue size\n" 52 #ifdef PROFILE 53 "\t\t -a <seconds>\t alarm interval for profiling output\n" 54 #endif 55 "\t\t -c <similarity> cluster criteria (a similarity measure in percent)\n" 56 "\t\t -d <directory>\t process direcory content\n" 57 "\t\t -h\t\t show this help\n" 58 "\t\t -l\t\t list clustered files in result\n" 59 "\t\t -o\t\t list outlier in result\n" 60 "\t\t -p\t\t show progress\n" 61 "\t\t -r\t\t hide result (only calculate cluster)\n" 62 "\t\t -t\t\t sort input files by creation time\n" 63 "\t\t -v\t\t be verbose\n" 64 "\t\t\t\t [files ...]\n", 65 progname); 53 66 54 67 exit(exit_val); … … 56 69 57 70 int main(int argc, char *argv[]) { 71 //int j; 58 72 int i, qsize; 59 73 u_char time_sort, *content, *tmpbuf; 60 74 char option, *dirname, *filename; 61 75 hash *tmp_hash; 62 qelem *cur_hq, *tmp_hq; 63 cluster *cl; 76 qelem *cur_cqelem, *tmp_cqelem, *cur_hqelem, *tmp_hqelem; 64 77 double score; 65 78 trie_node *t; 66 79 bstring bstr; 67 80 int n; 68 struct dirent** namelist;69 81 70 82 71 83 dirname = NULL; 84 namelist = NULL; 72 85 73 86 outlierq = NULL; 74 cluster _list= NULL;87 clusterq = NULL; 75 88 76 89 content = NULL; … … 91 104 show_progress = 0; // don't show progress dots 92 105 list_files = 0; // don't list cluster objects 106 list_outlier = 0; // don't list outlier 93 107 cluster_radius = 95.0; // 95% similarity as cluster criteria 94 108 outlierq_max = 500000; 95 clusterq_max = 500000; 109 clusterhashq_max = 500000; 110 clusterq_max = 5000; 96 111 97 112 memset(&md5sum_trie, 0, sizeof(trie_node)); … … 102 117 // process args 103 118 #ifdef PROFILE 104 while((option = getopt(argc, argv, " c:q:a:stplvd:r:h?")) > 0) {119 while((option = getopt(argc, argv, "a:c:C:E:O:rtplovd:h?")) > 0) { 105 120 #else 106 while((option = getopt(argc, argv, "c: q:stplvd:r:h?")) > 0) {121 while((option = getopt(argc, argv, "c:C:E:O:rtplovd:h?")) > 0) { 107 122 #endif 108 123 switch(option) { … … 110 125 case 'a': 111 126 alarm_time = atoi(optarg); 112 if (alarm_time < 1) {113 fprintf(stderr, "Error - Profile time interval must be a t least one second.\n");127 if (alarm_time < 0) { 128 fprintf(stderr, "Error - Profile time interval must be a non-negative value (0 means 'off').\n"); 114 129 exit(EXIT_FAILURE); 115 130 } … … 117 132 #endif 118 133 case 'c': 134 cluster_radius = atof(optarg); 135 if ((cluster_radius < 0) || (cluster_radius > 100)) { 136 fprintf(stderr, "Error - Cluster radius must be a value between 0 and 100 (in percent).\n"); 137 exit(EXIT_FAILURE); 138 } 139 break; 140 case 'C': 119 141 clusterq_max = atoi(optarg); 120 if (clusterq_max < 2) {121 fprintf(stderr, "Error - Maximum cluster size must be at least 2.\n");142 if (clusterq_max < 1) { 143 fprintf(stderr, "Error - Cluster queue size must be a non-negative value.\n"); 122 144 exit(EXIT_FAILURE); 123 145 } … … 129 151 } 130 152 break; 153 case 'E': 154 clusterhashq_max = atoi(optarg); 155 if (clusterhashq_max < 2) { 156 fprintf(stderr, "Error - Cluster element queue size must be at least 2.\n"); 157 exit(EXIT_FAILURE); 158 } 159 break; 131 160 case 'l': 132 161 list_files = 1; 133 162 break; 134 case ' p':135 show_progress= 1;136 break; 137 case ' q':163 case 'o': 164 list_outlier = 1; 165 break; 166 case 'O': 138 167 outlierq_max = atoi(optarg); 139 168 if (outlierq_max < 1) { … … 142 171 } 143 172 break; 173 case 'p': 174 show_progress = 1; 175 break; 144 176 case 'r': 145 cluster_radius = atof(optarg);146 if ((cluster_radius < 0) || (cluster_radius > 100)) {147 fprintf(stderr, "Error - Cluster radius must be a value between 0 and 100 (in percent).\n");148 exit(EXIT_FAILURE);149 }150 break;151 case 's':152 177 hide_result = 1; 153 178 break; … … 195 220 196 221 // initialize outlier queue 197 outlierq = hashq_new(); 198 199 200 #ifdef PROFILE 201 alarm(alarm_time); 202 printf("Profiling enabled, printing statistics every %d seconds.\n", alarm_time); 203 files_in_interval = 0; 204 bytes_in_interval = 0; 205 checkpoint = 0; 222 outlierq = queue_new(); 223 clusterq = queue_new(); 224 225 226 #ifdef PROFILE 227 if (alarm_time) { 228 alarm(alarm_time); 229 printf("Profiling enabled, printing statistics every %d seconds.\n", alarm_time); 230 files_in_interval = 0; 231 bytes_in_interval = 0; 232 checkpoint = 0; 233 } 206 234 #endif 207 235 … … 227 255 printf("\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"); 228 256 if (num_of_files < total_files) 229 printf("files processed: %05.2f%% (%04d clusters)", num_of_files/total_files*100, num_of_clusters);257 printf("files processed: %05.2f%% (%04d clusters)", num_of_files/total_files*100, clusterq->size); 230 258 else 231 printf("files processed: 100.00%% (%04d clusters)\n", num_of_clusters);259 printf("files processed: 100.00%% (%04d clusters)\n", clusterq->size); 232 260 fflush(stdout); 233 261 } … … 271 299 272 300 // connect all clusters within range 273 for (c l = cluster_list; cl; cl = cl->next) {274 for (cur_hq = cl->hq->head; cur_hq; cur_hq = cur_hq->next)275 if ((score = spamsum_match(((hash*)t->data)->spamsum, ((hash*)cur_hq ->data)->spamsum)) >= cluster_radius) {301 for (cur_cqelem = clusterq->head; cur_cqelem; cur_cqelem = cur_cqelem->next) { 302 for (cur_hqelem = ((cluster *)cur_cqelem->data)->hq->head; cur_hqelem; cur_hqelem = cur_hqelem->next) { 303 if ((score = spamsum_match(((hash*)t->data)->spamsum, ((hash*)cur_hqelem->data)->spamsum)) >= cluster_radius) { 276 304 if (!((hash*)t->data)->cl) { 277 add_entry_to_cluster( cl, (hash*)t->data);305 add_entry_to_cluster((cluster *)cur_cqelem->data, (hash*)t->data); 278 306 break; 279 307 } else { 280 if (cl != ((hash*)t->data)->cl) { 281 clusters_merge(cl, ((hash*)t->data)->cl); 282 } 308 if ((cluster *) cur_cqelem->data != ((hash*)t->data)->cl) 309 clusters_merge(clusterq, (cluster *) cur_cqelem->data, ((hash*)t->data)->cl); 283 310 break; 284 311 } 285 312 } 313 } 286 314 } 287 315 288 316 // connect all outliers within range 289 for (cur_hq = outlierq->head; cur_hq; cur_hq = cur_hq->next) {290 if ((score = spamsum_match(((hash*)t->data)->spamsum, ((hash*)cur_hq ->data)->spamsum)) >= cluster_radius) {317 for (cur_hqelem = outlierq->head; cur_hqelem; cur_hqelem = cur_hqelem->next) { 318 if ((score = spamsum_match(((hash*)t->data)->spamsum, ((hash*)cur_hqelem->data)->spamsum)) >= cluster_radius) { 291 319 // unlink match from outlier list 292 tmp_hq = cur_hq;293 cur_hq = cur_hq->next;294 if ((tmp_hash = hashq_unlink(outlierq, tmp_hq)) == NULL) {320 tmp_hqelem = cur_hqelem; 321 cur_hqelem = cur_hqelem->next; 322 if ((tmp_hash = queue_unlink(outlierq, tmp_hqelem)) == NULL) { 295 323 fprintf(stderr, "Error - Unable to unlink outlier from queue.\n"); 296 324 exit(EXIT_FAILURE); … … 298 326 if (((hash*)t->data)->cl) { 299 327 // add other outliers to cluster 300 if (add_entry_to_cluster(((hash*)t->data)->cl, tmp_hash) == NULL) 328 if (add_entry_to_cluster(((hash*)t->data)->cl, tmp_hash) == NULL) { 329 trie_del_memstr(&md5sum_trie, (u_char *) tmp_hash->md5sum, strlen(tmp_hash->md5sum)); 330 trie_del_memstr(&spamsum_trie, (u_char *) tmp_hash->spamsum, strlen(tmp_hash->spamsum)); 301 331 hash_free(tmp_hash, 0); 332 } 302 333 } else { 303 334 // create new cluster of two outliers 304 create_cluster(((hash*)t->data), tmp_hash); 335 if ((tmp_cqelem = queue_ins(clusterq, create_cluster(((hash*)t->data), tmp_hash), clusterq_max)) != NULL) { 336 /* cluster queue is full, last element was dropped and must be free()d */ 337 338 /* first remove hashes from tries */ 339 for (tmp_hqelem = ((cluster *)tmp_cqelem->data)->hq->head; tmp_hqelem; tmp_hqelem = tmp_hqelem->next) { 340 trie_del_memstr(&md5sum_trie, (u_char *) ((hash *)tmp_hqelem->data)->md5sum, 341 strlen(((hash *)tmp_hqelem->data)->md5sum)); 342 trie_del_memstr(&spamsum_trie, (u_char *) ((hash *)tmp_hqelem->data)->spamsum, 343 strlen(((hash *)tmp_hqelem->data)->spamsum)); 344 } 345 /* now free cluster */ 346 cluster_free((cluster *)tmp_cqelem->data, 0); 347 free(tmp_cqelem); 348 } 349 /* set cluster element's parent pointer to cluster queue head 350 * we need this to be able to unlink a cluster from the queue */ 351 ((cluster *) clusterq->head->data)->parent = clusterq->head; 305 352 } 306 353 } 307 if (!cur_hq ) break;354 if (!cur_hqelem) break; 308 355 } 309 356 … … 314 361 // insert outlier into queue 315 362 if (outlierq->size >= outlierq_max) { 316 tmp_hash = hashq_unlink(outlierq, outlierq->tail);363 tmp_hash = queue_unlink(outlierq, outlierq->tail); 317 364 trie_del_memstr(&md5sum_trie, (u_char *) tmp_hash->md5sum, strlen(tmp_hash->md5sum)); 318 365 trie_del_memstr(&spamsum_trie, (u_char *) tmp_hash->spamsum, strlen(tmp_hash->spamsum)); 319 366 hash_free(tmp_hash, 0); 320 367 } 321 hashq_ins(outlierq, t->data, outlierq_max);368 queue_ins(outlierq, t->data, outlierq_max); 322 369 } 323 370 } nebula/trunk/src/nebula.h
r1419 r1431 29 29 30 30 #include "cluster.h" 31 #include "hash q.h"31 #include "hash.h" 32 32 33 u_char verbose, list_files, show_progress, hide_result; 34 int clusterq_max, outlierq_max; 33 u_char verbose, list_files, list_outlier, show_progress, hide_result; 34 struct dirent **namelist; 35 ssize_t clusterq_max, clusterhashq_max, outlierq_max; 35 36 u_int16_t num_of_clusters; 36 37 u_int32_t num_of_duplicates; … … 38 39 double cluster_radius; 39 40 trie_node spamsum_trie, md5sum_trie; 40 cluster *cluster_list;41 hashq*outlierq;41 queue *clusterq; 42 queue *outlierq; 42 43 43 44 #ifdef PROFILE nebula/trunk/src/signals.c
r1419 r1431 34 34 printf("Premature termination forced (signal %d caught).\n", sig); 35 35 36 exit(EXIT_FAILURE); 36 37 cleanup(); 37 38 exit(EXIT_SUCCESS); nebula/trunk/src/spamsum.h
r1414 r1431 26 26 #endif 27 27 28 #include "hashq.h"29 28 30 29 char *spamsum(const u_char *in, size_t length, u_int32_t bsize); nebula/trunk/src/util.c
r1420 r1431 30 30 31 31 #include "cluster.h" 32 #include "hash q.h"32 #include "hash.h" 33 33 #include "nebula.h" 34 #include "queue.h" 34 35 #include "signals.h" 35 36 #include "trie.h" … … 113 114 114 115 void cleanup(void) { 116 int i; 115 117 #ifdef PROFILE 116 118 struct sigaction s_action; … … 129 131 130 132 // free data structures 131 if (!hide_result) printf("%u files form %u clustes and %u outliers (%u duplicates).\n%s\n", 132 (unsigned int) num_of_files, num_of_clusters, outlierq->size, num_of_duplicates, 133 "-----------------------------------------------------------------"); 133 if (namelist) { 134 if (namelist) for (i = 0; namelist[i]; free(namelist[i++])); 135 free(namelist); 136 } 134 137 135 hashq_free(outlierq, 0, hash_free); 136 clusterlist_delete(cluster_list, list_files); 138 if (!hide_result) printf("%u files form %u clustes and %u outliers (%u duplicates).\n", 139 (unsigned int) num_of_files, clusterq->size, outlierq->size, num_of_duplicates); 140 141 if (list_outlier) printf("========== Outlier ==========\n"); 142 queue_free(outlierq, list_outlier, hash_free); 143 if (list_files) printf("========== Cluster ==========\n"); 144 queue_free(clusterq, list_files, cluster_free); 137 145 trie_delete(md5sum_trie.childlist, md5sum_trie.childlist_len, NULL); 138 146 trie_delete(spamsum_trie.childlist, spamsum_trie.childlist_len, NULL);
