Changeset 1546
- Timestamp:
- 02/08/08 21:52:02 (7 months ago)
- Files:
-
- nebula/trunk/gst/Makefile.am (modified) (1 diff)
- nebula/trunk/gst/cstr.c (added)
- nebula/trunk/gst/cstr.h (added)
- nebula/trunk/gst/gst.c (modified) (13 diffs)
- nebula/trunk/gst/gst.h (modified) (2 diffs)
- nebula/trunk/gst/lca.c (modified) (1 diff)
- nebula/trunk/gst/stree.c (modified) (20 diffs)
- nebula/trunk/gst/stree.h (modified) (3 diffs)
- nebula/trunk/gst/util.c (modified) (3 diffs)
- nebula/trunk/gst/util.h (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
nebula/trunk/gst/Makefile.am
r1524 r1546 1 1 AM_CFLAGS=-Wall -Werror 2 3 LIBS += -lm 2 4 3 5 bin_PROGRAMS = gst 4 6 gst_SOURCES = signals.c signals.h \ 7 cstr.c cstr.h \ 5 8 util.c util.h \ 6 9 stree.c stree.h \ nebula/trunk/gst/gst.c
r1524 r1546 1 1 /* gst.c 2 2 * 3 * Copyright (C) 2007 Tillmann Werner <tillmann.werner@gmx.de>3 * Copyright (C) 2007-2008 Tillmann Werner <tillmann.werner@gmx.de> 4 4 * 5 5 * This program is free software; you can redistribute it and/or modify … … 32 32 #include <unistd.h> 33 33 34 #include "cstr.h" 34 35 #include "gst.h" 35 36 #include "signals.h" … … 40 41 void usage(const char* progname, const int exit_val) { 41 42 printf("Usage: %s " 42 "\t -c\t\t only print common subtree (requires -g)\n"43 "\t -c\t\t only print common subtree\n" 43 44 "\t\t -d <directory>\t process direcory content\n" 44 45 "\t\t -g\t\t print generalized suffix tree\n" … … 52 53 } 53 54 55 54 56 int main(int argc, char *argv[]) { 55 57 int i, acnt; … … 60 62 struct dirent *dir_entry; 61 63 struct stat statbuf; 62 stnode *lca ;63 u_int32_t node_id ;64 u_char print_gst, print_sslist, print_cgst;65 64 stnode *lca, **leaves; 65 u_int32_t node_id, num_of_files, num_leaves, strllen; 66 u_char verbose, print_gst, print_sslist, print_cgst; 67 substr_list cstr_list; 66 68 lchar *strlist; 67 u_int32_t strllen;68 u_int32_t cstr_num;69 69 ssize_t min_sstr_len; 70 70 71 71 lca = NULL; 72 leaves = NULL; 72 73 string_offset = NULL; 73 74 strlist = NULL; 74 75 strllen = 0; 75 cstr_num = 0;76 76 77 77 dirp = NULL; 78 curfile = NULL; 78 79 gst = NULL; 79 80 content = NULL; 80 curfile = NULL;81 81 82 82 verbose = 0; … … 95 95 switch(option) { 96 96 case 'c': 97 print_cgst = 1; 97 print_cgst = 1; 98 print_gst = 1; 98 99 break; 99 100 case 'd': … … 127 128 } 128 129 130 129 131 set_signal_handlers(); 130 132 … … 134 136 } 135 137 138 136 139 // create suffix tree 137 140 if ((gst = ST_CreateTree()) == NULL) { 138 fprintf(stderr, "Error - Unable to create suffix tree.\n"); 139 exit(EXIT_FAILURE); 140 } 141 fprintf(stderr, "Error - Unable to add %s to suffix tree.\n", curfile); 142 exit(EXIT_FAILURE); 143 } 144 141 145 142 146 // process files … … 190 194 } 191 195 192 if (verbose) printf("%u files processed (%u bytes).\n ", num_of_files, strllen-num_of_files);196 if (verbose) printf("%u files processed (%u bytes).\n\n", num_of_files, strllen-num_of_files); 193 197 if (dirp) closedir(dirp); 194 198 … … 196 200 // print concatenated string 197 201 if (verbose > 1) { 198 printf(" concatenated string (%u) is '", strllen);202 printf("Concatenated string (%u) is '", strllen); 199 203 for (i=1; i<=strllen; i++) { 200 204 byte = (char *) &strlist[i]; … … 205 209 } 206 210 } 207 printf("'\n"); 208 } 209 if (verbose > 2) putchar('\n'); 211 printf("'\n\n"); 212 } 210 213 211 214 … … 218 221 } 219 222 223 220 224 // compute LCA table for tree nodes 221 node_id = 0; 222 set_ids(gst->root, &node_id); 225 node_id = 0; 226 num_leaves = 0; 227 if ((leaves = calloc(1, sizeof(stnode *))) == NULL) { 228 fprintf(stderr, "Error - Unable to allocate memory: %m.\n"); 229 exit(EXIT_FAILURE); 230 } 231 set_ids(gst, gst->root, &leaves, &num_leaves, &node_id); 232 223 233 if (verbose) printf("Computing LCA table.\n"); 224 234 if ((lca_table = lca_prep(gst)) == NULL) { … … 227 237 } 228 238 239 240 // set string IDs 241 if (verbose) printf("Setting string IDs.\n"); 242 if ((string_ids = avl_tree_new()) == NULL) { 243 fprintf(stderr, "Error - Unable to create AVL tree.\n"); 244 exit(EXIT_FAILURE); 245 } 246 string_ids->cmpfn = idcmp; 247 248 229 249 // calculate common subtree 230 250 if (verbose) printf("Calculating common subtree.\n"); 231 if ((string_ids = avl_tree_new()) == NULL) { 232 fprintf(stderr, "Error - Unable to create AVL tree.\n"); 233 exit(EXIT_FAILURE); 234 } 235 string_ids->cmpfn = idcmp; 236 if (ST_CalcCommonSubtree(gst, gst->root, lca_table, &lca, string_ids, &cstr_num, 0) == -1) { 251 memset(&cstr_list, 0, sizeof(substr_list)); 252 if (calc_common_subtree(gst, &cstr_list, min_sstr_len, lca_table, leaves, num_leaves, num_of_files) == -1) { 237 253 fprintf(stderr, "Error - Unable to find substrings.\n"); 238 254 exit(EXIT_FAILURE); 239 255 } 256 if (verbose) printf("Generalized suffix tree contains a common subtree with %u leaves.\n", cstr_list.len); 257 240 258 241 259 // print (common subtree of) GST 242 260 if (print_gst) ST_PrintTree(gst, print_cgst); 243 261 244 // get common substrings 262 263 // Extract common substrings 264 if (verbose) printf("Sorting common substrings after length.\n"); 265 qsort(cstr_list.elem, cstr_list.len, sizeof(substr), substr_len_cmp); 266 245 267 if (verbose) printf("Extracting common substrings with length >= %u.\n", min_sstr_len); 246 get_substrings(gst, print_sslist, min_sstr_len);247 248 if (verbose) printf("Generalized suffix tree contains a common subtree with %u nodes (including the root).\n", cstr_num+1); 268 if (cstr_list.len && print_sslist) for (i=cstr_list.len; i && cstr_list.elem[i-1].len >= min_sstr_len; i--) 269 list_substrings(gst, cstr_list.elem[i-1].n, cstr_list.elem[i-1].len); 270 249 271 250 272 // free data structures 273 free(cstr_list.elem); 251 274 ST_DeleteTree(gst); 252 275 avl_tree_free(string_ids); 253 276 lca_free(lca_table); 277 free(leaves); 254 278 free(string_offset); 255 279 nebula/trunk/gst/gst.h
r1524 r1546 1 1 /* gst.h 2 2 * 3 * Copyright (C) 2007 Tillmann Werner <tillmann.werner@gmx.de>3 * Copyright (C) 2007-2008 Tillmann Werner <tillmann.werner@gmx.de> 4 4 * 5 5 * This program is free software; you can redistribute it and/or modify … … 31 31 #include "stree.h" 32 32 33 u_char verbose;34 u_int16_t num_of_files;35 36 33 stree *gst; 37 34 lcatbl *lca_table; 38 35 avl_tree *string_ids; 39 40 36 u_int32_t *string_offset; 41 37 nebula/trunk/gst/lca.c
r1524 r1546 191 191 192 192 void lca_free(lcatbl *lca) { 193 if (lca == NULL) return; 194 193 195 if (lca->I != NULL) free(lca->I); 194 196 if (lca->A != NULL) free(lca->A); nebula/trunk/gst/stree.c
r1524 r1546 29 29 30 30 #include "avl.h" 31 #include "cstr.h" 31 32 #include "gst.h" 32 33 #include "stree.h" … … 51 52 52 53 // set integers as node IDs while traversing a tree in inorder 53 void set_ids(stnode *n, u_int32_t *id) { 54 stnode *tmpnode; 54 void set_ids(stree *t, stnode *n, stnode ***leaves, u_int32_t *num_leaves, u_int32_t *id) { 55 u_int32_t start, end; 56 stnode *tmpnode; 57 58 if (!n) return; 55 59 56 60 n->id = ++(*id); 57 for (tmpnode = n->sons; tmpnode; tmpnode = tmpnode->right_sibling) set_ids(tmpnode, id); 58 } 59 60 61 /******************************************************************************/ 61 if (!n->sons) { 62 if ((*leaves = realloc(*leaves, sizeof(stnode *) * ((*num_leaves)+1))) == NULL) { 63 fprintf(stderr, "Error - Unable to allocate memory: %m.\n"); 64 exit(EXIT_FAILURE); 65 } 66 (*leaves)[*num_leaves] = n; 67 (*num_leaves)++; 68 69 // set string id 70 start = n->edge_label_start; 71 end = get_node_label_end(t, n); 72 if (start) for (; start <= end; start++) { 73 if (t->tree_string[start] && !(t->tree_string[start] & 0x000000ff)) { 74 n->string_id = t->tree_string[start] >> 8; 75 break; 76 } 77 } 78 } 79 for (tmpnode = n->sons; tmpnode; tmpnode = tmpnode->right_sibling) set_ids(t, tmpnode, leaves, num_leaves, id); 80 return; 81 } 82 83 62 84 stnode * create_node(stree *tree, stnode *father, u_int32_t start, u_int32_t end, u_int32_t position) { 63 85 stnode *node; … … 80 102 } 81 103 82 /******************************************************************************/ 104 83 105 inline stnode * find_son(stree * tree, stnode * node, lchar sym) { 84 106 for (node = node->sons; node != 0 && tree->tree_string[node->edge_label_start] != sym; node = node->right_sibling); … … 86 108 } 87 109 88 /******************************************************************************/ 110 89 111 inline u_int32_t get_node_label_end(stree *tree, stnode* node) { 90 112 if (node->sons == 0) return tree->e; /* If it's a leaf - return e */ … … 92 114 } 93 115 94 /******************************************************************************/ 116 95 117 inline u_int32_t get_node_label_length(stree *tree, stnode* node) { 96 118 return(get_node_label_end(tree, node)-node->edge_label_start+1); 97 119 } 98 120 99 /******************************************************************************/ 121 100 122 inline char is_last_char_in_edge (stree *tree, stnode *node, u_int32_t edge_pos) { 101 123 return(edge_pos == get_node_label_length(tree, node) - 1 ? 1 : 0); 102 124 } 103 125 104 /******************************************************************************/ 126 105 127 inline void connect_siblings (stnode * left_sib, stnode * right_sib) { 106 128 if (left_sib) left_sib->right_sibling = right_sib; … … 108 130 } 109 131 110 /******************************************************************************/ 132 111 133 stnode *apply_extension_rule_2(stree *tree, stnode *node, u_int32_t edge_label_begin, u_int32_t edge_label_end, 112 134 u_int32_t path_pos, u_int32_t edge_pos, RULE_2_TYPE type) { … … 152 174 } 153 175 154 /******************************************************************************/ 176 155 177 stnode * trace_single_edge(stree *tree, stnode *node, PATH str, u_int32_t *edge_pos, u_int32_t *chars_found, SKIP_TYPE type, int *search_done) { 156 178 stnode *cont_node; … … 206 228 } 207 229 208 /******************************************************************************/ 230 209 231 stnode * trace_string(stree * tree, stnode *node, PATH str, u_int32_t *edge_pos, u_int32_t *chars_found, SKIP_TYPE type) { 210 232 u_int32_t edge_chars_found; … … 222 244 } 223 245 224 /******************************************************************************/ 246 225 247 u_int32_t ST_FindSubstring(stree *tree, lchar *W, u_int32_t P) { 226 248 stnode *node = find_son(tree, tree->root, W[0]); … … 243 265 } 244 266 245 /******************************************************************************/ 267 246 268 void follow_suffix_link(stree *tree, POS *pos) { 247 269 PATH gama; /* gama is the string between node and its father, in case node doesn't have a suffix link */ … … 278 300 279 301 280 /******************************************************************************/ 302 281 303 void SEA(stree *tree, POS *pos, PATH str, u_int32_t *rule_applied, char after_rule_3) { 282 304 u_int32_t chars_found = 0, … … 360 382 } 361 383 362 /******************************************************************************/ 384 363 385 void SPA (stree * tree, POS * pos, u_int32_t phase, u_int32_t * extension, char *repeated_extension) { 364 386 u_int32_t rule_applied = 0; … … 414 436 415 437 416 /******************************************************************************/417 438 stree *ST_CreateTree(void) { 418 439 stree *tree; … … 433 454 434 455 435 /******************************************************************************/436 456 void ST_DeleteSubTree (stnode * node) { 437 457 if (node == 0) return; /* Recoursion stoping condition */ … … 441 461 } 442 462 443 /******************************************************************************/ 463 444 464 void ST_DeleteTree (stree * tree) { 445 465 if (tree == 0) return; … … 449 469 } 450 470 451 /******************************************************************************/ 471 452 472 int ST_PrintNode(stree *tree, stnode *node1, long depth, int cs_only) { 453 473 stnode *node2 = node1->sons; … … 474 494 } 475 495 if (node1->flag) printf("\033[1;0m"); 476 477 // printf(" \t\t\t(%u,%u | %u)\n", node1->edge_label_start, end, node1->path_position);478 496 putchar('\n'); 479 497 } … … 488 506 489 507 490 /******************************************************************************/491 508 void ST_PrintTree(stree *tree, int cs_only) { 492 509 ST_PrintNode(tree, tree->root, 0, cs_only); … … 529 546 530 547 531 // returns 0 on success, -1 on failure 532 int ST_CalcCommonSubtree(stree *t, stnode *n, lcatbl *lca_table, stnode **lca, avl_tree *string_ids, u_int32_t *num_of_substrings, int depth) { 533 u_int32_t id, start, end; 534 stnode *tmpnode, *mylca; 535 avl_node *a; 536 537 id = 0; 538 start = 0; 539 tmpnode = NULL; 540 541 if (string_ids == NULL) { 542 fprintf(stderr, "Error - Cannot extract substrings from GST: No AVL tree given.\n"); 543 return(-1); 544 } 545 546 // check if node is a leaf and if so, get corresponding string ID 547 if (!n->sons) { 548 void list_substrings(stree *t, stnode *cstr_node, ssize_t length) { 549 u_int32_t id, start, end, i; 550 stnode *n; 551 552 id = 0; 553 554 if (!cstr_node || !length) return; 555 556 n = cstr_node; 557 558 // DFS to find leaf nodes 559 for (;;) { 560 while (n->sons) n = n->sons; 561 562 // now we're at a leaf so we found a substring 548 563 start = n->edge_label_start; 549 564 end = get_node_label_end(t, n); … … 554 569 } 555 570 } 556 } 557 558 // track a list of found string IDs in an AVL tree 559 if (id) { 560 // prepare new node 561 if (((a = calloc(1, sizeof(avl_node))) == NULL) || 562 ((a->data = malloc(sizeof(id))) == NULL)) { 563 fprintf(stderr, "Error - Unable to create AVL node: %s.\n", strerror(errno)); 564 exit(EXIT_FAILURE); 565 } 566 memcpy(a->data, &id, sizeof(id)); 567 a->size = sizeof(id); 568 569 // lookup id in avl tree 570 if (avl_find(string_ids, string_ids->root, a) == NULL) { 571 // store leaf id in avl tree 572 avl_insert(string_ids, &(string_ids->root), a); 573 string_ids->size++; 571 572 // print substring and meta infos 573 start = n->path_position; 574 printf("common substring starts in string \033[1;31m[%u]\033[1;0m\t at offset \033[1;33m%6u\033[1;0m " 575 "(absolute offset %7u) and has length \033[1;32m%3u\033[1;0m (entropy: %f):\t'", 576 id, start-string_offset[id-1]-1, start-1, length, entropy(t->tree_string+start, length)); 577 for (i=0; i<length; i++) putchar(isprint(t->tree_string[i+start]) ? t->tree_string[i+start] : '.'); 578 printf("'\n"); 579 580 if (!n->right_sibling) { 581 n = n->father; 582 while (n != cstr_node && !n->right_sibling) n = n->father; 583 if (n == cstr_node) break; 584 } 585 n = n->right_sibling; 586 } 587 return; 588 } 589 590 void label(stree *t, stnode *n) { 591 u_int32_t start, end, id; 592 char *byte; 593 594 start = n->edge_label_start; 595 end = get_node_label_end(t, n); 596 597 for (; start <= end; start++) { 598 if (t->tree_string[start] && !(t->tree_string[start] & 0xffff00ff)) { 599 id = t->tree_string[start] >> 8; 600 printf("\033[1;31m[%u]\033[1;0m", id); 601 break; 574 602 } else { 575 // id is already present, free prepared node 576 free(a->data); 577 free(a); 578 } 579 580 // update lowest common ancestor 581 if (!*lca) *lca = n; 582 else if (*lca != t->root) (*lca = lca_lookup(lca_table, *lca, n)); 583 } else { 584 for (tmpnode = n->sons; tmpnode; tmpnode = tmpnode->right_sibling) 585 if (!tmpnode->flag) 586 if (ST_CalcCommonSubtree(t, tmpnode, lca_table, lca, string_ids, num_of_substrings, depth+1) == -1) return(-1); 587 } 588 589 // flag all nodes on the path from the root to the lca 590 if (string_ids->size == num_of_files) { 591 if (*lca && ((stnode *)*lca)->flag == 0) { 592 (*lca)->flag = 1; 593 (*num_of_substrings)++; 594 595 for (tmpnode = (*lca)->father; tmpnode && !tmpnode->flag && tmpnode != t->root; tmpnode = tmpnode->father) { 596 tmpnode->flag = 1; 597 (*num_of_substrings)++; 598 } 599 600 // recursively process subtrees below lca 601 for (tmpnode = (*lca)->sons; tmpnode; tmpnode = tmpnode->right_sibling) { 602 mylca = NULL; 603 avl_tree_reset(string_ids); // reset AVL tree 604 if (ST_CalcCommonSubtree(t, tmpnode, lca_table, &mylca, string_ids, num_of_substrings, 0) == -1) return(-1); 605 } 606 } 607 avl_tree_reset(string_ids); // reset AVL tree 608 *lca = NULL; 609 610 } 611 612 if (n->father == t->root) { 613 if (string_ids->root) { 614 avl_tree_reset(string_ids); // reset AVL tree 615 *lca = NULL; 616 } 617 } 618 619 return(0); 620 } 621 622 623 void get_substrings_from_subtree(stree *t, stnode *n, ssize_t min_length, u_char print_sslist, ssize_t length) { 624 u_int32_t id, start, end, i; 625 stnode *tmpnode; 626 627 id = 0; 628 629 if (!n) return; 630 631 if (!n->sons && length && length >= min_length) { 632 // it's a leaf so we found a substring 633 634 start = n->edge_label_start; 635 end = get_node_label_end(t, n); 636 if (start) for (start = n->edge_label_start; start <= end; start++) { 637 if (t->tree_string[start] && !(t->tree_string[start] & 0x000000ff)) { 638 id = t->tree_string[start] >> 8; 639 break; 640 } 641 } 642 643 start = n->path_position; 644 if (print_sslist) { 645 printf("common substring starts in string \033[1;31m[%u]\033[1;0m\t at offset \033[1;33m%6u\033[1;0m " 646 "(absolute offset %7u) and has length \033[1;32m%3u\033[1;0m:\t'", 647 id, start-string_offset[id-1]-1, start-1, length); 648 for (i=0; i<length; i++) putchar(isprint(t->tree_string[i+start]) ? t->tree_string[i+start] : '.'); 649 printf("'\n"); 650 } 651 return; 652 } 653 654 if (n->flag) length += n->edge_label_end - n->edge_label_start + 1; 655 656 for (tmpnode = n->sons; tmpnode; tmpnode = tmpnode->right_sibling) 657 get_substrings_from_subtree(t, tmpnode, min_length, print_sslist, length); 658 } 659 660 661 void get_substrings(stree *t, u_char print_sslist, ssize_t min_length) { 662 if (!t) return; 663 664 if (print_sslist) putchar('\n'); 665 get_substrings_from_subtree(t, t->root, min_length, print_sslist, 0); 666 if (print_sslist) putchar('\n'); 667 } 603 byte = (char *) &(t->tree_string[start]); 604 putchar(isprint(*byte) ? *byte : '.'); 605 } 606 } 607 } 608 609 610 ssize_t calc_common_subtree(stree *t, substr_list *cstr_list, ssize_t min_length, lcatbl *lca_table, stnode **leaves, ssize_t num_leaves, ssize_t num_of_files) { 611 u_int32_t id, num_cstr, left; 612 ssize_t i, nl; 613 stnode *lca, *tmp, **l; 614 615 num_cstr = 0; 616 nl = 0; 617 left = 0; 618 619 if ((l = calloc(num_of_files, sizeof(stnode *))) == NULL) { 620 fprintf(stderr, "Error - Unable to allocate memory: %m.\n"); 621 exit(EXIT_FAILURE); 622 } 623 624 t->root->flag = 1; // always flag the root node 625 626 // process leaves from left to right by sequentially moving left and right interval boundaries 627 for (i=0; i<num_leaves; i++) { 628 id = leaves[i]->string_id; // get string ID of current leaf 629 630 if (nl == 0) { 631 // spool over equal string IDs to find left boundary 632 while (i+1 < num_leaves && leaves[i+1]->string_id == leaves[i]->string_id) i++; 633 634 // initialize left interval boundary 635 left = i; 636 } 637 638 if (!l[id-1]) nl++; // increase leaf counter if list entry for ID is empty 639 l[id-1] = leaves[i]; // update list entry so that it always contains the right-most leaf for this ID 640 641 // check if left interval boundary must be moved 642 if (left < i && id == leaves[left]->string_id) { 643 left++; 644 645 // spool over equal string IDs to find new left boundary 646 while (left < i && leaves[left+1]->string_id == leaves[left]->string_id) { 647 left++; 648 } 649 l[leaves[left]->string_id-1] = leaves[left]; 650 } 651 652 if (nl == num_of_files) { 653 // interval contains every ID at least one time, so it covers a common subtree 654 655 lca = lca_lookup(lca_table, leaves[left], leaves[i]); 656 if (!lca->flag) { 657 num_cstr++; 658 substr_add(cstr_list, lca, get_node_label_end(t, lca)-lca->path_position+1); 659 } 660 661 // flag all nodes on the (reverse) path from the root to the lca 662 for (tmp = lca; !tmp->flag; tmp = tmp->father) tmp->flag = 1; 663 664 if (i+1 < num_leaves && lca_lookup(lca_table, lca, leaves[i+1]) != lca) { 665 // next leaf and current lca are in differen subtrees, so we can reset the ID interval 666 nl = 0; 667 memset(l, 0, num_of_files * sizeof(stnode *)); 668 } else { 669 // remove ID corresponding to the left interval boundary from the interval 670 l[leaves[left]->string_id-1] = NULL; 671 nl--; 672 left++; 673 674 // spool over equal string IDs to find new left boundary 675 while (left < i && leaves[left+1]->string_id == leaves[left]->string_id) left++; 676 l[leaves[left]->string_id-1] = leaves[left]; 677 } 678 } else { 679 if (lca_lookup(lca_table, leaves[left], leaves[i]) == t->root) { 680 // interval does not cover a real subtree, so skip it 681 memset(l, 0, num_of_files * sizeof(stnode *)); 682 683 // now restore list entry 684 l[id-1] = leaves[i]; 685 left = i; 686 nl = 1; 687 } 688 } 689 } 690 free(l); 691 692 return(num_cstr); 693 } nebula/trunk/gst/stree.h
r1524 r1546 42 42 u_int32_t edge_label_start; /* Start index of the incoming edge */ 43 43 u_int32_t edge_label_end; /* End index of the incoming edge */ 44 u_int32_t string_id; 44 45 u_int32_t id; 45 46 u_char flag; … … 55 56 } stree; 56 57 57 typedef struct ss_list { 58 59 60 typedef struct substr { 61 ssize_t len; 58 62 stnode *n; 59 struct ss_list *next;60 63 } substr; 61 64 62 substr *substr_list; 65 66 typedef struct substr_list { 67 ssize_t len; 68 substr *elem; 69 } substr_list; 63 70 64 71 … … 81 88 u_int32_t ST_SelfTest(stree* tree); 82 89 stree *ST_AddStringList(stree *tree, lchar *str, u_int32_t length); 83 int ST_CalcCommonSubtree(stree *t, stnode *n, lcatbl *lca_table, stnode **lca, avl_tree *string_ids, u_int32_t *num_of_substrings, int depth); 84 void set_ids(stnode *n, u_int32_t *id); 85 void get_substrings(stree *t, u_char print_sslist, ssize_t min_length); 90 void set_ids(stree *t, stnode *n, stnode ***leaves, u_int32_t *num_leaves, u_int32_t *id); 91 void get_substring_endnodes(stnode *root, substr_list *cstr_list, ssize_t min_length); 92 void list_substrings(stree *t, stnode *cstr_node, ssize_t length); 93 ssize_t calc_common_subtree(stree *t, substr_list *cstr_list, ssize_t min_length, lcatbl *lca_table, stnode **leaves, ssize_t num_leaves, ssize_t num_of_files); 94 avl_tree *build_substring_list(stree *t, ssize_t min_length); 86 95 87 96 // lca prototypes nebula/trunk/gst/util.c
r1524 r1546 1 1 /* util.c 2 2 * 3 * Copyright (C) 2007 Tillmann Werner <tillmann.werner@gmx.de>3 * Copyright (C) 2007-2008 Tillmann Werner <tillmann.werner@gmx.de> 4 4 * 5 5 * This program is free software; you can redistribute it and/or modify … … 20 20 21 21 #include <fcntl.h> 22 #include <math.h> 22 23 #include <stdlib.h> 23 24 #include <stdio.h> … … 79 80 80 81 82 inline double entropy(lchar *data, ssize_t len) { 83 ssize_t i; 84 u_int32_t bytefreqs[256]; 85 double e; 86 double log2; 87 88 e = 0; 89 log2 = log(2); // factor for changing the log base 90 91 // calculate byte frequencies 92 memset(bytefreqs, 0, 256 * sizeof(u_int32_t)); 93 for (i=0; i<len; i++) bytefreqs[(u_char) data[i]]++; 94 95 96 // calculate entropy 97 for (i=0; i<=255; i++) 98 if (bytefreqs[i]) 99 e -= (double)bytefreqs[i]/(double)len * log((double)bytefreqs[i]/(double)len) / log2; 100 101 return(e); 102 } 103 104 81 105 void avl_subtree_print(avl_node *n) { 82 106 if (n == NULL) return; nebula/trunk/gst/util.h
r1524 r1546 1 1 /* util.h 2 2 * 3 * Copyright (C) 2007 Tillmann Werner <tillmann.werner@gmx.de>3 * Copyright (C) 2007-2008 Tillmann Werner <tillmann.werner@gmx.de> 4 4 * 5 5 * This program is free software; you can redistribute it and/or modify … … 44 44 void cleanup(void); 45 45 void avl_subtree_print(avl_node *n); 46 inline double entropy(lchar *data, ssize_t len); 46 47 int idcmp(const void *a, ssize_t sizea, const void *b, ssize_t sizeb); 47 48
