C语言实现 trie 统计单词频率

前面在解决圣经背诵问题使用 C 语言解决圣经背诵问题中,涉及到一个问题,就是统计文件中各个单词数量的问题,在网上找到了一个解决办法就是 Trie。
Trie,又称单词查找树或键树,是一种树形结构,是一种哈希树的变种。典型应用是用于统计和排序大量的字符串(但不仅限于字符串),所以经常被搜索引擎系统用于文本词频统计。它的优点是:最大限度地减少无谓的字符串比较,查询效率比哈希表高。

依照基本原理实现了一个基于链表的 Trie
一下是 trie.h 的代码

#ifndef _ZPZ_TRIE_H
#define _ZPZ_TRIE_H
#include <stdio.h>
#include <stdlib.h>

typedef int (*trie_char_to_index)(char c);
typedef char (*trie_index_to_char)(int i);

typedef struct _trie_node{
	int count;
	int depth;
	struct _trie_node ** list;
} trie_node;

typedef struct _trie_tree{
	int char_width;
	int max_depth;
	trie_node * root;
	trie_char_to_index char_to_index;
	trie_index_to_char index_to_char;
} trie_tree;

inline trie_node * trie_node_new(int depth){	
	trie_node * node = (trie_node *)malloc(sizeof(trie_node));
	if(NULL != node){
		memset(node, 0, sizeof(trie_node));
		node->depth = depth;
	}
	return node;
}

static inline trie_node ** trie_node_alloc_children(int char_width){
	trie_node ** nodes = (trie_node **)malloc(char_width * sizeof(trie_node *));
	if(NULL != nodes){
		memset(nodes, 0, char_width * sizeof(trie_node *));
	}
	return nodes;
}

inline trie_tree * trie_init(
	trie_tree * tree,
	int char_width,
	trie_char_to_index char_to_index,
	trie_index_to_char index_to_char
){
	memset(tree, 0, sizeof(trie_tree));
	tree->char_to_index = char_to_index;
	tree->index_to_char = index_to_char;
	tree->root = trie_node_new(0);
	tree->char_width = char_width;
	if(NULL != tree->root){
		tree->root->list = trie_node_alloc_children(tree->char_width);
	}
}

int trie_add(trie_tree * tree, char * word, int len){
	char * p = word, * end = p + len;
	trie_node * trie =  tree->root;
	int index, depth = 0;
	if(NULL == p || end == p){
		return 0;
	}
	while(p < end){
		index = tree->char_to_index(*p);
		if(NULL == trie->list){
			trie->list = trie_node_alloc_children(tree->char_width);
		}
		if(NULL == trie->list){
			return 0;
		}
		depth++;
		if(NULL == trie->list[index]){
			trie->list[index] = trie_node_new(depth);
		}
		if(NULL == trie->list[index]){
			return 0;
		}
		trie = trie->list[index];
		p++;
	}
	if(depth > tree->max_depth + 1){
		tree->max_depth = depth - 1;
	}
	trie->count++;	
	return 1;
}

#define trie_add_ex(tree,word) trie_add((tree), (word), strlen(word))

static int trie_node_release(trie_node * node, int char_width){
	int i;
	if(NULL != node->list){
		for(i = 0; i < char_width; i++){
			if(NULL != node->list[i]){
				trie_node_release(node->list[i], char_width);
			}
		}
		free(node->list);
	}
	free(node);
}

#define trie_release(tree) 	\
	trie_node_release((tree)->root, (tree)->char_width);\

trie_node * trie_find(trie_tree * tree, char * word, int len){
	int index;
	char * p = word, * end = p + len;
	trie_node * trie =  tree->root;
	if(NULL == p || p == end){
		return NULL;
	}
	while('\0' != *p){
		if(NULL == trie->list){
			return NULL;
		}
		index = tree->char_to_index(*p);
		if(NULL == trie->list[index]){
			return NULL;
		}
		trie = trie->list[index];
		p++;
	}
	return trie;

}
#define trie_find_ex(tree,word) trie_find((tree),(word),strlen(word))

static inline void trie_node_dump(
	trie_tree * tree,
	trie_node * node,
	char * word
){
	int i = 0;
	if(node->count > 0){
		printf("%-8d %s\n", node->count, word);
	}
	if(NULL != node->list){
		for(; i < tree->char_width; i++){
			if(NULL != node->list[i]){
				word[node->depth] = tree->index_to_char(i);
				word[node->depth + 1] = '\0';
				trie_node_dump(tree, node->list[i], word);
			}
		}
	}
}

void trie_dump(trie_tree * tree){
	char * word = (char *)malloc(tree->max_depth + 1);
	memset(word, 0, tree->max_depth + 1);
	trie_node_dump(tree, tree->root, word);
}

static inline int char_to_index(char c){
	if('0' <= c && c <= '9'){
		return c - '0';
	}
	else if('a' <= c && c <= 'z'){
		return 10 + c - 'a';
	}
	else if('A' <= c && c <= 'Z'){
		return 10 + 26 + c - 'A';
	}
	return 10 + 26 + 26;
}

static inline char index_to_char(int index){
	if(0 <= index && index < 10){
		return '0' + index;
	}
	else if(10 <= index && index < (10 + 26)){
		return 'a' + (index - 10);
	}
	else if((10 + 26) <= index && index < (10 + 26 + 26)){
		return 'A' + (index - 10 - 26);
	}
	return ' ';
}

inline trie_init_alnum(trie_tree * tree){
	trie_init(tree, 26 + 26 + 10 + 1, char_to_index, index_to_char);
}

#endif

下面实现了一个单词统计的程序:

#include "trie.h"

typedef void (*word_handler)(char * start, char * end);

#define is_digit_or_letter(c) (((c) >='a' && (c) <= 'z') || ((c) >= '0' && (c) <= '9') || ((c) >= 'A' && (c) <= 'Z'))

void find_word_in_buffer(
	char * start,
	int len,
	word_handler handler
){
	int stat = 0;
	char * end = start + len, * last = start;

	while(start < end){
		//printf("%c", *start);
		if(is_digit_or_letter(*start)){
			if(0 == stat){
				last = start;	
				stat = 1;
			}
		}
		else{
			if(1 == stat){
				//word
				handler(last, start);
				stat = 0;
			}
		}
		start++;
	}
	if(1 == stat && last < start){
		handler(last, start);
	}
}

void count_words_of_file(
	FILE * file,
	word_handler handler
){
	int line = 1,  words = 1;
	long file_size = 0;
	char * buffer;

	fseek (file, 0 , SEEK_END);
	file_size = ftell(file);
	rewind (file);

	buffer = (char *)malloc(file_size);

	if(NULL == buffer){
		printf("Alloc failed");
		return;
	}

	fread(buffer, file_size, 1, file);
	find_word_in_buffer(buffer, file_size, handler);
}

static trie_tree tree;

void count_words_handler(char * begin, char * end){
	trie_add(&tree, begin, end - begin);
}

int main(int argc, char ** argv){
	if(argc < 2){
		printf("parameter 2 must be the file name.\n");
		return 1;
	}
	FILE * file = fopen(argv[1], "r");
	if(NULL == file){
		printf("File %s not exitst.", argv[1]);
		return 2;
	}
	trie_init_alnum(&tree);
	count_words_of_file(file, count_words_handler);
	fclose(file);
	trie_dump(&tree);
	trie_release(&tree);
	return 0;
}

参考:http://zh.wikipedia.org/wiki/Trie

Post a Comment

Your email is never shared. Required fields are marked *

*
*