使用 C 语言解决圣经背诵问题

PHP 解决圣经背诵问题 这一篇中我们使用 PHP 解决了圣经背诵问题,但是程序运行速度不尽人意,一般要3秒或更多的时间才能得出结果。
我们可以使用 time 命令来看一下运行时间 (关于 time 命令的说明见这一篇 Linux 下使用 time 命令查看程序的执行时间)。
结果如下:
首先我们用下面的 shell 命令来初步统计一下单词的数量,这样便于验证结果的正确性。

tr -cs "[:alnum:]" "\n" < bible.txt | sort | uniq -c

我们在圣经中查找 apples ,这个单词出现了 3 次。

time php find-word-in-bible.php apples
apples	17125,10 17560,13 17636,42 [3]
real		3.85s
user		3.82s
sys		0.01s

消耗的时间比较多。
为了验证是算法导致,还是 PHP 本身本身导致,决定使用 C 实现同样的代码。
代码如下:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define is_digit_or_letter(c) (((c) >='a' && (c) <= 'z') || ((c) >= '0' && (c) <= '9') || ((c) >= 'A' && (c) <= 'Z'))

static void str_copy(char * dest, char * src, unsigned int len){
	if(NULL == dest || NULL == src || len < 1){
		return;
	}

	while(len--){
		dest[len] = src[len];
		if('\n' == src[len]){
			return;
		}
	}
}

unsigned int str_len(char * str){
	unsigned int len = 0;
	if(NULL != str){
		while('\0' != *str++){
			len++;
		}
	}
	return len;
}

void find_word_in_buffer(
	char * word,
	int word_len,
	char * start,
	char * end,
	int * line,
	int * words,
	int * found
){
	int j;
	char c;

	while(start < end){
		c = *start++;
		if('\r' == c){
			if(start < end && '\n' == *start){
				start++;
			}
			(*line)++;
			(*words) = 1;
		}
		else if('\n' == c){
			(*line)++;
			(*words) = 1;
		}
		else if(is_digit_or_letter(c)){
			j = 1;
			while(start < end && j < word_len && (*start == word[j])){
				start++;
				j++;
			}
			if(j == word_len && !is_digit_or_letter(*start)){
				(*found)++;
				printf("%d,%d ", *line, *words);
			}
			while(start < end && is_digit_or_letter(*start)){
				start++;
			}
			(*words)++;
		}
	}
}

void find_word_in_bible_all(
	char * word,
	int word_len,
	FILE * bible_file,
	int * found
){
	int line = 1,  words = 1;
	long file_size = 0;
	char * buffer;

	fseek (bible_file , 0 , SEEK_END);
	file_size = ftell(bible_file);
	rewind (bible_file);

	buffer = (char *)malloc(file_size);

	if(NULL == buffer){
		printf("Alloc failed");
		return;
	}

	fread(buffer, file_size, 1, bible_file);
	find_word_in_buffer(
		word, word_len,
		buffer, buffer + file_size,
		&line,
		&words,
		found
	);
}

void find_word_in_bible(
	char * word,
	int word_len,
	FILE * bible_file,
	int * found,
	long buffer_size
){
	long file_size = 0, read_size = 0;
	int line = 1 ,words = 1, last_len = 0;
	char swap[256], * start, *end, *p, *buffer = (char *)malloc(buffer_size);

	if(NULL == buffer){
		printf("Alloc failed");
		return;
	}

	fseek (bible_file , 0 , SEEK_END);
	file_size = ftell(bible_file);
	rewind (bible_file);

	while(!feof(bible_file)){
		end = buffer + buffer_size;
		if((read_size + buffer_size) > file_size){
			end += (read_size - file_size);
		}

		fread(buffer, buffer_size, 1, bible_file);

		start  = buffer;
		while(start <= end && is_digit_or_letter(*start)){
			start++;
		}
		while(start <= end && !is_digit_or_letter(*start)){
			start++;
		}

		p = end = buffer + buffer_size;
		while(p >= start && !is_digit_or_letter(*p)){
			p--;
		}
		while(p >= start && is_digit_or_letter(*p)){
			p--;
		}
		p++;

		str_copy(swap + last_len, buffer, start - buffer);
		find_word_in_buffer(
			word, word_len,
			swap, (char *)((int)swap + last_len + start - buffer),
			&line, &words,
			found
		);

		last_len = end  - p;
		if(last_len > 0){
			str_copy(swap, p, last_len);
		}
		find_word_in_buffer(
			word, word_len,
			start, p,
			&line, &words,
			found
		);
	}

	find_word_in_buffer(
		word, word_len,
		swap, swap + last_len,
		&line, &words,
		found
	);
}

static unsigned int is_valid_word(char * word, int word_len){
	char * end = word + word_len;
	while(word < end){
		if(!is_digit_or_letter(*word)){
			return 0;
		}
		word++;
	}
	return 1;
}

int main(int argc, char ** argv){
	char * word = NULL;
	int word_len, found = 0;
	long buffer_size = 0;
	time_t ticks;
	char * bible_file = "bible.txt";
	FILE * file;

	if(argc < 2){
		printf("Parameter 1 (word) should not be empty.\n");
		return 1;
	}

	if(argc > 2){
		buffer_size = atol(argv[2]);
	}
	if(buffer_size <= 1024){
		buffer_size = 1024;
	}

	word = argv[1];
	word_len = str_len(word);
	if(word_len < 1){
		printf("\nSearch word should not be empty.");
		return 2;
	}

	if(0 == is_valid_word(word, word_len)){
		printf("\nSearch word can just contain char in a-z and 0-9.\n");
		return 3;
	}

	file = fopen(bible_file, "r");
	if(NULL == file){
		printf("Can't open bible file at : %s\n", bible_file);
		return 5;
	}

	printf("Find [%s] in bible\n", word);
	find_word_in_bible(word, word_len, file, &found, buffer_size);

	fclose(file);
	ticks = clock();
	printf(
		"\nAll %d found in %d'ms.\n",
		found,
		(ticks * 1000) / CLOCKS_PER_SEC
	);
	return 0;
}

测试一下:

time ./a.out apples
Find [apples] in bible
17140,10 17576,13 17652,42 
All 3 found in 50'ms.
real		0.05
user		0.05
sys		0.00

不错,竟然能之 50毫秒里面搞定,速度确实快了几个数量级啊。

One Trackback

  1. By C语言实现 trie 统计单词频率 – 薹翮 on 2012 年 6 月 16 日 at 下午 2:37

    […] 薹翮 我悄悄地离开,在昨天;今天我又静静地到来 SEO 查询工具   « 使用 C 语言解决圣经背诵问题 […]

Post a Comment

Your email is never shared. Required fields are marked *

*
*