C语言如何对包含一千万不重复整数的文件进行排序

《编程珠玑》中有着样一题:
限制使用内存 1M
磁盘大小不限
输入为一个文件,此文件包含最多n个正整数,每个数都小于 n,n=10000000 ,所有的正整数不重复
要求对文件输入的整数进行升序排序

常规解法是分块排序,这种解法易懂,也容易实现,实现步骤为,
将 1-10000000个整数分为 40部分,0 – 249 999、250 000- 499 999 。。。
分为 40次,每次遍历一次文件中的所有数字并找出当前分段中的存在的数据
并将此部分中的数据存储到文件中。

#include<stdio.h>
#include<stdlib.h>
#include<time.h>
#define MAXSIZE 250000
static inline int comp(const void * p, const void * q){
    return ( * ( int * ) p - * ( int * ) q) ;
}

int main(){
	printf("Start \n");
	time_t starttime = clock(), endtime;
	int *s = (int *)malloc(1000000);
	int *t;
	FILE *p,*pw;
	pw = fopen("qsort-result.txt","w");
	int temp,count=0,i,round;
	for(round=0;round<40;round++){
		t=s; count=0;
		p = fopen("test.txt","r");
		while(fscanf(p,"%d",&temp)!=-1){
			if(temp>=MAXSIZE*round && temp<(MAXSIZE*(round+1))){
				*t++ = temp;
				++count;
			}
		}
		qsort(s, count, sizeof(int), comp);
		for(i=0;i<count;i++)
			fprintf(pw,"%d\n",s[i]);
	}
	endtime = clock();
	printf("Time : %d\n", endtime - starttime);
	return 0;
}

一种比较奇妙的算法就是位图法,使用1000 0000 个位,每一个位标志
一个数字,为 1表示对应的数字存在,否则就是不存在,这样就能极大程度节省了空间
并且将只需要扫描一遍文件,并且不需排序比较。

#include <stdio.h>
#include <stdlib.h>

#define WORD_LENGTH 32
#define SHIFT 5
#define MASK 0x1F

#define NUMBER_COUNT 10000000
#define ALLOC_COUNT (1+(NUMBER_COUNT / WORD_LENGTH))

static unsigned int bitmap[ALLOC_COUNT];

inline void bitmap_set(unsigned int i){
	bitmap[i >> SHIFT] |= (1 << (i & MASK));
}

inline void bitmap_clear(unsigned int i){
	bitmap[i >> SHIFT] &= ~(1 << (i & MASK));
}

inline int bitmap_test(unsigned int i){
 	return bitmap[i >> SHIFT] & (1 << (i & MASK));
}

void do_sort(char * org_file_name, char * result_file_name){
	int i;
	FILE * org_file, * result_file;

	org_file = fopen(org_file_name, "r");
	result_file = fopen(result_file_name, "w");

	if(NULL == org_file){
		printf("Faild to open org File\n");
		return ;
	}
	if(NULL == result_file){
		printf("Faild to open result File\n");
		return ;
	}

	i = NUMBER_COUNT;
	while(i--){
		bitmap_clear(i);
	} 

	while(EOF != fscanf(org_file, "%d", &i)){
		bitmap_set(i);
	}
	for(i = 0; i < NUMBER_COUNT; i++){
		if(bitmap_test(i)){
			fprintf(result_file, "%d\n", i);
		}
	}

	fclose(org_file);
	fclose(result_file);
}

int main(int argc, char ** argv){
	printf("Start\n");
	do_sort("test.txt", "result.txt");
	printf("Finished\n");
}

下面有一段简单的 PHP 代码创建测试文件

#!/usr/bin/php
<?php
$aArr = array();
$iC = 10000000 - rand(100, 10000);
for($i = 0; $i < $iC; ){
	$n = rand(0, 10000000);
	if(!isset($aArr[$n])){
		$aArr[$n] = true;
		$i++;
	}
}

$hFile = fopen('test.txt', 'wa');
foreach($aArr as $iKey => $bTrue){
	if($bTrue){
		fwrite($hFile, $iKey . "\n");
	}
}
fclose($hFile);

这是一个使用 c 语言生成的程序:

#include <stdio.h>
#include <stdlib.h>

#define NUMBER_COUNT 10000000
int  vector[NUMBER_COUNT];

static inline void swap(int i, int j){
	int t = vector[i];
	vector[i] = vector[j];
	vector[j] = t;
}

static void create(char * result_file_name, int create_count){
	int i;
	FILE * result_file;

	result_file = fopen(result_file_name, "w");

	if(NULL == result_file){
		printf("Faild to open result File\n");
		return ;
	}

	i = NUMBER_COUNT;
	while(i--){
		vector[i] = i;
	}

	printf("Number to Create %d \n", create_count);
	while(create_count--){
		swap(create_count, (rand() % NUMBER_COUNT));
		fprintf(result_file, "%d\n", vector[create_count]);
	}

	fclose(result_file);
}

int main(int argc, char ** argv){
	printf("Start\n");
	int create_count = NUMBER_COUNT, delta = 0;

	if(argc > 0){
		if(argc > 1){
			delta = atoi(argv[2]);
			if(delta > 0 && delta < NUMBER_COUNT){
				create_count  -= delta;
			}
		}

		create(argv[1], create_count);
		printf("Finished\n");
	}
}

Post a Comment

Your email is never shared. Required fields are marked *

*
*