/* bench_select.c Aaron Stump, June 2006 Randomly select a subset of lines of a specified size from an ASCII text data file, and print those lines to an output file. Seed the C library's random number generator either using a given command-line argument as a seed, or by summing up the seeds given in a seedfile. Every line of the seedfile should be of the form [some text] : [integer] By default, the data file is stdin, and the output file is stdout. The names of all files can be specified using command-line args. */ #include #include #define MAX_LINES 1048576 #define MAX_STR 65536 void die(char *s) { fprintf(stderr,s); exit(1); } int parse_line(char *_buf) { char *buf = _buf; while (*buf) { if (*buf == ':' && *(buf+1) != 0) { // now read the number char *tmpstr = NULL; int num = strtol(buf+1, &tmpstr, 10 /* the base */); if (*tmpstr != 0 && *tmpstr != '\n') { // the string could not be converted to an int */ fprintf(stderr,"Could not convert number for this line:\n%s\n",_buf); exit(1); } return num; } buf++; } fprintf(stderr,"Did not find a \": [integer]\" in this line:\n%s\n",_buf); exit(1); } int compute_seed(FILE *seedfile) { char buf[MAX_STR+10]; int seed = 0; while(fgets(buf,MAX_STR,seedfile)) seed += parse_line(buf); return seed; } void select_data(FILE *datafile, FILE *outfile, int num_to_select, int seed) { char **lines = malloc(sizeof(char *) * MAX_LINES); char buf[MAX_STR+10]; int numlines = 0; char *lastline; int tmpi; // read lines into lines array while (fgets(buf,MAX_STR,datafile)) { lastline = lines[numlines] = malloc(strlen(buf)+10); strcpy(lines[numlines], buf); numlines++; if (numlines >= MAX_LINES) die("Datafile is too long (more than MAX_LINES lines).\n"); } // make sure last line read ends in a newline tmpi = strlen(lastline) - 1; if (lastline[tmpi] != '\n') { lastline[tmpi] = '\n'; lastline[tmpi+1] = 0; /* ok, since we allocated 10 extra bytes above */ } if (num_to_select > numlines) { fprintf(stderr, "Warning, number to select exceeds number of lines in data file.\n"); num_to_select = numlines; } srand(seed); num_to_select--; while(num_to_select >= 0) { unsigned ind = ((unsigned)rand()) % numlines; if (lines[ind]) { fprintf(outfile,lines[ind]); num_to_select--; free(lines[ind]); lines[ind] = NULL; } } } int main(int argc, char **argv) { int curarg = 1; FILE *seedfile = NULL; unsigned int _seed; unsigned int *seed = NULL; char *tmpstr = NULL; FILE *datafile = stdin; FILE *outfile = stdout; int num_to_select = 1; for (; curarg < argc; curarg++) { if (strncmp("--seedf",argv[curarg],7) == 0) { // look for name of seedfile if (curarg+1 == argc) { // missing name of seedfile die("Name of seedfile must follow --seedfile.\n"); } if (!(seedfile = fopen(argv[curarg+1],"r"))) { // couldn't open seedfile die("Couldn't open seed file for reading.\n"); } curarg++; } else if (strncmp("--seed",argv[curarg],6) == 0) { // look for seed if (curarg+1 == argc) { // missing seed die("Seed must follow --seed.\n"); } _seed = strtol(argv[curarg+1], &tmpstr, 10 /* the base */); if (*tmpstr != 0) { // the string could not be converted to an int */ die("Seed could not be converted to an int.\n"); } seed = &_seed; curarg++; } else if (strncmp("--d",argv[curarg],3) == 0) { if (curarg+1 == argc) { // missing data file die("Name of datafile must follow --datafile.\n"); } if (!(datafile = fopen(argv[curarg+1],"r"))) { // couldn't open datafile die("Couldn't open data file for reading.\n"); } curarg++; } else if (strncmp("--o",argv[curarg],3) == 0) { if (curarg+1 == argc) { // missing out file die("Name of output file must follow --outfile.\n"); } if (!(outfile = fopen(argv[curarg+1],"w"))) { // couldn't open datafile die("Couldn't open output file for writing.\n"); } curarg++; } else if (strncmp("--n",argv[curarg],3) == 0) { // look for num to select if (curarg+1 == argc) { // missing seed die("Number of lines to select must follow --num.\n"); } num_to_select = strtol(argv[curarg+1], &tmpstr, 10 /* the base */); if (*tmpstr != 0) { // the string could not be converted to an int */ die("Number of lines to select could not be converted to an int.\n"); } curarg++; } else if (strncmp("--h",argv[curarg],3) == 0) { fprintf(stderr,"%s [options]\n\n\ Select lines randomly from a datafile.\n\n\ --seedfile [name of seed file] \ --seed [integer seed (takes precedence over --seedfile)] \ --num [integer for number of lines to select from the datafile] \ --datafile [name of data file (stdin by default)] \ --outfile [name of file to which to write output (stdout by default) "); exit(1); } } // end for if (seed != NULL) select_data(datafile, outfile, num_to_select, _seed); else if (seedfile != NULL) { _seed = compute_seed(seedfile); fprintf(stderr,"Computed seed %d.\n", _seed); select_data(datafile, outfile, num_to_select, _seed); } else die("No seed or seedfile specified (use --seed or --seedfile).\n"); if (datafile != stdin) fclose(datafile); if (outfile != stdout) fclose(outfile); if (seedfile) fclose(seedfile); return 0; }