Sunday, January 31, 2010

Grok

A friend of mine wrote a Python script to replace Ack (which is much faster than grep) and the resulting script turned out to be significantly faster than it. I decided then to write my own rough equivalent, called 'grok' (name from another similar program):

grok.c

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <pcre.h>
#include <dirent.h>
#include <limits.h>
#define BLOCK_SIZE 1024
#define OFFSET_COUNT 1
struct {
char * bytes;
off_t length;
} blocks = {NULL, 0};


int search(const char * path, pcre * re) {
FILE * input = fopen(path, "r");
off_t n = 1;
int ovector[OFFSET_COUNT];
int rc;
char first_match = 1;
if(!input) {
perror(path);
return 1;
}
if(!blocks.bytes && !blocks.length) {
// Initialize blocks on first time
blocks.bytes = malloc(BLOCK_SIZE);
memset(blocks.bytes, '\0', BLOCK_SIZE);
blocks.length = BLOCK_SIZE;
}
// Search
while(!feof(input)) {
// Read a line
blocks.bytes[blocks.length - 2] = '\0'; // [blocks.length - 1] will always be '\0' due to fgets() behavior
fgets(blocks.bytes, blocks.length, input);
//printf("%lu/%lu; %d\n", strlen(blocks.bytes), blocks.length, blocks.bytes[blocks.length - 2]);
while(!feof(input) && blocks.bytes[blocks.length - 2] && blocks.bytes[blocks.length - 2] != '\n') {
// Expand blocks as necessary
blocks.length += BLOCK_SIZE;
blocks.bytes = realloc(blocks.bytes, blocks.length);
blocks.bytes[blocks.length - 2] = '\0';
fgets(blocks.bytes + strlen(blocks.bytes), BLOCK_SIZE + 1, input);
//printf("%lu/%lu; %d\n", strlen(blocks.bytes), blocks.length, blocks.bytes[blocks.length - 2]);
}
if(!(feof(input) && !*blocks.bytes)) {
n++;
rc = pcre_exec(re, NULL, blocks.bytes, strlen(blocks.bytes), 0, 0, ovector, OFFSET_COUNT);
if(rc < 0) {
switch(rc) {
case PCRE_ERROR_NOMATCH:
break;
case PCRE_ERROR_BADUTF8:
fprintf(stderr, "Bad UTF-8 at line %lu in %s\nSkipping file (try running with -U option to disable Unicode).\n", n, path);
fclose(input);
return 1;
break;
default:
fprintf(stderr, "Error: %d\n", rc);
break;
}
blocks.bytes[0] = '\0';
continue;
}
if(first_match) {
first_match = 0;
printf("%s:\n", path);
}
printf("%6lu:%s", n, blocks.bytes);
blocks.bytes[0] = '\0';
}
}
// Cleanup
fclose(input);
return 0;
}

int recursive_search(const char * path, pcre * re) {
DIR * dirinfo;
struct dirent * file;
struct stat info;
char fullpath[PATH_MAX], * filepart;
if(!stat(path, &info)) {
if(S_ISREG(info.st_mode)) {
// Regular files
if(search(path, re) == -1)
return -1;
}
else if(S_ISDIR(info.st_mode)) {
// Directories
strcpy(fullpath, path);
strcat(fullpath, "/");
filepart = fullpath + strlen(fullpath);
dirinfo = opendir(path);
while((file = readdir(dirinfo)) != NULL) {
if(*file->d_name != '.') {
strcpy(filepart, file->d_name);
if(recursive_search(fullpath, re) == -1)
return -1;
}
}
closedir(dirinfo);
}
}
else
perror(path);
return 0;
}


int main(int argc, const char * argv[]) {
// Defaults
const char * default_dirs[] = {"."};
const char * progname = *argv;
// General Variables
const char ** dirs;
int erroroffset;
int options = PCRE_UTF8;
off_t i, dirs_length;
const char * error;
pcre * re;
// Initialization
argc--; argv++;
// Parse flags
while(argc && (*argv)[0] == '-' && (*argv)[1] != '-') {
switch((*argv)[1]) {
case 'i':
options |= PCRE_CASELESS;
case 'U':
options &= ~PCRE_UTF8;
case '\0': break;
default:
fprintf(stderr, "Invalid option: %s", *argv);
return -1;
break;
}
argc--; argv++;
}
// Parse arguments
if(argc) {
re = pcre_compile(*argv, options, &error, &erroroffset, NULL);
if(argc > 1) {
dirs = argv + 1;
dirs_length = argc - 1;
}
else {
dirs = default_dirs;
dirs_length = 1;
}
}
else {
fprintf(stderr, "Usage: %s [ -i ] [ -u ] expr [ path1 .. pathN ]\n", progname);
return 1;
}
if(!re) {
fprintf(stderr, "PCRE compilation error at offset %d: %s\n", erroroffset, error);
return 2;
}
// Recursive search
for(i = 0; i < dirs_length; i++) {
if(recursive_search(dirs[i], re) == -1) {
fputs("Ran out of memory.", stderr);
return 128;
}
}
// Cleanup
if(blocks.bytes && blocks.length) {
free(blocks.bytes);
blocks.bytes = NULL;
blocks.length = 0;
}
pcre_free(re);
return 0;
}

As it turns out, my program is able to run twice as fast as his on a given directory tree (small to large).