/* Copyright 2006 Neil Edelman, distributed under the terms of the GNU General Public License, see copying.txt */ /* Sunday, April 2, 2006 */ #include #include #include /* constants */ static const char *programme = "Find Patterns"; static const char *year = "2006"; static const int versionMajor = 1; static const int versionMinor = 0; typedef struct TAG_match_t { unsigned long next; unsigned long pos; unsigned long len; unsigned long count; } match_t; int main(int argc, char **argv); void set_binary_switch_string(int argc, char **argv, char *flagStr, unsigned long *bvPtr); unsigned short set_ulong_flag(int argc, char **argv, char flagChar, unsigned long *lPtr); char *get_unswitched_arg(int argc, char **argv, unsigned int argNo); #define MATCHBUF 1 #define ECHO 2 #define NOCASE 4 #define NOSHOWBUF 8 #define INTERACTIVE 16 #define SILENT 32 #define VERBOSE 64 #define HELP (128 | 256) #define DEF_MATCHLIMIT 4096 #define DEF_MINLENGTH 3 #define DEF_MEMGRAN 1024 int main(int argc, char **argv) { match_t *matchBuf = 0; FILE *in; size_t matchBufSize = 0, matchBufPos = 0; size_t bufSize, bufPos; unsigned long hiPos, loPos, hiDat, loDat, firstMatch = ULONG_MAX, numMatches; unsigned long flags = 0, matchLimit = DEF_MATCHLIMIT, minLength = DEF_MINLENGTH, memGran = DEF_MEMGRAN, matchMemGran; int readValue; unsigned short matched, worked = 1; char *buf = 0, ch, *filename = 0; /* skip the first command argument */ argv++; if(--argc < 0) { printf("Invoked with erreneous argument data (%d arguments.)\n", argc); return 3; } /* check for the binary flags */ set_binary_switch_string(argc, argv, "beinosvh?", &flags); if(flags & HELP) { /* "________________________________________________________________________________" */ fprintf(stderr, "Version %d.%d.\n\n", versionMajor, versionMinor); fprintf(stderr, "%s Copyright %s Neil Edelman\n", programme, year); fprintf(stderr, "This program comes with ABSOLUTELY NO WARRANTY.\n"); fprintf(stderr, "This is free software, and you are welcome to redistribute it\n"); fprintf(stderr, "under certain conditions; see copying.txt.\n\n"); printf("Searches through input and outputs sequences that are repeated. Because it's\n"); printf("intended for text files, control characters are ignored.\n\n"); printf("FINDPAT [filename] [-b] [-e] [-i] [-o] [-v] [-m] [-l] [-g] [-?|h]\n\n"); printf(" filename Attempt to read input from this file, otherwise uses stdin.\n"); printf(" -b Keep a buffer to count repeated matches (!o -> b.)\n"); printf(" -e Echo input.\n"); printf(" -i Case-insensitive (not implemented.)\n"); printf(" -n Don't display matches at the end.\n"); printf(" -o Output matches immediately as they are found.\n"); printf(" -s Silent mode - plain output with no extra characters.\n"); printf(" -v Verbose comments while outputting.\n"); printf(" -g Set memory buffer granularity to the closest power of two\n"); printf(" lower than bytes (default 1024.)\n"); printf(" -l Set match limit to matches (default 4096; 0 -> no limit.)\n"); printf(" -m Set minimum match length to symbols (default 3).\n"); printf(" -?|h Display this help screen and exit.\n\n"); printf(" Adding -- will turn off switch .\n"); return 1; } if(!(flags & INTERACTIVE)) { flags |= MATCHBUF; } if((flags & VERBOSE)) printf("Octal flags: 0%o.\n", (unsigned int)flags); /* set memory buffer granularity (make it a power of two) and make matchMemGran memGran / sizeof(match_t) */ set_ulong_flag(argc, argv, 'g', &memGran); { unsigned int memPower; for(memPower = 0; memGran >>= 1; memPower++); memGran = 1 << memPower; } if(memGran > 16) matchMemGran = memGran >> 4; else matchMemGran = 1; if((flags & VERBOSE)) printf("Allocation granularity: %u.\n", (unsigned int)memGran); /* set the limit on the number of matches */ set_ulong_flag(argc, argv, 'l', &matchLimit); if((flags & VERBOSE)) printf("Match limit: %u.\n", (unsigned int)matchLimit); /* get the minimum match length */ set_ulong_flag(argc, argv, 'm', &minLength); if(!minLength) minLength = 1; if((flags & VERBOSE)) printf("Minimum length: %u.\n", (unsigned int)minLength); /* get the filename for input */ if((filename = get_unswitched_arg(argc, argv, 0))) { if(!(in = fopen(filename, "r"))) { printf("Error opening file \"%s\" for reading.\n", filename); return 2; } } else if(!(in = stdin)) { printf("Error reading from stdin.\n"); return 2; } if((flags & VERBOSE) && filename) printf("Opened file \"%s\" at %p.\n", filename, (void *)in); /* explain what's going on so those who unwittingly run the app won't get stuck */ if(!(flags & SILENT)) printf("Finding repeated substrings; EOF (UNIX Ctrl-D, DOS Ctrl-Z.) to end.\n"); /* run through the buffer, expanding and filling it with input as it goes */ for(bufSize = 0, bufPos = 0, numMatches = 0; ; bufPos++) { /* read in a byte */ if((readValue = fgetc(in)) == EOF) break; /* haha, binary mode - I don't see how this could possibly be useful */ /* if(readValue < 0 || readValue > UCHAR_MAX) continue; */ /* since this is text mode, ignore control characters */ if(readValue < ' ' || readValue > '~') continue; /* assign the inputed int to a char to save on repeated conversions */ ch = (char)readValue; if((flags & ECHO)) putchar(ch); /* make sure the buffer size isn't going to exceed the limits of the data type */ if(bufPos + 2 >= ULONG_MAX - 1) { printf("Exceeded maximum data modulus of %u.", (unsigned int)ULONG_MAX - 1); worked = 0; break; } /* expand the buffer if required */ if(bufPos + 2 >= bufSize) { bufSize = ((bufPos + 2) & ~(memGran - 1)) + memGran; if(!(buf = (char *)realloc((void *)buf, sizeof(char) * bufSize))) { printf("Ran out of memory allocating %u bytes.\n", (unsigned int)(sizeof(char) * bufSize)); worked = 0; break; } else if((flags & VERBOSE)) printf("Input buffer sized to %u characters (%u bytes) at %p.\n", (unsigned int)bufSize, (unsigned int)(sizeof(char) * bufSize), buf); } /* write the symbol to the buffer with a null after it for string output */ buf[bufPos] = (char)ch; buf[bufPos + 1] = 0; /* starting at or past half way through the buffer and work towards the end; yes, the + 1 is right not + (bufPos & 1); think zero-offset */ for(hiPos = (bufPos >> 1) + 1; hiPos + minLength - 1 <= bufPos; hiPos++) { /* initial flags no match */ matched = 0; /* search forwards from the buffer start up to the last point where a full match might exist with the hi sequence */ for(loPos = 0; loPos < hiPos - (bufPos - hiPos); loPos++) { /* search through the entire hi and lo sequences for a match */ for(hiDat = hiPos, loDat = loPos; ; hiDat++, loDat++) { /* break on a non-match FIXME: add a case-insensitive version */ if(buf[hiDat] != buf[loDat]) break; /* if at the end, the whole thing matched */ if(hiDat >= bufPos) { matched = 1; break; } } /* check if the above resulted in a full match */ if(matched) { match_t match; /* check if this match is the same as the previous match */ /* don't exceed the limit */ if(matchLimit && numMatches++ >= matchLimit) { printf("Match limit of %u exceeded.\n", (unsigned int)matchLimit); worked = 0; break; } /* fill in info about the match */ match.pos = loPos; match.len = 1 + bufPos - hiPos; if((flags & INTERACTIVE)) printf("<%-*.*s", (int)match.len, (int)match.len, buf + match.pos); /* if there's a match buffer, put the match into it */ if((flags & MATCHBUF)) { unsigned long node, prevNode; /* skip past all matches up to this position */ for(matched = 0, node = firstMatch, prevNode = ULONG_MAX; node != ULONG_MAX; prevNode = node, node = matchBuf[node].next) { /* skip past all earlier positions */ if(matchBuf[node].pos < match.pos) continue; /* no match if the next position is later */ if(matchBuf[node].pos > match.pos) break; /* otherwise, this position must have matched */ matched = 1; break; } if(matched) { /* skip past all matches up to this length FIXME: 1 unneccessary test for if(node) and unneccesary assignment of prevNode */ for(matched = 0; node != ULONG_MAX; prevNode = node, node = matchBuf[node].next) { /* skip past all smaller lengths */ if(matchBuf[node].len < match.len) continue; /* no match if the next length is larger */ if(matchBuf[node].len > match.len) break; /* otherwise, this length must have matched */ matched = 1; break; } } /* if the length and position of both matches are the same, increment the existing match count */ if(matched) matchBuf[node].count++; /* otherwise, a new match must be created */ else { match_t *newMatch; /* make sure the buffer size isn't going to exceed the limits of the data type */ if(matchBufPos + 1 >= ULONG_MAX - 1) /* need the -1 because ULONG_MAX is a NULL flag */ { printf("Match buffer exceeded maximum data modulus of %u.", (unsigned int)ULONG_MAX - 1); worked = 0; break; } /* expand the buffer if required */ if(matchBufPos + 1 >= matchBufSize) { matchBufSize = ((matchBufPos + 1) & ~(matchMemGran - 1)) + matchMemGran; if(!(matchBuf = (match_t *)realloc((void *)matchBuf, sizeof(match_t) * matchBufSize))) { printf("Ran out of memory allocating %u bytes.\n", (unsigned int)(sizeof(match_t) * matchBufSize)); worked = 0; break; } else if((flags & VERBOSE)) printf("Match buffer sized to %u matches (%u bytes) at %p.\n", (unsigned int)matchBufSize, (unsigned int)(sizeof(match_t) * matchBufSize), (void *)matchBuf); } /* get the address of the new match and move ahead for next time */ newMatch = matchBuf + matchBufPos++; /* copy the match info into the new node */ newMatch->pos = match.pos; newMatch->len = match.len; newMatch->count = 1; /* insert into the linked list */ if(prevNode != ULONG_MAX) { newMatch->next = matchBuf[prevNode].next; matchBuf[prevNode].next = matchBufPos - 1; } else { newMatch->next = firstMatch; firstMatch = matchBufPos - 1; } } if((flags & INTERACTIVE)) printf("|#%u>\n", (unsigned int)matchBuf[node].count); } /* if there is no match buffer */ else if((flags & INTERACTIVE)) printf(">\n"); /* all other matches will have already been listed from previous searches so stop looking */ break; } /* make sure everything is still working */ if(!worked) break; } /* make sure everything is still working */ if(!worked) break; } /* make sure everything is still working */ if(!worked) break; } /* if the buffer exists */ if(matchBuf) { unsigned long node; /* FIXME: filter out substrings of matches that match the same as the whole string */ for(node = firstMatch; node != ULONG_MAX; node = matchBuf[node].next) { } /* print out the results */ if(!(flags & NOSHOWBUF)) { if((flags & SILENT)) { for(node = firstMatch; node != ULONG_MAX; node = matchBuf[node].next) printf("%-*.*s\n", (int)matchBuf[node].len, (int)matchBuf[node].len, buf + matchBuf[node].pos); } else { for(node = firstMatch; node != ULONG_MAX; node = matchBuf[node].next) printf("%u*<%-*.*s>@%u\n", (unsigned int)matchBuf[node].count, (int)matchBuf[node].len, (int)matchBuf[node].len, buf + matchBuf[node].pos, (unsigned int)matchBuf[node].pos); } } /* free the buffer */ if((flags & VERBOSE)) printf("Freeing match buffer at %p.\n", (void *)matchBuf); free(matchBuf); } /* free the memory allocated to the buffer */ if(buf) { if((flags & VERBOSE)) printf("Freeing input buffer at %p.\n", buf); free(buf); } /* close the file */ if(filename) { if((flags & VERBOSE)) printf("Closing input file at %p.\n", (void *)in); fclose(in); } /* return the error state */ if(worked) return 0; else return 1; } void set_binary_switch_string(int argc, char **argv, char *flagStr, unsigned long *bvPtr) { unsigned long mask; int i; char **arg, *str, *flagPos; /* go through all the argv's */ for(i = argc, arg = argv; i > 0; i--, arg++) { /* this argument looks like -a... or /a... */ if((*(str = *arg) == '-' || *str == '/') && *(++str)) { /* looks like /a -a /a+ or -a+ */ if(!str[1] || str[1] == '+') { for(mask = 1, flagPos = flagStr; *flagPos && mask; flagPos++, mask <<= 1) { if(*str == *flagPos) { *bvPtr |= mask; break; } } } /* looks like /a- or -a- */ else if(str[1] == '-') { for(mask = 1, flagPos = flagStr; *flagPos && mask; flagPos++, mask <<= 1) { if(*str == *flagPos) { *bvPtr &= ~mask; break; } } } } } } unsigned short set_ulong_flag(int argc, char **argv, char flagChar, unsigned long *lPtr) { unsigned long num; int i; unsigned short gotNum = 0; char **arg, *str, *end; /* go through all the argv's */ for(i = argc, arg = argv; i > 0; i--, arg++) { str = *arg; /* looks like -fa... or /fa... */ if((*str == '-' || *str == '/') && *(++str) == flagChar && *(++str)) { num = strtoul(str, &end, 0); if(!*end) { *lPtr = num; gotNum = 1; } } } return gotNum; } char *get_unswitched_arg(int argc, char **argv, unsigned int argNo) { int i; char **arg, *str; /* go through all the argv's */ for(i = argc, arg = argv; i > 0; i--, arg++) { /* this argument exists, doesn't look like -... or /..., and is no. argNo to be so */ if(*(str = *arg) && *str != '/' && *str != '-' && !argNo--) return str; } return 0; }