find-patterns/findpat.c

298 lines
14 KiB
C

/* Copyright 2006 Neil Edelman, distributed under the terms of the
GNU General Public License, see copying.txt */
/* Sunday, April 2, 2006 */
#include <stdlib.h>
#include <stdio.h>
#include <limits.h>
/* constants */
static const char *programme = "Find Patterns";
static const char *year = "2006";
static const int versionMajor = 1;
static const int versionMinor = 0;
typedef struct TAG_match_t {
unsigned long next;
unsigned long pos;
unsigned long len;
unsigned long count;
} match_t;
int main(int argc, char **argv);
void set_binary_switch_string(int argc, char **argv, char *flagStr, unsigned long *bvPtr);
unsigned short set_ulong_flag(int argc, char **argv, char flagChar, unsigned long *lPtr);
char *get_unswitched_arg(int argc, char **argv, unsigned int argNo);
#define MATCHBUF 1
#define ECHO 2
#define NOCASE 4
#define NOSHOWBUF 8
#define INTERACTIVE 16
#define SILENT 32
#define VERBOSE 64
#define HELP (128 | 256)
#define DEF_MATCHLIMIT 4096
#define DEF_MINLENGTH 3
#define DEF_MEMGRAN 1024
int main(int argc, char **argv) {
match_t *matchBuf = 0;
FILE *in;
size_t matchBufSize = 0, matchBufPos = 0;
size_t bufSize, bufPos;
unsigned long hiPos, loPos, hiDat, loDat, firstMatch = ULONG_MAX, numMatches;
unsigned long flags = 0, matchLimit = DEF_MATCHLIMIT, minLength = DEF_MINLENGTH, memGran = DEF_MEMGRAN, matchMemGran;
int readValue;
unsigned short matched, worked = 1;
char *buf = 0, ch, *filename = 0;
/* skip the first command argument */
argv++; if(--argc < 0) { printf("Invoked with erreneous argument data (%d arguments.)\n", argc); return 3; }
/* check for the binary flags */
set_binary_switch_string(argc, argv, "beinosvh?", &flags);
if(flags & HELP) {
/* "________________________________________________________________________________" */
fprintf(stderr, "Version %d.%d.\n\n", versionMajor, versionMinor);
fprintf(stderr, "%s Copyright %s Neil Edelman\n", programme, year);
fprintf(stderr, "This program comes with ABSOLUTELY NO WARRANTY.\n");
fprintf(stderr, "This is free software, and you are welcome to redistribute it\n");
fprintf(stderr, "under certain conditions; see copying.txt.\n\n");
printf("Searches through input and outputs sequences that are repeated. Because it's\n");
printf("intended for text files, control characters are ignored.\n\n");
printf("FINDPAT [filename] [-b] [-e] [-i] [-o] [-v] [-m<n>] [-l<n>] [-g<n>] [-?|h]\n\n");
printf(" filename Attempt to read input from this file, otherwise uses stdin.\n");
printf(" -b Keep a buffer to count repeated matches (!o -> b.)\n");
printf(" -e Echo input.\n");
printf(" -i Case-insensitive (not implemented.)\n");
printf(" -n Don't display matches at the end.\n");
printf(" -o Output matches immediately as they are found.\n");
printf(" -s Silent mode - plain output with no extra characters.\n");
printf(" -v Verbose comments while outputting.\n");
printf(" -g<n> Set memory buffer granularity to the closest power of two\n");
printf(" lower than <n> bytes (default 1024.)\n");
printf(" -l<n> Set match limit to <n> matches (default 4096; 0 -> no limit.)\n");
printf(" -m<n> Set minimum match length to <n> symbols (default 3).\n");
printf(" -?|h Display this help screen and exit.\n\n");
printf(" Adding -<s>- will turn off switch <s>.\n");
return 1;
}
if(!(flags & INTERACTIVE)) { flags |= MATCHBUF; }
if((flags & VERBOSE)) printf("Octal flags: 0%o.\n", (unsigned int)flags);
/* set memory buffer granularity (make it a power of two) and make matchMemGran memGran / sizeof(match_t) */
set_ulong_flag(argc, argv, 'g', &memGran);
{ unsigned int memPower; for(memPower = 0; memGran >>= 1; memPower++); memGran = 1 << memPower; }
if(memGran > 16) matchMemGran = memGran >> 4; else matchMemGran = 1;
if((flags & VERBOSE)) printf("Allocation granularity: %u.\n", (unsigned int)memGran);
/* set the limit on the number of matches */
set_ulong_flag(argc, argv, 'l', &matchLimit);
if((flags & VERBOSE)) printf("Match limit: %u.\n", (unsigned int)matchLimit);
/* get the minimum match length */
set_ulong_flag(argc, argv, 'm', &minLength);
if(!minLength) minLength = 1;
if((flags & VERBOSE)) printf("Minimum length: %u.\n", (unsigned int)minLength);
/* get the filename for input */
if((filename = get_unswitched_arg(argc, argv, 0))) {
if(!(in = fopen(filename, "r")))
{ printf("Error opening file \"%s\" for reading.\n", filename); return 2; }
}
else if(!(in = stdin))
{ printf("Error reading from stdin.\n"); return 2; }
if((flags & VERBOSE) && filename) printf("Opened file \"%s\" at %p.\n", filename, (void *)in);
/* explain what's going on so those who unwittingly run the app won't get stuck */
if(!(flags & SILENT)) printf("Finding repeated substrings; EOF (UNIX Ctrl-D, DOS Ctrl-Z.) to end.\n");
/* run through the buffer, expanding and filling it with input as it goes */
for(bufSize = 0, bufPos = 0, numMatches = 0; ; bufPos++) {
/* read in a byte */
if((readValue = fgetc(in)) == EOF) break;
/* haha, binary mode - I don't see how this could possibly be useful */
/* if(readValue < 0 || readValue > UCHAR_MAX) continue; */
/* since this is text mode, ignore control characters */
if(readValue < ' ' || readValue > '~') continue;
/* assign the inputed int to a char to save on repeated conversions */
ch = (char)readValue;
if((flags & ECHO)) putchar(ch);
/* make sure the buffer size isn't going to exceed the limits of the data type */
if(bufPos + 2 >= ULONG_MAX - 1)
{ printf("Exceeded maximum data modulus of %u.", (unsigned int)ULONG_MAX - 1); worked = 0; break; }
/* expand the buffer if required */
if(bufPos + 2 >= bufSize) {
bufSize = ((bufPos + 2) & ~(memGran - 1)) + memGran;
if(!(buf = (char *)realloc((void *)buf, sizeof(char) * bufSize)))
{ printf("Ran out of memory allocating %u bytes.\n", (unsigned int)(sizeof(char) * bufSize)); worked = 0; break; }
else if((flags & VERBOSE)) printf("Input buffer sized to %u characters (%u bytes) at %p.\n", (unsigned int)bufSize, (unsigned int)(sizeof(char) * bufSize), buf);
}
/* write the symbol to the buffer with a null after it for string output */
buf[bufPos] = (char)ch;
buf[bufPos + 1] = 0;
/* starting at or past half way through the buffer and work towards the
end; yes, the + 1 is right not + (bufPos & 1); think zero-offset */
for(hiPos = (bufPos >> 1) + 1; hiPos + minLength - 1 <= bufPos; hiPos++) {
/* initial flags no match */
matched = 0;
/* search forwards from the buffer start up to the last point where
a full match might exist with the hi sequence */
for(loPos = 0; loPos < hiPos - (bufPos - hiPos); loPos++) {
/* search through the entire hi and lo sequences for a match */
for(hiDat = hiPos, loDat = loPos; ; hiDat++, loDat++) {
/* break on a non-match FIXME: add a case-insensitive version */
if(buf[hiDat] != buf[loDat]) break;
/* if at the end, the whole thing matched */
if(hiDat >= bufPos) { matched = 1; break; }
}
/* check if the above resulted in a full match */
if(matched) {
match_t match;
/* check if this match is the same as the previous match */
/* don't exceed the limit */
if(matchLimit && numMatches++ >= matchLimit)
{ printf("Match limit of %u exceeded.\n", (unsigned int)matchLimit); worked = 0; break; }
/* fill in info about the match */
match.pos = loPos;
match.len = 1 + bufPos - hiPos;
if((flags & INTERACTIVE)) printf("<%-*.*s", (int)match.len, (int)match.len, buf + match.pos);
/* if there's a match buffer, put the match into it */
if((flags & MATCHBUF)) {
unsigned long node, prevNode;
/* skip past all matches up to this position */
for(matched = 0, node = firstMatch, prevNode = ULONG_MAX; node != ULONG_MAX; prevNode = node, node = matchBuf[node].next) {
/* skip past all earlier positions */
if(matchBuf[node].pos < match.pos) continue;
/* no match if the next position is later */
if(matchBuf[node].pos > match.pos) break;
/* otherwise, this position must have matched */
matched = 1; break;
}
if(matched) {
/* skip past all matches up to this length FIXME: 1 unneccessary test for if(node) and unneccesary assignment of prevNode */
for(matched = 0; node != ULONG_MAX; prevNode = node, node = matchBuf[node].next) {
/* skip past all smaller lengths */
if(matchBuf[node].len < match.len) continue;
/* no match if the next length is larger */
if(matchBuf[node].len > match.len) break;
/* otherwise, this length must have matched */
matched = 1; break;
}
}
/* if the length and position of both matches are the same, increment the existing match count */
if(matched) matchBuf[node].count++;
/* otherwise, a new match must be created */
else {
match_t *newMatch;
/* make sure the buffer size isn't going to exceed the limits of the data type */
if(matchBufPos + 1 >= ULONG_MAX - 1) /* need the -1 because ULONG_MAX is a NULL flag */
{ printf("Match buffer exceeded maximum data modulus of %u.", (unsigned int)ULONG_MAX - 1); worked = 0; break; }
/* expand the buffer if required */
if(matchBufPos + 1 >= matchBufSize) {
matchBufSize = ((matchBufPos + 1) & ~(matchMemGran - 1)) + matchMemGran;
if(!(matchBuf = (match_t *)realloc((void *)matchBuf, sizeof(match_t) * matchBufSize)))
{ printf("Ran out of memory allocating %u bytes.\n", (unsigned int)(sizeof(match_t) * matchBufSize)); worked = 0; break; }
else if((flags & VERBOSE)) printf("Match buffer sized to %u matches (%u bytes) at %p.\n", (unsigned int)matchBufSize, (unsigned int)(sizeof(match_t) * matchBufSize), (void *)matchBuf);
}
/* get the address of the new match and move ahead for next time */
newMatch = matchBuf + matchBufPos++;
/* copy the match info into the new node */
newMatch->pos = match.pos; newMatch->len = match.len; newMatch->count = 1;
/* insert into the linked list */
if(prevNode != ULONG_MAX) { newMatch->next = matchBuf[prevNode].next; matchBuf[prevNode].next = matchBufPos - 1; }
else { newMatch->next = firstMatch; firstMatch = matchBufPos - 1; }
}
if((flags & INTERACTIVE)) printf("|#%u>\n", (unsigned int)matchBuf[node].count);
}
/* if there is no match buffer */
else if((flags & INTERACTIVE)) printf(">\n");
/* all other matches will have already been listed from
previous searches so stop looking */
break;
}
/* make sure everything is still working */
if(!worked) break;
}
/* make sure everything is still working */
if(!worked) break;
}
/* make sure everything is still working */
if(!worked) break;
}
/* if the buffer exists */
if(matchBuf) {
unsigned long node;
/* FIXME: filter out substrings of matches that match the same as the whole string */
for(node = firstMatch; node != ULONG_MAX; node = matchBuf[node].next) {
}
/* print out the results */
if(!(flags & NOSHOWBUF)) {
if((flags & SILENT))
{ for(node = firstMatch; node != ULONG_MAX; node = matchBuf[node].next) printf("%-*.*s\n", (int)matchBuf[node].len, (int)matchBuf[node].len, buf + matchBuf[node].pos); }
else
{ for(node = firstMatch; node != ULONG_MAX; node = matchBuf[node].next) printf("%u*<%-*.*s>@%u\n", (unsigned int)matchBuf[node].count, (int)matchBuf[node].len, (int)matchBuf[node].len, buf + matchBuf[node].pos, (unsigned int)matchBuf[node].pos); }
}
/* free the buffer */
if((flags & VERBOSE)) printf("Freeing match buffer at %p.\n", (void *)matchBuf);
free(matchBuf);
}
/* free the memory allocated to the buffer */
if(buf) { if((flags & VERBOSE)) printf("Freeing input buffer at %p.\n", buf); free(buf); }
/* close the file */
if(filename) { if((flags & VERBOSE)) printf("Closing input file at %p.\n", (void *)in); fclose(in); }
/* return the error state */
if(worked) return 0; else return 1;
}
void set_binary_switch_string(int argc, char **argv, char *flagStr, unsigned long *bvPtr) {
unsigned long mask;
int i;
char **arg, *str, *flagPos;
/* go through all the argv's */
for(i = argc, arg = argv; i > 0; i--, arg++) {
/* this argument looks like -a... or /a... */
if((*(str = *arg) == '-' || *str == '/') && *(++str)) {
/* looks like /a -a /a+ or -a+ */
if(!str[1] || str[1] == '+') {
for(mask = 1, flagPos = flagStr; *flagPos && mask; flagPos++, mask <<= 1)
{ if(*str == *flagPos) { *bvPtr |= mask; break; } }
}
/* looks like /a- or -a- */
else if(str[1] == '-') {
for(mask = 1, flagPos = flagStr; *flagPos && mask; flagPos++, mask <<= 1)
{ if(*str == *flagPos) { *bvPtr &= ~mask; break; } }
}
}
}
}
unsigned short set_ulong_flag(int argc, char **argv, char flagChar, unsigned long *lPtr) {
unsigned long num;
int i;
unsigned short gotNum = 0;
char **arg, *str, *end;
/* go through all the argv's */
for(i = argc, arg = argv; i > 0; i--, arg++) {
str = *arg;
/* looks like -fa... or /fa... */
if((*str == '-' || *str == '/') && *(++str) == flagChar && *(++str))
{ num = strtoul(str, &end, 0); if(!*end) { *lPtr = num; gotNum = 1; } }
}
return gotNum;
}
char *get_unswitched_arg(int argc, char **argv, unsigned int argNo) {
int i;
char **arg, *str;
/* go through all the argv's */
for(i = argc, arg = argv; i > 0; i--, arg++) {
/* this argument exists, doesn't look like -... or /..., and is no. argNo to be so */
if(*(str = *arg) && *str != '/' && *str != '-' && !argNo--) return str;
}
return 0;
}