298 lines
14 KiB
C
298 lines
14 KiB
C
/* Copyright 2006 Neil Edelman, distributed under the terms of the
|
|
GNU General Public License, see copying.txt */
|
|
|
|
/* Sunday, April 2, 2006 */
|
|
|
|
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
#include <limits.h>
|
|
|
|
/* constants */
|
|
static const char *programme = "Find Patterns";
|
|
static const char *year = "2006";
|
|
static const int versionMajor = 1;
|
|
static const int versionMinor = 0;
|
|
|
|
typedef struct TAG_match_t {
|
|
unsigned long next;
|
|
unsigned long pos;
|
|
unsigned long len;
|
|
unsigned long count;
|
|
} match_t;
|
|
|
|
int main(int argc, char **argv);
|
|
void set_binary_switch_string(int argc, char **argv, char *flagStr, unsigned long *bvPtr);
|
|
unsigned short set_ulong_flag(int argc, char **argv, char flagChar, unsigned long *lPtr);
|
|
char *get_unswitched_arg(int argc, char **argv, unsigned int argNo);
|
|
|
|
#define MATCHBUF 1
|
|
#define ECHO 2
|
|
#define NOCASE 4
|
|
#define NOSHOWBUF 8
|
|
#define INTERACTIVE 16
|
|
#define SILENT 32
|
|
#define VERBOSE 64
|
|
#define HELP (128 | 256)
|
|
|
|
#define DEF_MATCHLIMIT 4096
|
|
#define DEF_MINLENGTH 3
|
|
#define DEF_MEMGRAN 1024
|
|
|
|
int main(int argc, char **argv) {
|
|
match_t *matchBuf = 0;
|
|
FILE *in;
|
|
size_t matchBufSize = 0, matchBufPos = 0;
|
|
size_t bufSize, bufPos;
|
|
unsigned long hiPos, loPos, hiDat, loDat, firstMatch = ULONG_MAX, numMatches;
|
|
unsigned long flags = 0, matchLimit = DEF_MATCHLIMIT, minLength = DEF_MINLENGTH, memGran = DEF_MEMGRAN, matchMemGran;
|
|
int readValue;
|
|
unsigned short matched, worked = 1;
|
|
char *buf = 0, ch, *filename = 0;
|
|
|
|
/* skip the first command argument */
|
|
argv++; if(--argc < 0) { printf("Invoked with erreneous argument data (%d arguments.)\n", argc); return 3; }
|
|
/* check for the binary flags */
|
|
set_binary_switch_string(argc, argv, "beinosvh?", &flags);
|
|
if(flags & HELP) {
|
|
/* "________________________________________________________________________________" */
|
|
fprintf(stderr, "Version %d.%d.\n\n", versionMajor, versionMinor);
|
|
fprintf(stderr, "%s Copyright %s Neil Edelman\n", programme, year);
|
|
fprintf(stderr, "This program comes with ABSOLUTELY NO WARRANTY.\n");
|
|
fprintf(stderr, "This is free software, and you are welcome to redistribute it\n");
|
|
fprintf(stderr, "under certain conditions; see copying.txt.\n\n");
|
|
printf("Searches through input and outputs sequences that are repeated. Because it's\n");
|
|
printf("intended for text files, control characters are ignored.\n\n");
|
|
printf("FINDPAT [filename] [-b] [-e] [-i] [-o] [-v] [-m<n>] [-l<n>] [-g<n>] [-?|h]\n\n");
|
|
printf(" filename Attempt to read input from this file, otherwise uses stdin.\n");
|
|
printf(" -b Keep a buffer to count repeated matches (!o -> b.)\n");
|
|
printf(" -e Echo input.\n");
|
|
printf(" -i Case-insensitive (not implemented.)\n");
|
|
printf(" -n Don't display matches at the end.\n");
|
|
printf(" -o Output matches immediately as they are found.\n");
|
|
printf(" -s Silent mode - plain output with no extra characters.\n");
|
|
printf(" -v Verbose comments while outputting.\n");
|
|
printf(" -g<n> Set memory buffer granularity to the closest power of two\n");
|
|
printf(" lower than <n> bytes (default 1024.)\n");
|
|
printf(" -l<n> Set match limit to <n> matches (default 4096; 0 -> no limit.)\n");
|
|
printf(" -m<n> Set minimum match length to <n> symbols (default 3).\n");
|
|
printf(" -?|h Display this help screen and exit.\n\n");
|
|
printf(" Adding -<s>- will turn off switch <s>.\n");
|
|
return 1;
|
|
}
|
|
if(!(flags & INTERACTIVE)) { flags |= MATCHBUF; }
|
|
if((flags & VERBOSE)) printf("Octal flags: 0%o.\n", (unsigned int)flags);
|
|
/* set memory buffer granularity (make it a power of two) and make matchMemGran memGran / sizeof(match_t) */
|
|
set_ulong_flag(argc, argv, 'g', &memGran);
|
|
{ unsigned int memPower; for(memPower = 0; memGran >>= 1; memPower++); memGran = 1 << memPower; }
|
|
if(memGran > 16) matchMemGran = memGran >> 4; else matchMemGran = 1;
|
|
if((flags & VERBOSE)) printf("Allocation granularity: %u.\n", (unsigned int)memGran);
|
|
/* set the limit on the number of matches */
|
|
set_ulong_flag(argc, argv, 'l', &matchLimit);
|
|
if((flags & VERBOSE)) printf("Match limit: %u.\n", (unsigned int)matchLimit);
|
|
/* get the minimum match length */
|
|
set_ulong_flag(argc, argv, 'm', &minLength);
|
|
if(!minLength) minLength = 1;
|
|
if((flags & VERBOSE)) printf("Minimum length: %u.\n", (unsigned int)minLength);
|
|
/* get the filename for input */
|
|
if((filename = get_unswitched_arg(argc, argv, 0))) {
|
|
if(!(in = fopen(filename, "r")))
|
|
{ printf("Error opening file \"%s\" for reading.\n", filename); return 2; }
|
|
}
|
|
else if(!(in = stdin))
|
|
{ printf("Error reading from stdin.\n"); return 2; }
|
|
if((flags & VERBOSE) && filename) printf("Opened file \"%s\" at %p.\n", filename, (void *)in);
|
|
/* explain what's going on so those who unwittingly run the app won't get stuck */
|
|
if(!(flags & SILENT)) printf("Finding repeated substrings; EOF (UNIX Ctrl-D, DOS Ctrl-Z.) to end.\n");
|
|
/* run through the buffer, expanding and filling it with input as it goes */
|
|
for(bufSize = 0, bufPos = 0, numMatches = 0; ; bufPos++) {
|
|
/* read in a byte */
|
|
if((readValue = fgetc(in)) == EOF) break;
|
|
/* haha, binary mode - I don't see how this could possibly be useful */
|
|
/* if(readValue < 0 || readValue > UCHAR_MAX) continue; */
|
|
/* since this is text mode, ignore control characters */
|
|
if(readValue < ' ' || readValue > '~') continue;
|
|
/* assign the inputed int to a char to save on repeated conversions */
|
|
ch = (char)readValue;
|
|
if((flags & ECHO)) putchar(ch);
|
|
/* make sure the buffer size isn't going to exceed the limits of the data type */
|
|
if(bufPos + 2 >= ULONG_MAX - 1)
|
|
{ printf("Exceeded maximum data modulus of %u.", (unsigned int)ULONG_MAX - 1); worked = 0; break; }
|
|
/* expand the buffer if required */
|
|
if(bufPos + 2 >= bufSize) {
|
|
bufSize = ((bufPos + 2) & ~(memGran - 1)) + memGran;
|
|
if(!(buf = (char *)realloc((void *)buf, sizeof(char) * bufSize)))
|
|
{ printf("Ran out of memory allocating %u bytes.\n", (unsigned int)(sizeof(char) * bufSize)); worked = 0; break; }
|
|
else if((flags & VERBOSE)) printf("Input buffer sized to %u characters (%u bytes) at %p.\n", (unsigned int)bufSize, (unsigned int)(sizeof(char) * bufSize), buf);
|
|
}
|
|
/* write the symbol to the buffer with a null after it for string output */
|
|
buf[bufPos] = (char)ch;
|
|
buf[bufPos + 1] = 0;
|
|
/* starting at or past half way through the buffer and work towards the
|
|
end; yes, the + 1 is right not + (bufPos & 1); think zero-offset */
|
|
for(hiPos = (bufPos >> 1) + 1; hiPos + minLength - 1 <= bufPos; hiPos++) {
|
|
/* initial flags no match */
|
|
matched = 0;
|
|
/* search forwards from the buffer start up to the last point where
|
|
a full match might exist with the hi sequence */
|
|
for(loPos = 0; loPos < hiPos - (bufPos - hiPos); loPos++) {
|
|
/* search through the entire hi and lo sequences for a match */
|
|
for(hiDat = hiPos, loDat = loPos; ; hiDat++, loDat++) {
|
|
/* break on a non-match FIXME: add a case-insensitive version */
|
|
if(buf[hiDat] != buf[loDat]) break;
|
|
/* if at the end, the whole thing matched */
|
|
if(hiDat >= bufPos) { matched = 1; break; }
|
|
}
|
|
/* check if the above resulted in a full match */
|
|
if(matched) {
|
|
match_t match;
|
|
|
|
/* check if this match is the same as the previous match */
|
|
/* don't exceed the limit */
|
|
if(matchLimit && numMatches++ >= matchLimit)
|
|
{ printf("Match limit of %u exceeded.\n", (unsigned int)matchLimit); worked = 0; break; }
|
|
/* fill in info about the match */
|
|
match.pos = loPos;
|
|
match.len = 1 + bufPos - hiPos;
|
|
if((flags & INTERACTIVE)) printf("<%-*.*s", (int)match.len, (int)match.len, buf + match.pos);
|
|
/* if there's a match buffer, put the match into it */
|
|
if((flags & MATCHBUF)) {
|
|
unsigned long node, prevNode;
|
|
|
|
/* skip past all matches up to this position */
|
|
for(matched = 0, node = firstMatch, prevNode = ULONG_MAX; node != ULONG_MAX; prevNode = node, node = matchBuf[node].next) {
|
|
/* skip past all earlier positions */
|
|
if(matchBuf[node].pos < match.pos) continue;
|
|
/* no match if the next position is later */
|
|
if(matchBuf[node].pos > match.pos) break;
|
|
/* otherwise, this position must have matched */
|
|
matched = 1; break;
|
|
}
|
|
if(matched) {
|
|
/* skip past all matches up to this length FIXME: 1 unneccessary test for if(node) and unneccesary assignment of prevNode */
|
|
for(matched = 0; node != ULONG_MAX; prevNode = node, node = matchBuf[node].next) {
|
|
/* skip past all smaller lengths */
|
|
if(matchBuf[node].len < match.len) continue;
|
|
/* no match if the next length is larger */
|
|
if(matchBuf[node].len > match.len) break;
|
|
/* otherwise, this length must have matched */
|
|
matched = 1; break;
|
|
}
|
|
}
|
|
/* if the length and position of both matches are the same, increment the existing match count */
|
|
if(matched) matchBuf[node].count++;
|
|
/* otherwise, a new match must be created */
|
|
else {
|
|
match_t *newMatch;
|
|
|
|
/* make sure the buffer size isn't going to exceed the limits of the data type */
|
|
if(matchBufPos + 1 >= ULONG_MAX - 1) /* need the -1 because ULONG_MAX is a NULL flag */
|
|
{ printf("Match buffer exceeded maximum data modulus of %u.", (unsigned int)ULONG_MAX - 1); worked = 0; break; }
|
|
/* expand the buffer if required */
|
|
if(matchBufPos + 1 >= matchBufSize) {
|
|
matchBufSize = ((matchBufPos + 1) & ~(matchMemGran - 1)) + matchMemGran;
|
|
if(!(matchBuf = (match_t *)realloc((void *)matchBuf, sizeof(match_t) * matchBufSize)))
|
|
{ printf("Ran out of memory allocating %u bytes.\n", (unsigned int)(sizeof(match_t) * matchBufSize)); worked = 0; break; }
|
|
else if((flags & VERBOSE)) printf("Match buffer sized to %u matches (%u bytes) at %p.\n", (unsigned int)matchBufSize, (unsigned int)(sizeof(match_t) * matchBufSize), (void *)matchBuf);
|
|
}
|
|
/* get the address of the new match and move ahead for next time */
|
|
newMatch = matchBuf + matchBufPos++;
|
|
/* copy the match info into the new node */
|
|
newMatch->pos = match.pos; newMatch->len = match.len; newMatch->count = 1;
|
|
/* insert into the linked list */
|
|
if(prevNode != ULONG_MAX) { newMatch->next = matchBuf[prevNode].next; matchBuf[prevNode].next = matchBufPos - 1; }
|
|
else { newMatch->next = firstMatch; firstMatch = matchBufPos - 1; }
|
|
}
|
|
if((flags & INTERACTIVE)) printf("|#%u>\n", (unsigned int)matchBuf[node].count);
|
|
}
|
|
/* if there is no match buffer */
|
|
else if((flags & INTERACTIVE)) printf(">\n");
|
|
/* all other matches will have already been listed from
|
|
previous searches so stop looking */
|
|
break;
|
|
}
|
|
/* make sure everything is still working */
|
|
if(!worked) break;
|
|
}
|
|
/* make sure everything is still working */
|
|
if(!worked) break;
|
|
}
|
|
/* make sure everything is still working */
|
|
if(!worked) break;
|
|
}
|
|
/* if the buffer exists */
|
|
if(matchBuf) {
|
|
unsigned long node;
|
|
|
|
/* FIXME: filter out substrings of matches that match the same as the whole string */
|
|
for(node = firstMatch; node != ULONG_MAX; node = matchBuf[node].next) {
|
|
}
|
|
/* print out the results */
|
|
if(!(flags & NOSHOWBUF)) {
|
|
if((flags & SILENT))
|
|
{ for(node = firstMatch; node != ULONG_MAX; node = matchBuf[node].next) printf("%-*.*s\n", (int)matchBuf[node].len, (int)matchBuf[node].len, buf + matchBuf[node].pos); }
|
|
else
|
|
{ for(node = firstMatch; node != ULONG_MAX; node = matchBuf[node].next) printf("%u*<%-*.*s>@%u\n", (unsigned int)matchBuf[node].count, (int)matchBuf[node].len, (int)matchBuf[node].len, buf + matchBuf[node].pos, (unsigned int)matchBuf[node].pos); }
|
|
}
|
|
/* free the buffer */
|
|
if((flags & VERBOSE)) printf("Freeing match buffer at %p.\n", (void *)matchBuf);
|
|
free(matchBuf);
|
|
}
|
|
/* free the memory allocated to the buffer */
|
|
if(buf) { if((flags & VERBOSE)) printf("Freeing input buffer at %p.\n", buf); free(buf); }
|
|
/* close the file */
|
|
if(filename) { if((flags & VERBOSE)) printf("Closing input file at %p.\n", (void *)in); fclose(in); }
|
|
/* return the error state */
|
|
if(worked) return 0; else return 1;
|
|
}
|
|
|
|
void set_binary_switch_string(int argc, char **argv, char *flagStr, unsigned long *bvPtr) {
|
|
unsigned long mask;
|
|
int i;
|
|
char **arg, *str, *flagPos;
|
|
|
|
/* go through all the argv's */
|
|
for(i = argc, arg = argv; i > 0; i--, arg++) {
|
|
/* this argument looks like -a... or /a... */
|
|
if((*(str = *arg) == '-' || *str == '/') && *(++str)) {
|
|
/* looks like /a -a /a+ or -a+ */
|
|
if(!str[1] || str[1] == '+') {
|
|
for(mask = 1, flagPos = flagStr; *flagPos && mask; flagPos++, mask <<= 1)
|
|
{ if(*str == *flagPos) { *bvPtr |= mask; break; } }
|
|
}
|
|
/* looks like /a- or -a- */
|
|
else if(str[1] == '-') {
|
|
for(mask = 1, flagPos = flagStr; *flagPos && mask; flagPos++, mask <<= 1)
|
|
{ if(*str == *flagPos) { *bvPtr &= ~mask; break; } }
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
unsigned short set_ulong_flag(int argc, char **argv, char flagChar, unsigned long *lPtr) {
|
|
unsigned long num;
|
|
int i;
|
|
unsigned short gotNum = 0;
|
|
char **arg, *str, *end;
|
|
|
|
/* go through all the argv's */
|
|
for(i = argc, arg = argv; i > 0; i--, arg++) {
|
|
str = *arg;
|
|
/* looks like -fa... or /fa... */
|
|
if((*str == '-' || *str == '/') && *(++str) == flagChar && *(++str))
|
|
{ num = strtoul(str, &end, 0); if(!*end) { *lPtr = num; gotNum = 1; } }
|
|
}
|
|
return gotNum;
|
|
}
|
|
|
|
char *get_unswitched_arg(int argc, char **argv, unsigned int argNo) {
|
|
int i;
|
|
char **arg, *str;
|
|
|
|
/* go through all the argv's */
|
|
for(i = argc, arg = argv; i > 0; i--, arg++) {
|
|
/* this argument exists, doesn't look like -... or /..., and is no. argNo to be so */
|
|
if(*(str = *arg) && *str != '/' && *str != '-' && !argNo--) return str;
|
|
}
|
|
return 0;
|
|
}
|