653 lines
14 KiB
C
653 lines
14 KiB
C
/* Copyright 1989 GROUPE BULL -- See license conditions in file COPYRIGHT
|
|
* Copyright 1989 Massachusetts Institute of Technology
|
|
*/
|
|
/***********************\
|
|
* *
|
|
* WOOL_OBJECT String *
|
|
* BODY *
|
|
* *
|
|
\***********************/
|
|
|
|
#include "EXTERN.h"
|
|
#include <string.h>
|
|
#include <stdio.h>
|
|
#include "wool.h"
|
|
#include "wl_number.h"
|
|
#include "wl_atom.h"
|
|
#include "wl_active.h"
|
|
#include "wl_pointer.h"
|
|
#include "wl_name.h"
|
|
#include "wl_list.h"
|
|
#include "INTERN.h"
|
|
#include "wl_string.h"
|
|
|
|
char * re_comp();
|
|
|
|
/*
|
|
* Constructor:
|
|
* WLString_make
|
|
* argument 1: the string, which will be COPIED.
|
|
*/
|
|
|
|
WOOL_String
|
|
WLString_make(s)
|
|
char *s; /* the string itself */
|
|
{
|
|
WOOL_String object;
|
|
|
|
if (s[0] == '\0')
|
|
return NIL_STRING;
|
|
object = (WOOL_String)
|
|
Malloc(sizeof(struct _WOOL_String) + strlen(s));
|
|
zrt_put(object);
|
|
object -> type = WLString;
|
|
strcpy(object -> string, s);
|
|
return object;
|
|
}
|
|
|
|
WOOL_String
|
|
WLString_n_make(n)
|
|
int n;
|
|
{
|
|
WOOL_String object = (WOOL_String)
|
|
Malloc(sizeof(struct _WOOL_String) + n);
|
|
|
|
zrt_put(object);
|
|
object -> type = WLString;
|
|
return object;
|
|
}
|
|
|
|
NIL_STRING_make()
|
|
{
|
|
NIL_STRING = (WOOL_String) Malloc(sizeof(struct _WOOL_String));
|
|
zrt_put(NIL_STRING);
|
|
NIL_STRING -> type = WLString;
|
|
NIL_STRING -> string[0] = '\0';
|
|
increase_reference(NIL_STRING);
|
|
}
|
|
|
|
/*
|
|
* WLString_print:
|
|
* We print strings surrounded by double quotes.
|
|
*/
|
|
|
|
WOOL_OBJECT
|
|
WLString_print(obj)
|
|
WOOL_String obj;
|
|
{
|
|
wool_puts(obj -> string);
|
|
return (WOOL_OBJECT) obj;
|
|
}
|
|
|
|
/*
|
|
* WLString_free
|
|
* just frees the structure, so we use WLNumber_free.
|
|
*/
|
|
|
|
/*
|
|
* WLString_execute
|
|
* just the same error as with atoms, let's use it.
|
|
*/
|
|
|
|
/*
|
|
* WLString_equal
|
|
* tests 2 strings for equality (returns it if true)
|
|
*/
|
|
|
|
WOOL_OBJECT
|
|
WLString_equal(s1, s2)
|
|
WOOL_String s1, s2;
|
|
{
|
|
if (!is_a_string(s2) || strcmp(s1 -> string, s2 -> string))
|
|
return NIL;
|
|
else
|
|
return (WOOL_OBJECT) s1;
|
|
}
|
|
|
|
/*
|
|
* string manipulation routines
|
|
*/
|
|
|
|
static char *strings_temp_buffer;
|
|
static int strings_temp_buffer_size = 256;
|
|
|
|
WOOL_OBJECT
|
|
add_strings(argc,argv)
|
|
int argc;
|
|
WOOL_String argv[];
|
|
{
|
|
int required_length = 0, i;
|
|
|
|
for (i = 0; i < argc; i++) {
|
|
must_be_string(argv[i], i);
|
|
if (argv[i] != (WOOL_String) NIL)
|
|
required_length += strlen(argv[i] -> string);
|
|
}
|
|
if (!strings_temp_buffer)
|
|
strings_temp_buffer = (char *) Malloc(strings_temp_buffer_size);
|
|
if (required_length >= strings_temp_buffer_size) {
|
|
strings_temp_buffer_size = Max(2 * strings_temp_buffer_size,
|
|
required_length+1);
|
|
strings_temp_buffer = (char *)
|
|
Realloc(strings_temp_buffer, strings_temp_buffer_size);
|
|
}
|
|
strings_temp_buffer[0] = '\0';
|
|
for (i = 0; i < argc; i++)
|
|
if (argv[i] != (WOOL_String) NIL)
|
|
strcat(strings_temp_buffer, argv[i] -> string);
|
|
return (WOOL_OBJECT) WLString_make(strings_temp_buffer);
|
|
}
|
|
|
|
/*
|
|
* To know if an object can be used as a string (atom, pointer or active)
|
|
*/
|
|
|
|
int
|
|
must_be_string(obj, n)
|
|
WOOL_OBJECT obj;
|
|
int n;
|
|
{
|
|
if (obj -> type != WLString
|
|
&& obj -> type != WLAtom
|
|
&& obj -> type != WLActive
|
|
&& obj -> type != WLPointer
|
|
&& obj -> type != WLName
|
|
&& obj != NIL)
|
|
bad_argument(obj, n, WOOL_TYPE_P_NAME(WLString));
|
|
}
|
|
|
|
int
|
|
is_a_string(obj)
|
|
WOOL_OBJECT obj;
|
|
{
|
|
if (obj -> type != WLString
|
|
&& obj -> type != WLAtom
|
|
&& obj -> type != WLActive
|
|
&& obj -> type != WLPointer
|
|
&& obj -> type != WLName
|
|
&& obj != NIL)
|
|
return 0;
|
|
else
|
|
return 1;
|
|
}
|
|
|
|
|
|
/**************************************************************************\
|
|
* *
|
|
* the general match package *
|
|
* (match regular-expression string [level]) *
|
|
* returns the sub-string in the levelth enclosing \( and \) or NIL_STRING *
|
|
* or string or NIL if no level given *
|
|
* *
|
|
\**************************************************************************/
|
|
|
|
WOOL_String
|
|
WLString_match(argc, argv)
|
|
int argc;
|
|
WOOL_String argv[];
|
|
{
|
|
int result, i;
|
|
char *subst, *s, *comp_error;
|
|
WOOL_String wl_subst;
|
|
WOOL_List wl_list;
|
|
|
|
if (argc < 2)
|
|
wool_error(BAD_NUMBER_OF_ARGS, argc);
|
|
must_be_string(argv[0], 0);
|
|
must_be_string(argv[1], 1);
|
|
if ((comp_error = re_comp(argv[0] -> string)) ||
|
|
((result = re_exec(argv[1] -> string)) == -1)) {
|
|
if (comp_error)
|
|
wool_printf("%\n", comp_error);
|
|
wool_error("match: Bad regular expression, %s", argv[0] -> string);
|
|
}
|
|
if (result) {
|
|
switch (argc) {
|
|
case 2:
|
|
return argv[1];
|
|
case 3:
|
|
must_be_number(argv[2], 2);
|
|
if (result =
|
|
re_subst(((WOOL_Number) argv[2]) -> number, &subst)) {
|
|
wl_subst = WLString_n_make(result + 1);
|
|
strncpy(wl_subst -> string, subst, result);
|
|
wl_subst -> string[result] = '\0';
|
|
return wl_subst;
|
|
} else {
|
|
return NIL_STRING;
|
|
}
|
|
default:
|
|
wl_list = wool_list_make(argc - 2);
|
|
bzero(wl_list -> list, wl_list -> size * sizeof (WOOL_OBJECT));
|
|
for (i = 2; i< argc; i++) {
|
|
must_be_number(argv[i], i);
|
|
if (result =
|
|
re_subst(((WOOL_Number) argv[i]) -> number, &subst)) {
|
|
wl_subst = WLString_n_make(result + 1);
|
|
strncpy(wl_subst -> string, subst, result);
|
|
wl_subst -> string[result] = '\0';
|
|
increase_reference(wl_list -> list[i - 2] =
|
|
(WOOL_OBJECT) wl_subst);
|
|
} else {
|
|
increase_reference(wl_list -> list[i - 2] =
|
|
(WOOL_OBJECT) NIL_STRING);
|
|
}
|
|
}
|
|
return (WOOL_String) wl_list;
|
|
}
|
|
} else {
|
|
return (argc == 2 ? (WOOL_String) NIL : NIL_STRING);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* routines to do regular expression matching
|
|
*
|
|
* Entry points:
|
|
*
|
|
* re_comp(s)
|
|
* char *s;
|
|
* ... returns 0 if the string s was compiled successfully,
|
|
* a pointer to an error message otherwise.
|
|
* If passed 0 or a null string returns without changing
|
|
* the currently compiled re (see note 11 below).
|
|
*
|
|
* re_exec(s)
|
|
* char *s;
|
|
* ... returns 1 if the string s matches the last compiled regular
|
|
* expression,
|
|
* 0 if the string s failed to match the last compiled
|
|
* regular expression, and
|
|
* -1 if the compiled regular expression was invalid
|
|
* (indicating an internal error).
|
|
*
|
|
* re_subst(n, &p)
|
|
* int n;
|
|
* char *p;
|
|
* ... returns in p the string matching the nth \(, returns the
|
|
* number of chars matching
|
|
*
|
|
* The strings passed to both re_comp and re_exec may have trailing or
|
|
* embedded newline characters; they are terminated by nulls.
|
|
*
|
|
* The regular expressions recognized are described below. This description
|
|
* is essentially the same as that for ed.
|
|
*
|
|
* A regular expression specifies a set of strings of characters.
|
|
* A member of this set of strings is said to be matched by
|
|
* the regular expression. In the following specification for
|
|
* regular expressions the word `character' means any character but NUL.
|
|
*
|
|
* 1. Any character except a special character matches itself.
|
|
* Special characters are the regular expression delimiter plus
|
|
* \ [ . and sometimes ^ * $.
|
|
* 2. A . matches any character.
|
|
* 3. A \ followed by any character except a digit or ( )
|
|
* matches that character.
|
|
* 4. A nonempty string s bracketed [s] (or [^s]) matches any
|
|
* character in (or not in) s. In s, \ has no special meaning,
|
|
* and ] may only appear as the first letter. A substring
|
|
* a-b, with a and b in ascending ASCII order, stands for
|
|
* the inclusive range of ASCII characters.
|
|
* 5. A regular expression of form 1-4 followed by * matches a
|
|
* sequence of 0 or more matches of the regular expression.
|
|
* 6. A regular expression, x, of form 1-8, bracketed \(x\)
|
|
* matches what x matches.
|
|
* 7. A \ followed by a digit n matches a copy of the string that the
|
|
* bracketed regular expression beginning with the nth \( matched.
|
|
* 8. A regular expression of form 1-8, x, followed by a regular
|
|
* expression of form 1-7, y matches a match for x followed by
|
|
* a match for y, with the x match being as long as possible
|
|
* while still permitting a y match.
|
|
* 9. A regular expression of form 1-8 preceded by ^ (or followed
|
|
* by $), is constrained to matches that begin at the left
|
|
* (or end at the right) end of a line.
|
|
* 10. A regular expression of form 1-9 picks out the longest among
|
|
* the leftmost matches in a line.
|
|
* 11. An empty regular expression stands for a copy of the last
|
|
* regular expression encountered.
|
|
*/
|
|
|
|
/*
|
|
* constants for re's
|
|
*/
|
|
#define CBRA 1
|
|
#define CCHR 2
|
|
#define CDOT 4
|
|
#define CCL 6
|
|
#define NCCL 8
|
|
#define CDOL 10
|
|
#define CEOF 11
|
|
#define CKET 12
|
|
#define CBACK 18
|
|
|
|
#define CSTAR 01
|
|
|
|
#define ESIZE 512
|
|
#define NBRA 9
|
|
#define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
|
|
|
|
static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];
|
|
static char circf;
|
|
static int numbra;
|
|
static int advance();
|
|
|
|
/*
|
|
* compile the regular expression argument into a dfa
|
|
*/
|
|
|
|
char *
|
|
re_comp(sp)
|
|
char *sp;
|
|
{
|
|
int c;
|
|
char *ep = expbuf;
|
|
int cclcnt;
|
|
char *lastep = 0;
|
|
char bracket[NBRA];
|
|
char *bracketp = &bracket[0];
|
|
static char *retoolong = "Regular expression too long";
|
|
|
|
numbra = 0;
|
|
if (sp == 0 || *sp == '\0') {
|
|
if (*ep == 0)
|
|
return ("No previous regular expression");
|
|
return (0);
|
|
}
|
|
if (*sp == '^') {
|
|
circf = 1;
|
|
sp++;
|
|
} else
|
|
circf = 0;
|
|
for (;;) {
|
|
if (ep >= &expbuf[ESIZE])
|
|
comerr(retoolong);
|
|
if ((c = *sp++) == '\0') {
|
|
if (bracketp != bracket)
|
|
comerr("unmatched \\(");
|
|
*ep++ = CEOF;
|
|
*ep++ = 0;
|
|
return (0);
|
|
}
|
|
if (c != '*')
|
|
lastep = ep;
|
|
switch (c) {
|
|
|
|
case '.':
|
|
*ep++ = CDOT;
|
|
continue;
|
|
|
|
case '*':
|
|
if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
|
|
goto defchar;
|
|
*lastep |= CSTAR;
|
|
continue;
|
|
|
|
case '$':
|
|
if (*sp != '\0')
|
|
goto defchar;
|
|
*ep++ = CDOL;
|
|
continue;
|
|
|
|
case '[':
|
|
*ep++ = CCL;
|
|
*ep++ = 0;
|
|
cclcnt = 1;
|
|
if ((c = *sp++) == '^') {
|
|
c = *sp++;
|
|
ep[-2] = NCCL;
|
|
}
|
|
do {
|
|
if (c == '\0')
|
|
comerr("missing ]");
|
|
if (c == '-' && ep[-1] != 0) {
|
|
if ((c = *sp++) == ']') {
|
|
*ep++ = '-';
|
|
cclcnt++;
|
|
break;
|
|
}
|
|
while (ep[-1] < c) {
|
|
*ep = ep[-1] + 1;
|
|
ep++;
|
|
cclcnt++;
|
|
if (ep >= &expbuf[ESIZE])
|
|
comerr(retoolong);
|
|
}
|
|
}
|
|
*ep++ = c;
|
|
cclcnt++;
|
|
if (ep >= &expbuf[ESIZE])
|
|
comerr(retoolong);
|
|
} while ((c = *sp++) != ']');
|
|
lastep[1] = cclcnt;
|
|
continue;
|
|
|
|
case '\\':
|
|
if ((c = *sp++) == '(') {
|
|
if (numbra >= NBRA)
|
|
comerr("too many \\(\\) pairs");
|
|
*bracketp++ = numbra;
|
|
*ep++ = CBRA;
|
|
*ep++ = numbra++;
|
|
continue;
|
|
}
|
|
if (c == ')') {
|
|
if (bracketp <= bracket)
|
|
comerr("unmatched \\)");
|
|
*ep++ = CKET;
|
|
*ep++ = *--bracketp;
|
|
continue;
|
|
}
|
|
if (c >= '1' && c < ('1' + NBRA)) {
|
|
*ep++ = CBACK;
|
|
*ep++ = c - '1';
|
|
continue;
|
|
}
|
|
*ep++ = CCHR;
|
|
*ep++ = c;
|
|
continue;
|
|
|
|
defchar:
|
|
default:
|
|
*ep++ = CCHR;
|
|
*ep++ = c;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* match the argument string against the compiled re
|
|
*/
|
|
|
|
int
|
|
re_exec(p1)
|
|
char *p1;
|
|
{
|
|
char *p2 = expbuf;
|
|
int c;
|
|
int rv;
|
|
|
|
for (c = 0; c < NBRA; c++) {
|
|
braslist[c] = 0;
|
|
braelist[c] = 0;
|
|
}
|
|
if (circf)
|
|
return ((advance(p1, p2)));
|
|
|
|
/*
|
|
* fast check for first character
|
|
*/
|
|
if (*p2 == CCHR) {
|
|
c = p2[1];
|
|
do {
|
|
if (*p1 != c)
|
|
continue;
|
|
if (rv = advance(p1, p2))
|
|
return (rv);
|
|
} while (*p1++);
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* regular algorithm
|
|
*/
|
|
do
|
|
if (rv = advance(p1, p2))
|
|
return (rv);
|
|
while (*p1++);
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* try to match the next thing in the dfa
|
|
*/
|
|
|
|
static int
|
|
advance(lp, ep)
|
|
char *lp, *ep;
|
|
{
|
|
char *curlp;
|
|
int ct, i;
|
|
int rv;
|
|
|
|
for (;;)
|
|
switch (*ep++) {
|
|
|
|
case CCHR:
|
|
if (*ep++ == *lp++)
|
|
continue;
|
|
return (0);
|
|
|
|
case CDOT:
|
|
if (*lp++)
|
|
continue;
|
|
return (0);
|
|
|
|
case CDOL:
|
|
if (*lp == '\0')
|
|
continue;
|
|
return (0);
|
|
|
|
case CEOF:
|
|
return (1);
|
|
|
|
case CCL:
|
|
if (cclass(ep, *lp++, 1)) {
|
|
ep += *ep;
|
|
continue;
|
|
}
|
|
return (0);
|
|
|
|
case NCCL:
|
|
if (cclass(ep, *lp++, 0)) {
|
|
ep += *ep;
|
|
continue;
|
|
}
|
|
return (0);
|
|
|
|
case CBRA:
|
|
braslist[*ep++] = lp;
|
|
continue;
|
|
|
|
case CKET:
|
|
braelist[*ep++] = lp;
|
|
continue;
|
|
|
|
case CBACK:
|
|
if (braelist[i = *ep++] == 0)
|
|
return (-1);
|
|
if (backref(i, lp)) {
|
|
lp += braelist[i] - braslist[i];
|
|
continue;
|
|
}
|
|
return (0);
|
|
|
|
case CBACK | CSTAR:
|
|
if (braelist[i = *ep++] == 0)
|
|
return (-1);
|
|
curlp = lp;
|
|
ct = braelist[i] - braslist[i];
|
|
while (backref(i, lp))
|
|
lp += ct;
|
|
while (lp >= curlp) {
|
|
if (rv = advance(lp, ep))
|
|
return (rv);
|
|
lp -= ct;
|
|
}
|
|
continue;
|
|
|
|
case CDOT | CSTAR:
|
|
curlp = lp;
|
|
while (*lp++);
|
|
goto star;
|
|
|
|
case CCHR | CSTAR:
|
|
curlp = lp;
|
|
while (*lp++ == *ep);
|
|
ep++;
|
|
goto star;
|
|
|
|
case CCL | CSTAR:
|
|
case NCCL | CSTAR:
|
|
curlp = lp;
|
|
while (cclass(ep, *lp++, ep[-1] == (CCL | CSTAR)));
|
|
ep += *ep;
|
|
goto star;
|
|
|
|
star:
|
|
do {
|
|
lp--;
|
|
if (rv = advance(lp, ep))
|
|
return (rv);
|
|
} while (lp > curlp);
|
|
return (0);
|
|
|
|
default:
|
|
return (-1);
|
|
}
|
|
}
|
|
|
|
backref(i, lp)
|
|
int i;
|
|
char *lp;
|
|
{
|
|
char *bp;
|
|
|
|
bp = braslist[i];
|
|
while (*bp++ == *lp++)
|
|
if (bp >= braelist[i])
|
|
return (1);
|
|
return (0);
|
|
}
|
|
|
|
int
|
|
cclass(set, c, af)
|
|
char *set, c;
|
|
int af;
|
|
{
|
|
int n;
|
|
|
|
if (c == 0)
|
|
return (0);
|
|
n = *set++;
|
|
while (--n)
|
|
if (*set++ == c)
|
|
return (af);
|
|
return (!af);
|
|
}
|
|
|
|
int
|
|
re_subst(n, pp)
|
|
int n;
|
|
char **pp;
|
|
{
|
|
int length;
|
|
|
|
if(n > numbra) {
|
|
return (int) wool_error("match: Too many \"(\" asked (%d) ", n);
|
|
}
|
|
*pp = braslist[n-1];
|
|
length = braelist[n-1] - *pp;
|
|
return length;
|
|
}
|