$viewtab = TRUE to visualize hardcoded tabs.

Refactor &lef and & mid with stronger assertion on utf8_to_unicode().
2025-07-05 16:37:38 -04:00 · 2021-08-16 11:05:24 +08:00 · 2021-08-16 11:05:24 +08:00 · 2758464a2e
commit 2758464a2e
parent 946c603a39
5 changed files with 111 additions and 122 deletions
--- a/display.c
+++ b/display.c
@ -69,9 +69,11 @@ static int taboff = 0 ;	/* tab offset for display       */
 int mpresf = FALSE ;	/* TRUE if message in last line */
 int scrollcount = 1 ;	/* number of lines to scroll */
-int discmd = TRUE ;	/* display command flag         */
+int discmd = TRUE ;		/* display command flag         */
-int disinp = TRUE ;	/* display input characters (echo)	*/
+int disinp = TRUE ;		/* display input characters (echo)	*/
 /* global variables */
 boolean viewtab = FALSE ;	/* $viewtab = TRUE to visualize hardcoded tab */
 static int reframe( window_p wp) ;
 static void updone( window_p wp) ;
@ -198,11 +200,11 @@ static void vtputc( unicode_t c) {
 	if( c > 0x10FFFF)	/* Let's assume this is due to sign extension */
 		c &= 0xFF ;
-	if( c == '\t')
+	if( c == '\t') {
-		do {
+		sane_vtputc( viewtab ? 0x226B : ' ') ;	/* 0x226B: '≫' */
 		while( ((vtcol + taboff) % tabwidth) != 0)
 			sane_vtputc( ' ') ;
-		} while( ((vtcol + taboff) % tabwidth) != 0) ;
+	} else if( c < 0x20 || c == 0x7F) {
 	else if( c < 0x20 || c == 0x7F) {
 		sane_vtputc( '^') ;
 		sane_vtputc( c ^ 0x40) ;
 	} else if( c >= 0x80 && c <= 0xA0) {
--- a/display.h
+++ b/display.h
@ -15,6 +15,9 @@ extern int disinp ;         /* display input characters (echo)  */
 extern int gfcolor ;        /* global forgrnd color (white) */
 extern int gbcolor ;        /* global backgrnd color (black) */
 /* global variables */
 extern boolean viewtab ;    /* $viewtab = TRUE to visualize hardcoded tab */
 /* Bindable functions */
 BINDABLE( upscreen) ;
--- a/eval.c
+++ b/eval.c
@ -129,6 +129,7 @@ static const char *envars[] = {
 	"rval",			/* child process return value */
 	"tab",			/* tab width, 1... */
 	"hardtab",		/* TRUE for hard coded tab, FALSE for soft ones */
 	"viewtab",		/* TRUE to visualize hard coded tabs */
 	"overlap",
 	"jump",
 #if SCROLLCODE
@ -177,9 +178,10 @@ static const char *envars[] = {
 #define	EVRVAL		36
 #define EVTAB		37
 #define EVHARDTAB	38
-#define EVOVERLAP	39
+#define EVVIEWTAB	39
-#define EVSCROLLCOUNT	40
+#define EVOVERLAP	40
-#define EVSCROLL	41
+#define EVSCROLLCOUNT	41
 #define EVSCROLL	42
 enum function_type {
 	NILNAMIC	= 0,
@ -212,7 +214,7 @@ static struct {
 	{ "bno", UFBNOT		| MONAMIC },	/* bitwise not */
 	{ "bor", UFBOR		| DYNAMIC },	/* bitwise or    9-10-87  jwm */
 	{ "bxo", UFBXOR		| DYNAMIC },	/* bitwise xor   9-10-87  jwm */
-	{ "cat", UFCAT		| DYNAMIC },	/* concatinate string */
+	{ "cat", UFCAT		| DYNAMIC },	/* concatenate string */
 	{ "chr", UFCHR		| MONAMIC },	/* integer to char conversion */
 	{ "div", UFDIV		| DYNAMIC },	/* division */
 	{ "env", UFENV		| MONAMIC },	/* retrieve a system environment var */
@ -237,7 +239,7 @@ static struct {
 	{ "sgr", UFSGREAT	| DYNAMIC },	/* string logical greater than */
 	{ "sin", UFSINDEX	| DYNAMIC },	/* find the index of one string in another */
 	{ "sle", UFSLESS	| DYNAMIC },	/* string logical less than */
-	{ "sub", UFSUB		| DYNAMIC },	/* subtraction */
+	{ "sub", UFSUB		| DYNAMIC },	/* substraction */
 	{ "tim", UFTIMES	| DYNAMIC },	/* multiplication */
 	{ "tru", UFTRUTH	| MONAMIC },	/* Truth of the universe logical test */
 	{ "upp", UFUPPER	| MONAMIC },	/* uppercase string */
@ -413,19 +415,14 @@ static const char *gtfun( char *fname) {
 	}
 		break ;
 	case UFLEFT | DYNAMIC: {
-		int	sz1, i ;
+		int sz1 = strlen( arg1) ;
 		sz1 = strlen( arg1) ;
 		sz = 0 ;
-		for( i = atoi( arg2) ; i > 0 ; i -= 1) {
+		for( int i = atoi( arg2) ; i > 0 ; i -= 1) {
 			unicode_t c ;
 			int bytc ;
-			bytc = utf8_to_unicode( arg1, sz, sz1, &c) ;
+			sz += utf8_to_unicode( arg1, sz, sz1, &c) ;
-			if( bytc == 0)
+			if( sz == sz1)
 				break ;
 			else
 				sz += bytc ;
 		}
 		if( sz >= ressize) {
@ -449,26 +446,23 @@ static const char *gtfun( char *fname) {
 		retstr = strcpy( result, &arg1[ strlen( arg1) - sz]) ;
 		break ;
 	case UFMID | TRINAMIC: {
-		int sz1, start, i, bytc ;
+		int i ;
 		unicode_t c ;
-		sz1 = strlen( arg1) ;
+		int sz1 = strlen( arg1) ;
-		start = 0 ;
+		int start = 0 ;
 		for( i = atoi( arg2) - 1 ; i > 0 ; i -= 1) {
-			bytc = utf8_to_unicode( arg1, start, sz1, &c) ;
+			start +=  utf8_to_unicode( arg1, start, sz1, &c) ;
-			if( bytc == 0)
+			if( start == sz1)
 				break ;
 			else
 				start += bytc ;
 		}
 		sz = start ;
 		if( sz < sz1)
 		for( i = atoi( arg3) ; i > 0 ; i -= 1) {
-			bytc = utf8_to_unicode( arg1, sz, sz1, &c) ;
+			sz += utf8_to_unicode( arg1, sz, sz1, &c) ;
-			if( bytc == 0)
+			if( sz == sz1)
 				break ;
 			else
 				sz += bytc ;
 		}
 		sz -= start ;
@ -656,8 +650,8 @@ static char *gtusr( char *vname) {
 	return errorm;
 }
-/*
+
- * gtenv()
+/* gtenv()
 *
 * char *vname;			name of environment variable to retrieve
 */
@ -665,24 +659,20 @@ static char *gtenv( char *vname) {
 	unsigned vnum ;	/* ordinal number of var referenced */
 	/* scan the list, looking for the referenced name */
-	for (vnum = 0; vnum < ARRAY_SIZE(envars); vnum++)
+	for( vnum = 0 ; vnum < ARRAY_SIZE( envars) ; vnum++)
-		if (strcmp(vname, envars[vnum]) == 0)
+		if( strcmp( vname, envars[ vnum]) == 0)
-			break;
+			break ;
 	/* return errorm on a bad reference */
-	if (vnum == ARRAY_SIZE(envars))
+	if( vnum == ARRAY_SIZE( envars)) {
 #if	ENVFUNC
 	{
 		char *ename = getenv(vname);
-		if (ename != NULL)
+		if( ename != NULL)
-			return ename;
+			return ename ;
 		else
 			return errorm;
 	}
 #else
 		return errorm;
 #endif
 		return errorm ;
 	}
 	/* otherwise, fetch the appropriate value */
 	switch (vnum) {
@ -778,6 +768,8 @@ static char *gtenv( char *vname) {
 		return i_to_a( tabwidth) ;
 	case EVHARDTAB:
 		return ltos( hardtab) ;
 	case EVVIEWTAB:
 		return ltos( viewtab) ;
 	case EVOVERLAP:
 		return i_to_a(overlap);
 	case EVSCROLLCOUNT:
@ -1100,6 +1092,9 @@ static int svar(struct variable_description *var, char *value)
 		case EVHARDTAB:
 			hardtab = stol( value) ;
 			break ;
 		case EVVIEWTAB:
 			viewtab = stol( value) ;
 			break ;
 		case EVOVERLAP:
 			overlap = atoi(value);
 			break;
@ -1368,30 +1363,26 @@ static int ernd( int i) {
 	return (i <= 0) ? s : s % i + 1 ;
 }
-/*
+
- * find pattern within source
+/* find pattern within source
 *
 * char *source;	source string to search
 * char *pattern;	string to look for
 */
 static int sindex( char *source, char *pattern) {
-	char *sp;		/* ptr to current position to scan */
+/* scanning through the source string */
-
+	char *sp = source ;		/* ptr to current position to scan */
 	/* scanning through the source string */
 	sp = source;
 	int idx = 1 ;
 	int pos = 0 ;
 	int len = strlen( source) ;
-	while (*sp) {
+	while( *sp) {
 		char *csp;		/* ptr to source string during comparison */
 		char *cp;		/* ptr to place to check for equality */
 		char c ;
 		unicode_t uc ;
 		/* scan through the pattern */
-		cp = pattern;
+		char *cp = pattern ;		/* ptr to place to check for equality */
-		csp = sp;
+		char *csp = sp ;			/* ptr to source string during comparison */
 		while( (c = *cp++) && eq( c, *csp))
 			csp++ ;
@ -1406,7 +1397,7 @@ static int sindex( char *source, char *pattern) {
 	}
 	/* no match at all.. */
-	return 0;
+	return 0 ;
 }
 /*
--- a/utf8.c
+++ b/utf8.c
@ -1,9 +1,8 @@
-/* utf8.c -- implements utf8.h, converts between unicode and UTF-8 */
+/* utf8.c -- implements utf8.h, conversion between unicode and UTF-8 */
 #define _XOPEN_SOURCE	/* wcwidth in wchar.h */
 #include "utf8.h"
 #define _XOPEN_SOURCE   /* wcwidth in wchar.h */
 #include <assert.h>
 #include <wchar.h>
@ -11,17 +10,17 @@
 /* Display width of UTF-8 character */
 int _utf8_width( unicode_t c) {
 #if CYGWIN
-	assert( sizeof( wchar_t) == 2) ;	/* wcwidth only supports UTF-16 */
+    assert( sizeof( wchar_t) == 2) ;    /* wcwidth only supports UTF-16 */
-	return (c < 0x10000) ? wcwidth( (wchar_t) c) : -1 ;
+    return (c < 0x10000) ? wcwidth( (wchar_t) c) : -1 ;
 #else
-	return wcwidth( (wchar_t) c) ;
+    return wcwidth( (wchar_t) c) ;
 #endif
 }
 int utf8_width( unicode_t c) {
-	int w = _utf8_width( c) ;
+    int w = _utf8_width( c) ;
-	return (w < 0) ? 2 : w ;		/* display \u if can't figure out width */
+    return (w < 0) ? 2 : w ;        /* display \u if can't figure out width */
 }
@ -38,55 +37,49 @@ int utf8_width( unicode_t c) {
 * are happily accepted and decoded, as are the various "invalid values".
 */
 unsigned utf8_to_unicode( const char *line, unsigned index, unsigned len,
-															unicode_t *res) {
+                                                            unicode_t *res) {
-    unicode_t   value ;
+    assert( index < len) ;
-    unsigned	c ;
+    unsigned c = *res = (unsigned char) line[ index] ;
    unsigned	bytes, mask, i;
-	if( index >= len)
+    /* 0xxxxxxx is valid one byte utf8
 		return 0 ;
    *res = c = line[ index] & 0xFFU ;
    /*
     * 0xxxxxxx is valid one byte utf8
     * 10xxxxxx is invalid UTF-8 start byte, we assume it is Latin1
     * 1100000x is start of overlong encoding sequence
     * Sequence longer than 4 bytes are invalid
     * Last valid code is 0x10FFFF, encoding start with 0xF4
     */
    if( c <= 0xC1 || c > 0xF4)
-        return 1;
+        return 1 ;
    /* Ok, it's 11xxxxxx, do a stupid decode */
-    mask = 0x20;
+    unsigned mask = 0x20 ;
-    bytes = 2;
+    unsigned bytes = 2 ;
    while( (c & mask) != 0) {
-        bytes++;
+        bytes++ ;
-        mask >>= 1;
+        mask >>= 1 ;
    }
-	/* bytes is in range [2..4] as c was in range [C2..F4] */
+    /* bytes is in range [2..4] as c was in range [C2..F4] */
-    len -= index;
+    len -= index ;
-    if (bytes > len)
+    if( bytes > len)
-        return 1;
+        return 1 ;
-    value = c & (mask-1);
+    unicode_t value = c & (mask - 1) ;
    /* Ok, do the bytes */
-    line += index;
+    line += index ;
-    for (i = 1; i < bytes; i++) {
+    for( unsigned i = 2 ; i <= bytes ; i++) {
-        c = line[i] & 0xFFU ;
+        c = (unsigned char) *++line ;
-        if ((c & 0xc0) != 0x80)
+        if( (c & 0xc0) != 0x80)
-            return 1;
+            return 1 ;
-        value = (value << 6) | (c & 0x3f);
+
        value = (value << 6) | (c & 0x3f) ;
    }
    if( value > 0x10FFFF) /* Avoid 110000 - 13FFFF */
        return 1 ;
-    *res = value;
+    *res = value ;
-    return bytes;
+    return bytes ;
 }
@ -106,12 +99,12 @@ unsigned unicode_to_utf8( unicode_t c, char *utf8) {
    assert( c <= 0x10FFFF) ;
 #ifdef NDEBUG
-	if( c > 0x10FFFF)	/* Let's assume this is due to sign extension */
+    if( c > 0x10FFFF)   /* Let's assume this is due to sign extension */
-		c &= 0xFF ;
+        c &= 0xFF ;
 #endif
    if( c <= 0x7f)
-	    *utf8 = (char) c ;
+        *utf8 = (char) c ;
    else {
        unsigned prefix = 0x40 ;
        char *p = utf8 ;
@ -122,38 +115,38 @@ unsigned unicode_to_utf8( unicode_t c, char *utf8) {
            c >>= 6 ;
        } while( c >= prefix) ;
-		*p-- = *utf8 ;
+        *p-- = *utf8 ;
-		*utf8++ = (char) (c - 2 * prefix) ;
+        *utf8++ = (char) (c - 2 * prefix) ;
-		if( utf8 < p) {	/* swap middle two bytes if 4 bytes utf-8 code */
+        if( utf8 < p) { /* swap middle two bytes if 4 bytes utf-8 code */
-			char c = *p ;
+            char c = *p ;
-			*p = *utf8 ;
+            *p = *utf8 ;
-			*utf8 = c ;
+            *utf8 = c ;
-		}
+        }
    }
    return bytes ;
 }
 unsigned utf8_revdelta( unsigned char *p, unsigned pos) {
-	unsigned delta = 0 ;
+    unsigned delta = 0 ;
-	if( (*p & 0xC0) == 0x80) {
+    if( (*p & 0xC0) == 0x80) {
-		unsigned char c ;
+        unsigned char c ;
-		c = *--p ;
+        c = *--p ;
-		if( (c & 0xE0) == 0xC0)	/* valid 2 bytes unicode seq */
+        if( (c & 0xE0) == 0xC0) /* valid 2 bytes unicode seq */
-			delta = 1 ;
+            delta = 1 ;
-		else if( ((c & 0xC0) == 0x80) && (pos > 1)) {
+        else if( ((c & 0xC0) == 0x80) && (pos > 1)) {
-			c = *--p ;
+            c = *--p ;
-			if( (c & 0xF0) == 0xE0)	/* valid 3 bytes unicode seq */
+            if( (c & 0xF0) == 0xE0) /* valid 3 bytes unicode seq */
-				delta = 2 ;
+                delta = 2 ;
-			else if( ((c & 0xC0) == 0x80) && (pos > 2))
+            else if( ((c & 0xC0) == 0x80) && (pos > 2))
-				if( (p[ -1] & 0xF8) == 0xF0)	/* valid 4 bytes unicode seq */
+                if( (p[ -1] & 0xF8) == 0xF0)    /* valid 4 bytes unicode seq */
-					delta = 3 ;
+                    delta = 3 ;
-		}
+        }
-	}
+    }
-	return delta ;
+    return delta ;
 }
--- a/utf8.h
+++ b/utf8.h
@ -1,4 +1,4 @@
-/* utf8.h -- */
+/* utf8.h -- conversion between unicode and UTF-8 */
 #ifndef _UTF8_H_
 #define _UTF8_H_