$viewtab = TRUE to visualize hardcoded tabs.

Refactor &lef and & mid with stronger assertion on utf8_to_unicode().
This commit is contained in:
Renaud 2021-08-16 11:05:24 +08:00
parent 946c603a39
commit 2758464a2e
5 changed files with 111 additions and 122 deletions

View File

@ -69,9 +69,11 @@ static int taboff = 0 ; /* tab offset for display */
int mpresf = FALSE ; /* TRUE if message in last line */ int mpresf = FALSE ; /* TRUE if message in last line */
int scrollcount = 1 ; /* number of lines to scroll */ int scrollcount = 1 ; /* number of lines to scroll */
int discmd = TRUE ; /* display command flag */ int discmd = TRUE ; /* display command flag */
int disinp = TRUE ; /* display input characters (echo) */ int disinp = TRUE ; /* display input characters (echo) */
/* global variables */
boolean viewtab = FALSE ; /* $viewtab = TRUE to visualize hardcoded tab */
static int reframe( window_p wp) ; static int reframe( window_p wp) ;
static void updone( window_p wp) ; static void updone( window_p wp) ;
@ -198,11 +200,11 @@ static void vtputc( unicode_t c) {
if( c > 0x10FFFF) /* Let's assume this is due to sign extension */ if( c > 0x10FFFF) /* Let's assume this is due to sign extension */
c &= 0xFF ; c &= 0xFF ;
if( c == '\t') if( c == '\t') {
do { sane_vtputc( viewtab ? 0x226B : ' ') ; /* 0x226B: '≫' */
while( ((vtcol + taboff) % tabwidth) != 0)
sane_vtputc( ' ') ; sane_vtputc( ' ') ;
} while( ((vtcol + taboff) % tabwidth) != 0) ; } else if( c < 0x20 || c == 0x7F) {
else if( c < 0x20 || c == 0x7F) {
sane_vtputc( '^') ; sane_vtputc( '^') ;
sane_vtputc( c ^ 0x40) ; sane_vtputc( c ^ 0x40) ;
} else if( c >= 0x80 && c <= 0xA0) { } else if( c >= 0x80 && c <= 0xA0) {

View File

@ -15,6 +15,9 @@ extern int disinp ; /* display input characters (echo) */
extern int gfcolor ; /* global forgrnd color (white) */ extern int gfcolor ; /* global forgrnd color (white) */
extern int gbcolor ; /* global backgrnd color (black) */ extern int gbcolor ; /* global backgrnd color (black) */
/* global variables */
extern boolean viewtab ; /* $viewtab = TRUE to visualize hardcoded tab */
/* Bindable functions */ /* Bindable functions */
BINDABLE( upscreen) ; BINDABLE( upscreen) ;

93
eval.c
View File

@ -129,6 +129,7 @@ static const char *envars[] = {
"rval", /* child process return value */ "rval", /* child process return value */
"tab", /* tab width, 1... */ "tab", /* tab width, 1... */
"hardtab", /* TRUE for hard coded tab, FALSE for soft ones */ "hardtab", /* TRUE for hard coded tab, FALSE for soft ones */
"viewtab", /* TRUE to visualize hard coded tabs */
"overlap", "overlap",
"jump", "jump",
#if SCROLLCODE #if SCROLLCODE
@ -177,9 +178,10 @@ static const char *envars[] = {
#define EVRVAL 36 #define EVRVAL 36
#define EVTAB 37 #define EVTAB 37
#define EVHARDTAB 38 #define EVHARDTAB 38
#define EVOVERLAP 39 #define EVVIEWTAB 39
#define EVSCROLLCOUNT 40 #define EVOVERLAP 40
#define EVSCROLL 41 #define EVSCROLLCOUNT 41
#define EVSCROLL 42
enum function_type { enum function_type {
NILNAMIC = 0, NILNAMIC = 0,
@ -212,7 +214,7 @@ static struct {
{ "bno", UFBNOT | MONAMIC }, /* bitwise not */ { "bno", UFBNOT | MONAMIC }, /* bitwise not */
{ "bor", UFBOR | DYNAMIC }, /* bitwise or 9-10-87 jwm */ { "bor", UFBOR | DYNAMIC }, /* bitwise or 9-10-87 jwm */
{ "bxo", UFBXOR | DYNAMIC }, /* bitwise xor 9-10-87 jwm */ { "bxo", UFBXOR | DYNAMIC }, /* bitwise xor 9-10-87 jwm */
{ "cat", UFCAT | DYNAMIC }, /* concatinate string */ { "cat", UFCAT | DYNAMIC }, /* concatenate string */
{ "chr", UFCHR | MONAMIC }, /* integer to char conversion */ { "chr", UFCHR | MONAMIC }, /* integer to char conversion */
{ "div", UFDIV | DYNAMIC }, /* division */ { "div", UFDIV | DYNAMIC }, /* division */
{ "env", UFENV | MONAMIC }, /* retrieve a system environment var */ { "env", UFENV | MONAMIC }, /* retrieve a system environment var */
@ -237,7 +239,7 @@ static struct {
{ "sgr", UFSGREAT | DYNAMIC }, /* string logical greater than */ { "sgr", UFSGREAT | DYNAMIC }, /* string logical greater than */
{ "sin", UFSINDEX | DYNAMIC }, /* find the index of one string in another */ { "sin", UFSINDEX | DYNAMIC }, /* find the index of one string in another */
{ "sle", UFSLESS | DYNAMIC }, /* string logical less than */ { "sle", UFSLESS | DYNAMIC }, /* string logical less than */
{ "sub", UFSUB | DYNAMIC }, /* subtraction */ { "sub", UFSUB | DYNAMIC }, /* substraction */
{ "tim", UFTIMES | DYNAMIC }, /* multiplication */ { "tim", UFTIMES | DYNAMIC }, /* multiplication */
{ "tru", UFTRUTH | MONAMIC }, /* Truth of the universe logical test */ { "tru", UFTRUTH | MONAMIC }, /* Truth of the universe logical test */
{ "upp", UFUPPER | MONAMIC }, /* uppercase string */ { "upp", UFUPPER | MONAMIC }, /* uppercase string */
@ -413,19 +415,14 @@ static const char *gtfun( char *fname) {
} }
break ; break ;
case UFLEFT | DYNAMIC: { case UFLEFT | DYNAMIC: {
int sz1, i ; int sz1 = strlen( arg1) ;
sz1 = strlen( arg1) ;
sz = 0 ; sz = 0 ;
for( i = atoi( arg2) ; i > 0 ; i -= 1) { for( int i = atoi( arg2) ; i > 0 ; i -= 1) {
unicode_t c ; unicode_t c ;
int bytc ;
bytc = utf8_to_unicode( arg1, sz, sz1, &c) ; sz += utf8_to_unicode( arg1, sz, sz1, &c) ;
if( bytc == 0) if( sz == sz1)
break ; break ;
else
sz += bytc ;
} }
if( sz >= ressize) { if( sz >= ressize) {
@ -449,26 +446,23 @@ static const char *gtfun( char *fname) {
retstr = strcpy( result, &arg1[ strlen( arg1) - sz]) ; retstr = strcpy( result, &arg1[ strlen( arg1) - sz]) ;
break ; break ;
case UFMID | TRINAMIC: { case UFMID | TRINAMIC: {
int sz1, start, i, bytc ; int i ;
unicode_t c ; unicode_t c ;
sz1 = strlen( arg1) ; int sz1 = strlen( arg1) ;
start = 0 ; int start = 0 ;
for( i = atoi( arg2) - 1 ; i > 0 ; i -= 1) { for( i = atoi( arg2) - 1 ; i > 0 ; i -= 1) {
bytc = utf8_to_unicode( arg1, start, sz1, &c) ; start += utf8_to_unicode( arg1, start, sz1, &c) ;
if( bytc == 0) if( start == sz1)
break ; break ;
else
start += bytc ;
} }
sz = start ; sz = start ;
if( sz < sz1)
for( i = atoi( arg3) ; i > 0 ; i -= 1) { for( i = atoi( arg3) ; i > 0 ; i -= 1) {
bytc = utf8_to_unicode( arg1, sz, sz1, &c) ; sz += utf8_to_unicode( arg1, sz, sz1, &c) ;
if( bytc == 0) if( sz == sz1)
break ; break ;
else
sz += bytc ;
} }
sz -= start ; sz -= start ;
@ -656,8 +650,8 @@ static char *gtusr( char *vname) {
return errorm; return errorm;
} }
/*
* gtenv() /* gtenv()
* *
* char *vname; name of environment variable to retrieve * char *vname; name of environment variable to retrieve
*/ */
@ -665,24 +659,20 @@ static char *gtenv( char *vname) {
unsigned vnum ; /* ordinal number of var referenced */ unsigned vnum ; /* ordinal number of var referenced */
/* scan the list, looking for the referenced name */ /* scan the list, looking for the referenced name */
for (vnum = 0; vnum < ARRAY_SIZE(envars); vnum++) for( vnum = 0 ; vnum < ARRAY_SIZE( envars) ; vnum++)
if (strcmp(vname, envars[vnum]) == 0) if( strcmp( vname, envars[ vnum]) == 0)
break; break ;
/* return errorm on a bad reference */ /* return errorm on a bad reference */
if (vnum == ARRAY_SIZE(envars)) if( vnum == ARRAY_SIZE( envars)) {
#if ENVFUNC #if ENVFUNC
{
char *ename = getenv(vname); char *ename = getenv(vname);
if (ename != NULL) if( ename != NULL)
return ename; return ename ;
else
return errorm;
}
#else
return errorm;
#endif #endif
return errorm ;
}
/* otherwise, fetch the appropriate value */ /* otherwise, fetch the appropriate value */
switch (vnum) { switch (vnum) {
@ -778,6 +768,8 @@ static char *gtenv( char *vname) {
return i_to_a( tabwidth) ; return i_to_a( tabwidth) ;
case EVHARDTAB: case EVHARDTAB:
return ltos( hardtab) ; return ltos( hardtab) ;
case EVVIEWTAB:
return ltos( viewtab) ;
case EVOVERLAP: case EVOVERLAP:
return i_to_a(overlap); return i_to_a(overlap);
case EVSCROLLCOUNT: case EVSCROLLCOUNT:
@ -1100,6 +1092,9 @@ static int svar(struct variable_description *var, char *value)
case EVHARDTAB: case EVHARDTAB:
hardtab = stol( value) ; hardtab = stol( value) ;
break ; break ;
case EVVIEWTAB:
viewtab = stol( value) ;
break ;
case EVOVERLAP: case EVOVERLAP:
overlap = atoi(value); overlap = atoi(value);
break; break;
@ -1368,30 +1363,26 @@ static int ernd( int i) {
return (i <= 0) ? s : s % i + 1 ; return (i <= 0) ? s : s % i + 1 ;
} }
/*
* find pattern within source /* find pattern within source
* *
* char *source; source string to search * char *source; source string to search
* char *pattern; string to look for * char *pattern; string to look for
*/ */
static int sindex( char *source, char *pattern) { static int sindex( char *source, char *pattern) {
char *sp; /* ptr to current position to scan */ /* scanning through the source string */
char *sp = source ; /* ptr to current position to scan */
/* scanning through the source string */
sp = source;
int idx = 1 ; int idx = 1 ;
int pos = 0 ; int pos = 0 ;
int len = strlen( source) ; int len = strlen( source) ;
while (*sp) { while( *sp) {
char *csp; /* ptr to source string during comparison */
char *cp; /* ptr to place to check for equality */
char c ; char c ;
unicode_t uc ; unicode_t uc ;
/* scan through the pattern */ /* scan through the pattern */
cp = pattern; char *cp = pattern ; /* ptr to place to check for equality */
csp = sp; char *csp = sp ; /* ptr to source string during comparison */
while( (c = *cp++) && eq( c, *csp)) while( (c = *cp++) && eq( c, *csp))
csp++ ; csp++ ;
@ -1406,7 +1397,7 @@ static int sindex( char *source, char *pattern) {
} }
/* no match at all.. */ /* no match at all.. */
return 0; return 0 ;
} }
/* /*

121
utf8.c
View File

@ -1,9 +1,8 @@
/* utf8.c -- implements utf8.h, converts between unicode and UTF-8 */ /* utf8.c -- implements utf8.h, conversion between unicode and UTF-8 */
#define _XOPEN_SOURCE /* wcwidth in wchar.h */
#include "utf8.h" #include "utf8.h"
#define _XOPEN_SOURCE /* wcwidth in wchar.h */
#include <assert.h> #include <assert.h>
#include <wchar.h> #include <wchar.h>
@ -11,17 +10,17 @@
/* Display width of UTF-8 character */ /* Display width of UTF-8 character */
int _utf8_width( unicode_t c) { int _utf8_width( unicode_t c) {
#if CYGWIN #if CYGWIN
assert( sizeof( wchar_t) == 2) ; /* wcwidth only supports UTF-16 */ assert( sizeof( wchar_t) == 2) ; /* wcwidth only supports UTF-16 */
return (c < 0x10000) ? wcwidth( (wchar_t) c) : -1 ; return (c < 0x10000) ? wcwidth( (wchar_t) c) : -1 ;
#else #else
return wcwidth( (wchar_t) c) ; return wcwidth( (wchar_t) c) ;
#endif #endif
} }
int utf8_width( unicode_t c) { int utf8_width( unicode_t c) {
int w = _utf8_width( c) ; int w = _utf8_width( c) ;
return (w < 0) ? 2 : w ; /* display \u if can't figure out width */ return (w < 0) ? 2 : w ; /* display \u if can't figure out width */
} }
@ -38,55 +37,49 @@ int utf8_width( unicode_t c) {
* are happily accepted and decoded, as are the various "invalid values". * are happily accepted and decoded, as are the various "invalid values".
*/ */
unsigned utf8_to_unicode( const char *line, unsigned index, unsigned len, unsigned utf8_to_unicode( const char *line, unsigned index, unsigned len,
unicode_t *res) { unicode_t *res) {
unicode_t value ; assert( index < len) ;
unsigned c ; unsigned c = *res = (unsigned char) line[ index] ;
unsigned bytes, mask, i;
if( index >= len) /* 0xxxxxxx is valid one byte utf8
return 0 ;
*res = c = line[ index] & 0xFFU ;
/*
* 0xxxxxxx is valid one byte utf8
* 10xxxxxx is invalid UTF-8 start byte, we assume it is Latin1 * 10xxxxxx is invalid UTF-8 start byte, we assume it is Latin1
* 1100000x is start of overlong encoding sequence * 1100000x is start of overlong encoding sequence
* Sequence longer than 4 bytes are invalid * Sequence longer than 4 bytes are invalid
* Last valid code is 0x10FFFF, encoding start with 0xF4 * Last valid code is 0x10FFFF, encoding start with 0xF4
*/ */
if( c <= 0xC1 || c > 0xF4) if( c <= 0xC1 || c > 0xF4)
return 1; return 1 ;
/* Ok, it's 11xxxxxx, do a stupid decode */ /* Ok, it's 11xxxxxx, do a stupid decode */
mask = 0x20; unsigned mask = 0x20 ;
bytes = 2; unsigned bytes = 2 ;
while( (c & mask) != 0) { while( (c & mask) != 0) {
bytes++; bytes++ ;
mask >>= 1; mask >>= 1 ;
} }
/* bytes is in range [2..4] as c was in range [C2..F4] */ /* bytes is in range [2..4] as c was in range [C2..F4] */
len -= index; len -= index ;
if (bytes > len) if( bytes > len)
return 1; return 1 ;
value = c & (mask-1); unicode_t value = c & (mask - 1) ;
/* Ok, do the bytes */ /* Ok, do the bytes */
line += index; line += index ;
for (i = 1; i < bytes; i++) { for( unsigned i = 2 ; i <= bytes ; i++) {
c = line[i] & 0xFFU ; c = (unsigned char) *++line ;
if ((c & 0xc0) != 0x80) if( (c & 0xc0) != 0x80)
return 1; return 1 ;
value = (value << 6) | (c & 0x3f);
value = (value << 6) | (c & 0x3f) ;
} }
if( value > 0x10FFFF) /* Avoid 110000 - 13FFFF */ if( value > 0x10FFFF) /* Avoid 110000 - 13FFFF */
return 1 ; return 1 ;
*res = value; *res = value ;
return bytes; return bytes ;
} }
@ -106,12 +99,12 @@ unsigned unicode_to_utf8( unicode_t c, char *utf8) {
assert( c <= 0x10FFFF) ; assert( c <= 0x10FFFF) ;
#ifdef NDEBUG #ifdef NDEBUG
if( c > 0x10FFFF) /* Let's assume this is due to sign extension */ if( c > 0x10FFFF) /* Let's assume this is due to sign extension */
c &= 0xFF ; c &= 0xFF ;
#endif #endif
if( c <= 0x7f) if( c <= 0x7f)
*utf8 = (char) c ; *utf8 = (char) c ;
else { else {
unsigned prefix = 0x40 ; unsigned prefix = 0x40 ;
char *p = utf8 ; char *p = utf8 ;
@ -122,38 +115,38 @@ unsigned unicode_to_utf8( unicode_t c, char *utf8) {
c >>= 6 ; c >>= 6 ;
} while( c >= prefix) ; } while( c >= prefix) ;
*p-- = *utf8 ; *p-- = *utf8 ;
*utf8++ = (char) (c - 2 * prefix) ; *utf8++ = (char) (c - 2 * prefix) ;
if( utf8 < p) { /* swap middle two bytes if 4 bytes utf-8 code */ if( utf8 < p) { /* swap middle two bytes if 4 bytes utf-8 code */
char c = *p ; char c = *p ;
*p = *utf8 ; *p = *utf8 ;
*utf8 = c ; *utf8 = c ;
} }
} }
return bytes ; return bytes ;
} }
unsigned utf8_revdelta( unsigned char *p, unsigned pos) { unsigned utf8_revdelta( unsigned char *p, unsigned pos) {
unsigned delta = 0 ; unsigned delta = 0 ;
if( (*p & 0xC0) == 0x80) { if( (*p & 0xC0) == 0x80) {
unsigned char c ; unsigned char c ;
c = *--p ; c = *--p ;
if( (c & 0xE0) == 0xC0) /* valid 2 bytes unicode seq */ if( (c & 0xE0) == 0xC0) /* valid 2 bytes unicode seq */
delta = 1 ; delta = 1 ;
else if( ((c & 0xC0) == 0x80) && (pos > 1)) { else if( ((c & 0xC0) == 0x80) && (pos > 1)) {
c = *--p ; c = *--p ;
if( (c & 0xF0) == 0xE0) /* valid 3 bytes unicode seq */ if( (c & 0xF0) == 0xE0) /* valid 3 bytes unicode seq */
delta = 2 ; delta = 2 ;
else if( ((c & 0xC0) == 0x80) && (pos > 2)) else if( ((c & 0xC0) == 0x80) && (pos > 2))
if( (p[ -1] & 0xF8) == 0xF0) /* valid 4 bytes unicode seq */ if( (p[ -1] & 0xF8) == 0xF0) /* valid 4 bytes unicode seq */
delta = 3 ; delta = 3 ;
} }
} }
return delta ; return delta ;
} }

2
utf8.h
View File

@ -1,4 +1,4 @@
/* utf8.h -- */ /* utf8.h -- conversion between unicode and UTF-8 */
#ifndef _UTF8_H_ #ifndef _UTF8_H_
#define _UTF8_H_ #define _UTF8_H_