From 7b544bbd35fc5eb9cdab2235006fd09e9f94cc03 Mon Sep 17 00:00:00 2001 From: John Zaitseff Date: Sat, 20 Aug 2011 13:03:40 +1000 Subject: [PATCH] Add the functions xwcrtomb() and xmbstowcs() The xwcrtomb() and xmbstowcs() functions replace illegal characters with EILSEQ_REPL ('?'). Adjust other functions, particularly mkchstr() and friends, to suit. --- src/intf.c | 114 +++++++++++++++------------------------------------ src/system.h | 1 + src/utils.c | 76 ++++++++++++++++++++++++++++++++++ src/utils.h | 33 +++++++++++++++ 4 files changed, 143 insertions(+), 81 deletions(-) diff --git a/src/intf.c b/src/intf.c index 09b6501..46cc3e8 100644 --- a/src/intf.c +++ b/src/intf.c @@ -1069,68 +1069,51 @@ int mkchstr_parse (const wchar_t *restrict format, void mkchstr_conv (chtype *restrict chbuf, int chbufsize, wchar_t *restrict wcbuf, chtype *restrict attrbuf) { - char *convbuf = xmalloc(chbufsize); - mbstate_t mbstate; - wchar_t *wp; + char convbuf[MB_LEN_MAX + 1]; + char endbuf[MB_LEN_MAX]; + mbstate_t mbstate, mbcopy; + size_t endsize, n; char *p; bool done; - size_t n; - /* Perform a preliminary conversion to weed out any problems with - EILSEQ and insufficient buffer space. */ - while (true) { - memset(&mbstate, 0, sizeof(mbstate)); - wp = wcbuf; - if (wcsrtombs(convbuf, (const wchar_t **) &wp, chbufsize, &mbstate) - == (size_t) -1) { - if (errno == EILSEQ) { - /* Replace problematic wide characters with a known-good - (ASCII) one. This is better than terminating! */ - *wp = EILSEQ_REPL; - } else { - errno_exit(_("mkchstr_conv: `%ls'"), wcbuf); - } - } else if (wp != NULL) { - // convbuf is too small: truncate wcbuf if possible - if (wp == wcbuf) { - errno = E2BIG; - errno_exit(_("mkchstr_conv: `%ls'"), wcbuf); - } else { - *(wp - 1) = '\0'; - } - } else { - // wcbuf CAN fit into convbuf when converted - break; - } - } - - // Convert for real, combining each multibyte character with attrbuf memset(&mbstate, 0, sizeof(mbstate)); done = false; while (! done) { - // Yes, we want to convert a wide NUL, too! - if ((n = wcrtomb(convbuf, *wcbuf, &mbstate)) == (size_t) -1) { - errno_exit(_("mkchstr_conv: `%ls'"), wcbuf); + // Make sure we always have enough space for ending shift sequence + memcpy(&mbcopy, &mbstate, sizeof(mbstate)); + endsize = wcrtomb(endbuf, '\0', &mbcopy); + if (endsize == (size_t) -1) { + errno_exit(_("mkchstr_conv: NUL")); } - for (p = convbuf; n > 0; n--, p++, chbuf++) { - if (*p == '\0' || *p == '\n') { - /* This code assumes '\n' can never appear in a multibyte - string except as a control character---which is true - of all multibyte encodings (I believe!) */ - *chbuf = (unsigned char) *p; - } else { - *chbuf = (unsigned char) *p | *attrbuf; + // Yes, we want to convert a wide NUL, too! + n = xwcrtomb(convbuf, *wcbuf, &mbstate); + + if (chbufsize > endsize + n) { + for (p = convbuf; n > 0; n--, p++, chbuf++, chbufsize--) { + if (*p == '\0' || *p == '\n') { + /* This code assumes '\n' can never appear in a + multibyte string except as a control character--- + which is true of all multibyte encodings (I + believe!) */ + *chbuf = (unsigned char) *p; + } else { + *chbuf = (unsigned char) *p | *attrbuf; + } } + } else { + // Not enough space for *wcbuf, so terminate chbuf early + for (p = endbuf; endsize > 0; endsize--, p++, chbuf++) { + *chbuf = (unsigned char) *p; + } + break; } done = (*wcbuf == '\0'); wcbuf++; attrbuf++; } - - free(convbuf); } @@ -1147,7 +1130,6 @@ int vmkchstr (chtype *restrict chbuf, int chbufsize, chtype attr_norm, struct convspec *spec; const wchar_t *wcformat; wchar_t *orig_wcformat; - mbstate_t mbstate; wchar_t *outbuf, *orig_outbuf; chtype *attrbuf, *orig_attrbuf; @@ -1176,16 +1158,7 @@ int vmkchstr (chtype *restrict chbuf, int chbufsize, chtype attr_norm, fmtbuf = xmalloc(BUFSIZE * sizeof(wchar_t)); // Convert format to a wide-character string - { - memset(&mbstate, 0, sizeof(mbstate)); - const char *p = format; - if (mbsrtowcs(orig_wcformat, &p, BUFSIZE, &mbstate) == (size_t) -1) { - goto error; - } else if (p != NULL) { - errno = E2BIG; - goto error; - } - } + xmbstowcs(orig_wcformat, format, BUFSIZE); if (mkchstr_parse(wcformat, format_arg, format_spec, args) < 0) { goto error; @@ -1269,8 +1242,7 @@ int vmkchstr (chtype *restrict chbuf, int chbufsize, chtype attr_norm, } if (wc == '\0' || wc == WEOF) { - errno = EILSEQ; - goto error; + wc = EILSEQ_REPL; } fmtbuf[0] = wc; @@ -1319,7 +1291,6 @@ int vmkchstr (chtype *restrict chbuf, int chbufsize, chtype attr_norm, /* strfmon() is not available in a wide-char version, so we need a multibyte char buffer */ char *buf = xmalloc(BUFSIZE); - const char *p = buf; if (l_strfmon(buf, BUFSIZE, spec->flag_nosym ? "%!n" : "%n", format_arg[spec->arg_num].a.a_double) < 0) { @@ -1329,19 +1300,7 @@ int vmkchstr (chtype *restrict chbuf, int chbufsize, chtype attr_norm, goto error; } - memset(&mbstate, 0, sizeof(mbstate)); - if (mbsrtowcs(fmtbuf, &p, BUFSIZE, &mbstate) - == (size_t) -1) { - saved_errno = errno; - free(buf); - errno = saved_errno; - goto error; - } else if (p != NULL) { - free(buf); - errno = E2BIG; - goto error; - } - + xmbstowcs(fmtbuf, buf, BUFSIZE); free(buf); } @@ -1357,14 +1316,7 @@ int vmkchstr (chtype *restrict chbuf, int chbufsize, chtype attr_norm, if (p == NULL) { str = NULL; } else { - memset(&mbstate, 0, sizeof(mbstate)); - if (mbsrtowcs(fmtbuf, &p, BUFSIZE, &mbstate) - == (size_t) -1) { - goto error; - } else if (p != NULL) { - errno = E2BIG; - goto error; - } + xmbstowcs(fmtbuf, p, BUFSIZE); str = fmtbuf; } } diff --git a/src/system.h b/src/system.h index 4b1732a..c9342a7 100644 --- a/src/system.h +++ b/src/system.h @@ -55,6 +55,7 @@ #include #include #include +#include #include #include #include diff --git a/src/utils.c b/src/utils.c index 7b0adac..3d9cfd5 100644 --- a/src/utils.c +++ b/src/utils.c @@ -507,5 +507,81 @@ wchar_t *xwcsdup (const wchar_t *str) } +/***********************************************************************/ +// xmbstowcs: Convert a multibyte string to a wide-character string + +size_t xmbstowcs (wchar_t *restrict dest, const char *restrict src, size_t len) +{ + assert(dest != NULL); + assert(len > 0); + + char *s = xstrdup(src); + size_t n; + + while (true) { + mbstate_t mbstate; + char *p = s; + + memset(&mbstate, 0, sizeof(mbstate)); + if ((n = mbsrtowcs(dest, (const char **) &p, len, &mbstate)) + == (size_t) -1) { + if (errno == EILSEQ) { + // Illegal sequence detected: replace it and try again + *p = EILSEQ_REPL; + } else { + errno_exit(_("xmbstowcs: `%s'"), src); + } + } else if (p != NULL) { + // Multibyte string was too long: truncate dest + dest[len - 1] = '\0'; + n--; + break; + } else { + break; + } + } + + free(s); + return n; +} + + +/***********************************************************************/ +// xwcrtomb: Convert a wide character to a multibyte sequence + +size_t xwcrtomb (char *restrict dest, wchar_t wc, mbstate_t *restrict mbstate) +{ + mbstate_t mbcopy; + size_t n; + + + assert(dest != NULL); + assert(mbstate != NULL); + + memcpy(&mbcopy, mbstate, sizeof(mbcopy)); + + if ((n = wcrtomb(dest, wc, &mbcopy)) == (size_t) -1) { + if (errno == EILSEQ) { + /* wc cannot be represented in current locale. + + Note that the shift state in mbcopy is now undefined. + Hence, restore the original, try to store an ending shift + sequence, then EILSEQ_REPL. */ + memcpy(&mbcopy, mbstate, sizeof(mbcopy)); + if ((n = wcrtomb(dest, '\0', &mbcopy)) == (size_t) -1) { + errno_exit(_("xwcrtomb: NUL")); + } + dest[n] = EILSEQ_REPL; + dest[n++] = '\0'; + } else { + errno_exit(_("xwcrtomb: `%lc'"), wc); + } + } + + memcpy(mbstate, &mbcopy, sizeof(mbcopy)); + return n; +} + + /***********************************************************************/ // End of file diff --git a/src/utils.h b/src/utils.h index 88699ba..6d0079f 100644 --- a/src/utils.h +++ b/src/utils.h @@ -343,4 +343,37 @@ extern char *xstrdup (const char *str); extern wchar_t *xwcsdup (const wchar_t *str); +/* + Function: xmbstowcs - Convert a multibyte string to a wide-character string + Parameters: dest - Location of wide-string buffer + src - String to convert + len - Size of dest, in multiples of wchar_t + Returns: size_t - Number of characters placed in dest (excluding NUL) + + This wrapper function converts a multibyte string to a wide-character + one by calling mbrtowc() continually until the whole string is + converted. If any illegal sequences are present, they are converted to + the EILSEQ_REPL character. If the destination buffer is too small, the + string is truncated. +*/ +extern size_t xmbstowcs (wchar_t *restrict dest, const char *restrict src, + size_t len); + + +/* + Function: xwcrtomb - Convert a wide character to a multibyte sequence + Parameters: dest - Location of multibyte buffer (size >= MB_CUR_MAX + 1) + wc - Character to convert + mbstate - Pointer to current multibyte shift state + Returns: size_t - Number of characters placed in dest + + This wrapper function converts the wide character in wc (which may be + NUL) by calling wcrtomb(). If wc cannot be represented in the current + locale, EILSEQ_REPL is used instead (with any characters needed to move + to an initial shift state prior to EILSEQ_REPL). +*/ +extern size_t xwcrtomb (char *restrict dest, wchar_t wc, + mbstate_t *restrict mbstate); + + #endif /* included_UTILS_H */