From 216495188a0b326dd0524635806e983a6c8913eb Mon Sep 17 00:00:00 2001 From: Kalle Olavi Niemitalo Date: Sat, 2 Sep 2006 18:28:31 +0300 Subject: [PATCH] UTF-8: New functions for stepping forward and backward in a string. --- src/intl/charsets.c | 140 ++++++++++++++++++++++++++++++++++++++++++++ src/intl/charsets.h | 20 +++++++ 2 files changed, 160 insertions(+) diff --git a/src/intl/charsets.c b/src/intl/charsets.c index 11aedf28..edd9e484 100644 --- a/src/intl/charsets.c +++ b/src/intl/charsets.c @@ -376,6 +376,146 @@ utf8_cells2bytes(unsigned char *string, int max_cells, unsigned char *end) return bytes; } +/* Take @max steps forward from @string in the specified @way, but + * not going past @end. Return the resulting address. Store the + * number of steps taken to *@count, unless @count is NULL. + * + * This assumes the text is valid UTF-8, and @string and @end point to + * character boundaries. If not, it doesn't crash but the results may + * be inconsistent. + * + * This function can do some of the same jobs as utf8charlen(), + * utf8_cells2bytes(), and strlen_utf8(). */ +unsigned char * +utf8_step_forward(unsigned char *string, unsigned char *end, + int max, enum utf8_step way, int *count) +{ + int steps = 0; + unsigned char *current = string; + + assert(string); + assert(max >= 0); + if_assert_failed return string; + if (end == NULL) + end = strchr(string, '\0'); + + switch (way) { + case utf8_step_characters: + while (steps < max && current < end) { + ++current; + if (utf8_islead(*current)) + ++steps; + } + break; + + case utf8_step_cells_fewer: + case utf8_step_cells_more: + while (steps < max) { + unicode_val_T u; + unsigned char *prev = current; + int width; + + u = utf_8_to_unicode(¤t, end); + if (u == UCS_NO_CHAR) { + /* Assume the incomplete sequence + * costs one cell. */ + current = end; + ++steps; + break; + } + + width = unicode_to_cell(u); + if (way == utf8_step_cells_fewer + && steps + width > max) { + /* Back off. */ + current = prev; + break; + } + steps += width; + } + break; + + default: + INTERNAL("impossible enum utf8_step"); + } + + if (count) + *count = steps; + return current; +} + +/* Take @max steps backward from @string in the specified @way, but + * not going past @start. Return the resulting address. Store the + * number of steps taken to *@count, unless @count is NULL. + * + * This assumes the text is valid UTF-8, and @string and @start point + * to character boundaries. If not, it doesn't crash but the results + * may be inconsistent. + * + * This function can do some of the same jobs as utf8_prevchar(). */ +unsigned char * +utf8_step_backward(unsigned char *string, unsigned char *start, + int max, enum utf8_step way, int *count) +{ + int steps = 0; + unsigned char *current = string; + + assert(string); + assert(start); + assert(max >= 0); + if_assert_failed return string; + + switch (way) { + case utf8_step_characters: + while (steps < max && current > start) { + --current; + if (utf8_islead(*current)) + ++steps; + } + break; + + case utf8_step_cells_fewer: + case utf8_step_cells_more: + while (steps < max) { + unsigned char *prev = current; + unsigned char *look; + unicode_val_T u; + int width; + + if (current <= start) + break; + do { + --current; + } while (current > start && !utf8_islead(*current)); + + look = current; + u = utf_8_to_unicode(&look, prev); + if (u == UCS_NO_CHAR) { + /* Assume the incomplete sequence + * costs one cell. */ + width = 1; + } else + width = unicode_to_cell(u); + + if (way == utf8_step_cells_fewer + && steps + width > max) { + /* Back off. */ + current = prev; + break; + } + steps += width; + } + break; + + default: + INTERNAL("impossible enum utf8_step"); + } + + if (count) + *count = steps; + return current; +} + /* * Find out number of standard terminal collumns needed for displaying symbol * (glyph) which represents Unicode character c. diff --git a/src/intl/charsets.h b/src/intl/charsets.h index c0452aff..84ce2e2f 100644 --- a/src/intl/charsets.h +++ b/src/intl/charsets.h @@ -71,6 +71,26 @@ int utf8_char2cells(unsigned char *, unsigned char *); int utf8_ptr2cells(unsigned char *, unsigned char *); int utf8_ptr2chars(unsigned char *, unsigned char *); int utf8_cells2bytes(unsigned char *, int, unsigned char *); +/* How utf8_step_forward and utf8_step_backward count steps. */ +enum utf8_step { + /* Each step is one character, even if it is a combining or + * double-width character. */ + utf8_step_characters, + + /* Each step is one cell. If the specified number of steps + * would end in the middle of a double-width character, do not + * include the character. */ + utf8_step_cells_fewer, + + /* Each step is one cell. If the specified number of steps + * would end in the middle of a double-width character, + * include the whole character. */ + utf8_step_cells_more +}; +unsigned char *utf8_step_forward(unsigned char *, unsigned char *, + int, enum utf8_step, int *); +unsigned char *utf8_step_backward(unsigned char *, unsigned char *, + int, enum utf8_step, int *); inline int unicode_to_cell(unicode_val_T); unicode_val_T unicode_fold_label_case(unicode_val_T); inline int strlen_utf8(unsigned char **);