UTF-8: New functions for stepping forward and backward in a string.

2025-02-02 15:09:23 -05:00 · 2006-09-02 18:28:31 +03:00 · 2006-09-02 18:28:31 +03:00 · 216495188a
commit 216495188a
parent a8c573a174
2 changed files with 160 additions and 0 deletions
--- a/src/intl/charsets.c
+++ b/src/intl/charsets.c
@ -376,6 +376,146 @@ utf8_cells2bytes(unsigned char *string, int max_cells, unsigned char *end)
 	return bytes;
 }
 /* Take @max steps forward from @string in the specified @way, but
 * not going past @end.  Return the resulting address.  Store the
 * number of steps taken to *@count, unless @count is NULL.
 *
 * This assumes the text is valid UTF-8, and @string and @end point to
 * character boundaries.  If not, it doesn't crash but the results may
 * be inconsistent.
 *
 * This function can do some of the same jobs as utf8charlen(),
 * utf8_cells2bytes(), and strlen_utf8().  */
 unsigned char *
 utf8_step_forward(unsigned char *string, unsigned char *end,
 		  int max, enum utf8_step way, int *count)
 {
 	int steps = 0;
 	unsigned char *current = string;
 	assert(string);
 	assert(max >= 0);
 	if_assert_failed return string;
 	if (end == NULL)
 		end = strchr(string, '\0');
 	switch (way) {
 	case utf8_step_characters:
 		while (steps < max && current < end) {
 			++current;
 			if (utf8_islead(*current))
 				++steps;
 		}
 		break;
 	case utf8_step_cells_fewer:
 	case utf8_step_cells_more:
 		while (steps < max) {
 			unicode_val_T u;
 			unsigned char *prev = current;
 			int width;
 			u = utf_8_to_unicode(&current, end);
 			if (u == UCS_NO_CHAR) {
 				/* Assume the incomplete sequence
 				 * costs one cell.  */
 				current = end;
 				++steps;
 				break;
 			}
 			width = unicode_to_cell(u);
 			if (way == utf8_step_cells_fewer
 			    && steps + width > max) {
 				/* Back off.  */
 				current = prev;
 				break;
 			}
 			steps += width;
 		}
 		break;
 	default:
 		INTERNAL("impossible enum utf8_step");
 	}
 	if (count)
 		*count = steps;
 	return current;
 }
 /* Take @max steps backward from @string in the specified @way, but
 * not going past @start.  Return the resulting address.  Store the
 * number of steps taken to *@count, unless @count is NULL.
 *
 * This assumes the text is valid UTF-8, and @string and @start point
 * to character boundaries.  If not, it doesn't crash but the results
 * may be inconsistent.
 *
 * This function can do some of the same jobs as utf8_prevchar().  */
 unsigned char *
 utf8_step_backward(unsigned char *string, unsigned char *start,
 		   int max, enum utf8_step way, int *count)
 {
 	int steps = 0;
 	unsigned char *current = string;
 	assert(string);
 	assert(start);
 	assert(max >= 0);
 	if_assert_failed return string;
 	switch (way) {
 	case utf8_step_characters:
 		while (steps < max && current > start) {
 			--current;
 			if (utf8_islead(*current))
 				++steps;
 		}
 		break;
 	case utf8_step_cells_fewer:
 	case utf8_step_cells_more:
 		while (steps < max) {
 			unsigned char *prev = current;
 			unsigned char *look;
 			unicode_val_T u;
 			int width;
 			if (current <= start)
 				break;
 			do {
 				--current;
 			} while (current > start && !utf8_islead(*current));
 			look = current;
 			u = utf_8_to_unicode(&look, prev);
 			if (u == UCS_NO_CHAR) {
 				/* Assume the incomplete sequence
 				 * costs one cell.  */
 				width = 1;
 			} else
 				width = unicode_to_cell(u);
 			if (way == utf8_step_cells_fewer
 			    && steps + width > max) {
 				/* Back off.  */
 				current = prev;
 				break;
 			}
 			steps += width;
 		}
 		break;
 	default:
 		INTERNAL("impossible enum utf8_step");
 	}
 	if (count)
 		*count = steps;
 	return current;
 }
 /*
 * Find out number of standard terminal collumns needed for displaying symbol
 * (glyph) which represents Unicode character c.
--- a/src/intl/charsets.h
+++ b/src/intl/charsets.h
@ -71,6 +71,26 @@ int utf8_char2cells(unsigned char *, unsigned char *);
 int utf8_ptr2cells(unsigned char *, unsigned char *);
 int utf8_ptr2chars(unsigned char *, unsigned char *);
 int utf8_cells2bytes(unsigned char *, int, unsigned char *);
 /* How utf8_step_forward and utf8_step_backward count steps.  */
 enum utf8_step {
 	/* Each step is one character, even if it is a combining or
 	 * double-width character.  */
 	utf8_step_characters,
 	/* Each step is one cell.  If the specified number of steps
 	 * would end in the middle of a double-width character, do not
 	 * include the character.  */
 	utf8_step_cells_fewer,
 	/* Each step is one cell.  If the specified number of steps
 	 * would end in the middle of a double-width character,
 	 * include the whole character.  */
 	utf8_step_cells_more
 };
 unsigned char *utf8_step_forward(unsigned char *, unsigned char *,
 				 int, enum utf8_step, int *);
 unsigned char *utf8_step_backward(unsigned char *, unsigned char *,
 				  int, enum utf8_step, int *);
 inline int unicode_to_cell(unicode_val_T);
 unicode_val_T unicode_fold_label_case(unicode_val_T);
 inline int strlen_utf8(unsigned char **);