UTF-8: New functions for stepping forward and backward in a string.

2025-06-30 22:19:29 -04:00 · 2006-09-02 18:28:31 +03:00 · 2006-09-02 18:28:31 +03:00 · 216495188a
commit 216495188a
parent a8c573a174
2 changed files with 160 additions and 0 deletions
--- a/src/intl/charsets.c
+++ b/src/intl/charsets.c
@ -376,6 +376,146 @@ utf8_cells2bytes(unsigned char *string, int max_cells, unsigned char *end)
 	return bytes;
 }

+/* Take @max steps forward from @string in the specified @way, but
+ * not going past @end.  Return the resulting address.  Store the
+ * number of steps taken to *@count, unless @count is NULL.
+ *
+ * This assumes the text is valid UTF-8, and @string and @end point to
+ * character boundaries.  If not, it doesn't crash but the results may
+ * be inconsistent.
+ *
+ * This function can do some of the same jobs as utf8charlen(),
+ * utf8_cells2bytes(), and strlen_utf8().  */
+unsigned char *
+utf8_step_forward(unsigned char *string, unsigned char *end,
+		  int max, enum utf8_step way, int *count)
+{
+	int steps = 0;
+	unsigned char *current = string;
+
+	assert(string);
+	assert(max >= 0);
+	if_assert_failed return string;
+	if (end == NULL)
+		end = strchr(string, '\0');
+
+	switch (way) {
+	case utf8_step_characters:
+		while (steps < max && current < end) {
+			++current;
+			if (utf8_islead(*current))
+				++steps;
+		}
+		break;
+
+	case utf8_step_cells_fewer:
+	case utf8_step_cells_more:
+		while (steps < max) {
+			unicode_val_T u;
+			unsigned char *prev = current;
+			int width;
+
+			u = utf_8_to_unicode(&current, end);
+			if (u == UCS_NO_CHAR) {
+				/* Assume the incomplete sequence
+				 * costs one cell.  */
+				current = end;
+				++steps;
+				break;
+			}
+
+			width = unicode_to_cell(u);
+			if (way == utf8_step_cells_fewer
+			    && steps + width > max) {
+				/* Back off.  */
+				current = prev;
+				break;
+			}
+			steps += width;
+		}
+		break;
+
+	default:
+		INTERNAL("impossible enum utf8_step");
+	}
+
+	if (count)
+		*count = steps;
+	return current;
+}
+
+/* Take @max steps backward from @string in the specified @way, but
+ * not going past @start.  Return the resulting address.  Store the
+ * number of steps taken to *@count, unless @count is NULL.
+ *
+ * This assumes the text is valid UTF-8, and @string and @start point
+ * to character boundaries.  If not, it doesn't crash but the results
+ * may be inconsistent.
+ *
+ * This function can do some of the same jobs as utf8_prevchar().  */
+unsigned char *
+utf8_step_backward(unsigned char *string, unsigned char *start,
+		   int max, enum utf8_step way, int *count)
+{
+	int steps = 0;
+	unsigned char *current = string;
+
+	assert(string);
+	assert(start);
+	assert(max >= 0);
+	if_assert_failed return string;
+
+	switch (way) {
+	case utf8_step_characters:
+		while (steps < max && current > start) {
+			--current;
+			if (utf8_islead(*current))
+				++steps;
+		}
+		break;
+
+	case utf8_step_cells_fewer:
+	case utf8_step_cells_more:
+		while (steps < max) {
+			unsigned char *prev = current;
+			unsigned char *look;
+			unicode_val_T u;
+			int width;
+
+			if (current <= start)
+				break;
+			do {
+				--current;
+			} while (current > start && !utf8_islead(*current));
+
+			look = current;
+			u = utf_8_to_unicode(&look, prev);
+			if (u == UCS_NO_CHAR) {
+				/* Assume the incomplete sequence
+				 * costs one cell.  */
+				width = 1;
+			} else
+				width = unicode_to_cell(u);
+
+			if (way == utf8_step_cells_fewer
+			    && steps + width > max) {
+				/* Back off.  */
+				current = prev;
+				break;
+			}
+			steps += width;
+		}
+		break;
+
+	default:
+		INTERNAL("impossible enum utf8_step");
+	}
+
+	if (count)
+		*count = steps;
+	return current;
+}
+
 /*
 * Find out number of standard terminal collumns needed for displaying symbol
 * (glyph) which represents Unicode character c.
--- a/src/intl/charsets.h
+++ b/src/intl/charsets.h
@ -71,6 +71,26 @@ int utf8_char2cells(unsigned char *, unsigned char *);
 int utf8_ptr2cells(unsigned char *, unsigned char *);
 int utf8_ptr2chars(unsigned char *, unsigned char *);
 int utf8_cells2bytes(unsigned char *, int, unsigned char *);
+/* How utf8_step_forward and utf8_step_backward count steps.  */
+enum utf8_step {
+	/* Each step is one character, even if it is a combining or
+	 * double-width character.  */
+	utf8_step_characters,
+
+	/* Each step is one cell.  If the specified number of steps
+	 * would end in the middle of a double-width character, do not
+	 * include the character.  */
+	utf8_step_cells_fewer,
+
+	/* Each step is one cell.  If the specified number of steps
+	 * would end in the middle of a double-width character,
+	 * include the whole character.  */
+	utf8_step_cells_more
+};
+unsigned char *utf8_step_forward(unsigned char *, unsigned char *,
+				 int, enum utf8_step, int *);
+unsigned char *utf8_step_backward(unsigned char *, unsigned char *,
+				  int, enum utf8_step, int *);
 inline int unicode_to_cell(unicode_val_T);
 unicode_val_T unicode_fold_label_case(unicode_val_T);
 inline int strlen_utf8(unsigned char **);