1
0
mirror of https://github.com/rkd77/elinks.git synced 2024-11-04 08:17:17 -05:00

UTF-8: New functions for stepping forward and backward in a string.

This commit is contained in:
Kalle Olavi Niemitalo 2006-09-02 18:28:31 +03:00 committed by Kalle Olavi Niemitalo
parent a8c573a174
commit 216495188a
2 changed files with 160 additions and 0 deletions

View File

@ -376,6 +376,146 @@ utf8_cells2bytes(unsigned char *string, int max_cells, unsigned char *end)
return bytes;
}
/* Take @max steps forward from @string in the specified @way, but
* not going past @end. Return the resulting address. Store the
* number of steps taken to *@count, unless @count is NULL.
*
* This assumes the text is valid UTF-8, and @string and @end point to
* character boundaries. If not, it doesn't crash but the results may
* be inconsistent.
*
* This function can do some of the same jobs as utf8charlen(),
* utf8_cells2bytes(), and strlen_utf8(). */
unsigned char *
utf8_step_forward(unsigned char *string, unsigned char *end,
int max, enum utf8_step way, int *count)
{
int steps = 0;
unsigned char *current = string;
assert(string);
assert(max >= 0);
if_assert_failed return string;
if (end == NULL)
end = strchr(string, '\0');
switch (way) {
case utf8_step_characters:
while (steps < max && current < end) {
++current;
if (utf8_islead(*current))
++steps;
}
break;
case utf8_step_cells_fewer:
case utf8_step_cells_more:
while (steps < max) {
unicode_val_T u;
unsigned char *prev = current;
int width;
u = utf_8_to_unicode(&current, end);
if (u == UCS_NO_CHAR) {
/* Assume the incomplete sequence
* costs one cell. */
current = end;
++steps;
break;
}
width = unicode_to_cell(u);
if (way == utf8_step_cells_fewer
&& steps + width > max) {
/* Back off. */
current = prev;
break;
}
steps += width;
}
break;
default:
INTERNAL("impossible enum utf8_step");
}
if (count)
*count = steps;
return current;
}
/* Take @max steps backward from @string in the specified @way, but
* not going past @start. Return the resulting address. Store the
* number of steps taken to *@count, unless @count is NULL.
*
* This assumes the text is valid UTF-8, and @string and @start point
* to character boundaries. If not, it doesn't crash but the results
* may be inconsistent.
*
* This function can do some of the same jobs as utf8_prevchar(). */
unsigned char *
utf8_step_backward(unsigned char *string, unsigned char *start,
int max, enum utf8_step way, int *count)
{
int steps = 0;
unsigned char *current = string;
assert(string);
assert(start);
assert(max >= 0);
if_assert_failed return string;
switch (way) {
case utf8_step_characters:
while (steps < max && current > start) {
--current;
if (utf8_islead(*current))
++steps;
}
break;
case utf8_step_cells_fewer:
case utf8_step_cells_more:
while (steps < max) {
unsigned char *prev = current;
unsigned char *look;
unicode_val_T u;
int width;
if (current <= start)
break;
do {
--current;
} while (current > start && !utf8_islead(*current));
look = current;
u = utf_8_to_unicode(&look, prev);
if (u == UCS_NO_CHAR) {
/* Assume the incomplete sequence
* costs one cell. */
width = 1;
} else
width = unicode_to_cell(u);
if (way == utf8_step_cells_fewer
&& steps + width > max) {
/* Back off. */
current = prev;
break;
}
steps += width;
}
break;
default:
INTERNAL("impossible enum utf8_step");
}
if (count)
*count = steps;
return current;
}
/*
* Find out number of standard terminal collumns needed for displaying symbol
* (glyph) which represents Unicode character c.

View File

@ -71,6 +71,26 @@ int utf8_char2cells(unsigned char *, unsigned char *);
int utf8_ptr2cells(unsigned char *, unsigned char *);
int utf8_ptr2chars(unsigned char *, unsigned char *);
int utf8_cells2bytes(unsigned char *, int, unsigned char *);
/* How utf8_step_forward and utf8_step_backward count steps. */
enum utf8_step {
/* Each step is one character, even if it is a combining or
* double-width character. */
utf8_step_characters,
/* Each step is one cell. If the specified number of steps
* would end in the middle of a double-width character, do not
* include the character. */
utf8_step_cells_fewer,
/* Each step is one cell. If the specified number of steps
* would end in the middle of a double-width character,
* include the whole character. */
utf8_step_cells_more
};
unsigned char *utf8_step_forward(unsigned char *, unsigned char *,
int, enum utf8_step, int *);
unsigned char *utf8_step_backward(unsigned char *, unsigned char *,
int, enum utf8_step, int *);
inline int unicode_to_cell(unicode_val_T);
unicode_val_T unicode_fold_label_case(unicode_val_T);
inline int strlen_utf8(unsigned char **);