combined: Added combining characters support.

Combining characters requires a UTF-8 locale. It slows down rendering. There is still the unresolved issue with combining characters at the end of a document. This patch wasn't heavilly tested. Especially a "garbage" input may cause unpredictable results.
2024-12-04 14:46:47 -05:00 · 2008-01-03 13:03:08 +01:00 · 2008-01-03 13:03:08 +01:00 · 83a4d815ae
commit 83a4d815ae
parent 560818568a
3 changed files with 53 additions and 12 deletions
--- a/src/document/document.c
+++ b/src/document/document.c
@ -57,6 +57,10 @@ init_document(struct cache_entry *cached, struct document_options *options)
 	init_list(document->onload_snippets);
 #endif

+#ifdef CONFIG_UTF8
+	document->comb_x = -1;
+	document->comb_y = -1;
+#endif
 	object_nolock(document, "document");
 	object_lock(document);

--- a/src/document/document.h
+++ b/src/document/document.h
@ -201,6 +201,12 @@ struct document {
 #ifdef CONFIG_UTF8
 	unsigned char buf[7];
 	unsigned char buf_length;
+	/* base char + 5 combining chars = 6 */
+	unicode_val_T combi[UCS_MAX_LENGTH_COMBINED];
+	/* the number of combining characters. The base char is not counted. */
+	unsigned int combi_length;
+	/* Positions of the last base character.*/
+	int comb_x, comb_y;
 #endif
 	unsigned int id; /**< Used to check cache entries. */

--- a/src/document/html/renderer.c
+++ b/src/document/html/renderer.c
@ -8,6 +8,11 @@
 #include <stdarg.h>
 #include <string.h>

+#if defined(HAVE_WCHAR_H) && defined(HAVE_WCWIDTH)
+#define __USE_XOPEN
+#include <wchar.h>
+#endif
+
 #include "elinks.h"

 #include "cache/cache.h"
@ -393,6 +398,7 @@ set_hline(struct html_context *html_context, unsigned char *chars, int charslen,
 	 * has mapped those characters to NBSP_CHAR.  */

 	if (part->document) {
+		struct document *const document = part->document;
 		/* Reallocate LINE(y).chars[] to large enough.  The
 		 * last parameter of realloc_line is the index of the
 		 * last element to which we may want to write,
@ -402,10 +408,10 @@ set_hline(struct html_context *html_context, unsigned char *chars, int charslen,
 		 * (All double-cell characters take up at least two
 		 * bytes in UTF-8, and there are no triple-cell or
 		 * wider characters.)  However, if there already is an
-		 * incomplete character in part->document->buf, then
+		 * incomplete character in document->buf, then
 		 * the first byte of input can result in a double-cell
 		 * character, so we must reserve one extra element.  */
-		orig_length = realloc_line(html_context, part->document,
+		orig_length = realloc_line(html_context, document,
 					   Y(y), X(x) + charslen);
 		if (orig_length < 0) /* error */
 			return 0;
@ -413,17 +419,17 @@ set_hline(struct html_context *html_context, unsigned char *chars, int charslen,
 			unsigned char *const end = chars + charslen;
 			unicode_val_T data;

-			if (part->document->buf_length) {
+			if (document->buf_length) {
 				/* previous char was broken in the middle */
-				int length = utf8charlen(part->document->buf);
+				int length = utf8charlen(document->buf);
 				unsigned char i;
-				unsigned char *buf_ptr = part->document->buf;
+				unsigned char *buf_ptr = document->buf;

-				for (i = part->document->buf_length; i < length && chars < end;) {
-					part->document->buf[i++] = *chars++;
+				for (i = document->buf_length; i < length && chars < end;) {
+					document->buf[i++] = *chars++;
 				}
-				part->document->buf_length = i;
-				part->document->buf[i] = '\0';
+				document->buf_length = i;
+				document->buf[i] = '\0';
 				data = utf8_to_unicode(&buf_ptr, buf_ptr + i);
 				if (data != UCS_NO_CHAR) {
 					/* FIXME: If there was invalid
@ -436,7 +442,7 @@ set_hline(struct html_context *html_context, unsigned char *chars, int charslen,
 					 * trivial to implement because
 					 * each byte may have arrived in
 					 * a separate call.  */
-					part->document->buf_length = 0;
+					document->buf_length = 0;
 					goto good_char;
 				} else {
 					/* Still not full char */
@ -465,9 +471,9 @@ set_hline(struct html_context *html_context, unsigned char *chars, int charslen,
 						unsigned char i;

 						for (i = 0; chars < end;i++) {
-							part->document->buf[i] = *chars++;
+							document->buf[i] = *chars++;
 						}
-						part->document->buf_length = i;
+						document->buf_length = i;
 						break;
 					}
 					/* not reached */
@ -480,6 +486,27 @@ good_char:
 				if (data == UCS_NO_BREAK_SPACE
 				    && html_context->options->wrap_nbsp)
 					data = UCS_SPACE;
+#ifdef HAVE_WCWIDTH
+				if (wcwidth((wchar_t)data)) {
+					if (document->combi_length) {
+						if (document->comb_x != -1) {
+							unicode_val_T prev = get_combined(document->combi, document->combi_length + 1);
+
+							if (prev != UCS_NO_CHAR) {
+								schar->data = prev;
+								copy_screen_chars(&POS(document->comb_x, document->comb_y), schar, 1);
+							}
+						}
+						document->combi_length = 0;
+					}
+					document->combi[0] = data;
+				} else {
+					if (document->combi_length < (UCS_MAX_LENGTH_COMBINED - 1)) {
+						document->combi[++document->combi_length] = data;
+					}
+					continue;
+				}
+#endif
 				part->spaces[x] = (data == UCS_SPACE);

 				if (unicode_to_cell(data) == 2) {
@ -493,6 +520,10 @@ good_char:
 					part->char_width[x] = unicode_to_cell(data);
 					schar->data = (unicode_val_T)data;
 				}
+#ifdef HAVE_WCWIDTH
+				document->comb_x = x;
+				document->comb_y = y;
+#endif
 				copy_screen_chars(&POS(x++, y), schar, 1);
 			} /* while chars < end */
 		} else { /* not UTF-8 */