patch 8.2.0901: formatting CJK text isn't optimal

Problem: Formatting CJK text isn't optimal. Solution: Properly break CJK lines. (closes #3875)
2025-09-29 04:34:16 -04:00 · 2020-06-04 18:22:13 +02:00
parent 9155825b24
commit e52702f003
9 changed files with 329 additions and 9 deletions
--- a/src/textformat.c
+++ b/src/textformat.c
@@ -45,10 +45,12 @@ internal_format(
    int		c) // character to be inserted (can be NUL)
 {
    int		cc;
+    int		skip_pos;
    int		save_char = NUL;
    int		haveto_redraw = FALSE;
    int		fo_ins_blank = has_format_option(FO_INS_BLANK);
    int		fo_multibyte = has_format_option(FO_MBYTE_BREAK);
+    int		fo_rigor_tw  = has_format_option(FO_RIGOROUS_TW);
    int		fo_white_par = has_format_option(FO_WHITE_PAR);
    int		first_line = TRUE;
    colnr_T	leader_len;
@@ -125,6 +127,7 @@ internal_format(

 	curwin->w_cursor.col = startcol;
 	foundcol = 0;
+	skip_pos = 0;

 	// Find position to break at.
 	// Stop at first entered white when 'formatoptions' has 'v'
@@ -189,8 +192,11 @@ internal_format(
 		if (curwin->w_cursor.col <= (colnr_T)wantcol)
 		    break;
 	    }
-	    else if (cc >= 0x100 && fo_multibyte)
+	    else if ((cc >= 0x100 || !utf_allow_break_before(cc)) && fo_multibyte)
 	    {
+		int ncc;
+		int allow_break;
+
 		// Break after or before a multi-byte character.
 		if (curwin->w_cursor.col != startcol)
 		{
@@ -199,8 +205,14 @@ internal_format(
 			break;
 		    col = curwin->w_cursor.col;
 		    inc_cursor();
-		    // Don't change end_foundcol if already set.
-		    if (foundcol != curwin->w_cursor.col)
+		    ncc = gchar_cursor();
+
+		    allow_break =
+			(enc_utf8 && utf_allow_break(cc, ncc))
+			|| enc_dbcs;
+
+		    // If we have already checked this position, skip!
+		    if (curwin->w_cursor.col != skip_pos && allow_break)
 		    {
 			foundcol = curwin->w_cursor.col;
 			end_foundcol = foundcol;
@@ -213,6 +225,7 @@ internal_format(
 		if (curwin->w_cursor.col == 0)
 		    break;

+		ncc = cc;
 		col = curwin->w_cursor.col;

 		dec_cursor();
@@ -220,16 +233,65 @@ internal_format(

 		if (WHITECHAR(cc))
 		    continue;		// break with space
-		// Don't break until after the comment leader
+		// Don't break until after the comment leader.
 		if (curwin->w_cursor.col < leader_len)
 		    break;

 		curwin->w_cursor.col = col;
+		skip_pos = curwin->w_cursor.col;

-		foundcol = curwin->w_cursor.col;
-		end_foundcol = foundcol;
+		allow_break =
+		    (enc_utf8 && utf_allow_break(cc, ncc))
+		    || enc_dbcs;
+
+		// Must handle this to respect line break prohibition.
+		if (allow_break)
+		{
+		    foundcol = curwin->w_cursor.col;
+		    end_foundcol = foundcol;
+		}
 		if (curwin->w_cursor.col <= (colnr_T)wantcol)
-		    break;
+		{
+		    int ncc_allow_break =
+			 (enc_utf8 && utf_allow_break_before(ncc)) || enc_dbcs;
+
+		    if (allow_break)
+			break;
+		    if (!ncc_allow_break && !fo_rigor_tw)
+		    {
+			// Enable at most 1 punct hang outside of textwidth.
+			if (curwin->w_cursor.col == startcol)
+			{
+			    // We are inserting a non-breakable char, postpone
+			    // line break check to next insert.
+			    end_foundcol = foundcol = 0;
+			    break;
+			}
+
+			// Neither cc nor ncc is NUL if we are here, so
+			// it's safe to inc_cursor.
+			col = curwin->w_cursor.col;
+
+			inc_cursor();
+			cc  = ncc;
+			ncc = gchar_cursor();
+			// handle insert
+			ncc = (ncc != NUL) ? ncc : c;
+
+			allow_break =
+				(enc_utf8 && utf_allow_break(cc, ncc))
+				|| enc_dbcs;
+
+			if (allow_break)
+			{
+			    // Break only when we are not at end of line.
+			    end_foundcol = foundcol =
+				      ncc == NUL? 0 : curwin->w_cursor.col;
+			    break;
+			}
+			curwin->w_cursor.col = col;
+		    }
+		}
 	    }
 	    if (curwin->w_cursor.col == 0)
 		break;