patch 9.1.1476: no easy way to deduplicate text

Problem: no easy way to deduplicate text Solution: add the :uniq ex command (Hirohito Higashi) closes: #17538 Signed-off-by: Hirohito Higashi <h.east.727@gmail.com> Signed-off-by: Christian Brabandt <cb@256bit.org>
2025-10-23 08:44:20 -04:00 · 2025-06-23 21:42:36 +02:00
parent 159d392427
commit 74f0a77bb9
12 changed files with 929 additions and 14 deletions
--- a/src/ex_cmds.c
+++ b/src/ex_cmds.c
@@ -643,6 +643,237 @@ sortend:
 	emsg(_(e_interrupted));
 }

+/*
+ * ":uniq".
+ */
+    void
+ex_uniq(exarg_T *eap)
+{
+    regmatch_T	regmatch;
+    int		len;
+    linenr_T	lnum;
+    long	maxlen = 0;
+    linenr_T	count = eap->line2 - eap->line1 + 1;
+    char_u	*p;
+    char_u	*s;
+    char_u	save_c;			// temporary character storage
+    int		keep_only_unique = FALSE;
+    int		keep_only_not_unique = eap->forceit ? TRUE : FALSE;
+    long	deleted = 0;
+    colnr_T	start_col;
+    colnr_T	end_col;
+    int		change_occurred = FALSE; // Buffer contents changed.
+
+    // Uniq one line is really quick!
+    if (count <= 1)
+	return;
+
+    if (u_save((linenr_T)(eap->line1 - 1), (linenr_T)(eap->line2 + 1)) == FAIL)
+	return;
+    sortbuf1 = NULL;
+    regmatch.regprog = NULL;
+
+    sort_abort = sort_ic = sort_lc = sort_rx = sort_nr = 0;
+    sort_flt = 0;
+
+    for (p = eap->arg; *p != NUL; ++p)
+    {
+	if (VIM_ISWHITE(*p))
+	    ;
+	else if (*p == 'i')
+	    sort_ic = TRUE;
+	else if (*p == 'l')
+	    sort_lc = TRUE;
+	else if (*p == 'r')
+	    sort_rx = TRUE;
+	else if (*p == 'u')
+	{
+	    // 'u' is only valid when '!' is not given.
+	    if (!keep_only_not_unique)
+		keep_only_unique = TRUE;
+	}
+	else if (*p == '"')	// comment start
+	    break;
+	else if (eap->nextcmd == NULL && check_nextcmd(p) != NULL)
+	{
+	    eap->nextcmd = check_nextcmd(p);
+	    break;
+	}
+	else if (!ASCII_ISALPHA(*p) && regmatch.regprog == NULL)
+	{
+	    s = skip_regexp_err(p + 1, *p, TRUE);
+	    if (s == NULL)
+		goto uniqend;
+	    *s = NUL;
+	    // Use last search pattern if uniq pattern is empty.
+	    if (s == p + 1)
+	    {
+		if (last_search_pat() == NULL)
+		{
+		    emsg(_(e_no_previous_regular_expression));
+		    goto uniqend;
+		}
+		regmatch.regprog = vim_regcomp(last_search_pat(), RE_MAGIC);
+	    }
+	    else
+		regmatch.regprog = vim_regcomp(p + 1, RE_MAGIC);
+	    if (regmatch.regprog == NULL)
+		goto uniqend;
+	    p = s;		// continue after the regexp
+	    regmatch.rm_ic = p_ic;
+	}
+	else
+	{
+	    semsg(_(e_invalid_argument_str), p);
+	    goto uniqend;
+	}
+    }
+
+    // Make an array with all line numbers.  This avoids having to copy all
+    // the lines into allocated memory.
+    // When remove deplicating on strings "start_col_nr" is the offset in the
+    // line, for numbers remove deplicating it's the number to uniq on.  This
+    // means the pattern matching only has to be done once per line.
+    // Also get the longest line length for allocating "sortbuf".
+    for (lnum = eap->line1; lnum <= eap->line2; ++lnum)
+    {
+	s = ml_get(lnum);
+	len = ml_get_len(lnum);
+	if (maxlen < len)
+	    maxlen = len;
+
+	if (got_int)
+	    goto uniqend;
+    }
+
+    // Allocate a buffer that can hold the longest line.
+    sortbuf1 = alloc(maxlen + 1);
+    if (sortbuf1 == NULL)
+	goto uniqend;
+
+    // Delete lines according to options.
+    int match_continue = FALSE;
+    int next_is_unmatch = FALSE;
+    int is_match;
+    linenr_T done_lnum = eap->line1 - 1;
+    linenr_T delete_lnum = 0;
+    for (linenr_T i = 0; i < count; ++i)
+    {
+	linenr_T get_lnum = eap->line1 + i;
+
+	s = ml_get(get_lnum);
+	len = ml_get_len(get_lnum);
+
+	start_col = 0;
+	end_col = len;
+	if (regmatch.regprog != NULL && vim_regexec(&regmatch, s, 0))
+	{
+	    if (sort_rx)
+	    {
+		start_col = (colnr_T)(regmatch.startp[0] - s);
+		end_col = (colnr_T)(regmatch.endp[0] - s);
+	    }
+	    else
+		start_col = (colnr_T)(regmatch.endp[0] - s);
+	}
+	else
+	    if (regmatch.regprog != NULL)
+		end_col = 0;
+	if (end_col > 0)
+	{
+	    save_c = s[end_col];
+	    s[end_col] = NUL;
+	}
+
+	is_match = i > 0 ? !string_compare(&s[start_col], sortbuf1) : FALSE;
+	delete_lnum = 0;
+	if (next_is_unmatch)
+	{
+	    is_match = FALSE;
+	    next_is_unmatch = FALSE;
+	}
+
+	if (!keep_only_unique && !keep_only_not_unique)
+	{
+	    if (is_match)
+		delete_lnum = get_lnum;
+	    else
+		STRCPY(sortbuf1, &s[start_col]);
+	}
+	else if (keep_only_not_unique)
+	{
+	    if (is_match)
+	    {
+		done_lnum = get_lnum - 1;
+		delete_lnum = get_lnum;
+		match_continue = TRUE;
+	    }
+	    else
+	    {
+		if (i > 0 && !match_continue && get_lnum - 1 > done_lnum)
+		{
+		    delete_lnum = get_lnum - 1;
+		    next_is_unmatch = TRUE;
+		}
+		else if (i >= count - 1)
+		    delete_lnum = get_lnum;
+		match_continue = FALSE;
+		STRCPY(sortbuf1, &s[start_col]);
+	    }
+	}
+	else // keep_only_unique
+	{
+	    if (is_match)
+	    {
+		if (!match_continue)
+		    delete_lnum = get_lnum - 1;
+		else
+		    delete_lnum = get_lnum;
+		match_continue = TRUE;
+	    }
+	    else
+	    {
+		if (i == 0 && match_continue)
+		    delete_lnum = get_lnum;
+		match_continue = FALSE;
+		STRCPY(sortbuf1, &s[start_col]);
+	    }
+	}
+
+	if (end_col > 0)
+	    s[end_col] = save_c;
+
+	if (delete_lnum > 0)
+	{
+	    ml_delete(delete_lnum);
+	    i -= get_lnum - delete_lnum + 1;
+	    count--;
+	    deleted++;
+	    change_occurred = TRUE;
+	}
+
+	fast_breakcheck();
+	if (got_int)
+	    goto uniqend;
+    }
+
+    // Adjust marks for deleted lines and prepare for displaying.
+    mark_adjust(eap->line2 - deleted, eap->line2, (long)MAXLNUM, -deleted);
+    msgmore(-deleted);
+
+    if (change_occurred)
+	changed_lines(eap->line1, 0, eap->line2 + 1, -deleted);
+
+    curwin->w_cursor.lnum = eap->line1;
+    beginline(BL_WHITE | BL_FIX);
+
+uniqend:
+    vim_free(sortbuf1);
+    vim_regfree(regmatch.regprog);
+    if (got_int)
+	emsg(_(e_interrupted));
+}
+
 /*
 * :move command - move lines line1-line2 to line dest
 *