forked from aniani/vim
patch 8.2.1893: fuzzy matching does not support multiple words
Problem: Fuzzy matching does not support multiple words.
Solution: Add support for matching white space separated words. (Yegappan
Lakshmanan, closes #7163)
This commit is contained in:
171
src/search.c
171
src/search.c
@@ -4203,16 +4203,16 @@ the_end:
|
||||
* Ported from the lib_fts library authored by Forrest Smith.
|
||||
* https://github.com/forrestthewoods/lib_fts/tree/master/code
|
||||
*
|
||||
* Blog describing the algorithm:
|
||||
* The following blog describes the fuzzy matching algorithm:
|
||||
* https://www.forrestthewoods.com/blog/reverse_engineering_sublime_texts_fuzzy_match/
|
||||
*
|
||||
* Each matching string is assigned a score. The following factors are checked:
|
||||
* Matched letter
|
||||
* Unmatched letter
|
||||
* Consecutively matched letters
|
||||
* Proximity to start
|
||||
* Letter following a separator (space, underscore)
|
||||
* Uppercase letter following lowercase (aka CamelCase)
|
||||
* - Matched letter
|
||||
* - Unmatched letter
|
||||
* - Consecutively matched letters
|
||||
* - Proximity to start
|
||||
* - Letter following a separator (space, underscore)
|
||||
* - Uppercase letter following lowercase (aka CamelCase)
|
||||
*
|
||||
* Matched letters are good. Unmatched letters are bad. Matching near the start
|
||||
* is good. Matching the first letter in the middle of a phrase is good.
|
||||
@@ -4222,16 +4222,17 @@ the_end:
|
||||
* File paths are different from file names. File extensions may be ignorable.
|
||||
* Single words care about consecutive matches but not separators or camel
|
||||
* case.
|
||||
* Score starts at 0
|
||||
* Score starts at 100
|
||||
* Matched letter: +0 points
|
||||
* Unmatched letter: -1 point
|
||||
* Consecutive match bonus: +5 points
|
||||
* Separator bonus: +10 points
|
||||
* Camel case bonus: +10 points
|
||||
* Unmatched leading letter: -3 points (max: -9)
|
||||
* Consecutive match bonus: +15 points
|
||||
* First letter bonus: +15 points
|
||||
* Separator bonus: +30 points
|
||||
* Camel case bonus: +30 points
|
||||
* Unmatched leading letter: -5 points (max: -15)
|
||||
*
|
||||
* There is some nuance to this. Scores don’t have an intrinsic meaning. The
|
||||
* score range isn’t 0 to 100. It’s roughly [-50, 50]. Longer words have a
|
||||
* score range isn’t 0 to 100. It’s roughly [50, 150]. Longer words have a
|
||||
* lower minimum score due to unmatched letter penalty. Longer search patterns
|
||||
* have a higher maximum score due to match bonuses.
|
||||
*
|
||||
@@ -4247,6 +4248,7 @@ the_end:
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
int idx; // used for stable sort
|
||||
listitem_T *item;
|
||||
int score;
|
||||
list_T *lmatchpos;
|
||||
@@ -4267,6 +4269,8 @@ typedef struct
|
||||
#define MAX_LEADING_LETTER_PENALTY -15
|
||||
// penalty for every letter that doesn't match
|
||||
#define UNMATCHED_LETTER_PENALTY -1
|
||||
// penalty for gap in matching positions (-2 * k)
|
||||
#define GAP_PENALTY -2
|
||||
// Score for a string that doesn't fuzzy match the pattern
|
||||
#define SCORE_NONE -9999
|
||||
|
||||
@@ -4319,6 +4323,8 @@ fuzzy_match_compute_score(
|
||||
// Sequential
|
||||
if (currIdx == (prevIdx + 1))
|
||||
score += SEQUENTIAL_BONUS;
|
||||
else
|
||||
score += GAP_PENALTY * (currIdx - prevIdx);
|
||||
}
|
||||
|
||||
// Check for bonuses based on neighbor character value
|
||||
@@ -4334,7 +4340,7 @@ fuzzy_match_compute_score(
|
||||
while (sidx < currIdx)
|
||||
{
|
||||
neighbor = (*mb_ptr2char)(p);
|
||||
(void)mb_ptr2char_adv(&p);
|
||||
MB_PTR_ADV(p);
|
||||
sidx++;
|
||||
}
|
||||
curr = (*mb_ptr2char)(p);
|
||||
@@ -4362,6 +4368,10 @@ fuzzy_match_compute_score(
|
||||
return score;
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform a recursive search for fuzzy matching 'fuzpat' in 'str'.
|
||||
* Return the number of matching characters.
|
||||
*/
|
||||
static int
|
||||
fuzzy_match_recursive(
|
||||
char_u *fuzpat,
|
||||
@@ -4386,11 +4396,11 @@ fuzzy_match_recursive(
|
||||
// Count recursions
|
||||
++*recursionCount;
|
||||
if (*recursionCount >= FUZZY_MATCH_RECURSION_LIMIT)
|
||||
return FALSE;
|
||||
return 0;
|
||||
|
||||
// Detect end of strings
|
||||
if (*fuzpat == '\0' || *str == '\0')
|
||||
return FALSE;
|
||||
return 0;
|
||||
|
||||
// Loop through fuzpat and str looking for a match
|
||||
first_match = TRUE;
|
||||
@@ -4411,7 +4421,7 @@ fuzzy_match_recursive(
|
||||
|
||||
// Supplied matches buffer was too short
|
||||
if (nextMatch >= maxMatches)
|
||||
return FALSE;
|
||||
return 0;
|
||||
|
||||
// "Copy-on-Write" srcMatches into matches
|
||||
if (first_match && srcMatches)
|
||||
@@ -4444,12 +4454,12 @@ fuzzy_match_recursive(
|
||||
// Advance
|
||||
matches[nextMatch++] = strIdx;
|
||||
if (has_mbyte)
|
||||
(void)mb_ptr2char_adv(&fuzpat);
|
||||
MB_PTR_ADV(fuzpat);
|
||||
else
|
||||
++fuzpat;
|
||||
}
|
||||
if (has_mbyte)
|
||||
(void)mb_ptr2char_adv(&str);
|
||||
MB_PTR_ADV(str);
|
||||
else
|
||||
++str;
|
||||
strIdx++;
|
||||
@@ -4469,12 +4479,12 @@ fuzzy_match_recursive(
|
||||
// Recursive score is better than "this"
|
||||
memcpy(matches, bestRecursiveMatches, maxMatches * sizeof(matches[0]));
|
||||
*outScore = bestRecursiveScore;
|
||||
return TRUE;
|
||||
return nextMatch;
|
||||
}
|
||||
else if (matched)
|
||||
return TRUE; // "this" score is better than recursive
|
||||
return nextMatch; // "this" score is better than recursive
|
||||
|
||||
return FALSE; // no match
|
||||
return 0; // no match
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -4485,45 +4495,110 @@ fuzzy_match_recursive(
|
||||
* Scores values have no intrinsic meaning. Possible score range is not
|
||||
* normalized and varies with pattern.
|
||||
* Recursion is limited internally (default=10) to prevent degenerate cases
|
||||
* (fuzpat="aaaaaa" str="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa").
|
||||
* (pat_arg="aaaaaa" str="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa").
|
||||
* Uses char_u for match indices. Therefore patterns are limited to MAXMATCHES
|
||||
* characters.
|
||||
*
|
||||
* Returns TRUE if 'fuzpat' matches 'str'. Also returns the match score in
|
||||
* Returns TRUE if 'pat_arg' matches 'str'. Also returns the match score in
|
||||
* 'outScore' and the matching character positions in 'matches'.
|
||||
*/
|
||||
static int
|
||||
fuzzy_match(
|
||||
char_u *str,
|
||||
char_u *fuzpat,
|
||||
char_u *pat_arg,
|
||||
int matchseq,
|
||||
int *outScore,
|
||||
matchidx_T *matches,
|
||||
int maxMatches)
|
||||
{
|
||||
int recursionCount = 0;
|
||||
int len = MB_CHARLEN(str);
|
||||
char_u *save_pat;
|
||||
char_u *pat;
|
||||
char_u *p;
|
||||
int complete = FALSE;
|
||||
int score = 0;
|
||||
int numMatches = 0;
|
||||
int matchCount;
|
||||
|
||||
*outScore = 0;
|
||||
|
||||
return fuzzy_match_recursive(fuzpat, str, 0, outScore, str, len, NULL,
|
||||
matches, maxMatches, 0, &recursionCount);
|
||||
save_pat = vim_strsave(pat_arg);
|
||||
if (save_pat == NULL)
|
||||
return FALSE;
|
||||
pat = save_pat;
|
||||
p = pat;
|
||||
|
||||
// Try matching each word in 'pat_arg' in 'str'
|
||||
while (TRUE)
|
||||
{
|
||||
if (matchseq)
|
||||
complete = TRUE;
|
||||
else
|
||||
{
|
||||
// Extract one word from the pattern (separated by space)
|
||||
p = skipwhite(p);
|
||||
if (*p == NUL)
|
||||
break;
|
||||
pat = p;
|
||||
while (*p != NUL && !VIM_ISWHITE(PTR2CHAR(p)))
|
||||
{
|
||||
if (has_mbyte)
|
||||
MB_PTR_ADV(p);
|
||||
else
|
||||
++p;
|
||||
}
|
||||
if (*p == NUL) // processed all the words
|
||||
complete = TRUE;
|
||||
*p = NUL;
|
||||
}
|
||||
|
||||
score = 0;
|
||||
recursionCount = 0;
|
||||
matchCount = fuzzy_match_recursive(pat, str, 0, &score, str, len, NULL,
|
||||
matches + numMatches, maxMatches - numMatches,
|
||||
0, &recursionCount);
|
||||
if (matchCount == 0)
|
||||
{
|
||||
numMatches = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
// Accumulate the match score and the number of matches
|
||||
*outScore += score;
|
||||
numMatches += matchCount;
|
||||
|
||||
if (complete)
|
||||
break;
|
||||
|
||||
// try matching the next word
|
||||
++p;
|
||||
}
|
||||
|
||||
vim_free(save_pat);
|
||||
return numMatches != 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Sort the fuzzy matches in the descending order of the match score.
|
||||
* For items with same score, retain the order using the index (stable sort)
|
||||
*/
|
||||
static int
|
||||
fuzzy_item_compare(const void *s1, const void *s2)
|
||||
fuzzy_match_item_compare(const void *s1, const void *s2)
|
||||
{
|
||||
int v1 = ((fuzzyItem_T *)s1)->score;
|
||||
int v2 = ((fuzzyItem_T *)s2)->score;
|
||||
int idx1 = ((fuzzyItem_T *)s1)->idx;
|
||||
int idx2 = ((fuzzyItem_T *)s2)->idx;
|
||||
|
||||
return v1 == v2 ? 0 : v1 > v2 ? -1 : 1;
|
||||
return v1 == v2 ? (idx1 - idx2) : v1 > v2 ? -1 : 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fuzzy search the string 'str' in a list of 'items' and return the matching
|
||||
* strings in 'fmatchlist'.
|
||||
* If 'matchseq' is TRUE, then for multi-word search strings, match all the
|
||||
* words in sequence.
|
||||
* If 'items' is a list of strings, then search for 'str' in the list.
|
||||
* If 'items' is a list of dicts, then either use 'key' to lookup the string
|
||||
* for each item or use 'item_cb' Funcref function to get the string.
|
||||
@@ -4531,9 +4606,10 @@ fuzzy_item_compare(const void *s1, const void *s2)
|
||||
* matches for each item.
|
||||
*/
|
||||
static void
|
||||
match_fuzzy(
|
||||
fuzzy_match_in_list(
|
||||
list_T *items,
|
||||
char_u *str,
|
||||
int matchseq,
|
||||
char_u *key,
|
||||
callback_T *item_cb,
|
||||
int retmatchpos,
|
||||
@@ -4561,6 +4637,7 @@ match_fuzzy(
|
||||
char_u *itemstr;
|
||||
typval_T rettv;
|
||||
|
||||
ptrs[i].idx = i;
|
||||
ptrs[i].item = li;
|
||||
ptrs[i].score = SCORE_NONE;
|
||||
itemstr = NULL;
|
||||
@@ -4593,25 +4670,34 @@ match_fuzzy(
|
||||
}
|
||||
|
||||
if (itemstr != NULL
|
||||
&& fuzzy_match(itemstr, str, &score, matches,
|
||||
&& fuzzy_match(itemstr, str, matchseq, &score, matches,
|
||||
sizeof(matches) / sizeof(matches[0])))
|
||||
{
|
||||
// Copy the list of matching positions in itemstr to a list, if
|
||||
// 'retmatchpos' is set.
|
||||
if (retmatchpos)
|
||||
{
|
||||
int j;
|
||||
int strsz;
|
||||
int j = 0;
|
||||
char_u *p;
|
||||
|
||||
ptrs[i].lmatchpos = list_alloc();
|
||||
if (ptrs[i].lmatchpos == NULL)
|
||||
goto done;
|
||||
strsz = MB_CHARLEN(str);
|
||||
for (j = 0; j < strsz; j++)
|
||||
|
||||
p = str;
|
||||
while (*p != NUL)
|
||||
{
|
||||
if (list_append_number(ptrs[i].lmatchpos,
|
||||
matches[j]) == FAIL)
|
||||
goto done;
|
||||
if (!VIM_ISWHITE(PTR2CHAR(p)))
|
||||
{
|
||||
if (list_append_number(ptrs[i].lmatchpos,
|
||||
matches[j]) == FAIL)
|
||||
goto done;
|
||||
j++;
|
||||
}
|
||||
if (has_mbyte)
|
||||
MB_PTR_ADV(p);
|
||||
else
|
||||
++p;
|
||||
}
|
||||
}
|
||||
ptrs[i].score = score;
|
||||
@@ -4627,7 +4713,7 @@ match_fuzzy(
|
||||
|
||||
// Sort the list by the descending order of the match score
|
||||
qsort((void *)ptrs, (size_t)len, sizeof(fuzzyItem_T),
|
||||
fuzzy_item_compare);
|
||||
fuzzy_match_item_compare);
|
||||
|
||||
// For matchfuzzy(), return a list of matched strings.
|
||||
// ['str1', 'str2', 'str3']
|
||||
@@ -4687,6 +4773,7 @@ do_fuzzymatch(typval_T *argvars, typval_T *rettv, int retmatchpos)
|
||||
callback_T cb;
|
||||
char_u *key = NULL;
|
||||
int ret;
|
||||
int matchseq = FALSE;
|
||||
|
||||
CLEAR_POINTER(&cb);
|
||||
|
||||
@@ -4737,6 +4824,8 @@ do_fuzzymatch(typval_T *argvars, typval_T *rettv, int retmatchpos)
|
||||
return;
|
||||
}
|
||||
}
|
||||
if ((di = dict_find(d, (char_u *)"matchseq", -1)) != NULL)
|
||||
matchseq = TRUE;
|
||||
}
|
||||
|
||||
// get the fuzzy matches
|
||||
@@ -4762,8 +4851,8 @@ do_fuzzymatch(typval_T *argvars, typval_T *rettv, int retmatchpos)
|
||||
goto done;
|
||||
}
|
||||
|
||||
match_fuzzy(argvars[0].vval.v_list, tv_get_string(&argvars[1]), key,
|
||||
&cb, retmatchpos, rettv->vval.v_list);
|
||||
fuzzy_match_in_list(argvars[0].vval.v_list, tv_get_string(&argvars[1]),
|
||||
matchseq, key, &cb, retmatchpos, rettv->vval.v_list);
|
||||
|
||||
done:
|
||||
free_callback(&cb);
|
||||
|
||||
Reference in New Issue
Block a user