New function 'string-distance'

* src/fns.c (Fstring_distance): New primitive.
(syms_of_fns): Defsubr it.

* test/lisp/subr-tests.el (subr-tests--string-distance): New test.

* etc/NEWS: Mention 'string-distance'.
This commit is contained in:
Chen Bin 2018-04-20 00:38:29 +10:00 committed by Eli Zaretskii
parent 4bc74dac28
commit c6e6503900
3 changed files with 83 additions and 0 deletions

View file

@ -534,6 +534,9 @@ manual for more details.
+++
** New function assoc-delete-all.
** New function string-distance to calculate Levenshtein distance
between two strings.
** 'print-quoted' now defaults to t, so if you want to see
(quote x) instead of 'x you will have to bind it to nil where applicable.

View file

@ -153,6 +153,67 @@ If STRING is multibyte, this may be greater than the length of STRING. */)
return make_number (SBYTES (string));
}
DEFUN ("string-distance", Fstring_distance, Sstring_distance, 2, 3, 0,
doc: /* Return Levenshtein distance between STRING1 and STRING2.
If BYTECOMPARE is nil, compare character of strings.
If BYTECOMPARE is t, compare byte of strings.
Case is significant, but text properties are ignored. */)
(Lisp_Object string1, Lisp_Object string2, Lisp_Object bytecompare)
{
CHECK_STRING (string1);
CHECK_STRING (string2);
bool use_byte_compare = !NILP (bytecompare)
|| (!STRING_MULTIBYTE (string1) && !STRING_MULTIBYTE (string2));
ptrdiff_t len1 = use_byte_compare? SBYTES (string1) : SCHARS (string1);
ptrdiff_t len2 = use_byte_compare? SBYTES (string2) : SCHARS (string2);
ptrdiff_t x, y, lastdiag, olddiag;
USE_SAFE_ALLOCA;
ptrdiff_t *column = SAFE_ALLOCA ((len1 + 1) * sizeof (ptrdiff_t));
for (y = 1; y <= len1; y++)
column[y] = y;
if (use_byte_compare)
{
char *s1 = SSDATA (string1);
char *s2 = SSDATA (string2);
for (x = 1; x <= len2; x++)
{
column[0] = x;
for (y = 1, lastdiag = x - 1; y <= len1; y++)
{
olddiag = column[y];
column[y] = min (min (column[y] + 1, column[y-1] + 1), lastdiag + (s1[y-1] == s2[x-1]? 0 : 1));
lastdiag = olddiag;
}
}
}
else
{
int c1, c2;
ptrdiff_t i1, i1_byte, i2 = 0, i2_byte = 0;
for (x = 1; x <= len2; x++)
{
column[0] = x;
FETCH_STRING_CHAR_ADVANCE (c2, string2, i2, i2_byte);
i1 = i1_byte = 0;
for (y = 1, lastdiag = x - 1; y <= len1; y++)
{
olddiag = column[y];
FETCH_STRING_CHAR_ADVANCE (c1, string1, i1, i1_byte);
column[y] = min (min (column[y] + 1, column[y-1] + 1), lastdiag + (c1 == c2? 0 : 1));
lastdiag = olddiag;
}
}
}
SAFE_FREE ();
return make_number (column[len1]);
}
DEFUN ("string-equal", Fstring_equal, Sstring_equal, 2, 2, 0,
doc: /* Return t if two strings have identical contents.
Case is significant, but text properties are ignored.
@ -5226,6 +5287,7 @@ this variable. */);
defsubr (&Slength);
defsubr (&Ssafe_length);
defsubr (&Sstring_bytes);
defsubr (&Sstring_distance);
defsubr (&Sstring_equal);
defsubr (&Scompare_strings);
defsubr (&Sstring_lessp);

View file

@ -281,6 +281,24 @@ indirectly `mapbacktrace'."
(should (equal (string-match-p "\\`[[:blank:]]\\'" "\u3000") 0))
(should-not (string-match-p "\\`[[:blank:]]\\'" "\N{LINE SEPARATOR}")))
(ert-deftest subr-tests--string-distance ()
"Test `string-distance' behavior."
;; ASCII characters are always fine
(should (equal 1 (string-distance "heelo" "hello")))
(should (equal 2 (string-distance "aeelo" "hello")))
(should (equal 0 (string-distance "ab" "ab" t)))
(should (equal 1 (string-distance "ab" "abc" t)))
;; string containing hanzi character, compare by byte
(should (equal 6 (string-distance "ab" "ab我她" t)))
(should (equal 3 (string-distance "ab" "a我b" t)))
(should (equal 3 (string-distance "" "" t)))
;; string containing hanzi character, compare by character
(should (equal 2 (string-distance "ab" "ab我她")))
(should (equal 1 (string-distance "ab" "a我b")))
(should (equal 1 (string-distance "" ""))))
(ert-deftest subr-tests--dolist--wrong-number-of-args ()
"Test that `dolist' doesn't accept wrong types or length of SPEC,
cf. Bug#25477."