Skip to content

Add grapheme_levenshtein function. #18087

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ PHP NEWS
. Added Locale::isRightToLeft to check if a locale is written right to left.
(David Carlier)
. Added null bytes presence in locale inputs for Locale class. (David Carlier)
. Added grapheme_levenshtein() function. (Yuya Hamada)

- MySQLi:
. Fixed bugs GH-17900 and GH-8084 (calling mysqli::__construct twice).
Expand Down
2 changes: 2 additions & 0 deletions UPGRADING
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,8 @@ PHP 8.5 UPGRADE NOTES
- Intl:
. Added locale_is_right_to_left/Locale::isRightToLeft, returns true if
the locale is written right to left (after its enrichment with likely subtags).
. Added grapheme_levenshtein() function.
RFC: https://wiki.php.net/rfc/grapheme_levenshtein

- Pdo\Sqlite:
. Added support for Pdo\Sqlite::setAuthorizer(), which is the equivalent of
Expand Down
215 changes: 215 additions & 0 deletions ext/intl/grapheme/grapheme_string.c
Original file line number Diff line number Diff line change
Expand Up @@ -918,4 +918,219 @@ PHP_FUNCTION(grapheme_str_split)
ubrk_close(bi);
}

PHP_FUNCTION(grapheme_levenshtein)
{
zend_string *string1, *string2;
zend_long cost_ins = 1;
zend_long cost_rep = 1;
zend_long cost_del = 1;

ZEND_PARSE_PARAMETERS_START(2, 5)
Z_PARAM_STR(string1)
Z_PARAM_STR(string2)
Z_PARAM_OPTIONAL
Z_PARAM_LONG(cost_ins)
Z_PARAM_LONG(cost_rep)
Z_PARAM_LONG(cost_del)
ZEND_PARSE_PARAMETERS_END();

if (cost_ins <= 0 || cost_ins > UINT_MAX / 4) {
zend_argument_value_error(3, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4);
RETURN_THROWS();
}

if (cost_rep <= 0 || cost_rep > UINT_MAX / 4) {
zend_argument_value_error(4, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4);
RETURN_THROWS();
}

if (cost_del <= 0 || cost_del > UINT_MAX / 4) {
zend_argument_value_error(5, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4);
RETURN_THROWS();
}

zend_long c0, c1, c2;
zend_long retval;
size_t i2;
char *pstr1, *pstr2;

UChar *ustring1 = NULL;
UChar *ustring2 = NULL;

int32_t ustring1_len = 0;
int32_t ustring2_len = 0;

UErrorCode ustatus = U_ZERO_ERROR;

/* When all costs are equal, levenshtein fulfills the requirements of a metric, which means
* that the distance is symmetric. If string1 is shorter than string2 we can save memory (and CPU time)
* by having shorter rows (p1 & p2). */
if (ZSTR_LEN(string1) < ZSTR_LEN(string2) && cost_ins == cost_rep && cost_rep == cost_del) {
zend_string *tmp = string1;
string1 = string2;
string2 = tmp;
}

pstr1 = ZSTR_VAL(string1);
pstr2 = ZSTR_VAL(string2);

intl_convert_utf8_to_utf16(&ustring1, &ustring1_len, pstr1, ZSTR_LEN(string1), &ustatus);

if (U_FAILURE(ustatus)) {
intl_error_set_code(NULL, ustatus);

intl_error_set_custom_msg(NULL, "Error converting input string to UTF-16", 0);
efree(ustring1);
RETURN_FALSE;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason that false is being returned on failure, rather than throwing an exception?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For example, Userland can uses intl_get_error_message function after this function if this error. Therefore, this block is returns false.

}

intl_convert_utf8_to_utf16(&ustring2, &ustring2_len, pstr2, ZSTR_LEN(string2), &ustatus);

if (U_FAILURE(ustatus)) {
intl_error_set_code(NULL, ustatus);

intl_error_set_custom_msg(NULL, "Error converting input string to UTF-16", 0);
efree(ustring2);
efree(ustring1);
RETURN_FALSE;
}

UBreakIterator *bi1, *bi2;

int32_t strlen_1, strlen_2;
strlen_1 = grapheme_split_string(ustring1, ustring1_len, NULL, 0);
strlen_2 = grapheme_split_string(ustring2, ustring2_len, NULL, 0);

if (strlen_1 == 0) {
efree(ustring1);
efree(ustring2);
RETURN_LONG(strlen_2 * cost_ins);
}
if (strlen_2 == 0) {
efree(ustring1);
efree(ustring2);
RETURN_LONG(strlen_1 * cost_del);
}

unsigned char u_break_iterator_buffer1[U_BRK_SAFECLONE_BUFFERSIZE];
unsigned char u_break_iterator_buffer2[U_BRK_SAFECLONE_BUFFERSIZE];
bi1 = grapheme_get_break_iterator(u_break_iterator_buffer1, &ustatus);
if (U_FAILURE(ustatus)) {
intl_error_set_code(NULL, ustatus);
intl_error_set_custom_msg(NULL, "Error on grapheme_get_break_iterator for argument #1 ($string1)", 0);
efree(ustring2);
efree(ustring1);
ubrk_close(bi1);
RETURN_FALSE;
}

bi2 = grapheme_get_break_iterator(u_break_iterator_buffer2, &ustatus);
if (U_FAILURE(ustatus)) {
intl_error_set_code(NULL, ustatus);
intl_error_set_custom_msg(NULL, "Error on grapheme_get_break_iterator for argument #2 ($string2)", 0);
efree(ustring2);
efree(ustring1);
ubrk_close(bi2);
ubrk_close(bi1);
RETURN_FALSE;
}
ubrk_setText(bi1, ustring1, ustring1_len, &ustatus);

if (U_FAILURE(ustatus)) {
intl_error_set_code(NULL, ustatus);

intl_error_set_custom_msg(NULL, "Error on ubrk_setText for argument #1 ($string1)", 0);
efree(ustring2);
efree(ustring1);
ubrk_close(bi2);
ubrk_close(bi1);
RETURN_FALSE;
}

ubrk_setText(bi2, ustring2, ustring2_len, &ustatus);
if (U_FAILURE(ustatus)) {
intl_error_set_code(NULL, ustatus);

intl_error_set_custom_msg(NULL, "Error on ubrk_setText for argument #2 ($string2)", 0);
efree(ustring2);
efree(ustring1);
ubrk_close(bi2);
ubrk_close(bi1);
RETURN_FALSE;
}
UCollator *collator = ucol_open("", &ustatus);
if (U_FAILURE(ustatus)) {
intl_error_set_code(NULL, ustatus);

intl_error_set_custom_msg(NULL, "Error on ucol_open", 0);
efree(ustring2);
efree(ustring1);
ubrk_close(bi2);
ubrk_close(bi1);
ucol_close(collator);
RETURN_FALSE;
}

zend_long *p1, *p2, *tmp;
p1 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0);
p2 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0);

for (i2 = 0; i2 <= strlen_2; i2++) {
p1[i2] = i2 * cost_ins;
}

int32_t current1 = 0;
int32_t current2 = 0;
int32_t pos1 = 0;
int32_t pos2 = 0;

while (true) {
current1 = ubrk_current(bi1);
pos1 = ubrk_next(bi1);
if (pos1 == UBRK_DONE) {
break;
}
p2[0] = p1[0] + cost_del;
for (i2 = 0, pos2 = 0; pos2 != UBRK_DONE; i2++) {
current2 = ubrk_current(bi2);
pos2 = ubrk_next(bi2);
if (pos2 == UBRK_DONE) {
break;
}
if (ucol_strcoll(collator, ustring1 + current1, pos1 - current1, ustring2 + current2, pos2 - current2) == UCOL_EQUAL) {
c0 = p1[i2];
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Duplicated line

} else {
c0 = p1[i2] + cost_rep;
}
c1 = p1[i2 + 1] + cost_del;
if (c1 < c0) {
c0 = c1;
}
c2 = p2[i2] + cost_ins;
if (c2 < c0) {
c0 = c2;
}
p2[i2 + 1] = c0;
}
ubrk_first(bi2);
tmp = p1;
p1 = p2;
p2 = tmp;
}

ucol_close(collator);

ubrk_close(bi1);
ubrk_close(bi2);

efree(ustring1);
efree(ustring2);

retval = p1[strlen_2];

efree(p1);
efree(p2);
RETURN_LONG(retval);
}

/* }}} */
2 changes: 2 additions & 0 deletions ext/intl/php_intl.stub.php
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,8 @@ function grapheme_stristr(string $haystack, string $needle, bool $beforeNeedle =

function grapheme_str_split(string $string, int $length = 1): array|false {}

function grapheme_levenshtein(string $string1, string $string2, int $insertion_cost = 1, int $replacement_cost = 1, int $deletion_cost = 1): int|false {}

/** @param int $next */
function grapheme_extract(string $haystack, int $size, int $type = GRAPHEME_EXTR_COUNT, int $offset = 0, &$next = null): string|false {}

Expand Down
12 changes: 11 additions & 1 deletion ext/intl/php_intl_arginfo.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading