From d6e39c0c81453f6c569b81ea62d0d9d279e8f406 Mon Sep 17 00:00:00 2001 From: wltsmrz Date: Mon, 16 Nov 2020 11:57:14 +0000 Subject: [PATCH] Skip stemming strings with non-ascii chars --- src/stemmer.carp | 6 +++--- src/string_add.h | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/stemmer.carp b/src/stemmer.carp index 499b8de..83cb537 100644 --- a/src/stemmer.carp +++ b/src/stemmer.carp @@ -5,7 +5,7 @@ (register to-lower (Fn [&String] String) "String_to_lower_") (defn trim-from [x j] - (String.slice x 0 (+ (utf8-length x) j))) + (String.slice x 0 (+ (String.length x) j))) (defn replace-from [x j r] (String.append &(trim-from x j) r)) @@ -219,7 +219,7 @@ x)) (defn stem [x] - (cond (< (String.length x) 3) + (cond (or (< (String.length x) 3) (not (= (String.length x) (utf8-length x)))) @x (=> x (to-lower) @@ -234,5 +234,5 @@ ))) (defn stem-cstr [x] - (cstr &(stem &(String.from-cstr x)))) + (String.cstr &(stem &(String.from-cstr x)))) ) diff --git a/src/string_add.h b/src/string_add.h index 69eeac6..3cf0caf 100644 --- a/src/string_add.h +++ b/src/string_add.h @@ -9,4 +9,3 @@ String String_to_lower_(const String* p) { while (i--) *(r + i) = tolower(*(r + i)); return r; } -