Skip to content

Commit

Permalink
Organize
Browse files Browse the repository at this point in the history
  • Loading branch information
wltsmrz committed Nov 16, 2020
1 parent d6e39c0 commit 9baccba
Show file tree
Hide file tree
Showing 7 changed files with 245 additions and 246 deletions.
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# carp_stemmer

Fully compliant [Porter stemmer](https://tartarus.org/martin/PorterStemmer/) in carp
[Porter stemmer](https://tartarus.org/martin/PorterStemmer/) in Carp. Tested
against 23531-word list from official site.

```clojure
(load "https://github.com/wltsmrz/carp_stemmer@v0.2.0")
(load "https://github.com/wltsmrz/carp_stemmer@v0.3.0")

(defn main []
(IO.println &(PorterStemmer.stem "greetings")))
(IO.println &(Stemmer.stem "greetings")))
```
239 changes: 238 additions & 1 deletion main.carp
Original file line number Diff line number Diff line change
@@ -1 +1,238 @@
(load "src/stemmer.carp")

(defmodule Stemmer
(relative-include "src/string_extras.h")
(register utf8-length (Fn [&String] Int) "String_utf8len_")
(register to-lower (Fn [&String] String) "String_to_lower_")

(defn trim-from [x j]
(String.slice x 0 (+ (String.length x) j)))

(defn replace-from [x j r]
(String.append &(trim-from x j) r))

(defn consonant? [x i]
(let [char (String.char-at x i)]
(case char
\a false
\e false
\i false
\o false
\u false
\y (or (= i 0) (not (consonant? x (Int.dec i))))
true)))

(defn cv-form [x j]
(let-do [form [] prev \X]
(for [i 0 (+ (String.length x) j)]
(do (cond (consonant? x i)
(when (or (= prev \X) (= prev \V))
(Array.push-back! &form \C))
(when (or (= prev \X) (= prev \C))
(Array.push-back! &form \V)))
(set! prev (Array.unsafe-last &form))))
form))

(defn m [x j]
(let-do [n 0 form (cv-form x j)]
(for [i 0 (Int.dec (Array.length &form))]
(when (and
(= \V @(Array.unsafe-nth &form i))
(= \C @(Array.unsafe-nth &form (Int.inc i))))
(set! n (Int.inc n))))
n))

(defn m0 [x n] (> (m x n) 0))

(defn m1 [x n] (> (m x n) 1))

(defn contains-vowel-from? [x j]
(let-do [res false]
(for [i 0 (+ (String.length x) j)]
(when (not (consonant? x i))
(do (set! res true) (break))))
res))

(defn ends-with-cc? [x]
(let [j (Int.dec (String.length x))]
(and*
(>= j 1)
(= (String.char-at x j) (String.char-at x (Int.dec j)))
(consonant? x j))))

(defn cvc? [x]
(let [j (Int.dec (String.length x))]
(and*
(>= j 2)
(consonant? x j)
(not (consonant? x (- j 1)))
(and (consonant? x (- j 2))
(and*
(not (= \w (String.char-at x j)))
(not (= \x (String.char-at x j)))
(not (= \y (String.char-at x j))))))))

(defn step-1a [x]
(cond
(not (String.ends-with? &x "s")) x
(String.ends-with? &x "sses") (trim-from &x -2)
(String.ends-with? &x "ies") (trim-from &x -2)
(not (String.ends-with? &x "ss")) (trim-from &x -1)
x))

(defn step-1bi [x]
(cond
(or*
(String.ends-with? &x "at")
(String.ends-with? &x "bl")
(String.ends-with? &x "iz"))
(append &x "e")
(and*
(ends-with-cc? &x)
(not (String.ends-with? &x "l"))
(not (String.ends-with? &x "s"))
(not (String.ends-with? &x "z")))
(trim-from &x -1)
(and (= (m &x 0) 1) (cvc? &x))
(append &x "e")
x))

(defn step-1b [x]
(cond
(String.ends-with? &x "eed")
(if (m0 &x -3) (trim-from &x -1) x)
(and (String.ends-with? &x "ed") (contains-vowel-from? &x -2))
(step-1bi (trim-from &x -2))
(and (String.ends-with? &x "ing") (contains-vowel-from? &x -3))
(step-1bi (trim-from &x -3))
x))

(defn step-1c [x]
(cond (and (String.ends-with? &x "y") (contains-vowel-from? &x -1))
(replace-from &x -1 "i")
x))

(defn step-2 [x]
(case (String.char-at &x (- (String.length &x) 2))
\a (cond
(String.ends-with? &x "ational") (if (m0 &x -7) (replace-from &x -7 "ate") x)
(String.ends-with? &x "tional") (if (m0 &x -6) (replace-from &x -6 "tion") x) x)
\c (cond
(String.ends-with? &x "enci") (if (m0 &x -4) (replace-from &x -4 "ence") x)
(String.ends-with? &x "anci") (if (m0 &x -4) (replace-from &x -4 "ance") x) x)
\e (cond
(String.ends-with? &x "izer") (if (m0 &x -4) (replace-from &x -4 "ize") x) x)
\l (cond
(String.ends-with? &x "bli") (if (m0 &x -3) (replace-from &x -3 "ble") x)
(String.ends-with? &x "alli") (if (m0 &x -4) (replace-from &x -4 "al") x)
(String.ends-with? &x "entli") (if (m0 &x -5) (replace-from &x -5 "ent") x)
(String.ends-with? &x "eli") (if (m0 &x -3) (replace-from &x -3 "e") x)
(String.ends-with? &x "ousli") (if (m0 &x -5) (replace-from &x -5 "ous") x) x)
\o (cond
(String.ends-with? &x "ization") (if (m0 &x -7) (replace-from &x -7 "ize") x)
(String.ends-with? &x "ation") (if (m0 &x -5) (replace-from &x -5 "ate") x)
(String.ends-with? &x "ator") (if (m0 &x -4) (replace-from &x -4 "ate") x) x)
\s (cond
(String.ends-with? &x "alism") (if (m0 &x -5) (replace-from &x -5 "ize") x)
(String.ends-with? &x "iveness") (if (m0 &x -7) (replace-from &x -7 "ive") x)
(String.ends-with? &x "fulness") (if (m0 &x -7) (replace-from &x -7 "ful") x)
(String.ends-with? &x "ousness") (if (m0 &x -7) (replace-from &x -7 "ous") x) x)
\t (cond
(String.ends-with? &x "aliti") (if (m0 &x -5) (replace-from &x -5 "al") x)
(String.ends-with? &x "iviti") (if (m0 &x -5) (replace-from &x -5 "ive") x)
(String.ends-with? &x "biliti") (if (m0 &x -6) (replace-from &x -6 "ble") x) x)
\g (cond
(String.ends-with? &x "logi") (if (m0 &x -4) (replace-from &x -4 "log") x) x)

x))

(defn step-3 [x]
(case (String.char-at &x (- (String.length &x) 1))
\e (cond
(String.ends-with? &x "icate") (if (m0 &x -5) (replace-from &x -5 "ic") x)
(String.ends-with? &x "ative") (if (m0 &x -5) (replace-from &x -5 "") x)
(String.ends-with? &x "alize") (if (m0 &x -5) (replace-from &x -5 "al") x) x)
\i (cond
(String.ends-with? &x "iciti") (if (m0 &x -5) (replace-from &x -5 "ic") x) x)
\l (cond
(String.ends-with? &x "ical") (if (m0 &x -4) (replace-from &x -4 "ic") x)
(String.ends-with? &x "ful") (if (m0 &x -3) (replace-from &x -3 "") x) x)
\s (cond
(String.ends-with? &x "ness") (if (m0 &x -4) (replace-from &x -4 "") x) x)
x))

(defn step-4 [x]
(case (String.char-at &x (- (String.length &x) 2))
\a (cond
(String.ends-with? &x "al") (if (m1 &x -2) (trim-from &x -2) x) x)
\c (cond
(String.ends-with? &x "ance") (if (m1 &x -4) (trim-from &x -4) x)
(String.ends-with? &x "ence") (if (m1 &x -4) (trim-from &x -4) x) x)
\e (cond
(String.ends-with? &x "er") (if (m1 &x -2) (trim-from &x -2) x) x)
\i (cond
(String.ends-with? &x "ic") (if (m1 &x -2) (trim-from &x -2) x) x)
\l (cond
(String.ends-with? &x "able") (if (m1 &x -4) (trim-from &x -4) x)
(String.ends-with? &x "ible") (if (m1 &x -4) (trim-from &x -4) x) x)
\n (cond
(String.ends-with? &x "ant") (if (m1 &x -3) (trim-from &x -3) x)
(String.ends-with? &x "ement") (if (m1 &x -5) (trim-from &x -5) x)
(String.ends-with? &x "ment") (if (m1 &x -4) (trim-from &x -4) x)
(String.ends-with? &x "ent") (if (m1 &x -3) (trim-from &x -3) x) x)
\o (cond
(String.ends-with? &x "ion") (if
(and (m1 &x -3)
(or
(= \s (String.char-at &x (- (String.length &x) 4)))
(= \t (String.char-at &x (- (String.length &x) 4)))))
(trim-from &x -3) x)
(String.ends-with? &x "ou") (if (m1 &x -2) (trim-from &x -2) x) x)
\s (cond
(String.ends-with? &x "ism") (if (m1 &x -3) (trim-from &x -3) x) x)
\t (cond
(String.ends-with? &x "ate") (if (m1 &x -3) (trim-from &x -3) x)
(String.ends-with? &x "iti") (if (m1 &x -3) (trim-from &x -3) x) x)
\u (cond
(String.ends-with? &x "ous") (if (m1 &x -3) (trim-from &x -3) x) x)
\v (cond
(String.ends-with? &x "ive") (if (m1 &x -3) (trim-from &x -3) x) x)
\z (cond
(String.ends-with? &x "ize") (if (m1 &x -3) (trim-from &x -3) x) x)
x))

(defn step-5a [x]
(cond (String.ends-with? &x "e")
(let [mn (m &x 0)]
(cond
(> mn 1) (trim-from &x -1)
(and (= mn 1) (not (cvc? &(trim-from &x -1)))) (trim-from &x -1)
x))
x))

(defn step-5b [x]
(cond
(and*
(String.ends-with? &x "l")
(> (m &x 0) 1)
(ends-with-cc? &x))
(trim-from &x -1)
x))

(defn stem [x]
(cond (or (< (String.length x) 3) (not (= (String.length x) (utf8-length x))))
@x
(=> x
(to-lower)
(step-1a)
(step-1b)
(step-1c)
(step-2)
(step-3)
(step-4)
(step-5a)
(step-5b)
)))

(defn stem-cstr [x]
(String.cstr &(stem &(String.from-cstr x))))
)
Loading

0 comments on commit 9baccba

Please sign in to comment.