From 2e252eb1761bac9ee40055a8ab3ad7ecabfc8570 Mon Sep 17 00:00:00 2001 From: Dusan Malusev Date: Thu, 18 Jul 2024 11:18:06 +0200 Subject: [PATCH] Remove JFlex fully Signed-off-by: Dusan Malusev --- build.xml | 36 +- .../sasi/analyzer/DelimiterAnalyzer.java | 111 ------ .../analyzer/DelimiterTokenizingOptions.java | 71 ---- .../sasi/analyzer/SUPPLEMENTARY.jflex-macro | 143 -------- .../sasi/analyzer/StandardTokenizerImpl.jflex | 220 ------------ .../analyzer/StandardTokenizerInterface.java | 65 ---- .../analyzer/StandardTokenizerOptions.java | 273 --------------- .../sasi/analyzer/filter/StemmerFactory.java | 102 ------ .../sasi/analyzer/filter/StemmingFilters.java | 46 --- .../sasi/analyzer/filter/StopWordFactory.java | 100 ------ .../sasi/analyzer/filter/StopWordFilters.java | 42 --- .../cassandra/index/sasi/SASIIndexTest.java | 3 - .../sasi/analyzer/DelimiterAnalyzerTest.java | 317 ------------------ .../sasi/analyzer/StandardAnalyzerTest.java | 227 ------------- 14 files changed, 1 insertion(+), 1755 deletions(-) delete mode 100644 src/java/org/apache/cassandra/index/sasi/analyzer/DelimiterAnalyzer.java delete mode 100644 src/java/org/apache/cassandra/index/sasi/analyzer/DelimiterTokenizingOptions.java delete mode 100644 src/java/org/apache/cassandra/index/sasi/analyzer/SUPPLEMENTARY.jflex-macro delete mode 100644 src/java/org/apache/cassandra/index/sasi/analyzer/StandardTokenizerImpl.jflex delete mode 100644 src/java/org/apache/cassandra/index/sasi/analyzer/StandardTokenizerInterface.java delete mode 100644 src/java/org/apache/cassandra/index/sasi/analyzer/StandardTokenizerOptions.java delete mode 100644 src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmerFactory.java delete mode 100644 src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmingFilters.java delete mode 100644 src/java/org/apache/cassandra/index/sasi/analyzer/filter/StopWordFactory.java delete mode 100644 src/java/org/apache/cassandra/index/sasi/analyzer/filter/StopWordFilters.java delete mode 100644 test/unit/org/apache/cassandra/index/sasi/analyzer/DelimiterAnalyzerTest.java delete mode 100644 test/unit/org/apache/cassandra/index/sasi/analyzer/StandardAnalyzerTest.java diff --git a/build.xml b/build.xml index 24107ac14b..35c55928d1 100644 --- a/build.xml +++ b/build.xml @@ -241,14 +241,6 @@ - - - - - - - - @@ -389,27 +381,6 @@ - - - - - - - - - - - - - - - - - - @@ -909,7 +875,7 @@ - diff --git a/src/java/org/apache/cassandra/index/sasi/analyzer/DelimiterAnalyzer.java b/src/java/org/apache/cassandra/index/sasi/analyzer/DelimiterAnalyzer.java deleted file mode 100644 index 05dfedc6c4..0000000000 --- a/src/java/org/apache/cassandra/index/sasi/analyzer/DelimiterAnalyzer.java +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.cassandra.index.sasi.analyzer; - -import java.nio.CharBuffer; -import java.nio.ByteBuffer; -import java.nio.charset.Charset; -import java.nio.charset.StandardCharsets; -import java.util.HashMap; -import java.util.Iterator; -import java.util.Map; - -import com.google.common.annotations.Beta; -import com.google.common.base.Preconditions; - -import org.apache.cassandra.db.marshal.AbstractType; -import org.apache.cassandra.db.marshal.AsciiType; -import org.apache.cassandra.db.marshal.UTF8Type; -import org.apache.cassandra.utils.AbstractIterator; - -@Beta -public class DelimiterAnalyzer extends AbstractAnalyzer -{ - - private static final Map, Charset> VALID_ANALYZABLE_TYPES = new HashMap, Charset>() - {{ - put(UTF8Type.instance, StandardCharsets.UTF_8); - put(AsciiType.instance, StandardCharsets.US_ASCII); - }}; - - private char delimiter; - private Charset charset; - private Iterator iter; - - public DelimiterAnalyzer() - { - } - - @Override - public ByteBuffer next() - { - return iter.next(); - } - - public void init(Map options, AbstractType validator) - { - DelimiterTokenizingOptions tokenizingOptions = DelimiterTokenizingOptions.buildFromMap(options); - delimiter = tokenizingOptions.getDelimiter(); - charset = VALID_ANALYZABLE_TYPES.get(validator); - } - - public boolean hasNext() - { - return iter.hasNext(); - } - - public void reset(ByteBuffer input) - { - Preconditions.checkNotNull(input); - final CharBuffer cb = charset.decode(input); - - this.iter = new AbstractIterator() { - protected ByteBuffer computeNext() { - - if (!cb.hasRemaining()) - return endOfData(); - - CharBuffer readahead = cb.duplicate(); - // loop until we see the next delimiter character, or reach end of data - boolean readaheadRemaining; - while ((readaheadRemaining = readahead.hasRemaining()) && readahead.get() != delimiter); - - char[] chars = new char[readahead.position() - cb.position() - (readaheadRemaining ? 1 : 0)]; - cb.get(chars); - Preconditions.checkState(!cb.hasRemaining() || cb.get() == delimiter); - - return 0 < chars.length - ? charset.encode(CharBuffer.wrap(chars)) - // blank partition keys not permitted, ref ConcurrentRadixTree.putIfAbsent(..) - : computeNext(); - } - }; - } - - @Override - public boolean isTokenizing() - { - return true; - } - - @Override - public boolean isCompatibleWith(AbstractType validator) - { - return VALID_ANALYZABLE_TYPES.containsKey(validator); - } -} \ No newline at end of file diff --git a/src/java/org/apache/cassandra/index/sasi/analyzer/DelimiterTokenizingOptions.java b/src/java/org/apache/cassandra/index/sasi/analyzer/DelimiterTokenizingOptions.java deleted file mode 100644 index c2c8ef7d53..0000000000 --- a/src/java/org/apache/cassandra/index/sasi/analyzer/DelimiterTokenizingOptions.java +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.cassandra.index.sasi.analyzer; - -import java.util.Map; - -/** Simple tokenizer based on a specified delimiter (rather than whitespace). - */ -public class DelimiterTokenizingOptions -{ - public static final String DELIMITER = "delimiter"; - - private final char delimiter; - - private DelimiterTokenizingOptions(char delimiter) - { - this.delimiter = delimiter; - } - - char getDelimiter() - { - return delimiter; - } - - private static class OptionsBuilder - { - private char delimiter = ','; - - public DelimiterTokenizingOptions build() - { - return new DelimiterTokenizingOptions(delimiter); - } - } - - static DelimiterTokenizingOptions buildFromMap(Map optionsMap) - { - OptionsBuilder optionsBuilder = new OptionsBuilder(); - - for (Map.Entry entry : optionsMap.entrySet()) - { - switch (entry.getKey()) - { - case DELIMITER: - { - String value = entry.getValue(); - if (1 != value.length()) - throw new IllegalArgumentException(String.format("Only single character delimiters supported, was %s", value)); - - optionsBuilder.delimiter = entry.getValue().charAt(0); - break; - } - } - } - return optionsBuilder.build(); - } -} diff --git a/src/java/org/apache/cassandra/index/sasi/analyzer/SUPPLEMENTARY.jflex-macro b/src/java/org/apache/cassandra/index/sasi/analyzer/SUPPLEMENTARY.jflex-macro deleted file mode 100644 index f5bf68e254..0000000000 --- a/src/java/org/apache/cassandra/index/sasi/analyzer/SUPPLEMENTARY.jflex-macro +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -// Generated using ICU4J 52.1.0.0 -// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros - - -ALetterSupp = ( - ([\ud83b][\uDE00-\uDE03\uDE05-\uDE1F\uDE21\uDE22\uDE24\uDE27\uDE29-\uDE32\uDE34-\uDE37\uDE39\uDE3B\uDE42\uDE47\uDE49\uDE4B\uDE4D-\uDE4F\uDE51\uDE52\uDE54\uDE57\uDE59\uDE5B\uDE5D\uDE5F\uDE61\uDE62\uDE64\uDE67-\uDE6A\uDE6C-\uDE72\uDE74-\uDE77\uDE79-\uDE7C\uDE7E\uDE80-\uDE89\uDE8B-\uDE9B\uDEA1-\uDEA3\uDEA5-\uDEA9\uDEAB-\uDEBB]) - | ([\ud81a][\uDC00-\uDE38]) - | ([\ud81b][\uDF00-\uDF44\uDF50\uDF93-\uDF9F]) - | ([\ud835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB]) - | ([\ud80d][\uDC00-\uDC2E]) - | ([\ud80c][\uDC00-\uDFFF]) - | ([\ud809][\uDC00-\uDC62]) - | ([\ud808][\uDC00-\uDF6E]) - | ([\ud805][\uDE80-\uDEAA]) - | ([\ud804][\uDC03-\uDC37\uDC83-\uDCAF\uDCD0-\uDCE8\uDD03-\uDD26\uDD83-\uDDB2\uDDC1-\uDDC4]) - | ([\ud801][\uDC00-\uDC9D]) - | ([\ud800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5]) - | ([\ud803][\uDC00-\uDC48]) - | ([\ud802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDD80-\uDDB7\uDDBE\uDDBF\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]) -) -FormatSupp = ( - ([\ud804][\uDCBD]) - | ([\ud834][\uDD73-\uDD7A]) - | ([\udb40][\uDC01\uDC20-\uDC7F]) -) -NumericSupp = ( - ([\ud805][\uDEC0-\uDEC9]) - | ([\ud804][\uDC66-\uDC6F\uDCF0-\uDCF9\uDD36-\uDD3F\uDDD0-\uDDD9]) - | ([\ud835][\uDFCE-\uDFFF]) - | ([\ud801][\uDCA0-\uDCA9]) -) -ExtendSupp = ( - ([\ud81b][\uDF51-\uDF7E\uDF8F-\uDF92]) - | ([\ud805][\uDEAB-\uDEB7]) - | ([\ud804][\uDC00-\uDC02\uDC38-\uDC46\uDC80-\uDC82\uDCB0-\uDCBA\uDD00-\uDD02\uDD27-\uDD34\uDD80-\uDD82\uDDB3-\uDDC0]) - | ([\ud834][\uDD65-\uDD69\uDD6D-\uDD72\uDD7B-\uDD82\uDD85-\uDD8B\uDDAA-\uDDAD\uDE42-\uDE44]) - | ([\ud800][\uDDFD]) - | ([\udb40][\uDD00-\uDDEF]) - | ([\ud802][\uDE01-\uDE03\uDE05\uDE06\uDE0C-\uDE0F\uDE38-\uDE3A\uDE3F]) -) -KatakanaSupp = ( - ([\ud82c][\uDC00]) -) -MidLetterSupp = ( - [] -) -MidNumSupp = ( - [] -) -MidNumLetSupp = ( - [] -) -ExtendNumLetSupp = ( - [] -) -ExtendNumLetSupp = ( - [] -) -ComplexContextSupp = ( - [] -) -HanSupp = ( - ([\ud87e][\uDC00-\uDE1D]) - | ([\ud86b][\uDC00-\uDFFF]) - | ([\ud86a][\uDC00-\uDFFF]) - | ([\ud869][\uDC00-\uDED6\uDF00-\uDFFF]) - | ([\ud868][\uDC00-\uDFFF]) - | ([\ud86e][\uDC00-\uDC1D]) - | ([\ud86d][\uDC00-\uDF34\uDF40-\uDFFF]) - | ([\ud86c][\uDC00-\uDFFF]) - | ([\ud863][\uDC00-\uDFFF]) - | ([\ud862][\uDC00-\uDFFF]) - | ([\ud861][\uDC00-\uDFFF]) - | ([\ud860][\uDC00-\uDFFF]) - | ([\ud867][\uDC00-\uDFFF]) - | ([\ud866][\uDC00-\uDFFF]) - | ([\ud865][\uDC00-\uDFFF]) - | ([\ud864][\uDC00-\uDFFF]) - | ([\ud858][\uDC00-\uDFFF]) - | ([\ud859][\uDC00-\uDFFF]) - | ([\ud85a][\uDC00-\uDFFF]) - | ([\ud85b][\uDC00-\uDFFF]) - | ([\ud85c][\uDC00-\uDFFF]) - | ([\ud85d][\uDC00-\uDFFF]) - | ([\ud85e][\uDC00-\uDFFF]) - | ([\ud85f][\uDC00-\uDFFF]) - | ([\ud850][\uDC00-\uDFFF]) - | ([\ud851][\uDC00-\uDFFF]) - | ([\ud852][\uDC00-\uDFFF]) - | ([\ud853][\uDC00-\uDFFF]) - | ([\ud854][\uDC00-\uDFFF]) - | ([\ud855][\uDC00-\uDFFF]) - | ([\ud856][\uDC00-\uDFFF]) - | ([\ud857][\uDC00-\uDFFF]) - | ([\ud849][\uDC00-\uDFFF]) - | ([\ud848][\uDC00-\uDFFF]) - | ([\ud84b][\uDC00-\uDFFF]) - | ([\ud84a][\uDC00-\uDFFF]) - | ([\ud84d][\uDC00-\uDFFF]) - | ([\ud84c][\uDC00-\uDFFF]) - | ([\ud84f][\uDC00-\uDFFF]) - | ([\ud84e][\uDC00-\uDFFF]) - | ([\ud841][\uDC00-\uDFFF]) - | ([\ud840][\uDC00-\uDFFF]) - | ([\ud843][\uDC00-\uDFFF]) - | ([\ud842][\uDC00-\uDFFF]) - | ([\ud845][\uDC00-\uDFFF]) - | ([\ud844][\uDC00-\uDFFF]) - | ([\ud847][\uDC00-\uDFFF]) - | ([\ud846][\uDC00-\uDFFF]) -) -HiraganaSupp = ( - ([\ud83c][\uDE00]) - | ([\ud82c][\uDC01]) -) -SingleQuoteSupp = ( - [] -) -DoubleQuoteSupp = ( - [] -) -HebrewLetterSupp = ( - [] -) -RegionalIndicatorSupp = ( - ([\ud83c][\uDDE6-\uDDFF]) -) diff --git a/src/java/org/apache/cassandra/index/sasi/analyzer/StandardTokenizerImpl.jflex b/src/java/org/apache/cassandra/index/sasi/analyzer/StandardTokenizerImpl.jflex deleted file mode 100644 index 86c645101d..0000000000 --- a/src/java/org/apache/cassandra/index/sasi/analyzer/StandardTokenizerImpl.jflex +++ /dev/null @@ -1,220 +0,0 @@ -package org.apache.cassandra.index.sasi.analyzer; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.Arrays; - -/** - * This class implements Word Break rules from the Unicode Text Segmentation - * algorithm, as specified in - * Unicode Standard Annex #29. - *

- * Tokens produced are of the following types: - *

    - *
  • <ALPHANUM>: A sequence of alphabetic and numeric characters
  • - *
  • <NUM>: A number
  • - *
  • <SOUTHEAST_ASIAN>: A sequence of characters from South and Southeast - * Asian languages, including Thai, Lao, Myanmar, and Khmer
  • - *
  • <IDEOGRAPHIC>: A single CJKV ideographic character
  • - *
  • <HIRAGANA>: A single hiragana character
  • - *
  • <KATAKANA>: A sequence of katakana characters
  • - *
  • <HANGUL>: A sequence of Hangul characters
  • - *
- */ -%% - -%unicode 6.3 -%integer -%final -%public -%class StandardTokenizerImpl -%implements StandardTokenizerInterface -%function getNextToken -%char -%buffer 4096 - -%include SUPPLEMENTARY.jflex-macro -ALetter = (\p{WB:ALetter} | {ALetterSupp}) -Format = (\p{WB:Format} | {FormatSupp}) -Numeric = ([\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] | {NumericSupp}) -Extend = (\p{WB:Extend} | {ExtendSupp}) -Katakana = (\p{WB:Katakana} | {KatakanaSupp}) -MidLetter = (\p{WB:MidLetter} | {MidLetterSupp}) -MidNum = (\p{WB:MidNum} | {MidNumSupp}) -MidNumLet = (\p{WB:MidNumLet} | {MidNumLetSupp}) -ExtendNumLet = (\p{WB:ExtendNumLet} | {ExtendNumLetSupp}) -ComplexContext = (\p{LB:Complex_Context} | {ComplexContextSupp}) -Han = (\p{Script:Han} | {HanSupp}) -Hiragana = (\p{Script:Hiragana} | {HiraganaSupp}) -SingleQuote = (\p{WB:Single_Quote} | {SingleQuoteSupp}) -DoubleQuote = (\p{WB:Double_Quote} | {DoubleQuoteSupp}) -HebrewLetter = (\p{WB:Hebrew_Letter} | {HebrewLetterSupp}) -RegionalIndicator = (\p{WB:Regional_Indicator} | {RegionalIndicatorSupp}) -HebrewOrALetter = ({HebrewLetter} | {ALetter}) - -// UAX#29 WB4. X (Extend | Format)* --> X -// -HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] ({Format} | {Extend})* -HebrewOrALetterEx = {HebrewOrALetter} ({Format} | {Extend})* -NumericEx = {Numeric} ({Format} | {Extend})* -KatakanaEx = {Katakana} ({Format} | {Extend})* -MidLetterEx = ({MidLetter} | {MidNumLet} | {SingleQuote}) ({Format} | {Extend})* -MidNumericEx = ({MidNum} | {MidNumLet} | {SingleQuote}) ({Format} | {Extend})* -ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})* -HanEx = {Han} ({Format} | {Extend})* -HiraganaEx = {Hiragana} ({Format} | {Extend})* -SingleQuoteEx = {SingleQuote} ({Format} | {Extend})* -DoubleQuoteEx = {DoubleQuote} ({Format} | {Extend})* -HebrewLetterEx = {HebrewLetter} ({Format} | {Extend})* -RegionalIndicatorEx = {RegionalIndicator} ({Format} | {Extend})* - - -%{ - /** Alphanumeric sequences */ - public static final int WORD_TYPE = StandardAnalyzer.TokenType.ALPHANUM.value; - - /** Numbers */ - public static final int NUMERIC_TYPE = StandardAnalyzer.TokenType.NUM.value; - - /** - * Chars in class \p{Line_Break = Complex_Context} are from South East Asian - * scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept - * together as as a single token rather than broken up, because the logic - * required to break them at word boundaries is too complex for UAX#29. - *

- * See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA - */ - public static final int SOUTH_EAST_ASIAN_TYPE = StandardAnalyzer.TokenType.SOUTHEAST_ASIAN.value; - - public static final int IDEOGRAPHIC_TYPE = StandardAnalyzer.TokenType.IDEOGRAPHIC.value; - - public static final int HIRAGANA_TYPE = StandardAnalyzer.TokenType.HIRAGANA.value; - - public static final int KATAKANA_TYPE = StandardAnalyzer.TokenType.KATAKANA.value; - - public static final int HANGUL_TYPE = StandardAnalyzer.TokenType.HANGUL.value; - - public final long yychar() - { - return yychar; - } - - public String getText() - { - return String.valueOf(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); - } - - public char[] getArray() - { - return Arrays.copyOfRange(zzBuffer, zzStartRead, zzMarkedPos); - } - - public byte[] getBytes() - { - return getText().getBytes(); - } - -%} - -%% - -// UAX#29 WB1. sot ÷ -// WB2. ÷ eot -// -<> { return StandardAnalyzer.TokenType.EOF.value; } - -// UAX#29 WB8. Numeric × Numeric -// WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric -// WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric -// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet -// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana) -// -{ExtendNumLetEx}* {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* {ExtendNumLetEx}* - { return NUMERIC_TYPE; } - -// subset of the below for typing purposes only! -{HangulEx}+ - { return HANGUL_TYPE; } - -{KatakanaEx}+ - { return KATAKANA_TYPE; } - -// UAX#29 WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter) -// WB6. (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter) -// WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) × (ALetter | Hebrew_Letter) -// WB7a. Hebrew_Letter × Single_Quote -// WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter -// WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter -// WB9. (ALetter | Hebrew_Letter) × Numeric -// WB10. Numeric × (ALetter | Hebrew_Letter) -// WB13. Katakana × Katakana -// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet -// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana) -// -{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )* - | ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} ) - | {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* - | {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )* - )+ - ) -({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )* - | ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} ) - | {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* - | {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )* - )+ - ) -)* -{ExtendNumLetEx}* - { return WORD_TYPE; } - - -// From UAX #29: -// -// [C]haracters with the Line_Break property values of Contingent_Break (CB), -// Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word -// boundary property values based on criteria outside of the scope of this -// annex. That means that satisfactory treatment of languages like Chinese -// or Thai requires special handling. -// -// In Unicode 6.3, only one character has the \p{Line_Break = Contingent_Break} -// property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER. -// -// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context} -// character sequences (from South East Asian scripts like Thai, Myanmar, Khmer, -// Lao, etc.) are kept together. This grammar does the same below. -// -// See also the Unicode Line Breaking Algorithm: -// -// http://www.unicode.org/reports/tr14/#SA -// -{ComplexContext}+ { return SOUTH_EAST_ASIAN_TYPE; } - -// UAX#29 WB14. Any ÷ Any -// -{HanEx} { return IDEOGRAPHIC_TYPE; } -{HiraganaEx} { return HIRAGANA_TYPE; } - - -// UAX#29 WB3. CR × LF -// WB3a. (Newline | CR | LF) ÷ -// WB3b. ÷ (Newline | CR | LF) -// WB13c. Regional_Indicator × Regional_Indicator -// WB14. Any ÷ Any -// -{RegionalIndicatorEx} {RegionalIndicatorEx}+ | [^] - { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ } diff --git a/src/java/org/apache/cassandra/index/sasi/analyzer/StandardTokenizerInterface.java b/src/java/org/apache/cassandra/index/sasi/analyzer/StandardTokenizerInterface.java deleted file mode 100644 index f8b6bf773e..0000000000 --- a/src/java/org/apache/cassandra/index/sasi/analyzer/StandardTokenizerInterface.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.cassandra.index.sasi.analyzer; - -import java.io.IOException; -import java.io.Reader; - -/** - * Internal interface for supporting versioned grammars. - */ -public interface StandardTokenizerInterface -{ - - String getText(); - - char[] getArray(); - - byte[] getBytes(); - - /** - * Returns the current position. - */ - long yychar(); - - /** - * Returns the length of the matched text region. - */ - int yylength(); - - /** - * Resumes scanning until the next regular expression is matched, - * the end of input is encountered or an I/O-Error occurs. - * - * @return the next token, {@link #YYEOF} on end of stream - * @exception java.io.IOException if any I/O-Error occurs - */ - int getNextToken() throws IOException; - - /** - * Resets the scanner to read from a new input stream. - * Does not close the old reader. - * - * All internal variables are reset, the old input stream - * cannot be reused (internal buffer is discarded and lost). - * Lexical state is set to ZZ_INITIAL. - * - * @param reader the new input stream - */ - void yyreset(Reader reader); -} diff --git a/src/java/org/apache/cassandra/index/sasi/analyzer/StandardTokenizerOptions.java b/src/java/org/apache/cassandra/index/sasi/analyzer/StandardTokenizerOptions.java deleted file mode 100644 index da44f0ad7b..0000000000 --- a/src/java/org/apache/cassandra/index/sasi/analyzer/StandardTokenizerOptions.java +++ /dev/null @@ -1,273 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.cassandra.index.sasi.analyzer; - -import java.util.Locale; -import java.util.Map; - -/** - * Various options for controlling tokenization and enabling - * or disabling features - */ -public class StandardTokenizerOptions -{ - public static final String TOKENIZATION_ENABLE_STEMMING = "tokenization_enable_stemming"; - public static final String TOKENIZATION_SKIP_STOP_WORDS = "tokenization_skip_stop_words"; - public static final String TOKENIZATION_LOCALE = "tokenization_locale"; - public static final String TOKENIZATION_NORMALIZE_LOWERCASE = "tokenization_normalize_lowercase"; - public static final String TOKENIZATION_NORMALIZE_UPPERCASE = "tokenization_normalize_uppercase"; - - public static final int DEFAULT_MAX_TOKEN_LENGTH = 255; - public static final int DEFAULT_MIN_TOKEN_LENGTH = 0; - - private boolean stemTerms; - private boolean ignoreStopTerms; - private Locale locale; - private boolean caseSensitive; - private boolean allTermsToUpperCase; - private boolean allTermsToLowerCase; - private int minTokenLength; - private int maxTokenLength; - - public boolean shouldStemTerms() - { - return stemTerms; - } - - public void setStemTerms(boolean stemTerms) - { - this.stemTerms = stemTerms; - } - - public boolean shouldIgnoreStopTerms() - { - return ignoreStopTerms; - } - - public void setIgnoreStopTerms(boolean ignoreStopTerms) - { - this.ignoreStopTerms = ignoreStopTerms; - } - - public Locale getLocale() - { - return locale; - } - - public void setLocale(Locale locale) - { - this.locale = locale; - } - - public boolean isCaseSensitive() - { - return caseSensitive; - } - - public void setCaseSensitive(boolean caseSensitive) - { - this.caseSensitive = caseSensitive; - } - - public boolean shouldUpperCaseTerms() - { - return allTermsToUpperCase; - } - - public void setAllTermsToUpperCase(boolean allTermsToUpperCase) - { - this.allTermsToUpperCase = allTermsToUpperCase; - } - - public boolean shouldLowerCaseTerms() - { - return allTermsToLowerCase; - } - - public void setAllTermsToLowerCase(boolean allTermsToLowerCase) - { - this.allTermsToLowerCase = allTermsToLowerCase; - } - - public int getMinTokenLength() - { - return minTokenLength; - } - - public void setMinTokenLength(int minTokenLength) - { - this.minTokenLength = minTokenLength; - } - - public int getMaxTokenLength() - { - return maxTokenLength; - } - - public void setMaxTokenLength(int maxTokenLength) - { - this.maxTokenLength = maxTokenLength; - } - - public static class OptionsBuilder - { - private boolean stemTerms; - private boolean ignoreStopTerms; - private Locale locale; - private boolean caseSensitive; - private boolean allTermsToUpperCase; - private boolean allTermsToLowerCase; - private int minTokenLength = DEFAULT_MIN_TOKEN_LENGTH; - private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; - - public OptionsBuilder() - { - } - - public OptionsBuilder stemTerms(boolean stemTerms) - { - this.stemTerms = stemTerms; - return this; - } - - public OptionsBuilder ignoreStopTerms(boolean ignoreStopTerms) - { - this.ignoreStopTerms = ignoreStopTerms; - return this; - } - - public OptionsBuilder useLocale(Locale locale) - { - this.locale = locale; - return this; - } - - public OptionsBuilder caseSensitive(boolean caseSensitive) - { - this.caseSensitive = caseSensitive; - return this; - } - - public OptionsBuilder alwaysUpperCaseTerms(boolean allTermsToUpperCase) - { - this.allTermsToUpperCase = allTermsToUpperCase; - return this; - } - - public OptionsBuilder alwaysLowerCaseTerms(boolean allTermsToLowerCase) - { - this.allTermsToLowerCase = allTermsToLowerCase; - return this; - } - - /** - * Set the min allowed token length. Any token shorter - * than this is skipped. - */ - public OptionsBuilder minTokenLength(int minTokenLength) - { - if (minTokenLength < 1) - throw new IllegalArgumentException("minTokenLength must be greater than zero"); - this.minTokenLength = minTokenLength; - return this; - } - - /** - * Set the max allowed token length. Any token longer - * than this is skipped. - */ - public OptionsBuilder maxTokenLength(int maxTokenLength) - { - if (maxTokenLength < 1) - throw new IllegalArgumentException("maxTokenLength must be greater than zero"); - this.maxTokenLength = maxTokenLength; - return this; - } - - public StandardTokenizerOptions build() - { - if(allTermsToLowerCase && allTermsToUpperCase) - throw new IllegalArgumentException("Options to normalize terms cannot be " + - "both uppercase and lowercase at the same time"); - - StandardTokenizerOptions options = new StandardTokenizerOptions(); - options.setIgnoreStopTerms(ignoreStopTerms); - options.setStemTerms(stemTerms); - options.setLocale(locale); - options.setCaseSensitive(caseSensitive); - options.setAllTermsToLowerCase(allTermsToLowerCase); - options.setAllTermsToUpperCase(allTermsToUpperCase); - options.setMinTokenLength(minTokenLength); - options.setMaxTokenLength(maxTokenLength); - return options; - } - } - - public static StandardTokenizerOptions buildFromMap(Map optionsMap) - { - OptionsBuilder optionsBuilder = new OptionsBuilder(); - - for (Map.Entry entry : optionsMap.entrySet()) - { - switch(entry.getKey()) - { - case TOKENIZATION_ENABLE_STEMMING: - { - boolean bool = Boolean.parseBoolean(entry.getValue()); - optionsBuilder = optionsBuilder.stemTerms(bool); - break; - } - case TOKENIZATION_SKIP_STOP_WORDS: - { - boolean bool = Boolean.parseBoolean(entry.getValue()); - optionsBuilder = optionsBuilder.ignoreStopTerms(bool); - break; - } - case TOKENIZATION_LOCALE: - { - Locale locale = new Locale(entry.getValue()); - optionsBuilder = optionsBuilder.useLocale(locale); - break; - } - case TOKENIZATION_NORMALIZE_UPPERCASE: - { - boolean bool = Boolean.parseBoolean(entry.getValue()); - optionsBuilder = optionsBuilder.alwaysUpperCaseTerms(bool); - break; - } - case TOKENIZATION_NORMALIZE_LOWERCASE: - { - boolean bool = Boolean.parseBoolean(entry.getValue()); - optionsBuilder = optionsBuilder.alwaysLowerCaseTerms(bool); - break; - } - default: - { - } - } - } - return optionsBuilder.build(); - } - - public static StandardTokenizerOptions getDefaultOptions() - { - return new OptionsBuilder() - .ignoreStopTerms(true).alwaysLowerCaseTerms(true) - .stemTerms(false).useLocale(Locale.ENGLISH).build(); - } -} diff --git a/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmerFactory.java b/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmerFactory.java deleted file mode 100644 index ae232db21d..0000000000 --- a/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmerFactory.java +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.cassandra.index.sasi.analyzer.filter; - -import java.lang.reflect.Constructor; -import java.util.HashMap; -import java.util.Locale; -import java.util.Map; - -import org.tartarus.snowball.SnowballStemmer; -import org.tartarus.snowball.ext.*; - -import com.google.common.cache.CacheBuilder; -import com.google.common.cache.CacheLoader; -import com.google.common.cache.LoadingCache; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Returns a SnowballStemmer instance appropriate for - * a given language - */ -public class StemmerFactory -{ - private static final Logger logger = LoggerFactory.getLogger(StemmerFactory.class); - private static final LoadingCache> STEMMER_CONSTRUCTOR_CACHE = CacheBuilder.newBuilder() - .build(new CacheLoader>() - { - public Constructor load(Class aClass) throws Exception - { - try - { - return aClass.getConstructor(); - } - catch (Exception e) - { - logger.error("Failed to get stemmer constructor", e); - } - return null; - } - }); - - private static final Map SUPPORTED_LANGUAGES; - - static - { - SUPPORTED_LANGUAGES = new HashMap<>(); - SUPPORTED_LANGUAGES.put("de", germanStemmer.class); - SUPPORTED_LANGUAGES.put("da", danishStemmer.class); - SUPPORTED_LANGUAGES.put("es", spanishStemmer.class); - SUPPORTED_LANGUAGES.put("en", englishStemmer.class); - SUPPORTED_LANGUAGES.put("fl", finnishStemmer.class); - SUPPORTED_LANGUAGES.put("fr", frenchStemmer.class); - SUPPORTED_LANGUAGES.put("hu", hungarianStemmer.class); - SUPPORTED_LANGUAGES.put("it", italianStemmer.class); - SUPPORTED_LANGUAGES.put("nl", dutchStemmer.class); - SUPPORTED_LANGUAGES.put("no", norwegianStemmer.class); - SUPPORTED_LANGUAGES.put("pt", portugueseStemmer.class); - SUPPORTED_LANGUAGES.put("ro", romanianStemmer.class); - SUPPORTED_LANGUAGES.put("ru", russianStemmer.class); - SUPPORTED_LANGUAGES.put("sv", swedishStemmer.class); - SUPPORTED_LANGUAGES.put("tr", turkishStemmer.class); - } - - public static SnowballStemmer getStemmer(Locale locale) - { - if (locale == null) - return null; - - String rootLang = locale.getLanguage().substring(0, 2); - try - { - Class clazz = SUPPORTED_LANGUAGES.get(rootLang); - if(clazz == null) - return null; - Constructor ctor = STEMMER_CONSTRUCTOR_CACHE.get(clazz); - return (SnowballStemmer) ctor.newInstance(); - } - catch (Exception e) - { - logger.debug("Failed to create new SnowballStemmer instance " + - "for language [{}]", locale.getLanguage(), e); - } - return null; - } -} diff --git a/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmingFilters.java b/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmingFilters.java deleted file mode 100644 index cb840a8705..0000000000 --- a/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmingFilters.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.cassandra.index.sasi.analyzer.filter; - -import java.util.Locale; - -import org.tartarus.snowball.SnowballStemmer; - -/** - * Filters for performing Stemming on tokens - */ -public class StemmingFilters -{ - public static class DefaultStemmingFilter extends FilterPipelineTask - { - private SnowballStemmer stemmer; - - public DefaultStemmingFilter(Locale locale) - { - stemmer = StemmerFactory.getStemmer(locale); - } - - public String process(String input) throws Exception - { - if (input == null || stemmer == null) - return input; - stemmer.setCurrent(input); - return (stemmer.stem()) ? stemmer.getCurrent() : input; - } - } -} diff --git a/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StopWordFactory.java b/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StopWordFactory.java deleted file mode 100644 index 8ec02e0053..0000000000 --- a/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StopWordFactory.java +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.cassandra.index.sasi.analyzer.filter; - -import java.io.BufferedReader; -import java.io.File; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.nio.charset.StandardCharsets; -import java.util.Arrays; -import java.util.HashSet; -import java.util.Locale; -import java.util.Set; -import java.util.concurrent.ExecutionException; - -import com.google.common.cache.CacheBuilder; -import com.google.common.cache.CacheLoader; -import com.google.common.cache.LoadingCache; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Provides a list of Stop Words for a given language - */ -public class StopWordFactory -{ - private static final Logger logger = LoggerFactory.getLogger(StopWordFactory.class); - - private static final String DEFAULT_RESOURCE_EXT = "_ST.txt"; - private static final String DEFAULT_RESOURCE_PREFIX = StopWordFactory.class.getPackage() - .getName().replace(".", File.separator); - private static final Set SUPPORTED_LANGUAGES = new HashSet<>( - Arrays.asList("ar","bg","cs","de","en","es","fi","fr","hi","hu","it", - "pl","pt","ro","ru","sv")); - - private static final LoadingCache> STOP_WORDS_CACHE = CacheBuilder.newBuilder() - .build(new CacheLoader>() - { - public Set load(String s) - { - return getStopWordsFromResource(s); - } - }); - - public static Set getStopWordsForLanguage(Locale locale) - { - if (locale == null) - return null; - - String rootLang = locale.getLanguage().substring(0, 2); - try - { - return (!SUPPORTED_LANGUAGES.contains(rootLang)) ? null : STOP_WORDS_CACHE.get(rootLang); - } - catch (ExecutionException e) - { - logger.error("Failed to populate Stop Words Cache for language [{}]", locale.getLanguage(), e); - return null; - } - } - - private static Set getStopWordsFromResource(String language) - { - Set stopWords = new HashSet<>(); - String resourceName = DEFAULT_RESOURCE_PREFIX + File.separator + language + DEFAULT_RESOURCE_EXT; - try (InputStream is = StopWordFactory.class.getClassLoader().getResourceAsStream(resourceName); - BufferedReader r = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) - { - String line; - while ((line = r.readLine()) != null) - { - //skip comments (lines starting with # char) - if(line.charAt(0) == '#') - continue; - stopWords.add(line.trim()); - } - } - catch (Exception e) - { - logger.error("Failed to retrieve Stop Terms resource for language [{}]", language, e); - } - return stopWords; - } -} diff --git a/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StopWordFilters.java b/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StopWordFilters.java deleted file mode 100644 index 4ae849c1f4..0000000000 --- a/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StopWordFilters.java +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.cassandra.index.sasi.analyzer.filter; - -import java.util.Locale; -import java.util.Set; - -/** - * Filter implementations for input matching Stop Words - */ -public class StopWordFilters -{ - public static class DefaultStopWordFilter extends FilterPipelineTask - { - private Set stopWords = null; - - public DefaultStopWordFilter(Locale locale) - { - this.stopWords = StopWordFactory.getStopWordsForLanguage(locale); - } - - public String process(String input) throws Exception - { - return (stopWords != null && stopWords.contains(input)) ? null : input; - } - } -} diff --git a/test/unit/org/apache/cassandra/index/sasi/SASIIndexTest.java b/test/unit/org/apache/cassandra/index/sasi/SASIIndexTest.java index c82aafd99d..cb8c3f77d2 100644 --- a/test/unit/org/apache/cassandra/index/sasi/SASIIndexTest.java +++ b/test/unit/org/apache/cassandra/index/sasi/SASIIndexTest.java @@ -62,7 +62,6 @@ import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.exceptions.InvalidRequestException; import org.apache.cassandra.index.sasi.analyzer.AbstractAnalyzer; -import org.apache.cassandra.index.sasi.analyzer.DelimiterAnalyzer; import org.apache.cassandra.index.sasi.analyzer.NoOpAnalyzer; import org.apache.cassandra.index.sasi.analyzer.NonTokenizingAnalyzer; import org.apache.cassandra.index.sasi.conf.ColumnIndex; @@ -2538,9 +2537,7 @@ public void testAnalyzerValidation() new HashMap, List>() {{ - put(StandardAnalyzer.class, textColumns); put(NonTokenizingAnalyzer.class, textColumns); - put(DelimiterAnalyzer.class, textColumns); put(NoOpAnalyzer.class, allColumns); }} .forEach((analyzer, supportedColumns) -> { diff --git a/test/unit/org/apache/cassandra/index/sasi/analyzer/DelimiterAnalyzerTest.java b/test/unit/org/apache/cassandra/index/sasi/analyzer/DelimiterAnalyzerTest.java deleted file mode 100644 index f5f007f855..0000000000 --- a/test/unit/org/apache/cassandra/index/sasi/analyzer/DelimiterAnalyzerTest.java +++ /dev/null @@ -1,317 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.cassandra.index.sasi.analyzer; - -import java.nio.ByteBuffer; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; - -import org.apache.cassandra.config.ColumnDefinition; -import org.apache.cassandra.db.marshal.Int32Type; -import org.apache.cassandra.db.marshal.SetType; -import org.apache.cassandra.db.marshal.UTF8Type; -import org.apache.cassandra.exceptions.ConfigurationException; -import org.apache.cassandra.utils.ByteBufferUtil; -import org.apache.commons.io.IOUtils; - -import org.junit.Assert; -import org.junit.Test; - -import static org.junit.Assert.assertEquals; - -public class DelimiterAnalyzerTest -{ - - @Test - public void caseSensitiveAnalizer() throws Exception - { - DelimiterAnalyzer analyzer = new DelimiterAnalyzer(); - - analyzer.init( - new HashMap() - {{ - put(DelimiterTokenizingOptions.DELIMITER, " "); - }}, - UTF8Type.instance); - - String testString = "Nip it in the bud"; - ByteBuffer toAnalyze = ByteBuffer.wrap(testString.getBytes()); - analyzer.reset(toAnalyze); - StringBuilder output = new StringBuilder(); - while (analyzer.hasNext()) - output.append(ByteBufferUtil.string(analyzer.next()) + (analyzer.hasNext() ? ' ' : "")); - - Assert.assertEquals(testString, output.toString()); - Assert.assertFalse(testString.toLowerCase().equals(output.toString())); - } - - @Test - public void testBlankEntries() throws Exception - { - DelimiterAnalyzer analyzer = new DelimiterAnalyzer(); - - analyzer.init( - new HashMap() - {{ - put(DelimiterTokenizingOptions.DELIMITER, ","); - }}, - UTF8Type.instance); - - String testString = ",Nip,,,,it,,,in,,the,bud,,,"; - ByteBuffer toAnalyze = ByteBuffer.wrap(testString.getBytes()); - analyzer.reset(toAnalyze); - StringBuilder output = new StringBuilder(); - while (analyzer.hasNext()) - output.append(ByteBufferUtil.string(analyzer.next()) + (analyzer.hasNext() ? ',' : "")); - - Assert.assertEquals("Nip,it,in,the,bud", output.toString()); - Assert.assertFalse(testString.toLowerCase().equals(output.toString())); - } - - @Test(expected = ConfigurationException.class) - public void ensureIncompatibleInputOnCollectionTypeSkipped() - { - new DelimiterAnalyzer().validate(Collections.emptyMap(), - ColumnDefinition.regularDef("a", "b", "c", SetType.getInstance(UTF8Type.instance, true))); - } - - @Test(expected = ConfigurationException.class) - public void ensureIncompatibleInputSkipped() - { - new DelimiterAnalyzer().validate(Collections.emptyMap(), - ColumnDefinition.regularDef("a", "b", "c", Int32Type.instance)); - } - - @Test - public void testTokenizationLoremIpsum() throws Exception - { - ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray( - DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/lorem_ipsum.txt"))); - - DelimiterAnalyzer tokenizer = new DelimiterAnalyzer(); - - tokenizer.init( - new HashMap() - {{ - put(DelimiterTokenizingOptions.DELIMITER, " "); - }}, - UTF8Type.instance); - - List tokens = new ArrayList<>(); - tokenizer.reset(bb); - while (tokenizer.hasNext()) - tokens.add(tokenizer.next()); - - assertEquals(69, tokens.size()); - - } - - @Test - public void testTokenizationJaJp1() throws Exception - { - ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray( - DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/ja_jp_1.txt"))); - - DelimiterAnalyzer tokenizer = new DelimiterAnalyzer(); - - tokenizer.init( - new HashMap() - {{ - put(DelimiterTokenizingOptions.DELIMITER, "。"); - }}, - UTF8Type.instance); - - tokenizer.reset(bb); - List tokens = new ArrayList<>(); - while (tokenizer.hasNext()) - tokens.add(tokenizer.next()); - - assertEquals(4, tokens.size()); - } - - @Test - public void testTokenizationJaJp2() throws Exception - { - ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray( - DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/ja_jp_2.txt"))); - - DelimiterAnalyzer tokenizer = new DelimiterAnalyzer(); - - tokenizer.init( - new HashMap() - {{ - put(DelimiterTokenizingOptions.DELIMITER, "。"); - }}, - UTF8Type.instance); - - tokenizer.reset(bb); - List tokens = new ArrayList<>(); - while (tokenizer.hasNext()) - tokens.add(tokenizer.next()); - - assertEquals(2, tokens.size()); - } - - @Test - public void testTokenizationRuRu1() throws Exception - { - ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray( - DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/ru_ru_1.txt"))); - - DelimiterAnalyzer tokenizer = new DelimiterAnalyzer(); - - tokenizer.init( - new HashMap() - {{ - put(DelimiterTokenizingOptions.DELIMITER, " "); - }}, - UTF8Type.instance); - - List tokens = new ArrayList<>(); - tokenizer.reset(bb); - while (tokenizer.hasNext()) - tokens.add(tokenizer.next()); - - assertEquals(447, tokens.size()); - } - - @Test - public void testTokenizationZnTw1() throws Exception - { - ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray( - DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/zn_tw_1.txt"))); - - DelimiterAnalyzer tokenizer = new DelimiterAnalyzer(); - - tokenizer.init( - new HashMap() - {{ - put(DelimiterTokenizingOptions.DELIMITER, " "); - }}, - UTF8Type.instance); - - List tokens = new ArrayList<>(); - tokenizer.reset(bb); - while (tokenizer.hasNext()) - tokens.add(tokenizer.next()); - - assertEquals(403, tokens.size()); - } - - @Test - public void testTokenizationAdventuresOfHuckFinn() throws Exception - { - ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray( - DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/adventures_of_huckleberry_finn_mark_twain.txt"))); - - DelimiterAnalyzer tokenizer = new DelimiterAnalyzer(); - - tokenizer.init( - new HashMap() - {{ - put(DelimiterTokenizingOptions.DELIMITER, " "); - }}, - UTF8Type.instance); - - List tokens = new ArrayList<>(); - tokenizer.reset(bb); - while (tokenizer.hasNext()) - tokens.add(tokenizer.next()); - - assertEquals(104594, tokens.size()); - } - - @Test - public void testWorldCities() throws Exception - { - ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray( - DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/world_cities_a.csv"))); - - DelimiterAnalyzer tokenizer = new DelimiterAnalyzer(); - - tokenizer.init( - new HashMap() - {{ - put(DelimiterTokenizingOptions.DELIMITER, ","); - }}, - UTF8Type.instance); - - List tokens = new ArrayList<>(); - tokenizer.reset(bb); - while (tokenizer.hasNext()) - tokens.add(tokenizer.next()); - - assertEquals(122265, tokens.size()); - } - - @Test - public void tokenizeDomainNamesAndUrls() throws Exception - { - ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray( - DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/top_visited_domains.txt"))); - - DelimiterAnalyzer tokenizer = new DelimiterAnalyzer(); - - tokenizer.init( - new HashMap() - {{ - put(DelimiterTokenizingOptions.DELIMITER, " "); - }}, - UTF8Type.instance); - - tokenizer.reset(bb); - - List tokens = new ArrayList<>(); - while (tokenizer.hasNext()) - tokens.add(tokenizer.next()); - - assertEquals(12, tokens.size()); - } - - @Test - public void testReuseAndResetTokenizerInstance() throws Exception - { - List bbToTokenize = new ArrayList<>(); - bbToTokenize.add(ByteBuffer.wrap("Nip it in the bud".getBytes())); - bbToTokenize.add(ByteBuffer.wrap("I couldn’t care less".getBytes())); - bbToTokenize.add(ByteBuffer.wrap("One and the same".getBytes())); - bbToTokenize.add(ByteBuffer.wrap("The squeaky wheel gets the grease.".getBytes())); - bbToTokenize.add(ByteBuffer.wrap("The pen is mightier than the sword.".getBytes())); - - DelimiterAnalyzer tokenizer = new DelimiterAnalyzer(); - - tokenizer.init( - new HashMap() - {{ - put(DelimiterTokenizingOptions.DELIMITER, " "); - }}, - UTF8Type.instance); - - List tokens = new ArrayList<>(); - for (ByteBuffer bb : bbToTokenize) - { - tokenizer.reset(bb); - while (tokenizer.hasNext()) - tokens.add(tokenizer.next()); - } - assertEquals(26, tokens.size()); - } - -} diff --git a/test/unit/org/apache/cassandra/index/sasi/analyzer/StandardAnalyzerTest.java b/test/unit/org/apache/cassandra/index/sasi/analyzer/StandardAnalyzerTest.java deleted file mode 100644 index 7a88a3dc9f..0000000000 --- a/test/unit/org/apache/cassandra/index/sasi/analyzer/StandardAnalyzerTest.java +++ /dev/null @@ -1,227 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.cassandra.index.sasi.analyzer; - -import java.io.InputStream; -import java.nio.ByteBuffer; -import java.util.ArrayList; -import java.util.List; -import java.util.Locale; - -import org.junit.Test; - -import org.apache.cassandra.serializers.UTF8Serializer; - -import static org.junit.Assert.assertEquals; - -public class StandardAnalyzerTest -{ - @Test - public void testTokenizationAscii() throws Exception - { - InputStream is = StandardAnalyzerTest.class.getClassLoader() - .getResourceAsStream("tokenization/apache_license_header.txt"); - - StandardTokenizerOptions options = new StandardTokenizerOptions.OptionsBuilder() - .maxTokenLength(5).build(); - StandardAnalyzer tokenizer = new StandardAnalyzer(); - tokenizer.init(options); - - List tokens = new ArrayList<>(); - tokenizer.reset(is); - while (tokenizer.hasNext()) - tokens.add(tokenizer.next()); - - assertEquals(67, tokens.size()); - } - - @Test - public void testTokenizationLoremIpsum() throws Exception - { - InputStream is = StandardAnalyzerTest.class.getClassLoader() - .getResourceAsStream("tokenization/lorem_ipsum.txt"); - - StandardAnalyzer tokenizer = new StandardAnalyzer(); - tokenizer.init(StandardTokenizerOptions.getDefaultOptions()); - - List tokens = new ArrayList<>(); - tokenizer.reset(is); - while (tokenizer.hasNext()) - tokens.add(tokenizer.next()); - - assertEquals(62, tokens.size()); - - } - - @Test - public void testTokenizationJaJp1() throws Exception - { - InputStream is = StandardAnalyzerTest.class.getClassLoader() - .getResourceAsStream("tokenization/ja_jp_1.txt"); - - StandardAnalyzer tokenizer = new StandardAnalyzer(); - tokenizer.init(StandardTokenizerOptions.getDefaultOptions()); - - tokenizer.reset(is); - List tokens = new ArrayList<>(); - while (tokenizer.hasNext()) - tokens.add(tokenizer.next()); - - assertEquals(210, tokens.size()); - } - - @Test - public void testTokenizationJaJp2() throws Exception - { - InputStream is = StandardAnalyzerTest.class.getClassLoader() - .getResourceAsStream("tokenization/ja_jp_2.txt"); - - StandardTokenizerOptions options = new StandardTokenizerOptions.OptionsBuilder().stemTerms(true) - .ignoreStopTerms(true).alwaysLowerCaseTerms(true).build(); - StandardAnalyzer tokenizer = new StandardAnalyzer(); - tokenizer.init(options); - - tokenizer.reset(is); - List tokens = new ArrayList<>(); - while (tokenizer.hasNext()) - tokens.add(tokenizer.next()); - - assertEquals(57, tokens.size()); - } - - @Test - public void testTokenizationRuRu1() throws Exception - { - InputStream is = StandardAnalyzerTest.class.getClassLoader() - .getResourceAsStream("tokenization/ru_ru_1.txt"); - StandardAnalyzer tokenizer = new StandardAnalyzer(); - tokenizer.init(StandardTokenizerOptions.getDefaultOptions()); - - List tokens = new ArrayList<>(); - tokenizer.reset(is); - while (tokenizer.hasNext()) - tokens.add(tokenizer.next()); - - assertEquals(456, tokens.size()); - } - - @Test - public void testTokenizationZnTw1() throws Exception - { - InputStream is = StandardAnalyzerTest.class.getClassLoader() - .getResourceAsStream("tokenization/zn_tw_1.txt"); - StandardAnalyzer tokenizer = new StandardAnalyzer(); - tokenizer.init(StandardTokenizerOptions.getDefaultOptions()); - - List tokens = new ArrayList<>(); - tokenizer.reset(is); - while (tokenizer.hasNext()) - tokens.add(tokenizer.next()); - - assertEquals(963, tokens.size()); - } - - @Test - public void testTokenizationAdventuresOfHuckFinn() throws Exception - { - InputStream is = StandardAnalyzerTest.class.getClassLoader() - .getResourceAsStream("tokenization/adventures_of_huckleberry_finn_mark_twain.txt"); - - StandardTokenizerOptions options = new StandardTokenizerOptions.OptionsBuilder().stemTerms(true) - .ignoreStopTerms(true).useLocale(Locale.ENGLISH) - .alwaysLowerCaseTerms(true).build(); - StandardAnalyzer tokenizer = new StandardAnalyzer(); - tokenizer.init(options); - - List tokens = new ArrayList<>(); - tokenizer.reset(is); - while (tokenizer.hasNext()) - tokens.add(tokenizer.next()); - - assertEquals(37739, tokens.size()); - } - - @Test - public void testSkipStopWordBeforeStemmingFrench() throws Exception - { - InputStream is = StandardAnalyzerTest.class.getClassLoader() - .getResourceAsStream("tokenization/french_skip_stop_words_before_stemming.txt"); - - StandardTokenizerOptions options = new StandardTokenizerOptions.OptionsBuilder().stemTerms(true) - .ignoreStopTerms(true).useLocale(Locale.FRENCH) - .alwaysLowerCaseTerms(true).build(); - StandardAnalyzer tokenizer = new StandardAnalyzer(); - tokenizer.init(options); - - List tokens = new ArrayList<>(); - List words = new ArrayList<>(); - tokenizer.reset(is); - while (tokenizer.hasNext()) - { - final ByteBuffer nextToken = tokenizer.next(); - tokens.add(nextToken); - words.add(UTF8Serializer.instance.deserialize(nextToken.duplicate())); - } - - assertEquals(4, tokens.size()); - assertEquals("dans", words.get(0)); - assertEquals("plui", words.get(1)); - assertEquals("chanson", words.get(2)); - assertEquals("connu", words.get(3)); - } - - @Test - public void tokenizeDomainNamesAndUrls() throws Exception - { - InputStream is = StandardAnalyzerTest.class.getClassLoader() - .getResourceAsStream("tokenization/top_visited_domains.txt"); - - StandardAnalyzer tokenizer = new StandardAnalyzer(); - tokenizer.init(StandardTokenizerOptions.getDefaultOptions()); - tokenizer.reset(is); - - List tokens = new ArrayList<>(); - while (tokenizer.hasNext()) - tokens.add(tokenizer.next()); - - assertEquals(15, tokens.size()); - } - - @Test - public void testReuseAndResetTokenizerInstance() throws Exception - { - List bbToTokenize = new ArrayList<>(); - bbToTokenize.add(ByteBuffer.wrap("Nip it in the bud".getBytes())); - bbToTokenize.add(ByteBuffer.wrap("I couldn’t care less".getBytes())); - bbToTokenize.add(ByteBuffer.wrap("One and the same".getBytes())); - bbToTokenize.add(ByteBuffer.wrap("The squeaky wheel gets the grease.".getBytes())); - bbToTokenize.add(ByteBuffer.wrap("The pen is mightier than the sword.".getBytes())); - - StandardAnalyzer tokenizer = new StandardAnalyzer(); - tokenizer.init(StandardTokenizerOptions.getDefaultOptions()); - - List tokens = new ArrayList<>(); - for (ByteBuffer bb : bbToTokenize) - { - tokenizer.reset(bb); - while (tokenizer.hasNext()) - tokens.add(tokenizer.next()); - } - assertEquals(10, tokens.size()); - } -}