entry : optionsMap.entrySet())
- {
- switch (entry.getKey())
- {
- case DELIMITER:
- {
- String value = entry.getValue();
- if (1 != value.length())
- throw new IllegalArgumentException(String.format("Only single character delimiters supported, was %s", value));
-
- optionsBuilder.delimiter = entry.getValue().charAt(0);
- break;
- }
- }
- }
- return optionsBuilder.build();
- }
-}
diff --git a/src/java/org/apache/cassandra/index/sasi/analyzer/SUPPLEMENTARY.jflex-macro b/src/java/org/apache/cassandra/index/sasi/analyzer/SUPPLEMENTARY.jflex-macro
deleted file mode 100644
index f5bf68e254..0000000000
--- a/src/java/org/apache/cassandra/index/sasi/analyzer/SUPPLEMENTARY.jflex-macro
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-// Generated using ICU4J 52.1.0.0
-// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
-
-
-ALetterSupp = (
- ([\ud83b][\uDE00-\uDE03\uDE05-\uDE1F\uDE21\uDE22\uDE24\uDE27\uDE29-\uDE32\uDE34-\uDE37\uDE39\uDE3B\uDE42\uDE47\uDE49\uDE4B\uDE4D-\uDE4F\uDE51\uDE52\uDE54\uDE57\uDE59\uDE5B\uDE5D\uDE5F\uDE61\uDE62\uDE64\uDE67-\uDE6A\uDE6C-\uDE72\uDE74-\uDE77\uDE79-\uDE7C\uDE7E\uDE80-\uDE89\uDE8B-\uDE9B\uDEA1-\uDEA3\uDEA5-\uDEA9\uDEAB-\uDEBB])
- | ([\ud81a][\uDC00-\uDE38])
- | ([\ud81b][\uDF00-\uDF44\uDF50\uDF93-\uDF9F])
- | ([\ud835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB])
- | ([\ud80d][\uDC00-\uDC2E])
- | ([\ud80c][\uDC00-\uDFFF])
- | ([\ud809][\uDC00-\uDC62])
- | ([\ud808][\uDC00-\uDF6E])
- | ([\ud805][\uDE80-\uDEAA])
- | ([\ud804][\uDC03-\uDC37\uDC83-\uDCAF\uDCD0-\uDCE8\uDD03-\uDD26\uDD83-\uDDB2\uDDC1-\uDDC4])
- | ([\ud801][\uDC00-\uDC9D])
- | ([\ud800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5])
- | ([\ud803][\uDC00-\uDC48])
- | ([\ud802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDD80-\uDDB7\uDDBE\uDDBF\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72])
-)
-FormatSupp = (
- ([\ud804][\uDCBD])
- | ([\ud834][\uDD73-\uDD7A])
- | ([\udb40][\uDC01\uDC20-\uDC7F])
-)
-NumericSupp = (
- ([\ud805][\uDEC0-\uDEC9])
- | ([\ud804][\uDC66-\uDC6F\uDCF0-\uDCF9\uDD36-\uDD3F\uDDD0-\uDDD9])
- | ([\ud835][\uDFCE-\uDFFF])
- | ([\ud801][\uDCA0-\uDCA9])
-)
-ExtendSupp = (
- ([\ud81b][\uDF51-\uDF7E\uDF8F-\uDF92])
- | ([\ud805][\uDEAB-\uDEB7])
- | ([\ud804][\uDC00-\uDC02\uDC38-\uDC46\uDC80-\uDC82\uDCB0-\uDCBA\uDD00-\uDD02\uDD27-\uDD34\uDD80-\uDD82\uDDB3-\uDDC0])
- | ([\ud834][\uDD65-\uDD69\uDD6D-\uDD72\uDD7B-\uDD82\uDD85-\uDD8B\uDDAA-\uDDAD\uDE42-\uDE44])
- | ([\ud800][\uDDFD])
- | ([\udb40][\uDD00-\uDDEF])
- | ([\ud802][\uDE01-\uDE03\uDE05\uDE06\uDE0C-\uDE0F\uDE38-\uDE3A\uDE3F])
-)
-KatakanaSupp = (
- ([\ud82c][\uDC00])
-)
-MidLetterSupp = (
- []
-)
-MidNumSupp = (
- []
-)
-MidNumLetSupp = (
- []
-)
-ExtendNumLetSupp = (
- []
-)
-ExtendNumLetSupp = (
- []
-)
-ComplexContextSupp = (
- []
-)
-HanSupp = (
- ([\ud87e][\uDC00-\uDE1D])
- | ([\ud86b][\uDC00-\uDFFF])
- | ([\ud86a][\uDC00-\uDFFF])
- | ([\ud869][\uDC00-\uDED6\uDF00-\uDFFF])
- | ([\ud868][\uDC00-\uDFFF])
- | ([\ud86e][\uDC00-\uDC1D])
- | ([\ud86d][\uDC00-\uDF34\uDF40-\uDFFF])
- | ([\ud86c][\uDC00-\uDFFF])
- | ([\ud863][\uDC00-\uDFFF])
- | ([\ud862][\uDC00-\uDFFF])
- | ([\ud861][\uDC00-\uDFFF])
- | ([\ud860][\uDC00-\uDFFF])
- | ([\ud867][\uDC00-\uDFFF])
- | ([\ud866][\uDC00-\uDFFF])
- | ([\ud865][\uDC00-\uDFFF])
- | ([\ud864][\uDC00-\uDFFF])
- | ([\ud858][\uDC00-\uDFFF])
- | ([\ud859][\uDC00-\uDFFF])
- | ([\ud85a][\uDC00-\uDFFF])
- | ([\ud85b][\uDC00-\uDFFF])
- | ([\ud85c][\uDC00-\uDFFF])
- | ([\ud85d][\uDC00-\uDFFF])
- | ([\ud85e][\uDC00-\uDFFF])
- | ([\ud85f][\uDC00-\uDFFF])
- | ([\ud850][\uDC00-\uDFFF])
- | ([\ud851][\uDC00-\uDFFF])
- | ([\ud852][\uDC00-\uDFFF])
- | ([\ud853][\uDC00-\uDFFF])
- | ([\ud854][\uDC00-\uDFFF])
- | ([\ud855][\uDC00-\uDFFF])
- | ([\ud856][\uDC00-\uDFFF])
- | ([\ud857][\uDC00-\uDFFF])
- | ([\ud849][\uDC00-\uDFFF])
- | ([\ud848][\uDC00-\uDFFF])
- | ([\ud84b][\uDC00-\uDFFF])
- | ([\ud84a][\uDC00-\uDFFF])
- | ([\ud84d][\uDC00-\uDFFF])
- | ([\ud84c][\uDC00-\uDFFF])
- | ([\ud84f][\uDC00-\uDFFF])
- | ([\ud84e][\uDC00-\uDFFF])
- | ([\ud841][\uDC00-\uDFFF])
- | ([\ud840][\uDC00-\uDFFF])
- | ([\ud843][\uDC00-\uDFFF])
- | ([\ud842][\uDC00-\uDFFF])
- | ([\ud845][\uDC00-\uDFFF])
- | ([\ud844][\uDC00-\uDFFF])
- | ([\ud847][\uDC00-\uDFFF])
- | ([\ud846][\uDC00-\uDFFF])
-)
-HiraganaSupp = (
- ([\ud83c][\uDE00])
- | ([\ud82c][\uDC01])
-)
-SingleQuoteSupp = (
- []
-)
-DoubleQuoteSupp = (
- []
-)
-HebrewLetterSupp = (
- []
-)
-RegionalIndicatorSupp = (
- ([\ud83c][\uDDE6-\uDDFF])
-)
diff --git a/src/java/org/apache/cassandra/index/sasi/analyzer/StandardTokenizerImpl.jflex b/src/java/org/apache/cassandra/index/sasi/analyzer/StandardTokenizerImpl.jflex
deleted file mode 100644
index 86c645101d..0000000000
--- a/src/java/org/apache/cassandra/index/sasi/analyzer/StandardTokenizerImpl.jflex
+++ /dev/null
@@ -1,220 +0,0 @@
-package org.apache.cassandra.index.sasi.analyzer;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.util.Arrays;
-
-/**
- * This class implements Word Break rules from the Unicode Text Segmentation
- * algorithm, as specified in
- * Unicode Standard Annex #29.
- *
- * Tokens produced are of the following types:
- *
- * - <ALPHANUM>: A sequence of alphabetic and numeric characters
- * - <NUM>: A number
- * - <SOUTHEAST_ASIAN>: A sequence of characters from South and Southeast
- * Asian languages, including Thai, Lao, Myanmar, and Khmer
- * - <IDEOGRAPHIC>: A single CJKV ideographic character
- * - <HIRAGANA>: A single hiragana character
- * - <KATAKANA>: A sequence of katakana characters
- * - <HANGUL>: A sequence of Hangul characters
- *
- */
-%%
-
-%unicode 6.3
-%integer
-%final
-%public
-%class StandardTokenizerImpl
-%implements StandardTokenizerInterface
-%function getNextToken
-%char
-%buffer 4096
-
-%include SUPPLEMENTARY.jflex-macro
-ALetter = (\p{WB:ALetter} | {ALetterSupp})
-Format = (\p{WB:Format} | {FormatSupp})
-Numeric = ([\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] | {NumericSupp})
-Extend = (\p{WB:Extend} | {ExtendSupp})
-Katakana = (\p{WB:Katakana} | {KatakanaSupp})
-MidLetter = (\p{WB:MidLetter} | {MidLetterSupp})
-MidNum = (\p{WB:MidNum} | {MidNumSupp})
-MidNumLet = (\p{WB:MidNumLet} | {MidNumLetSupp})
-ExtendNumLet = (\p{WB:ExtendNumLet} | {ExtendNumLetSupp})
-ComplexContext = (\p{LB:Complex_Context} | {ComplexContextSupp})
-Han = (\p{Script:Han} | {HanSupp})
-Hiragana = (\p{Script:Hiragana} | {HiraganaSupp})
-SingleQuote = (\p{WB:Single_Quote} | {SingleQuoteSupp})
-DoubleQuote = (\p{WB:Double_Quote} | {DoubleQuoteSupp})
-HebrewLetter = (\p{WB:Hebrew_Letter} | {HebrewLetterSupp})
-RegionalIndicator = (\p{WB:Regional_Indicator} | {RegionalIndicatorSupp})
-HebrewOrALetter = ({HebrewLetter} | {ALetter})
-
-// UAX#29 WB4. X (Extend | Format)* --> X
-//
-HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] ({Format} | {Extend})*
-HebrewOrALetterEx = {HebrewOrALetter} ({Format} | {Extend})*
-NumericEx = {Numeric} ({Format} | {Extend})*
-KatakanaEx = {Katakana} ({Format} | {Extend})*
-MidLetterEx = ({MidLetter} | {MidNumLet} | {SingleQuote}) ({Format} | {Extend})*
-MidNumericEx = ({MidNum} | {MidNumLet} | {SingleQuote}) ({Format} | {Extend})*
-ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
-HanEx = {Han} ({Format} | {Extend})*
-HiraganaEx = {Hiragana} ({Format} | {Extend})*
-SingleQuoteEx = {SingleQuote} ({Format} | {Extend})*
-DoubleQuoteEx = {DoubleQuote} ({Format} | {Extend})*
-HebrewLetterEx = {HebrewLetter} ({Format} | {Extend})*
-RegionalIndicatorEx = {RegionalIndicator} ({Format} | {Extend})*
-
-
-%{
- /** Alphanumeric sequences */
- public static final int WORD_TYPE = StandardAnalyzer.TokenType.ALPHANUM.value;
-
- /** Numbers */
- public static final int NUMERIC_TYPE = StandardAnalyzer.TokenType.NUM.value;
-
- /**
- * Chars in class \p{Line_Break = Complex_Context} are from South East Asian
- * scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
- * together as as a single token rather than broken up, because the logic
- * required to break them at word boundaries is too complex for UAX#29.
- *
- * See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
- */
- public static final int SOUTH_EAST_ASIAN_TYPE = StandardAnalyzer.TokenType.SOUTHEAST_ASIAN.value;
-
- public static final int IDEOGRAPHIC_TYPE = StandardAnalyzer.TokenType.IDEOGRAPHIC.value;
-
- public static final int HIRAGANA_TYPE = StandardAnalyzer.TokenType.HIRAGANA.value;
-
- public static final int KATAKANA_TYPE = StandardAnalyzer.TokenType.KATAKANA.value;
-
- public static final int HANGUL_TYPE = StandardAnalyzer.TokenType.HANGUL.value;
-
- public final long yychar()
- {
- return yychar;
- }
-
- public String getText()
- {
- return String.valueOf(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
- }
-
- public char[] getArray()
- {
- return Arrays.copyOfRange(zzBuffer, zzStartRead, zzMarkedPos);
- }
-
- public byte[] getBytes()
- {
- return getText().getBytes();
- }
-
-%}
-
-%%
-
-// UAX#29 WB1. sot ÷
-// WB2. ÷ eot
-//
-<> { return StandardAnalyzer.TokenType.EOF.value; }
-
-// UAX#29 WB8. Numeric × Numeric
-// WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
-// WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
-// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
-// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
-//
-{ExtendNumLetEx}* {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* {ExtendNumLetEx}*
- { return NUMERIC_TYPE; }
-
-// subset of the below for typing purposes only!
-{HangulEx}+
- { return HANGUL_TYPE; }
-
-{KatakanaEx}+
- { return KATAKANA_TYPE; }
-
-// UAX#29 WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
-// WB6. (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
-// WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) × (ALetter | Hebrew_Letter)
-// WB7a. Hebrew_Letter × Single_Quote
-// WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter
-// WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter
-// WB9. (ALetter | Hebrew_Letter) × Numeric
-// WB10. Numeric × (ALetter | Hebrew_Letter)
-// WB13. Katakana × Katakana
-// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
-// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
-//
-{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
- | ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
- | {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
- | {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
- )+
- )
-({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
- | ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
- | {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
- | {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
- )+
- )
-)*
-{ExtendNumLetEx}*
- { return WORD_TYPE; }
-
-
-// From UAX #29:
-//
-// [C]haracters with the Line_Break property values of Contingent_Break (CB),
-// Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
-// boundary property values based on criteria outside of the scope of this
-// annex. That means that satisfactory treatment of languages like Chinese
-// or Thai requires special handling.
-//
-// In Unicode 6.3, only one character has the \p{Line_Break = Contingent_Break}
-// property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER.
-//
-// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
-// character sequences (from South East Asian scripts like Thai, Myanmar, Khmer,
-// Lao, etc.) are kept together. This grammar does the same below.
-//
-// See also the Unicode Line Breaking Algorithm:
-//
-// http://www.unicode.org/reports/tr14/#SA
-//
-{ComplexContext}+ { return SOUTH_EAST_ASIAN_TYPE; }
-
-// UAX#29 WB14. Any ÷ Any
-//
-{HanEx} { return IDEOGRAPHIC_TYPE; }
-{HiraganaEx} { return HIRAGANA_TYPE; }
-
-
-// UAX#29 WB3. CR × LF
-// WB3a. (Newline | CR | LF) ÷
-// WB3b. ÷ (Newline | CR | LF)
-// WB13c. Regional_Indicator × Regional_Indicator
-// WB14. Any ÷ Any
-//
-{RegionalIndicatorEx} {RegionalIndicatorEx}+ | [^]
- { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
diff --git a/src/java/org/apache/cassandra/index/sasi/analyzer/StandardTokenizerInterface.java b/src/java/org/apache/cassandra/index/sasi/analyzer/StandardTokenizerInterface.java
deleted file mode 100644
index f8b6bf773e..0000000000
--- a/src/java/org/apache/cassandra/index/sasi/analyzer/StandardTokenizerInterface.java
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.index.sasi.analyzer;
-
-import java.io.IOException;
-import java.io.Reader;
-
-/**
- * Internal interface for supporting versioned grammars.
- */
-public interface StandardTokenizerInterface
-{
-
- String getText();
-
- char[] getArray();
-
- byte[] getBytes();
-
- /**
- * Returns the current position.
- */
- long yychar();
-
- /**
- * Returns the length of the matched text region.
- */
- int yylength();
-
- /**
- * Resumes scanning until the next regular expression is matched,
- * the end of input is encountered or an I/O-Error occurs.
- *
- * @return the next token, {@link #YYEOF} on end of stream
- * @exception java.io.IOException if any I/O-Error occurs
- */
- int getNextToken() throws IOException;
-
- /**
- * Resets the scanner to read from a new input stream.
- * Does not close the old reader.
- *
- * All internal variables are reset, the old input stream
- * cannot be reused (internal buffer is discarded and lost).
- * Lexical state is set to ZZ_INITIAL.
- *
- * @param reader the new input stream
- */
- void yyreset(Reader reader);
-}
diff --git a/src/java/org/apache/cassandra/index/sasi/analyzer/StandardTokenizerOptions.java b/src/java/org/apache/cassandra/index/sasi/analyzer/StandardTokenizerOptions.java
deleted file mode 100644
index da44f0ad7b..0000000000
--- a/src/java/org/apache/cassandra/index/sasi/analyzer/StandardTokenizerOptions.java
+++ /dev/null
@@ -1,273 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.index.sasi.analyzer;
-
-import java.util.Locale;
-import java.util.Map;
-
-/**
- * Various options for controlling tokenization and enabling
- * or disabling features
- */
-public class StandardTokenizerOptions
-{
- public static final String TOKENIZATION_ENABLE_STEMMING = "tokenization_enable_stemming";
- public static final String TOKENIZATION_SKIP_STOP_WORDS = "tokenization_skip_stop_words";
- public static final String TOKENIZATION_LOCALE = "tokenization_locale";
- public static final String TOKENIZATION_NORMALIZE_LOWERCASE = "tokenization_normalize_lowercase";
- public static final String TOKENIZATION_NORMALIZE_UPPERCASE = "tokenization_normalize_uppercase";
-
- public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
- public static final int DEFAULT_MIN_TOKEN_LENGTH = 0;
-
- private boolean stemTerms;
- private boolean ignoreStopTerms;
- private Locale locale;
- private boolean caseSensitive;
- private boolean allTermsToUpperCase;
- private boolean allTermsToLowerCase;
- private int minTokenLength;
- private int maxTokenLength;
-
- public boolean shouldStemTerms()
- {
- return stemTerms;
- }
-
- public void setStemTerms(boolean stemTerms)
- {
- this.stemTerms = stemTerms;
- }
-
- public boolean shouldIgnoreStopTerms()
- {
- return ignoreStopTerms;
- }
-
- public void setIgnoreStopTerms(boolean ignoreStopTerms)
- {
- this.ignoreStopTerms = ignoreStopTerms;
- }
-
- public Locale getLocale()
- {
- return locale;
- }
-
- public void setLocale(Locale locale)
- {
- this.locale = locale;
- }
-
- public boolean isCaseSensitive()
- {
- return caseSensitive;
- }
-
- public void setCaseSensitive(boolean caseSensitive)
- {
- this.caseSensitive = caseSensitive;
- }
-
- public boolean shouldUpperCaseTerms()
- {
- return allTermsToUpperCase;
- }
-
- public void setAllTermsToUpperCase(boolean allTermsToUpperCase)
- {
- this.allTermsToUpperCase = allTermsToUpperCase;
- }
-
- public boolean shouldLowerCaseTerms()
- {
- return allTermsToLowerCase;
- }
-
- public void setAllTermsToLowerCase(boolean allTermsToLowerCase)
- {
- this.allTermsToLowerCase = allTermsToLowerCase;
- }
-
- public int getMinTokenLength()
- {
- return minTokenLength;
- }
-
- public void setMinTokenLength(int minTokenLength)
- {
- this.minTokenLength = minTokenLength;
- }
-
- public int getMaxTokenLength()
- {
- return maxTokenLength;
- }
-
- public void setMaxTokenLength(int maxTokenLength)
- {
- this.maxTokenLength = maxTokenLength;
- }
-
- public static class OptionsBuilder
- {
- private boolean stemTerms;
- private boolean ignoreStopTerms;
- private Locale locale;
- private boolean caseSensitive;
- private boolean allTermsToUpperCase;
- private boolean allTermsToLowerCase;
- private int minTokenLength = DEFAULT_MIN_TOKEN_LENGTH;
- private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
-
- public OptionsBuilder()
- {
- }
-
- public OptionsBuilder stemTerms(boolean stemTerms)
- {
- this.stemTerms = stemTerms;
- return this;
- }
-
- public OptionsBuilder ignoreStopTerms(boolean ignoreStopTerms)
- {
- this.ignoreStopTerms = ignoreStopTerms;
- return this;
- }
-
- public OptionsBuilder useLocale(Locale locale)
- {
- this.locale = locale;
- return this;
- }
-
- public OptionsBuilder caseSensitive(boolean caseSensitive)
- {
- this.caseSensitive = caseSensitive;
- return this;
- }
-
- public OptionsBuilder alwaysUpperCaseTerms(boolean allTermsToUpperCase)
- {
- this.allTermsToUpperCase = allTermsToUpperCase;
- return this;
- }
-
- public OptionsBuilder alwaysLowerCaseTerms(boolean allTermsToLowerCase)
- {
- this.allTermsToLowerCase = allTermsToLowerCase;
- return this;
- }
-
- /**
- * Set the min allowed token length. Any token shorter
- * than this is skipped.
- */
- public OptionsBuilder minTokenLength(int minTokenLength)
- {
- if (minTokenLength < 1)
- throw new IllegalArgumentException("minTokenLength must be greater than zero");
- this.minTokenLength = minTokenLength;
- return this;
- }
-
- /**
- * Set the max allowed token length. Any token longer
- * than this is skipped.
- */
- public OptionsBuilder maxTokenLength(int maxTokenLength)
- {
- if (maxTokenLength < 1)
- throw new IllegalArgumentException("maxTokenLength must be greater than zero");
- this.maxTokenLength = maxTokenLength;
- return this;
- }
-
- public StandardTokenizerOptions build()
- {
- if(allTermsToLowerCase && allTermsToUpperCase)
- throw new IllegalArgumentException("Options to normalize terms cannot be " +
- "both uppercase and lowercase at the same time");
-
- StandardTokenizerOptions options = new StandardTokenizerOptions();
- options.setIgnoreStopTerms(ignoreStopTerms);
- options.setStemTerms(stemTerms);
- options.setLocale(locale);
- options.setCaseSensitive(caseSensitive);
- options.setAllTermsToLowerCase(allTermsToLowerCase);
- options.setAllTermsToUpperCase(allTermsToUpperCase);
- options.setMinTokenLength(minTokenLength);
- options.setMaxTokenLength(maxTokenLength);
- return options;
- }
- }
-
- public static StandardTokenizerOptions buildFromMap(Map optionsMap)
- {
- OptionsBuilder optionsBuilder = new OptionsBuilder();
-
- for (Map.Entry entry : optionsMap.entrySet())
- {
- switch(entry.getKey())
- {
- case TOKENIZATION_ENABLE_STEMMING:
- {
- boolean bool = Boolean.parseBoolean(entry.getValue());
- optionsBuilder = optionsBuilder.stemTerms(bool);
- break;
- }
- case TOKENIZATION_SKIP_STOP_WORDS:
- {
- boolean bool = Boolean.parseBoolean(entry.getValue());
- optionsBuilder = optionsBuilder.ignoreStopTerms(bool);
- break;
- }
- case TOKENIZATION_LOCALE:
- {
- Locale locale = new Locale(entry.getValue());
- optionsBuilder = optionsBuilder.useLocale(locale);
- break;
- }
- case TOKENIZATION_NORMALIZE_UPPERCASE:
- {
- boolean bool = Boolean.parseBoolean(entry.getValue());
- optionsBuilder = optionsBuilder.alwaysUpperCaseTerms(bool);
- break;
- }
- case TOKENIZATION_NORMALIZE_LOWERCASE:
- {
- boolean bool = Boolean.parseBoolean(entry.getValue());
- optionsBuilder = optionsBuilder.alwaysLowerCaseTerms(bool);
- break;
- }
- default:
- {
- }
- }
- }
- return optionsBuilder.build();
- }
-
- public static StandardTokenizerOptions getDefaultOptions()
- {
- return new OptionsBuilder()
- .ignoreStopTerms(true).alwaysLowerCaseTerms(true)
- .stemTerms(false).useLocale(Locale.ENGLISH).build();
- }
-}
diff --git a/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmerFactory.java b/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmerFactory.java
deleted file mode 100644
index ae232db21d..0000000000
--- a/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmerFactory.java
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.index.sasi.analyzer.filter;
-
-import java.lang.reflect.Constructor;
-import java.util.HashMap;
-import java.util.Locale;
-import java.util.Map;
-
-import org.tartarus.snowball.SnowballStemmer;
-import org.tartarus.snowball.ext.*;
-
-import com.google.common.cache.CacheBuilder;
-import com.google.common.cache.CacheLoader;
-import com.google.common.cache.LoadingCache;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Returns a SnowballStemmer instance appropriate for
- * a given language
- */
-public class StemmerFactory
-{
- private static final Logger logger = LoggerFactory.getLogger(StemmerFactory.class);
- private static final LoadingCache> STEMMER_CONSTRUCTOR_CACHE = CacheBuilder.newBuilder()
- .build(new CacheLoader>()
- {
- public Constructor> load(Class aClass) throws Exception
- {
- try
- {
- return aClass.getConstructor();
- }
- catch (Exception e)
- {
- logger.error("Failed to get stemmer constructor", e);
- }
- return null;
- }
- });
-
- private static final Map SUPPORTED_LANGUAGES;
-
- static
- {
- SUPPORTED_LANGUAGES = new HashMap<>();
- SUPPORTED_LANGUAGES.put("de", germanStemmer.class);
- SUPPORTED_LANGUAGES.put("da", danishStemmer.class);
- SUPPORTED_LANGUAGES.put("es", spanishStemmer.class);
- SUPPORTED_LANGUAGES.put("en", englishStemmer.class);
- SUPPORTED_LANGUAGES.put("fl", finnishStemmer.class);
- SUPPORTED_LANGUAGES.put("fr", frenchStemmer.class);
- SUPPORTED_LANGUAGES.put("hu", hungarianStemmer.class);
- SUPPORTED_LANGUAGES.put("it", italianStemmer.class);
- SUPPORTED_LANGUAGES.put("nl", dutchStemmer.class);
- SUPPORTED_LANGUAGES.put("no", norwegianStemmer.class);
- SUPPORTED_LANGUAGES.put("pt", portugueseStemmer.class);
- SUPPORTED_LANGUAGES.put("ro", romanianStemmer.class);
- SUPPORTED_LANGUAGES.put("ru", russianStemmer.class);
- SUPPORTED_LANGUAGES.put("sv", swedishStemmer.class);
- SUPPORTED_LANGUAGES.put("tr", turkishStemmer.class);
- }
-
- public static SnowballStemmer getStemmer(Locale locale)
- {
- if (locale == null)
- return null;
-
- String rootLang = locale.getLanguage().substring(0, 2);
- try
- {
- Class clazz = SUPPORTED_LANGUAGES.get(rootLang);
- if(clazz == null)
- return null;
- Constructor> ctor = STEMMER_CONSTRUCTOR_CACHE.get(clazz);
- return (SnowballStemmer) ctor.newInstance();
- }
- catch (Exception e)
- {
- logger.debug("Failed to create new SnowballStemmer instance " +
- "for language [{}]", locale.getLanguage(), e);
- }
- return null;
- }
-}
diff --git a/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmingFilters.java b/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmingFilters.java
deleted file mode 100644
index cb840a8705..0000000000
--- a/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmingFilters.java
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.index.sasi.analyzer.filter;
-
-import java.util.Locale;
-
-import org.tartarus.snowball.SnowballStemmer;
-
-/**
- * Filters for performing Stemming on tokens
- */
-public class StemmingFilters
-{
- public static class DefaultStemmingFilter extends FilterPipelineTask
- {
- private SnowballStemmer stemmer;
-
- public DefaultStemmingFilter(Locale locale)
- {
- stemmer = StemmerFactory.getStemmer(locale);
- }
-
- public String process(String input) throws Exception
- {
- if (input == null || stemmer == null)
- return input;
- stemmer.setCurrent(input);
- return (stemmer.stem()) ? stemmer.getCurrent() : input;
- }
- }
-}
diff --git a/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StopWordFactory.java b/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StopWordFactory.java
deleted file mode 100644
index 8ec02e0053..0000000000
--- a/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StopWordFactory.java
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.index.sasi.analyzer.filter;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.nio.charset.StandardCharsets;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.Locale;
-import java.util.Set;
-import java.util.concurrent.ExecutionException;
-
-import com.google.common.cache.CacheBuilder;
-import com.google.common.cache.CacheLoader;
-import com.google.common.cache.LoadingCache;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Provides a list of Stop Words for a given language
- */
-public class StopWordFactory
-{
- private static final Logger logger = LoggerFactory.getLogger(StopWordFactory.class);
-
- private static final String DEFAULT_RESOURCE_EXT = "_ST.txt";
- private static final String DEFAULT_RESOURCE_PREFIX = StopWordFactory.class.getPackage()
- .getName().replace(".", File.separator);
- private static final Set SUPPORTED_LANGUAGES = new HashSet<>(
- Arrays.asList("ar","bg","cs","de","en","es","fi","fr","hi","hu","it",
- "pl","pt","ro","ru","sv"));
-
- private static final LoadingCache> STOP_WORDS_CACHE = CacheBuilder.newBuilder()
- .build(new CacheLoader>()
- {
- public Set load(String s)
- {
- return getStopWordsFromResource(s);
- }
- });
-
- public static Set getStopWordsForLanguage(Locale locale)
- {
- if (locale == null)
- return null;
-
- String rootLang = locale.getLanguage().substring(0, 2);
- try
- {
- return (!SUPPORTED_LANGUAGES.contains(rootLang)) ? null : STOP_WORDS_CACHE.get(rootLang);
- }
- catch (ExecutionException e)
- {
- logger.error("Failed to populate Stop Words Cache for language [{}]", locale.getLanguage(), e);
- return null;
- }
- }
-
- private static Set getStopWordsFromResource(String language)
- {
- Set stopWords = new HashSet<>();
- String resourceName = DEFAULT_RESOURCE_PREFIX + File.separator + language + DEFAULT_RESOURCE_EXT;
- try (InputStream is = StopWordFactory.class.getClassLoader().getResourceAsStream(resourceName);
- BufferedReader r = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8)))
- {
- String line;
- while ((line = r.readLine()) != null)
- {
- //skip comments (lines starting with # char)
- if(line.charAt(0) == '#')
- continue;
- stopWords.add(line.trim());
- }
- }
- catch (Exception e)
- {
- logger.error("Failed to retrieve Stop Terms resource for language [{}]", language, e);
- }
- return stopWords;
- }
-}
diff --git a/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StopWordFilters.java b/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StopWordFilters.java
deleted file mode 100644
index 4ae849c1f4..0000000000
--- a/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StopWordFilters.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.index.sasi.analyzer.filter;
-
-import java.util.Locale;
-import java.util.Set;
-
-/**
- * Filter implementations for input matching Stop Words
- */
-public class StopWordFilters
-{
- public static class DefaultStopWordFilter extends FilterPipelineTask
- {
- private Set stopWords = null;
-
- public DefaultStopWordFilter(Locale locale)
- {
- this.stopWords = StopWordFactory.getStopWordsForLanguage(locale);
- }
-
- public String process(String input) throws Exception
- {
- return (stopWords != null && stopWords.contains(input)) ? null : input;
- }
- }
-}
diff --git a/test/unit/org/apache/cassandra/index/sasi/SASIIndexTest.java b/test/unit/org/apache/cassandra/index/sasi/SASIIndexTest.java
index c82aafd99d..cb8c3f77d2 100644
--- a/test/unit/org/apache/cassandra/index/sasi/SASIIndexTest.java
+++ b/test/unit/org/apache/cassandra/index/sasi/SASIIndexTest.java
@@ -62,7 +62,6 @@
import org.apache.cassandra.exceptions.ConfigurationException;
import org.apache.cassandra.exceptions.InvalidRequestException;
import org.apache.cassandra.index.sasi.analyzer.AbstractAnalyzer;
-import org.apache.cassandra.index.sasi.analyzer.DelimiterAnalyzer;
import org.apache.cassandra.index.sasi.analyzer.NoOpAnalyzer;
import org.apache.cassandra.index.sasi.analyzer.NonTokenizingAnalyzer;
import org.apache.cassandra.index.sasi.conf.ColumnIndex;
@@ -2538,9 +2537,7 @@ public void testAnalyzerValidation()
new HashMap, List>()
{{
- put(StandardAnalyzer.class, textColumns);
put(NonTokenizingAnalyzer.class, textColumns);
- put(DelimiterAnalyzer.class, textColumns);
put(NoOpAnalyzer.class, allColumns);
}}
.forEach((analyzer, supportedColumns) -> {
diff --git a/test/unit/org/apache/cassandra/index/sasi/analyzer/DelimiterAnalyzerTest.java b/test/unit/org/apache/cassandra/index/sasi/analyzer/DelimiterAnalyzerTest.java
deleted file mode 100644
index f5f007f855..0000000000
--- a/test/unit/org/apache/cassandra/index/sasi/analyzer/DelimiterAnalyzerTest.java
+++ /dev/null
@@ -1,317 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.index.sasi.analyzer;
-
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.List;
-
-import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.db.marshal.Int32Type;
-import org.apache.cassandra.db.marshal.SetType;
-import org.apache.cassandra.db.marshal.UTF8Type;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.commons.io.IOUtils;
-
-import org.junit.Assert;
-import org.junit.Test;
-
-import static org.junit.Assert.assertEquals;
-
-public class DelimiterAnalyzerTest
-{
-
- @Test
- public void caseSensitiveAnalizer() throws Exception
- {
- DelimiterAnalyzer analyzer = new DelimiterAnalyzer();
-
- analyzer.init(
- new HashMap()
- {{
- put(DelimiterTokenizingOptions.DELIMITER, " ");
- }},
- UTF8Type.instance);
-
- String testString = "Nip it in the bud";
- ByteBuffer toAnalyze = ByteBuffer.wrap(testString.getBytes());
- analyzer.reset(toAnalyze);
- StringBuilder output = new StringBuilder();
- while (analyzer.hasNext())
- output.append(ByteBufferUtil.string(analyzer.next()) + (analyzer.hasNext() ? ' ' : ""));
-
- Assert.assertEquals(testString, output.toString());
- Assert.assertFalse(testString.toLowerCase().equals(output.toString()));
- }
-
- @Test
- public void testBlankEntries() throws Exception
- {
- DelimiterAnalyzer analyzer = new DelimiterAnalyzer();
-
- analyzer.init(
- new HashMap()
- {{
- put(DelimiterTokenizingOptions.DELIMITER, ",");
- }},
- UTF8Type.instance);
-
- String testString = ",Nip,,,,it,,,in,,the,bud,,,";
- ByteBuffer toAnalyze = ByteBuffer.wrap(testString.getBytes());
- analyzer.reset(toAnalyze);
- StringBuilder output = new StringBuilder();
- while (analyzer.hasNext())
- output.append(ByteBufferUtil.string(analyzer.next()) + (analyzer.hasNext() ? ',' : ""));
-
- Assert.assertEquals("Nip,it,in,the,bud", output.toString());
- Assert.assertFalse(testString.toLowerCase().equals(output.toString()));
- }
-
- @Test(expected = ConfigurationException.class)
- public void ensureIncompatibleInputOnCollectionTypeSkipped()
- {
- new DelimiterAnalyzer().validate(Collections.emptyMap(),
- ColumnDefinition.regularDef("a", "b", "c", SetType.getInstance(UTF8Type.instance, true)));
- }
-
- @Test(expected = ConfigurationException.class)
- public void ensureIncompatibleInputSkipped()
- {
- new DelimiterAnalyzer().validate(Collections.emptyMap(),
- ColumnDefinition.regularDef("a", "b", "c", Int32Type.instance));
- }
-
- @Test
- public void testTokenizationLoremIpsum() throws Exception
- {
- ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
- DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/lorem_ipsum.txt")));
-
- DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();
-
- tokenizer.init(
- new HashMap()
- {{
- put(DelimiterTokenizingOptions.DELIMITER, " ");
- }},
- UTF8Type.instance);
-
- List tokens = new ArrayList<>();
- tokenizer.reset(bb);
- while (tokenizer.hasNext())
- tokens.add(tokenizer.next());
-
- assertEquals(69, tokens.size());
-
- }
-
- @Test
- public void testTokenizationJaJp1() throws Exception
- {
- ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
- DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/ja_jp_1.txt")));
-
- DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();
-
- tokenizer.init(
- new HashMap()
- {{
- put(DelimiterTokenizingOptions.DELIMITER, "。");
- }},
- UTF8Type.instance);
-
- tokenizer.reset(bb);
- List tokens = new ArrayList<>();
- while (tokenizer.hasNext())
- tokens.add(tokenizer.next());
-
- assertEquals(4, tokens.size());
- }
-
- @Test
- public void testTokenizationJaJp2() throws Exception
- {
- ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
- DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/ja_jp_2.txt")));
-
- DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();
-
- tokenizer.init(
- new HashMap()
- {{
- put(DelimiterTokenizingOptions.DELIMITER, "。");
- }},
- UTF8Type.instance);
-
- tokenizer.reset(bb);
- List tokens = new ArrayList<>();
- while (tokenizer.hasNext())
- tokens.add(tokenizer.next());
-
- assertEquals(2, tokens.size());
- }
-
- @Test
- public void testTokenizationRuRu1() throws Exception
- {
- ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
- DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/ru_ru_1.txt")));
-
- DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();
-
- tokenizer.init(
- new HashMap()
- {{
- put(DelimiterTokenizingOptions.DELIMITER, " ");
- }},
- UTF8Type.instance);
-
- List tokens = new ArrayList<>();
- tokenizer.reset(bb);
- while (tokenizer.hasNext())
- tokens.add(tokenizer.next());
-
- assertEquals(447, tokens.size());
- }
-
- @Test
- public void testTokenizationZnTw1() throws Exception
- {
- ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
- DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/zn_tw_1.txt")));
-
- DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();
-
- tokenizer.init(
- new HashMap()
- {{
- put(DelimiterTokenizingOptions.DELIMITER, " ");
- }},
- UTF8Type.instance);
-
- List tokens = new ArrayList<>();
- tokenizer.reset(bb);
- while (tokenizer.hasNext())
- tokens.add(tokenizer.next());
-
- assertEquals(403, tokens.size());
- }
-
- @Test
- public void testTokenizationAdventuresOfHuckFinn() throws Exception
- {
- ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
- DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/adventures_of_huckleberry_finn_mark_twain.txt")));
-
- DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();
-
- tokenizer.init(
- new HashMap()
- {{
- put(DelimiterTokenizingOptions.DELIMITER, " ");
- }},
- UTF8Type.instance);
-
- List tokens = new ArrayList<>();
- tokenizer.reset(bb);
- while (tokenizer.hasNext())
- tokens.add(tokenizer.next());
-
- assertEquals(104594, tokens.size());
- }
-
- @Test
- public void testWorldCities() throws Exception
- {
- ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
- DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/world_cities_a.csv")));
-
- DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();
-
- tokenizer.init(
- new HashMap()
- {{
- put(DelimiterTokenizingOptions.DELIMITER, ",");
- }},
- UTF8Type.instance);
-
- List tokens = new ArrayList<>();
- tokenizer.reset(bb);
- while (tokenizer.hasNext())
- tokens.add(tokenizer.next());
-
- assertEquals(122265, tokens.size());
- }
-
- @Test
- public void tokenizeDomainNamesAndUrls() throws Exception
- {
- ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
- DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/top_visited_domains.txt")));
-
- DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();
-
- tokenizer.init(
- new HashMap()
- {{
- put(DelimiterTokenizingOptions.DELIMITER, " ");
- }},
- UTF8Type.instance);
-
- tokenizer.reset(bb);
-
- List tokens = new ArrayList<>();
- while (tokenizer.hasNext())
- tokens.add(tokenizer.next());
-
- assertEquals(12, tokens.size());
- }
-
- @Test
- public void testReuseAndResetTokenizerInstance() throws Exception
- {
- List bbToTokenize = new ArrayList<>();
- bbToTokenize.add(ByteBuffer.wrap("Nip it in the bud".getBytes()));
- bbToTokenize.add(ByteBuffer.wrap("I couldn’t care less".getBytes()));
- bbToTokenize.add(ByteBuffer.wrap("One and the same".getBytes()));
- bbToTokenize.add(ByteBuffer.wrap("The squeaky wheel gets the grease.".getBytes()));
- bbToTokenize.add(ByteBuffer.wrap("The pen is mightier than the sword.".getBytes()));
-
- DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();
-
- tokenizer.init(
- new HashMap()
- {{
- put(DelimiterTokenizingOptions.DELIMITER, " ");
- }},
- UTF8Type.instance);
-
- List tokens = new ArrayList<>();
- for (ByteBuffer bb : bbToTokenize)
- {
- tokenizer.reset(bb);
- while (tokenizer.hasNext())
- tokens.add(tokenizer.next());
- }
- assertEquals(26, tokens.size());
- }
-
-}
diff --git a/test/unit/org/apache/cassandra/index/sasi/analyzer/StandardAnalyzerTest.java b/test/unit/org/apache/cassandra/index/sasi/analyzer/StandardAnalyzerTest.java
deleted file mode 100644
index 7a88a3dc9f..0000000000
--- a/test/unit/org/apache/cassandra/index/sasi/analyzer/StandardAnalyzerTest.java
+++ /dev/null
@@ -1,227 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.index.sasi.analyzer;
-
-import java.io.InputStream;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Locale;
-
-import org.junit.Test;
-
-import org.apache.cassandra.serializers.UTF8Serializer;
-
-import static org.junit.Assert.assertEquals;
-
-public class StandardAnalyzerTest
-{
- @Test
- public void testTokenizationAscii() throws Exception
- {
- InputStream is = StandardAnalyzerTest.class.getClassLoader()
- .getResourceAsStream("tokenization/apache_license_header.txt");
-
- StandardTokenizerOptions options = new StandardTokenizerOptions.OptionsBuilder()
- .maxTokenLength(5).build();
- StandardAnalyzer tokenizer = new StandardAnalyzer();
- tokenizer.init(options);
-
- List tokens = new ArrayList<>();
- tokenizer.reset(is);
- while (tokenizer.hasNext())
- tokens.add(tokenizer.next());
-
- assertEquals(67, tokens.size());
- }
-
- @Test
- public void testTokenizationLoremIpsum() throws Exception
- {
- InputStream is = StandardAnalyzerTest.class.getClassLoader()
- .getResourceAsStream("tokenization/lorem_ipsum.txt");
-
- StandardAnalyzer tokenizer = new StandardAnalyzer();
- tokenizer.init(StandardTokenizerOptions.getDefaultOptions());
-
- List tokens = new ArrayList<>();
- tokenizer.reset(is);
- while (tokenizer.hasNext())
- tokens.add(tokenizer.next());
-
- assertEquals(62, tokens.size());
-
- }
-
- @Test
- public void testTokenizationJaJp1() throws Exception
- {
- InputStream is = StandardAnalyzerTest.class.getClassLoader()
- .getResourceAsStream("tokenization/ja_jp_1.txt");
-
- StandardAnalyzer tokenizer = new StandardAnalyzer();
- tokenizer.init(StandardTokenizerOptions.getDefaultOptions());
-
- tokenizer.reset(is);
- List tokens = new ArrayList<>();
- while (tokenizer.hasNext())
- tokens.add(tokenizer.next());
-
- assertEquals(210, tokens.size());
- }
-
- @Test
- public void testTokenizationJaJp2() throws Exception
- {
- InputStream is = StandardAnalyzerTest.class.getClassLoader()
- .getResourceAsStream("tokenization/ja_jp_2.txt");
-
- StandardTokenizerOptions options = new StandardTokenizerOptions.OptionsBuilder().stemTerms(true)
- .ignoreStopTerms(true).alwaysLowerCaseTerms(true).build();
- StandardAnalyzer tokenizer = new StandardAnalyzer();
- tokenizer.init(options);
-
- tokenizer.reset(is);
- List tokens = new ArrayList<>();
- while (tokenizer.hasNext())
- tokens.add(tokenizer.next());
-
- assertEquals(57, tokens.size());
- }
-
- @Test
- public void testTokenizationRuRu1() throws Exception
- {
- InputStream is = StandardAnalyzerTest.class.getClassLoader()
- .getResourceAsStream("tokenization/ru_ru_1.txt");
- StandardAnalyzer tokenizer = new StandardAnalyzer();
- tokenizer.init(StandardTokenizerOptions.getDefaultOptions());
-
- List tokens = new ArrayList<>();
- tokenizer.reset(is);
- while (tokenizer.hasNext())
- tokens.add(tokenizer.next());
-
- assertEquals(456, tokens.size());
- }
-
- @Test
- public void testTokenizationZnTw1() throws Exception
- {
- InputStream is = StandardAnalyzerTest.class.getClassLoader()
- .getResourceAsStream("tokenization/zn_tw_1.txt");
- StandardAnalyzer tokenizer = new StandardAnalyzer();
- tokenizer.init(StandardTokenizerOptions.getDefaultOptions());
-
- List tokens = new ArrayList<>();
- tokenizer.reset(is);
- while (tokenizer.hasNext())
- tokens.add(tokenizer.next());
-
- assertEquals(963, tokens.size());
- }
-
- @Test
- public void testTokenizationAdventuresOfHuckFinn() throws Exception
- {
- InputStream is = StandardAnalyzerTest.class.getClassLoader()
- .getResourceAsStream("tokenization/adventures_of_huckleberry_finn_mark_twain.txt");
-
- StandardTokenizerOptions options = new StandardTokenizerOptions.OptionsBuilder().stemTerms(true)
- .ignoreStopTerms(true).useLocale(Locale.ENGLISH)
- .alwaysLowerCaseTerms(true).build();
- StandardAnalyzer tokenizer = new StandardAnalyzer();
- tokenizer.init(options);
-
- List tokens = new ArrayList<>();
- tokenizer.reset(is);
- while (tokenizer.hasNext())
- tokens.add(tokenizer.next());
-
- assertEquals(37739, tokens.size());
- }
-
- @Test
- public void testSkipStopWordBeforeStemmingFrench() throws Exception
- {
- InputStream is = StandardAnalyzerTest.class.getClassLoader()
- .getResourceAsStream("tokenization/french_skip_stop_words_before_stemming.txt");
-
- StandardTokenizerOptions options = new StandardTokenizerOptions.OptionsBuilder().stemTerms(true)
- .ignoreStopTerms(true).useLocale(Locale.FRENCH)
- .alwaysLowerCaseTerms(true).build();
- StandardAnalyzer tokenizer = new StandardAnalyzer();
- tokenizer.init(options);
-
- List tokens = new ArrayList<>();
- List words = new ArrayList<>();
- tokenizer.reset(is);
- while (tokenizer.hasNext())
- {
- final ByteBuffer nextToken = tokenizer.next();
- tokens.add(nextToken);
- words.add(UTF8Serializer.instance.deserialize(nextToken.duplicate()));
- }
-
- assertEquals(4, tokens.size());
- assertEquals("dans", words.get(0));
- assertEquals("plui", words.get(1));
- assertEquals("chanson", words.get(2));
- assertEquals("connu", words.get(3));
- }
-
- @Test
- public void tokenizeDomainNamesAndUrls() throws Exception
- {
- InputStream is = StandardAnalyzerTest.class.getClassLoader()
- .getResourceAsStream("tokenization/top_visited_domains.txt");
-
- StandardAnalyzer tokenizer = new StandardAnalyzer();
- tokenizer.init(StandardTokenizerOptions.getDefaultOptions());
- tokenizer.reset(is);
-
- List tokens = new ArrayList<>();
- while (tokenizer.hasNext())
- tokens.add(tokenizer.next());
-
- assertEquals(15, tokens.size());
- }
-
- @Test
- public void testReuseAndResetTokenizerInstance() throws Exception
- {
- List bbToTokenize = new ArrayList<>();
- bbToTokenize.add(ByteBuffer.wrap("Nip it in the bud".getBytes()));
- bbToTokenize.add(ByteBuffer.wrap("I couldn’t care less".getBytes()));
- bbToTokenize.add(ByteBuffer.wrap("One and the same".getBytes()));
- bbToTokenize.add(ByteBuffer.wrap("The squeaky wheel gets the grease.".getBytes()));
- bbToTokenize.add(ByteBuffer.wrap("The pen is mightier than the sword.".getBytes()));
-
- StandardAnalyzer tokenizer = new StandardAnalyzer();
- tokenizer.init(StandardTokenizerOptions.getDefaultOptions());
-
- List tokens = new ArrayList<>();
- for (ByteBuffer bb : bbToTokenize)
- {
- tokenizer.reset(bb);
- while (tokenizer.hasNext())
- tokens.add(tokenizer.next());
- }
- assertEquals(10, tokens.size());
- }
-}