From 258668b1fc65639ca80db48701907c919e17362d Mon Sep 17 00:00:00 2001 From: Anton Rubin Date: Mon, 21 Oct 2024 16:07:30 +0100 Subject: [PATCH 01/15] adding arabic language analyzer Signed-off-by: Anton Rubin --- _analyzers/language-analyzers/arabic.md | 109 ++++++++++++++++++ .../index.md} | 5 +- _analyzers/supported-analyzers/index.md | 4 +- 3 files changed, 114 insertions(+), 4 deletions(-) create mode 100644 _analyzers/language-analyzers/arabic.md rename _analyzers/{language-analyzers.md => language-analyzers/index.md} (95%) diff --git a/_analyzers/language-analyzers/arabic.md b/_analyzers/language-analyzers/arabic.md new file mode 100644 index 0000000000..194e42b66b --- /dev/null +++ b/_analyzers/language-analyzers/arabic.md @@ -0,0 +1,109 @@ +--- +layout: default +title: Arabic +parent: Language analyzers +nav_order: 10 +--- + +# Arabic analyzer + +The built-in `arabic` analyzer can be applied to a text field using the following command: + +```json +PUT /arabic-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "arabic" + } + } + } +} +``` +{% include copy-curl.html %} + +## Arabic analyzer internals + +The `arabic` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- decimal_digit (general) +- stop (arabic) +- normalization (arabic) +- keywords (arabic) +- stemmer (arabic) + +## Custom Arabic analyzer + +You can create custom Arabic analyzer using the following command: + +```json +PUT /arabic-index +{ + "settings": { + "analysis": { + "filter": { + "arabic_stop": { + "type": "stop", + "stopwords": "_arabic_" + }, + "arabic_stemmer": { + "type": "stemmer", + "language": "arabic" + }, + "arabic_normalization": { + "type": "arabic_normalization" + }, + "decimal_digit": { + "type": "decimal_digit" + } + }, + "analyzer": { + "arabic_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "arabic_normalization", + "decimal_digit", + "arabic_stop", + "arabic_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "arabic_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +If you want to prevent certain words from stemming, you can add a `keyword_marker` token filter to mark list of words as keywords and add it to list of filters in analyzer. + +```json +"arabic_stemmer": { + ... +}, +"arabic_keywords": { + "type": "keyword_marker", + "keywords": ["بتن"] +}, +"arabic_normalization": { + ... +}, +``` + + diff --git a/_analyzers/language-analyzers.md b/_analyzers/language-analyzers/index.md similarity index 95% rename from _analyzers/language-analyzers.md rename to _analyzers/language-analyzers/index.md index ca4ba320dd..9d5c634cd3 100644 --- a/_analyzers/language-analyzers.md +++ b/_analyzers/language-analyzers/index.md @@ -3,8 +3,9 @@ layout: default title: Language analyzers nav_order: 100 parent: Analyzers -redirect_from: - - /query-dsl/analyzers/language-analyzers/ +has_children: true +has_toc: false + --- # Language analyzers diff --git a/_analyzers/supported-analyzers/index.md b/_analyzers/supported-analyzers/index.md index af6ce6c3a6..682f20acac 100644 --- a/_analyzers/supported-analyzers/index.md +++ b/_analyzers/supported-analyzers/index.md @@ -24,9 +24,9 @@ Analyzer | Analysis performed | Analyzer output **Stop** | - Parses strings into tokens on any non-letter character
- Removes non-letter characters
- Removes stop words
- Converts tokens to lowercase | [`s`, `fun`, `contribute`, `brand`, `new`, `pr`, `opensearch`] **Keyword** (no-op) | - Outputs the entire string unchanged | [`It’s fun to contribute a brand-new PR or 2 to OpenSearch!`] **Pattern** | - Parses strings into tokens using regular expressions
- Supports converting strings to lowercase
- Supports removing stop words | [`it`, `s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `pr`, `or`, `2`, `to`, `opensearch`] -[**Language**]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/) | Performs analysis specific to a certain language (for example, `english`). | [`fun`, `contribut`, `brand`, `new`, `pr`, `2`, `opensearch`] +[**Language**]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/index/) | Performs analysis specific to a certain language (for example, `english`). | [`fun`, `contribut`, `brand`, `new`, `pr`, `2`, `opensearch`] **Fingerprint** | - Parses strings on any non-letter character
- Normalizes characters by converting them to ASCII
- Converts tokens to lowercase
- Sorts, deduplicates, and concatenates tokens into a single token
- Supports removing stop words | [`2 a brand contribute fun it's new opensearch or pr to`]
Note that the apostrophe was converted to its ASCII counterpart. ## Language analyzers -OpenSearch supports analyzers for various languages. For more information, see [Language analyzers]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/). \ No newline at end of file +OpenSearch supports analyzers for various languages. For more information, see [Language analyzers]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/index/). \ No newline at end of file From 37b845a005044ca0e56a001b2d142548a381ecb3 Mon Sep 17 00:00:00 2001 From: Fanit Kolchina Date: Mon, 21 Oct 2024 11:26:09 -0400 Subject: [PATCH 02/15] Add grandparent to arabic analyzer Signed-off-by: Fanit Kolchina --- _analyzers/language-analyzers/arabic.md | 1 + _analyzers/language-analyzers/index.md | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/_analyzers/language-analyzers/arabic.md b/_analyzers/language-analyzers/arabic.md index 194e42b66b..81dcba269f 100644 --- a/_analyzers/language-analyzers/arabic.md +++ b/_analyzers/language-analyzers/arabic.md @@ -2,6 +2,7 @@ layout: default title: Arabic parent: Language analyzers +grand_parent: Analyzers nav_order: 10 --- diff --git a/_analyzers/language-analyzers/index.md b/_analyzers/language-analyzers/index.md index 9d5c634cd3..3760a77be0 100644 --- a/_analyzers/language-analyzers/index.md +++ b/_analyzers/language-analyzers/index.md @@ -5,7 +5,6 @@ nav_order: 100 parent: Analyzers has_children: true has_toc: false - --- # Language analyzers From 436bd3ea304a2794811a551fe9f2e41852be0127 Mon Sep 17 00:00:00 2001 From: Anton Rubin Date: Tue, 22 Oct 2024 14:22:40 +0100 Subject: [PATCH 03/15] adding more details Signed-off-by: Anton Rubin --- _analyzers/language-analyzers/arabic.md | 88 ++++++++++++++++++++---- _analyzers/language-analyzers/index.md | 89 ++++++++++++++++++++++++- 2 files changed, 164 insertions(+), 13 deletions(-) diff --git a/_analyzers/language-analyzers/arabic.md b/_analyzers/language-analyzers/arabic.md index 81dcba269f..bc092aa03b 100644 --- a/_analyzers/language-analyzers/arabic.md +++ b/_analyzers/language-analyzers/arabic.md @@ -25,6 +25,25 @@ PUT /arabic-index ``` {% include copy-curl.html %} +You can also use `stem_exclusion` with any language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_english_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_english_analyzer":{ + "type":"arabic", + "stem_exclusion":["authority","authorization"] + } + } + } + } +} +``` +{% include copy-curl.html %} + ## Arabic analyzer internals The `arabic` analyzer is build using the following: @@ -90,21 +109,66 @@ PUT /arabic-index ``` {% include copy-curl.html %} -## Stem exclusion +## Generated tokens -If you want to prevent certain words from stemming, you can add a `keyword_marker` token filter to mark list of words as keywords and add it to list of filters in analyzer. +Use the following request to examine the tokens generated using the analyzer: ```json -"arabic_stemmer": { - ... -}, -"arabic_keywords": { - "type": "keyword_marker", - "keywords": ["بتن"] -}, -"arabic_normalization": { - ... -}, +POST /arabic-index/_analyze +{ + "field": "content", + "text": "الطلاب يدرسون في الجامعات العربية. أرقامهم ١٢٣٤٥٦." +} ``` +{% include copy-curl.html %} +The response contains the generated tokens: +```json +{ + "tokens": [ + { + "token": "طلاب", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "يدرس", + "start_offset": 7, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "جامع", + "start_offset": 17, + "end_offset": 25, + "type": "", + "position": 3 + }, + { + "token": "عرب", + "start_offset": 26, + "end_offset": 33, + "type": "", + "position": 4 + }, + { + "token": "ارقامهم", + "start_offset": 35, + "end_offset": 42, + "type": "", + "position": 5 + }, + { + "token": "123456", + "start_offset": 43, + "end_offset": 49, + "type": "", + "position": 6 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/index.md b/_analyzers/language-analyzers/index.md index 3760a77be0..21d4bc08ad 100644 --- a/_analyzers/language-analyzers/index.md +++ b/_analyzers/language-analyzers/index.md @@ -41,4 +41,91 @@ PUT my-index } ``` - +## stem_exclusion + +The `stem_exclusion` feature can be applied to many language analyzers by providing a list of lowercase words that should be excluded from stemming. Internally, OpenSearch uses the `keyword_marker` token filter to mark these words as keywords, ensuring they are not stemmed. + +## Example stem_exclusion + +You can use the following command to configure `stem_exclusion`: + +```json +PUT index_with_stem_exclusion_english_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_english_analyzer":{ + "type":"english", + "stem_exclusion": ["manager", "management"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +Following languages support `stem_exclusion`: + +- arabic +- armenian +- basque, +- bengali +- bulgarian +- catalan +- czech +- dutch +- english +- finnish +- french +- galician +- german +- hindi +- hungarian +- indonesian +- irish +- italian +- latvian +- lithuanian +- norwegian +- portuguese +- romanian +- russian +- sorani +- spanish +- swedish +- turkish + + +## stem_exclusion with custom analyzer + +All language analyzers are made up from tokenizers and token filters specific to the particular language. If you want to implement a custom version of the language analyzer with `stem_exclusion`, you need to configure `keyword_marker` token filter and list the necessary words in `keywords` parameter, see the following example: + +```json +PUT index_with_keyword_marker_analyzer +{ + "settings": { + "analysis": { + "filter": { + "protected_keywords_filter": { + "type": "keyword_marker", + "keywords": ["Apple", "OpenSearch"] + } + }, + "analyzer": { + "custom_english_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "protected_keywords_filter", + "english_stemmer" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} From 93308e4fb73d0d4b306546e20e2e76a2ab29e792 Mon Sep 17 00:00:00 2001 From: Anton Rubin Date: Wed, 30 Oct 2024 11:45:37 +0000 Subject: [PATCH 04/15] adding armenian language analyzer Signed-off-by: Anton Rubin --- _analyzers/language-analyzers/arabic.md | 11 +- _analyzers/language-analyzers/armenian.md | 132 ++++++++++++++++++++++ 2 files changed, 139 insertions(+), 4 deletions(-) create mode 100644 _analyzers/language-analyzers/armenian.md diff --git a/_analyzers/language-analyzers/arabic.md b/_analyzers/language-analyzers/arabic.md index bc092aa03b..7f18454a26 100644 --- a/_analyzers/language-analyzers/arabic.md +++ b/_analyzers/language-analyzers/arabic.md @@ -25,17 +25,19 @@ PUT /arabic-index ``` {% include copy-curl.html %} -You can also use `stem_exclusion` with any language analyzer using the following command: +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: ```json -PUT index_with_stem_exclusion_english_analyzer +PUT index_with_stem_exclusion_arabic { "settings": { "analysis": { "analyzer": { - "stem_exclusion_english_analyzer":{ + "stem_exclusion_arabic_analyzer":{ "type":"arabic", - "stem_exclusion":["authority","authorization"] + "stem_exclusion":["تكنولوجيا","سلطة "] } } } @@ -51,6 +53,7 @@ The `arabic` analyzer is build using the following: Tokenizer: `standard` Token Filters: +- lowercase (general) - decimal_digit (general) - stop (arabic) - normalization (arabic) diff --git a/_analyzers/language-analyzers/armenian.md b/_analyzers/language-analyzers/armenian.md new file mode 100644 index 0000000000..9bdc316e2e --- /dev/null +++ b/_analyzers/language-analyzers/armenian.md @@ -0,0 +1,132 @@ +--- +layout: default +title: Armenian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 20 +--- + +# Armenian analyzer + +The built-in `armenian` analyzer can be applied to a text field using the following command: + +```json +PUT /arabic-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "armenian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_armenian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_armenian_analyzer": { + "type": "armenian", + "stem_exclusion": ["բարև", "խաղաղություն"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Armenian analyzer internals + +The `armenian` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- lowercase (general) +- stop (armenian) +- keywords (armenian) +- stemmer (armenian) + +## Custom Armenian analyzer + +You can create custom Armenian analyzer using the following command: + +```json +PUT /armenian-index +{ + "settings": { + "analysis": { + "filter": { + "armenian_stop": { + "type": "stop", + "stopwords": "_armenian_" + }, + "armenian_stemmer": { + "type": "stemmer", + "language": "armenian" + } + }, + "analyzer": { + "armenian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "armenian_stop", + "armenian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "armenian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET armenian-index/_analyze +{ + "analyzer": "stem_exclusion_armenian_analyzer", + "text": "բարև բոլորին, մենք խաղաղություն ենք ուզում և նոր օր ենք սկսել" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "բարև","start_offset": 0,"end_offset": 4,"type": "","position": 0}, + {"token": "բոլոր","start_offset": 5,"end_offset": 12,"type": "","position": 1}, + {"token": "խաղաղություն","start_offset": 19,"end_offset": 31,"type": "","position": 3}, + {"token": "ուզ","start_offset": 36,"end_offset": 42,"type": "","position": 5}, + {"token": "նոր","start_offset": 45,"end_offset": 48,"type": "","position": 7}, + {"token": "օր","start_offset": 49,"end_offset": 51,"type": "","position": 8}, + {"token": "սկսել","start_offset": 56,"end_offset": 61,"type": "","position": 10} + ] +} +``` \ No newline at end of file From d416dd4ed45ff0ca57c943a49a8934913a31991f Mon Sep 17 00:00:00 2001 From: Anton Rubin Date: Wed, 30 Oct 2024 12:25:36 +0000 Subject: [PATCH 05/15] adding basque bengali and brazilian language analyzers Signed-off-by: Anton Rubin --- _analyzers/language-analyzers/basque.md | 132 ++++++++++++++++++++ _analyzers/language-analyzers/bengali.md | 137 +++++++++++++++++++++ _analyzers/language-analyzers/brazilian.md | 132 ++++++++++++++++++++ _analyzers/language-analyzers/index.md | 5 +- 4 files changed, 404 insertions(+), 2 deletions(-) create mode 100644 _analyzers/language-analyzers/basque.md create mode 100644 _analyzers/language-analyzers/bengali.md create mode 100644 _analyzers/language-analyzers/brazilian.md diff --git a/_analyzers/language-analyzers/basque.md b/_analyzers/language-analyzers/basque.md new file mode 100644 index 0000000000..b48fc378fa --- /dev/null +++ b/_analyzers/language-analyzers/basque.md @@ -0,0 +1,132 @@ +--- +layout: default +title: Basque +parent: Language analyzers +grand_parent: Analyzers +nav_order: 30 +--- + +# Basque analyzer + +The built-in `basque` analyzer can be applied to a text field using the following command: + +```json +PUT /basque-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "basque" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_basque_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_basque_analyzer": { + "type": "basque", + "stem_exclusion": ["autoritate", "baldintza"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Basque analyzer internals + +The `basque` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- lowercase (general) +- stop (basque) +- keywords (basque) +- stemmer (basque) + +## Custom Basque analyzer + +You can create custom Basque analyzer using the following command: + +```json +PUT /basque-index +{ + "settings": { + "analysis": { + "filter": { + "basque_stop": { + "type": "stop", + "stopwords": "_basque_" + }, + "basque_stemmer": { + "type": "stemmer", + "language": "basque" + } + }, + "analyzer": { + "basque_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "basque_stop", + "basque_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "basque_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /basque-index/_analyze +{ + "field": "content", + "text": "Ikasleek euskal unibertsitateetan ikasten dute. Haien zenbakiak 123456 dira." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "ikasle","start_offset": 0,"end_offset": 8,"type": "","position": 0}, + {"token": "euskal","start_offset": 9,"end_offset": 15,"type": "","position": 1}, + {"token": "unibertsi","start_offset": 16,"end_offset": 33,"type": "","position": 2}, + {"token": "ikas","start_offset": 34,"end_offset": 41,"type": "","position": 3}, + {"token": "haien","start_offset": 48,"end_offset": 53,"type": "","position": 5}, + {"token": "zenba","start_offset": 54,"end_offset": 63,"type": "","position": 6}, + {"token": "123456","start_offset": 64,"end_offset": 70,"type": "","position": 7} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/bengali.md b/_analyzers/language-analyzers/bengali.md new file mode 100644 index 0000000000..011082b068 --- /dev/null +++ b/_analyzers/language-analyzers/bengali.md @@ -0,0 +1,137 @@ +--- +layout: default +title: Bengali +parent: Language analyzers +grand_parent: Analyzers +nav_order: 40 +--- + +# Bengali analyzer + +The built-in `bengali` analyzer can be applied to a text field using the following command: + +```json +PUT /bengali-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "bengali" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_bengali_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_bengali_analyzer": { + "type": "bengali", + "stem_exclusion": ["কর্তৃপক্ষ", "অনুমোদন"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Bengali analyzer internals + +The `bengali` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- lowercase (general) +- decimal_digit (general) +- indic_normalization +- normalization (bengali) +- stop (bengali) +- keywords (bengali) +- stemmer (bengali) + +## Custom Bengali analyzer + +You can create custom Bengali analyzer using the following command: + +```json +PUT /bengali-index +{ + "settings": { + "analysis": { + "filter": { + "bengali_stop": { + "type": "stop", + "stopwords": "_bengali_" + }, + "bengali_stemmer": { + "type": "stemmer", + "language": "bengali" + } + }, + "analyzer": { + "bengali_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "decimal_digit", + "indic_normalization", + "bengali_normalization", + "bengali_stop", + "bengali_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "bengali_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /bengali-index/_analyze +{ + "field": "content", + "text": "ছাত্ররা বিশ্ববিদ্যালয়ে পড়াশোনা করে। তাদের নম্বরগুলি ১২৩৪৫৬।" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "ছাত্র","start_offset": 0,"end_offset": 7,"type": "","position": 0}, + {"token": "বিসসবিদালয়","start_offset": 8,"end_offset": 23,"type": "","position": 1}, + {"token": "পরাসোন","start_offset": 24,"end_offset": 32,"type": "","position": 2}, + {"token": "তা","start_offset": 38,"end_offset": 43,"type": "","position": 4}, + {"token": "নমমর","start_offset": 44,"end_offset": 53,"type": "","position": 5}, + {"token": "123456","start_offset": 54,"end_offset": 60,"type": "","position": 6} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/brazilian.md b/_analyzers/language-analyzers/brazilian.md new file mode 100644 index 0000000000..073166d149 --- /dev/null +++ b/_analyzers/language-analyzers/brazilian.md @@ -0,0 +1,132 @@ +--- +layout: default +title: Brazilian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 50 +--- + +# Brazilian analyzer + +The built-in `brazilian` analyzer can be applied to a text field using the following command: + +```json +PUT /brazilian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "brazilian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_brazilian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_brazilian_analyzer": { + "type": "brazilian", + "stem_exclusion": ["autoridade", "aprovação"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Brazilian analyzer internals + +The `brazilian` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- lowercase (general) +- stop (brazilian) +- keywords (brazilian) +- stemmer (brazilian) + +## Custom Brazilian analyzer + +You can create custom Brazilian analyzer using the following command: + +```json +PUT /brazilian-index +{ + "settings": { + "analysis": { + "filter": { + "brazilian_stop": { + "type": "stop", + "stopwords": "_brazilian_" + }, + "brazilian_stemmer": { + "type": "stemmer", + "language": "brazilian" + } + }, + "analyzer": { + "brazilian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "brazilian_stop", + "brazilian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "brazilian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /brazilian-index/_analyze +{ + "field": "content", + "text": "Estudantes estudam em universidades brasileiras. Seus números são 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "estudant","start_offset": 0,"end_offset": 10,"type": "","position": 0}, + {"token": "estud","start_offset": 11,"end_offset": 18,"type": "","position": 1}, + {"token": "univers","start_offset": 22,"end_offset": 35,"type": "","position": 3}, + {"token": "brasileir","start_offset": 36,"end_offset": 47,"type": "","position": 4}, + {"token": "numer","start_offset": 54,"end_offset": 61,"type": "","position": 6}, + {"token": "sao","start_offset": 62,"end_offset": 65,"type": "","position": 7}, + {"token": "123456","start_offset": 66,"end_offset": 72,"type": "","position": 8} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/index.md b/_analyzers/language-analyzers/index.md index 21d4bc08ad..8b032e205b 100644 --- a/_analyzers/language-analyzers/index.md +++ b/_analyzers/language-analyzers/index.md @@ -69,9 +69,10 @@ PUT index_with_stem_exclusion_english_analyzer Following languages support `stem_exclusion`: - arabic -- armenian -- basque, +- armenian +- basque - bengali +- brazilian - bulgarian - catalan - czech From 2e4f01d18dca255fe0a48e3fb989c6c223dfb8ef Mon Sep 17 00:00:00 2001 From: Anton Rubin Date: Wed, 30 Oct 2024 15:34:35 +0000 Subject: [PATCH 06/15] adding bulgarian catalan and cjk language analyzers Signed-off-by: Anton Rubin --- _analyzers/language-analyzers/arabic.md | 4 +- _analyzers/language-analyzers/armenian.md | 2 +- _analyzers/language-analyzers/basque.md | 2 +- _analyzers/language-analyzers/bengali.md | 4 +- _analyzers/language-analyzers/brazilian.md | 2 +- _analyzers/language-analyzers/bulgarian.md | 132 +++++++++++++++++++ _analyzers/language-analyzers/catalan.md | 138 ++++++++++++++++++++ _analyzers/language-analyzers/cjk.md | 142 +++++++++++++++++++++ _analyzers/language-analyzers/index.md | 1 + 9 files changed, 420 insertions(+), 7 deletions(-) create mode 100644 _analyzers/language-analyzers/bulgarian.md create mode 100644 _analyzers/language-analyzers/catalan.md create mode 100644 _analyzers/language-analyzers/cjk.md diff --git a/_analyzers/language-analyzers/arabic.md b/_analyzers/language-analyzers/arabic.md index 7f18454a26..913414c8a3 100644 --- a/_analyzers/language-analyzers/arabic.md +++ b/_analyzers/language-analyzers/arabic.md @@ -53,8 +53,8 @@ The `arabic` analyzer is build using the following: Tokenizer: `standard` Token Filters: -- lowercase (general) -- decimal_digit (general) +- lowercase +- decimal_digit - stop (arabic) - normalization (arabic) - keywords (arabic) diff --git a/_analyzers/language-analyzers/armenian.md b/_analyzers/language-analyzers/armenian.md index 9bdc316e2e..a5ce7d8526 100644 --- a/_analyzers/language-analyzers/armenian.md +++ b/_analyzers/language-analyzers/armenian.md @@ -53,7 +53,7 @@ The `armenian` analyzer is build using the following: Tokenizer: `standard` Token Filters: -- lowercase (general) +- lowercase - stop (armenian) - keywords (armenian) - stemmer (armenian) diff --git a/_analyzers/language-analyzers/basque.md b/_analyzers/language-analyzers/basque.md index b48fc378fa..7eac4cde82 100644 --- a/_analyzers/language-analyzers/basque.md +++ b/_analyzers/language-analyzers/basque.md @@ -53,7 +53,7 @@ The `basque` analyzer is build using the following: Tokenizer: `standard` Token Filters: -- lowercase (general) +- lowercase - stop (basque) - keywords (basque) - stemmer (basque) diff --git a/_analyzers/language-analyzers/bengali.md b/_analyzers/language-analyzers/bengali.md index 011082b068..d3df7f8417 100644 --- a/_analyzers/language-analyzers/bengali.md +++ b/_analyzers/language-analyzers/bengali.md @@ -53,8 +53,8 @@ The `bengali` analyzer is build using the following: Tokenizer: `standard` Token Filters: -- lowercase (general) -- decimal_digit (general) +- lowercase +- decimal_digit - indic_normalization - normalization (bengali) - stop (bengali) diff --git a/_analyzers/language-analyzers/brazilian.md b/_analyzers/language-analyzers/brazilian.md index 073166d149..b3b9c7cdb8 100644 --- a/_analyzers/language-analyzers/brazilian.md +++ b/_analyzers/language-analyzers/brazilian.md @@ -53,7 +53,7 @@ The `brazilian` analyzer is build using the following: Tokenizer: `standard` Token Filters: -- lowercase (general) +- lowercase - stop (brazilian) - keywords (brazilian) - stemmer (brazilian) diff --git a/_analyzers/language-analyzers/bulgarian.md b/_analyzers/language-analyzers/bulgarian.md new file mode 100644 index 0000000000..1d74f66c49 --- /dev/null +++ b/_analyzers/language-analyzers/bulgarian.md @@ -0,0 +1,132 @@ +--- +layout: default +title: Bulgarian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 60 +--- + +# Bulgarian analyzer + +The built-in `bulgarian` analyzer can be applied to a text field using the following command: + +```json +PUT /bulgarian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "bulgarian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_bulgarian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_bulgarian_analyzer": { + "type": "bulgarian", + "stem_exclusion": ["авторитет", "одобрение"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Bulgarian analyzer internals + +The `bulgarian` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- lowercase +- stop (bulgarian) +- keywords (bulgarian) +- stemmer (bulgarian) + +## Custom Bulgarian analyzer + +You can create custom Bulgarian analyzer using the following command: + +```json +PUT /bulgarian-index +{ + "settings": { + "analysis": { + "filter": { + "bulgarian_stop": { + "type": "stop", + "stopwords": "_bulgarian_" + }, + "bulgarian_stemmer": { + "type": "stemmer", + "language": "bulgarian" + } + }, + "analyzer": { + "bulgarian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "bulgarian_stop", + "bulgarian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "bulgarian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /bulgarian-index/_analyze +{ + "field": "content", + "text": "Студентите учат в българските университети. Техните номера са 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "студент","start_offset": 0,"end_offset": 10,"type": "","position": 0}, + {"token": "учат","start_offset": 11,"end_offset": 15,"type": "","position": 1}, + {"token": "българск","start_offset": 18,"end_offset": 29,"type": "","position": 3}, + {"token": "университят","start_offset": 30,"end_offset": 42,"type": "","position": 4}, + {"token": "техн","start_offset": 44,"end_offset": 51,"type": "","position": 5}, + {"token": "номер","start_offset": 52,"end_offset": 58,"type": "","position": 6}, + {"token": "123456","start_offset": 62,"end_offset": 68,"type": "","position": 8} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/catalan.md b/_analyzers/language-analyzers/catalan.md new file mode 100644 index 0000000000..bc072f8bd9 --- /dev/null +++ b/_analyzers/language-analyzers/catalan.md @@ -0,0 +1,138 @@ +--- +layout: default +title: Catalan +parent: Language analyzers +grand_parent: Analyzers +nav_order: 70 +--- + +# Catalan analyzer + +The built-in `catalan` analyzer can be applied to a text field using the following command: + +```json +PUT /catalan-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "catalan" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_catalan_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_catalan_analyzer": { + "type": "catalan", + "stem_exclusion": ["autoritat", "aprovació"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Catalan analyzer internals + +The `catalan` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- elision (catalan) +- lowercase +- stop (catalan) +- keywords (catalan) +- stemmer (catalan) + +## Custom Catalan analyzer + +You can create custom Catalan analyzer using the following command: + +```json +PUT /catalan-index +{ + "settings": { + "analysis": { + "filter": { + "catalan_stop": { + "type": "stop", + "stopwords": "_catalan_" + }, + "catalan_elision": { + "type": "elision", + "articles": [ "d", "l", "m", "n", "s", "t"], + "articles_case": true + }, + "catalan_stemmer": { + "type": "stemmer", + "language": "catalan" + } + }, + "analyzer": { + "catalan_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "catalan_elision", + "lowercase", + "catalan_stop", + "catalan_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "catalan_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /catalan-index/_analyze +{ + "field": "content", + "text": "Els estudiants estudien a les universitats catalanes. Els seus números són 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "estud","start_offset": 4,"end_offset": 14,"type": "","position": 1}, + {"token": "estud","start_offset": 15,"end_offset": 23,"type": "","position": 2}, + {"token": "univer","start_offset": 30,"end_offset": 42,"type": "","position": 5}, + {"token": "catalan","start_offset": 43,"end_offset": 52,"type": "","position": 6}, + {"token": "numer","start_offset": 63,"end_offset": 70,"type": "","position": 9}, + {"token": "123456","start_offset": 75,"end_offset": 81,"type": "","position": 11} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/cjk.md b/_analyzers/language-analyzers/cjk.md new file mode 100644 index 0000000000..111adb423b --- /dev/null +++ b/_analyzers/language-analyzers/cjk.md @@ -0,0 +1,142 @@ +--- +layout: default +title: CJK +parent: Language analyzers +grand_parent: Analyzers +nav_order: 80 +--- + +# CJK analyzer + +The built-in `cjk` analyzer can be applied to a text field using the following command: + +```json +PUT /cjk-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "cjk" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_cjk_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_cjk_analyzer": { + "type": "cjk", + "stem_exclusion": ["example", "words"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## CJK analyzer internals + +The `cjk` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- cjk_width +- lowercase +- cjk_bigram +- stop (similar to english) + +## Custom CJK analyzer + +You can create custom CJK analyzer using the following command: + +```json +PUT /cjk-index +{ + "settings": { + "analysis": { + "filter": { + "english_stop": { + "type": "stop", + "stopwords": [ + "a", "and", "are", "as", "at", "be", "but", "by", "for", + "if", "in", "into", "is", "it", "no", "not", "of", "on", + "or", "s", "such", "t", "that", "the", "their", "then", + "there", "these", "they", "this", "to", "was", "will", + "with", "www" + ] + } + }, + "analyzer": { + "cjk_custom_analyzer": { + "tokenizer": "standard", + "filter": [ + "cjk_width", + "lowercase", + "cjk_bigram", + "english_stop" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "cjk_custom_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /cjk-index/_analyze +{ + "field": "content", + "text": "学生们在中国、日本和韩国的大学学习。123456" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "学生","start_offset": 0,"end_offset": 2,"type": "","position": 0}, + {"token": "生们","start_offset": 1,"end_offset": 3,"type": "","position": 1}, + {"token": "们在","start_offset": 2,"end_offset": 4,"type": "","position": 2}, + {"token": "在中","start_offset": 3,"end_offset": 5,"type": "","position": 3}, + {"token": "中国","start_offset": 4,"end_offset": 6,"type": "","position": 4}, + {"token": "日本","start_offset": 7,"end_offset": 9,"type": "","position": 5}, + {"token": "本和","start_offset": 8,"end_offset": 10,"type": "","position": 6}, + {"token": "和韩","start_offset": 9,"end_offset": 11,"type": "","position": 7}, + {"token": "韩国","start_offset": 10,"end_offset": 12,"type": "","position": 8}, + {"token": "国的","start_offset": 11,"end_offset": 13,"type": "","position": 9}, + {"token": "的大","start_offset": 12,"end_offset": 14,"type": "","position": 10}, + {"token": "大学","start_offset": 13,"end_offset": 15,"type": "","position": 11}, + {"token": "学学","start_offset": 14,"end_offset": 16,"type": "","position": 12}, + {"token": "学习","start_offset": 15,"end_offset": 17,"type": "","position": 13}, + {"token": "123456","start_offset": 18,"end_offset": 24,"type": "","position": 14} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/index.md b/_analyzers/language-analyzers/index.md index 8b032e205b..2e15c32b86 100644 --- a/_analyzers/language-analyzers/index.md +++ b/_analyzers/language-analyzers/index.md @@ -75,6 +75,7 @@ Following languages support `stem_exclusion`: - brazilian - bulgarian - catalan +- cjk - czech - dutch - english From 8a1052de56413c01db71b024cfd20061febdd265 Mon Sep 17 00:00:00 2001 From: Anton Rubin Date: Wed, 30 Oct 2024 16:40:40 +0000 Subject: [PATCH 07/15] adding czech,danish,dutch,english,estonian,finnish,french and galician analyzer docs Signed-off-by: Anton Rubin --- _analyzers/language-analyzers/arabic.md | 13 +- _analyzers/language-analyzers/armenian.md | 11 +- _analyzers/language-analyzers/basque.md | 11 +- _analyzers/language-analyzers/bengali.md | 13 +- _analyzers/language-analyzers/brazilian.md | 11 +- _analyzers/language-analyzers/bulgarian.md | 11 +- _analyzers/language-analyzers/catalan.md | 13 +- _analyzers/language-analyzers/cjk.md | 2 +- _analyzers/language-analyzers/czech.md | 172 +++++++++++++++++++++ _analyzers/language-analyzers/danish.md | 172 +++++++++++++++++++++ _analyzers/language-analyzers/dutch.md | 148 ++++++++++++++++++ _analyzers/language-analyzers/english.md | 143 +++++++++++++++++ _analyzers/language-analyzers/estonian.md | 139 +++++++++++++++++ _analyzers/language-analyzers/finnish.md | 137 ++++++++++++++++ _analyzers/language-analyzers/french.md | 148 ++++++++++++++++++ _analyzers/language-analyzers/galician.md | 138 +++++++++++++++++ _analyzers/language-analyzers/index.md | 1 + 17 files changed, 1258 insertions(+), 25 deletions(-) create mode 100644 _analyzers/language-analyzers/czech.md create mode 100644 _analyzers/language-analyzers/danish.md create mode 100644 _analyzers/language-analyzers/dutch.md create mode 100644 _analyzers/language-analyzers/english.md create mode 100644 _analyzers/language-analyzers/estonian.md create mode 100644 _analyzers/language-analyzers/finnish.md create mode 100644 _analyzers/language-analyzers/french.md create mode 100644 _analyzers/language-analyzers/galician.md diff --git a/_analyzers/language-analyzers/arabic.md b/_analyzers/language-analyzers/arabic.md index 913414c8a3..b15d7ee58d 100644 --- a/_analyzers/language-analyzers/arabic.md +++ b/_analyzers/language-analyzers/arabic.md @@ -55,10 +55,10 @@ Tokenizer: `standard` Token Filters: - lowercase - decimal_digit -- stop (arabic) -- normalization (arabic) -- keywords (arabic) -- stemmer (arabic) +- stop (Arabic) +- normalization (Arabic) +- keywords +- stemmer (Arabic) ## Custom Arabic analyzer @@ -83,6 +83,10 @@ PUT /arabic-index }, "decimal_digit": { "type": "decimal_digit" + }, + "arabic_keywords": { + "type": "keyword_marker", + "keywords": [] } }, "analyzer": { @@ -94,6 +98,7 @@ PUT /arabic-index "arabic_normalization", "decimal_digit", "arabic_stop", + "arabic_keywords", "arabic_stemmer" ] } diff --git a/_analyzers/language-analyzers/armenian.md b/_analyzers/language-analyzers/armenian.md index a5ce7d8526..1324e39420 100644 --- a/_analyzers/language-analyzers/armenian.md +++ b/_analyzers/language-analyzers/armenian.md @@ -54,9 +54,9 @@ Tokenizer: `standard` Token Filters: - lowercase -- stop (armenian) -- keywords (armenian) -- stemmer (armenian) +- stop (Armenian) +- keywords +- stemmer (Armenian) ## Custom Armenian analyzer @@ -75,6 +75,10 @@ PUT /armenian-index "armenian_stemmer": { "type": "stemmer", "language": "armenian" + }, + "armenian_keywords": { + "type": "keyword_marker", + "keywords": [] } }, "analyzer": { @@ -84,6 +88,7 @@ PUT /armenian-index "filter": [ "lowercase", "armenian_stop", + "armenian_keywords", "armenian_stemmer" ] } diff --git a/_analyzers/language-analyzers/basque.md b/_analyzers/language-analyzers/basque.md index 7eac4cde82..bab4ffa0fe 100644 --- a/_analyzers/language-analyzers/basque.md +++ b/_analyzers/language-analyzers/basque.md @@ -54,9 +54,9 @@ Tokenizer: `standard` Token Filters: - lowercase -- stop (basque) -- keywords (basque) -- stemmer (basque) +- stop (Basque) +- keywords +- stemmer (Basque) ## Custom Basque analyzer @@ -75,6 +75,10 @@ PUT /basque-index "basque_stemmer": { "type": "stemmer", "language": "basque" + }, + "basque_keywords": { + "type": "keyword_marker", + "keywords": [] } }, "analyzer": { @@ -84,6 +88,7 @@ PUT /basque-index "filter": [ "lowercase", "basque_stop", + "basque_keywords", "basque_stemmer" ] } diff --git a/_analyzers/language-analyzers/bengali.md b/_analyzers/language-analyzers/bengali.md index d3df7f8417..72132e8e91 100644 --- a/_analyzers/language-analyzers/bengali.md +++ b/_analyzers/language-analyzers/bengali.md @@ -56,10 +56,10 @@ Token Filters: - lowercase - decimal_digit - indic_normalization -- normalization (bengali) -- stop (bengali) -- keywords (bengali) -- stemmer (bengali) +- normalization (Bengali) +- stop (Bengali) +- keywords +- stemmer (Bengali) ## Custom Bengali analyzer @@ -78,6 +78,10 @@ PUT /bengali-index "bengali_stemmer": { "type": "stemmer", "language": "bengali" + }, + "bengali_keywords": { + "type": "keyword_marker", + "keywords": [] } }, "analyzer": { @@ -90,6 +94,7 @@ PUT /bengali-index "indic_normalization", "bengali_normalization", "bengali_stop", + "bengali_keywords", "bengali_stemmer" ] } diff --git a/_analyzers/language-analyzers/brazilian.md b/_analyzers/language-analyzers/brazilian.md index b3b9c7cdb8..b905773bbb 100644 --- a/_analyzers/language-analyzers/brazilian.md +++ b/_analyzers/language-analyzers/brazilian.md @@ -54,9 +54,9 @@ Tokenizer: `standard` Token Filters: - lowercase -- stop (brazilian) -- keywords (brazilian) -- stemmer (brazilian) +- stop (Brazilian) +- keywords +- stemmer (Brazilian) ## Custom Brazilian analyzer @@ -75,6 +75,10 @@ PUT /brazilian-index "brazilian_stemmer": { "type": "stemmer", "language": "brazilian" + }, + "brazilian_keywords": { + "type": "keyword_marker", + "keywords": [] } }, "analyzer": { @@ -84,6 +88,7 @@ PUT /brazilian-index "filter": [ "lowercase", "brazilian_stop", + "brazilian_keywords", "brazilian_stemmer" ] } diff --git a/_analyzers/language-analyzers/bulgarian.md b/_analyzers/language-analyzers/bulgarian.md index 1d74f66c49..d924a81afc 100644 --- a/_analyzers/language-analyzers/bulgarian.md +++ b/_analyzers/language-analyzers/bulgarian.md @@ -54,9 +54,9 @@ Tokenizer: `standard` Token Filters: - lowercase -- stop (bulgarian) -- keywords (bulgarian) -- stemmer (bulgarian) +- stop (Bulgarian) +- keywords +- stemmer (Bulgarian) ## Custom Bulgarian analyzer @@ -75,6 +75,10 @@ PUT /bulgarian-index "bulgarian_stemmer": { "type": "stemmer", "language": "bulgarian" + }, + "bulgarian_keywords": { + "type": "keyword_marker", + "keywords": [] } }, "analyzer": { @@ -84,6 +88,7 @@ PUT /bulgarian-index "filter": [ "lowercase", "bulgarian_stop", + "bulgarian_keywords", "bulgarian_stemmer" ] } diff --git a/_analyzers/language-analyzers/catalan.md b/_analyzers/language-analyzers/catalan.md index bc072f8bd9..b1df91ce20 100644 --- a/_analyzers/language-analyzers/catalan.md +++ b/_analyzers/language-analyzers/catalan.md @@ -53,11 +53,11 @@ The `catalan` analyzer is build using the following: Tokenizer: `standard` Token Filters: -- elision (catalan) +- elision (Catalan) - lowercase -- stop (catalan) -- keywords (catalan) -- stemmer (catalan) +- stop (Catalan) +- keywords +- stemmer (Catalan) ## Custom Catalan analyzer @@ -81,6 +81,10 @@ PUT /catalan-index "catalan_stemmer": { "type": "stemmer", "language": "catalan" + }, + "catalan_keywords": { + "type": "keyword_marker", + "keywords": [] } }, "analyzer": { @@ -91,6 +95,7 @@ PUT /catalan-index "catalan_elision", "lowercase", "catalan_stop", + "catalan_keywords", "catalan_stemmer" ] } diff --git a/_analyzers/language-analyzers/cjk.md b/_analyzers/language-analyzers/cjk.md index 111adb423b..e66b222062 100644 --- a/_analyzers/language-analyzers/cjk.md +++ b/_analyzers/language-analyzers/cjk.md @@ -56,7 +56,7 @@ Token Filters: - cjk_width - lowercase - cjk_bigram -- stop (similar to english) +- stop (similar to English) ## Custom CJK analyzer diff --git a/_analyzers/language-analyzers/czech.md b/_analyzers/language-analyzers/czech.md new file mode 100644 index 0000000000..f0a2ac6482 --- /dev/null +++ b/_analyzers/language-analyzers/czech.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Czech +parent: Language analyzers +grand_parent: Analyzers +nav_order: 90 +--- + +# Czech analyzer + +The built-in `czech` analyzer can be applied to a text field using the following command: + +```json +PUT /czech-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "czech" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_czech_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_czech_analyzer": { + "type": "czech", + "stem_exclusion": ["autorita", "schválení"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Czech analyzer internals + +The `czech` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- lowercase +- stop (Czech) +- keyword +- stemmer (Czech) + +## Custom Czech analyzer + +You can create custom Czech analyzer using the following command: + +```json +PUT /czech-index +{ + "settings": { + "analysis": { + "filter": { + "czech_stop": { + "type": "stop", + "stopwords": "_czech_" + }, + "czech_stemmer": { + "type": "stemmer", + "language": "czech" + }, + "czech_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "czech_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "czech_stop", + "czech_keywords", + "czech_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "czech_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /czech-index/_analyze +{ + "field": "content", + "text": "Studenti studují na českých univerzitách. Jejich čísla jsou 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "student", + "start_offset": 0, + "end_offset": 8, + "type": "", + "position": 0 + }, + { + "token": "studuj", + "start_offset": 9, + "end_offset": 16, + "type": "", + "position": 1 + }, + { + "token": "česk", + "start_offset": 20, + "end_offset": 27, + "type": "", + "position": 3 + }, + { + "token": "univerzit", + "start_offset": 28, + "end_offset": 40, + "type": "", + "position": 4 + }, + { + "token": "čísl", + "start_offset": 49, + "end_offset": 54, + "type": "", + "position": 6 + }, + { + "token": "123456", + "start_offset": 60, + "end_offset": 66, + "type": "", + "position": 8 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/danish.md b/_analyzers/language-analyzers/danish.md new file mode 100644 index 0000000000..3f974d5e0f --- /dev/null +++ b/_analyzers/language-analyzers/danish.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Danish +parent: Language analyzers +grand_parent: Analyzers +nav_order: 100 +--- + +# Danish analyzer + +The built-in `danish` analyzer can be applied to a text field using the following command: + +```json +PUT /danish-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "danish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_danish_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_danish_analyzer": { + "type": "danish", + "stem_exclusion": ["autoritet", "godkendelse"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Danish analyzer internals + +The `danish` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- lowercase +- stop (Danish) +- keyword +- stemmer (Danish) + +## Custom Danish analyzer + +You can create custom Danish analyzer using the following command: + +```json +PUT /danish-index +{ + "settings": { + "analysis": { + "filter": { + "danish_stop": { + "type": "stop", + "stopwords": "_danish_" + }, + "danish_stemmer": { + "type": "stemmer", + "language": "danish" + }, + "danish_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "danish_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "danish_stop", + "danish_keywords", + "danish_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "danish_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /danish-index/_analyze +{ + "field": "content", + "text": "Studerende studerer på de danske universiteter. Deres numre er 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "stud", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0 + }, + { + "token": "stud", + "start_offset": 11, + "end_offset": 19, + "type": "", + "position": 1 + }, + { + "token": "dansk", + "start_offset": 26, + "end_offset": 32, + "type": "", + "position": 4 + }, + { + "token": "universitet", + "start_offset": 33, + "end_offset": 46, + "type": "", + "position": 5 + }, + { + "token": "numr", + "start_offset": 54, + "end_offset": 59, + "type": "", + "position": 7 + }, + { + "token": "123456", + "start_offset": 63, + "end_offset": 69, + "type": "", + "position": 9 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/dutch.md b/_analyzers/language-analyzers/dutch.md new file mode 100644 index 0000000000..e96c05d147 --- /dev/null +++ b/_analyzers/language-analyzers/dutch.md @@ -0,0 +1,148 @@ +--- +layout: default +title: Dutch +parent: Language analyzers +grand_parent: Analyzers +nav_order: 110 +--- + +# Dutch analyzer + +The built-in `dutch` analyzer can be applied to a text field using the following command: + +```json +PUT /dutch-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "dutch" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_dutch_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_dutch_analyzer": { + "type": "dutch", + "stem_exclusion": ["autoriteit", "goedkeuring"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Dutch analyzer internals + +The `dutch` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- lowercase +- stop (Dutch) +- keyword +- stemmer_override +- stemmer (Dutch) + +## Custom Dutch analyzer + +You can create custom Dutch analyzer using the following command: + +```json +PUT /dutch-index +{ + "settings": { + "analysis": { + "filter": { + "dutch_stop": { + "type": "stop", + "stopwords": "_dutch_" + }, + "dutch_stemmer": { + "type": "stemmer", + "language": "dutch" + }, + "dutch_keywords": { + "type": "keyword_marker", + "keywords": [] + }, + "dutch_override": { + "type": "stemmer_override", + "rules": [ + "fiets=>fiets", + "bromfiets=>bromfiets", + "ei=>eier", + "kind=>kinder" + ] + } + }, + "analyzer": { + "dutch_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "dutch_stop", + "dutch_keywords", + "dutch_override", + "dutch_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "dutch_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /dutch-index/_analyze +{ + "field": "content", + "text": "De studenten studeren in Nederland en bezoeken Amsterdam. Hun nummers zijn 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "student","start_offset": 3,"end_offset": 12,"type": "","position": 1}, + {"token": "studer","start_offset": 13,"end_offset": 21,"type": "","position": 2}, + {"token": "nederland","start_offset": 25,"end_offset": 34,"type": "","position": 4}, + {"token": "bezoek","start_offset": 38,"end_offset": 46,"type": "","position": 6}, + {"token": "amsterdam","start_offset": 47,"end_offset": 56,"type": "","position": 7}, + {"token": "nummer","start_offset": 62,"end_offset": 69,"type": "","position": 9}, + {"token": "123456","start_offset": 75,"end_offset": 81,"type": "","position": 11} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/english.md b/_analyzers/language-analyzers/english.md new file mode 100644 index 0000000000..4c3dff5bbc --- /dev/null +++ b/_analyzers/language-analyzers/english.md @@ -0,0 +1,143 @@ +--- +layout: default +title: English +parent: Language analyzers +grand_parent: Analyzers +nav_order: 120 +--- + +# English analyzer + +The built-in `english` analyzer can be applied to a text field using the following command: + +```json +PUT /english-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "english" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_english_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_english_analyzer": { + "type": "english", + "stem_exclusion": ["authority", "authorization"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## English analyzer internals + +The `english` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- stemmer (possessive_english) +- lowercase +- stop (English) +- keyword +- stemmer (English) + +## Custom English analyzer + +You can create custom English analyzer using the following command: + +```json +PUT /english-index +{ + "settings": { + "analysis": { + "filter": { + "english_stop": { + "type": "stop", + "stopwords": "_english_" + }, + "english_stemmer": { + "type": "stemmer", + "language": "english" + }, + "english_keywords": { + "type": "keyword_marker", + "keywords": [] + }, + "english_possessive_stemmer": { + "type": "stemmer", + "language": "possessive_english" + } + }, + "analyzer": { + "english_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "english_possessive_stemmer", + "lowercase", + "english_stop", + "english_keywords", + "english_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "english_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /english-index/_analyze +{ + "field": "content", + "text": "The students study in the USA and work at NASA. Their numbers are 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "student","start_offset": 4,"end_offset": 12,"type": "","position": 1}, + {"token": "studi","start_offset": 13,"end_offset": 18,"type": "","position": 2}, + {"token": "usa","start_offset": 26,"end_offset": 29,"type": "","position": 5}, + {"token": "work","start_offset": 34,"end_offset": 38,"type": "","position": 7}, + {"token": "nasa","start_offset": 42,"end_offset": 46,"type": "","position": 9}, + {"token": "number","start_offset": 54,"end_offset": 61,"type": "","position": 11}, + {"token": "123456","start_offset": 66,"end_offset": 72,"type": "","position": 13} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/estonian.md b/_analyzers/language-analyzers/estonian.md new file mode 100644 index 0000000000..6b5afa2271 --- /dev/null +++ b/_analyzers/language-analyzers/estonian.md @@ -0,0 +1,139 @@ +--- +layout: default +title: Estonian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 130 +--- + +# Estonian analyzer + +The built-in `estonian` analyzer can be applied to a text field using the following command: + +```json +PUT /estonian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "estonian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_estonian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_estonian_analyzer": { + "type": "estonian", + "stem_exclusion": ["autoriteet", "kinnitus"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Estonian analyzer internals + +The `estonian` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- lowercase +- stop (Estonian) +- keyword +- stemmer (Estonian) + +## Custom Estonian analyzer + +You can create custom Estonian analyzer using the following command: + +```json +PUT /estonian-index +{ + "settings": { + "analysis": { + "filter": { + "estonian_stop": { + "type": "stop", + "stopwords": "_estonian_" + }, + "estonian_stemmer": { + "type": "stemmer", + "language": "estonian" + }, + "estonian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "estonian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "estonian_stop", + "estonian_keywords", + "estonian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "estonian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /estonian-index/_analyze +{ + "field": "content", + "text": "Õpilased õpivad Tallinnas ja Eesti ülikoolides. Nende numbrid on 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "õpilase","start_offset": 0,"end_offset": 8,"type": "","position": 0}, + {"token": "õpi","start_offset": 9,"end_offset": 15,"type": "","position": 1}, + {"token": "tallinna","start_offset": 16,"end_offset": 25,"type": "","position": 2}, + {"token": "eesti","start_offset": 29,"end_offset": 34,"type": "","position": 4}, + {"token": "ülikooli","start_offset": 35,"end_offset": 46,"type": "","position": 5}, + {"token": "nende","start_offset": 48,"end_offset": 53,"type": "","position": 6}, + {"token": "numbri","start_offset": 54,"end_offset": 61,"type": "","position": 7}, + {"token": "on","start_offset": 62,"end_offset": 64,"type": "","position": 8}, + {"token": "123456","start_offset": 65,"end_offset": 71,"type": "","position": 9} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/finnish.md b/_analyzers/language-analyzers/finnish.md new file mode 100644 index 0000000000..ccc1534b2f --- /dev/null +++ b/_analyzers/language-analyzers/finnish.md @@ -0,0 +1,137 @@ +--- +layout: default +title: Finnish +parent: Language analyzers +grand_parent: Analyzers +nav_order: 140 +--- + +# Finnish analyzer + +The built-in `finnish` analyzer can be applied to a text field using the following command: + +```json +PUT /finnish-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "finnish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_finnish_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_finnish_analyzer": { + "type": "finnish", + "stem_exclusion": ["valta", "hyväksyntä"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Finnish analyzer internals + +The `finnish` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- lowercase +- stop (Finnish) +- keyword +- stemmer (Finnish) + +## Custom Finnish analyzer + +You can create custom Finnish analyzer using the following command: + +```json +PUT /finnish-index +{ + "settings": { + "analysis": { + "filter": { + "finnish_stop": { + "type": "stop", + "stopwords": "_finnish_" + }, + "finnish_stemmer": { + "type": "stemmer", + "language": "finnish" + }, + "finnish_keywords": { + "type": "keyword_marker", + "keywords": ["Helsinki", "Suomi"] + } + }, + "analyzer": { + "finnish_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "finnish_stop", + "finnish_keywords", + "finnish_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "finnish_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /finnish-index/_analyze +{ + "field": "content", + "text": "Opiskelijat opiskelevat Helsingissä ja Suomen yliopistoissa. Heidän numeronsa ovat 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "opiskelij","start_offset": 0,"end_offset": 11,"type": "","position": 0}, + {"token": "opiskelev","start_offset": 12,"end_offset": 23,"type": "","position": 1}, + {"token": "helsing","start_offset": 24,"end_offset": 35,"type": "","position": 2}, + {"token": "suome","start_offset": 39,"end_offset": 45,"type": "","position": 4}, + {"token": "yliopisto","start_offset": 46,"end_offset": 59,"type": "","position": 5}, + {"token": "numero","start_offset": 68,"end_offset": 77,"type": "","position": 7}, + {"token": "123456","start_offset": 83,"end_offset": 89,"type": "","position": 9} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/french.md b/_analyzers/language-analyzers/french.md new file mode 100644 index 0000000000..730a2066d4 --- /dev/null +++ b/_analyzers/language-analyzers/french.md @@ -0,0 +1,148 @@ +--- +layout: default +title: French +parent: Language analyzers +grand_parent: Analyzers +nav_order: 150 +--- + +# French analyzer + +The built-in `french` analyzer can be applied to a text field using the following command: + +```json +PUT /french-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "french" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_french_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_french_analyzer": { + "type": "french", + "stem_exclusion": ["autorité", "acceptation"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## French analyzer internals + +The `french` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- elision (French) +- lowercase +- stop (French) +- keyword +- stemmer (French) + +## Custom French analyzer + +You can create custom French analyzer using the following command: + +```json +PUT /french-index +{ + "settings": { + "analysis": { + "filter": { + "french_stop": { + "type": "stop", + "stopwords": "_french_" + }, + "french_elision": { + "type": "elision", + "articles_case": true, + "articles": [ + "l", "m", "t", "qu", "n", "s", + "j", "d", "c", "jusqu", "quoiqu", + "lorsqu", "puisqu" + ] + }, + "french_stemmer": { + "type": "stemmer", + "language": "light_french" + }, + "french_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "french_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "french_elision", + "lowercase", + "french_stop", + "french_keywords", + "french_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "french_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /french-index/_analyze +{ + "field": "content", + "text": "Les étudiants étudient à Paris et dans les universités françaises. Leurs numéros sont 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "etudiant","start_offset": 4,"end_offset": 13,"type": "","position": 1}, + {"token": "etudient","start_offset": 14,"end_offset": 22,"type": "","position": 2}, + {"token": "pari","start_offset": 25,"end_offset": 30,"type": "","position": 4}, + {"token": "universit","start_offset": 43,"end_offset": 54,"type": "","position": 8}, + {"token": "francais","start_offset": 55,"end_offset": 65,"type": "","position": 9}, + {"token": "numero","start_offset": 73,"end_offset": 80,"type": "","position": 11}, + {"token": "123456","start_offset": 86,"end_offset": 92,"type": "","position": 13} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/galician.md b/_analyzers/language-analyzers/galician.md new file mode 100644 index 0000000000..e0f833e13d --- /dev/null +++ b/_analyzers/language-analyzers/galician.md @@ -0,0 +1,138 @@ +--- +layout: default +title: Galician +parent: Language analyzers +grand_parent: Analyzers +nav_order: 160 +--- + +# Galician analyzer + +The built-in `galician` analyzer can be applied to a text field using the following command: + +```json +PUT /galician-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "galician" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_galician_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_galician_analyzer": { + "type": "galician", + "stem_exclusion": ["autoridade", "aceptación"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Galician analyzer internals + +The `galician` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- lowercase +- stop (French) +- keyword +- stemmer (French) + +## Custom Galician analyzer + +You can create custom Galician analyzer using the following command: + +```json +PUT /galician-index +{ + "settings": { + "analysis": { + "filter": { + "galician_stop": { + "type": "stop", + "stopwords": "_galician_" + }, + "galician_stemmer": { + "type": "stemmer", + "language": "galician" + }, + "galician_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "galician_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "galician_stop", + "galician_keywords", + "galician_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "galician_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /galician-index/_analyze +{ + "field": "content", + "text": "Os estudantes estudan en Santiago e nas universidades galegas. Os seus números son 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "estud","start_offset": 3,"end_offset": 13,"type": "","position": 1}, + {"token": "estud","start_offset": 14,"end_offset": 21,"type": "","position": 2}, + {"token": "santiag","start_offset": 25,"end_offset": 33,"type": "","position": 4}, + {"token": "univers","start_offset": 40,"end_offset": 53,"type": "","position": 7}, + {"token": "galeg","start_offset": 54,"end_offset": 61,"type": "","position": 8}, + {"token": "numer","start_offset": 71,"end_offset": 78,"type": "","position": 11}, + {"token": "son","start_offset": 79,"end_offset": 82,"type": "","position": 12}, + {"token": "123456","start_offset": 83,"end_offset": 89,"type": "","position": 13} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/index.md b/_analyzers/language-analyzers/index.md index 2e15c32b86..c69337f3a9 100644 --- a/_analyzers/language-analyzers/index.md +++ b/_analyzers/language-analyzers/index.md @@ -77,6 +77,7 @@ Following languages support `stem_exclusion`: - catalan - cjk - czech +- danish - dutch - english - finnish From 15c0f8c7c35a386c7350110c4533487c2ea9d86a Mon Sep 17 00:00:00 2001 From: Anton Rubin Date: Thu, 31 Oct 2024 14:21:33 +0000 Subject: [PATCH 08/15] adding german,greek,hindi,hungarian,indonesian,irish,italian,latvian,lithuanian,norwegian and persion laguage analyzer docs Signed-off-by: Anton Rubin --- _analyzers/language-analyzers/german.md | 174 +++++++++++++++++++ _analyzers/language-analyzers/greek.md | 139 +++++++++++++++ _analyzers/language-analyzers/hindi.md | 178 ++++++++++++++++++++ _analyzers/language-analyzers/hungarian.md | 172 +++++++++++++++++++ _analyzers/language-analyzers/indonesian.md | 172 +++++++++++++++++++ _analyzers/language-analyzers/irish.md | 157 +++++++++++++++++ _analyzers/language-analyzers/italian.md | 148 ++++++++++++++++ _analyzers/language-analyzers/latvian.md | 148 ++++++++++++++++ _analyzers/language-analyzers/lithuanian.md | 136 +++++++++++++++ _analyzers/language-analyzers/norwegian.md | 137 +++++++++++++++ _analyzers/language-analyzers/persian.md | 142 ++++++++++++++++ 11 files changed, 1703 insertions(+) create mode 100644 _analyzers/language-analyzers/german.md create mode 100644 _analyzers/language-analyzers/greek.md create mode 100644 _analyzers/language-analyzers/hindi.md create mode 100644 _analyzers/language-analyzers/hungarian.md create mode 100644 _analyzers/language-analyzers/indonesian.md create mode 100644 _analyzers/language-analyzers/irish.md create mode 100644 _analyzers/language-analyzers/italian.md create mode 100644 _analyzers/language-analyzers/latvian.md create mode 100644 _analyzers/language-analyzers/lithuanian.md create mode 100644 _analyzers/language-analyzers/norwegian.md create mode 100644 _analyzers/language-analyzers/persian.md diff --git a/_analyzers/language-analyzers/german.md b/_analyzers/language-analyzers/german.md new file mode 100644 index 0000000000..3076fea57c --- /dev/null +++ b/_analyzers/language-analyzers/german.md @@ -0,0 +1,174 @@ +--- +layout: default +title: German +parent: Language analyzers +grand_parent: Analyzers +nav_order: 170 +--- + +# German analyzer + +The built-in `german` analyzer can be applied to a text field using the following command: + +```json +PUT /german-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "german" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_german_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_german_analyzer": { + "type": "german", + "stem_exclusion": ["Autorität", "Genehmigung"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## German analyzer internals + +The `german` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- lowercase +- stop (German) +- keyword +- normalization (German) +- stemmer (German) + +## Custom German analyzer + +You can create custom German analyzer using the following command: + +```json +PUT /german-index +{ + "settings": { + "analysis": { + "filter": { + "german_stop": { + "type": "stop", + "stopwords": "_german_" + }, + "german_stemmer": { + "type": "stemmer", + "language": "light_german" + }, + "german_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "german_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "german_stop", + "german_keywords", + "german_normalization", + "german_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "german_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /german-index/_analyze +{ + "field": "content", + "text": "Die Studenten studieren an den deutschen Universitäten. Ihre Nummern sind 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "student", + "start_offset": 4, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "studi", + "start_offset": 14, + "end_offset": 23, + "type": "", + "position": 2 + }, + { + "token": "deutsch", + "start_offset": 31, + "end_offset": 40, + "type": "", + "position": 5 + }, + { + "token": "universitat", + "start_offset": 41, + "end_offset": 54, + "type": "", + "position": 6 + }, + { + "token": "numm", + "start_offset": 61, + "end_offset": 68, + "type": "", + "position": 8 + }, + { + "token": "123456", + "start_offset": 74, + "end_offset": 80, + "type": "", + "position": 10 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/greek.md b/_analyzers/language-analyzers/greek.md new file mode 100644 index 0000000000..01735581ca --- /dev/null +++ b/_analyzers/language-analyzers/greek.md @@ -0,0 +1,139 @@ +--- +layout: default +title: Greek +parent: Language analyzers +grand_parent: Analyzers +nav_order: 180 +--- + +# Greek analyzer + +The built-in `greek` analyzer can be applied to a text field using the following command: + +```json +PUT /greek-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "greek" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_greek_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_greek_analyzer": { + "type": "greek", + "stem_exclusion": ["αρχή", "έγκριση"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Greek analyzer internals + +The `greek` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- lowercase +- stop (Greek) +- keyword +- stemmer (Greek) + +## Custom Greek analyzer + +You can create custom Greek analyzer using the following command: + +```json +PUT /greek-index +{ + "settings": { + "analysis": { + "filter": { + "greek_stop": { + "type": "stop", + "stopwords": "_greek_" + }, + "greek_stemmer": { + "type": "stemmer", + "language": "greek" + }, + "greek_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "greek_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "greek_stop", + "greek_keywords", + "greek_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "greek_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /greek-index/_analyze +{ + "field": "content", + "text": "Οι φοιτητές σπουδάζουν στα ελληνικά πανεπιστήμια. Οι αριθμοί τους είναι 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "φοιτητές","start_offset": 3,"end_offset": 11,"type": "","position": 1}, + {"token": "σπουδάζ","start_offset": 12,"end_offset": 22,"type": "","position": 2}, + {"token": "στα","start_offset": 23,"end_offset": 26,"type": "","position": 3}, + {"token": "ελληνικά","start_offset": 27,"end_offset": 35,"type": "","position": 4}, + {"token": "πανεπιστήμ","start_offset": 36,"end_offset": 48,"type": "","position": 5}, + {"token": "αριθμοί","start_offset": 53,"end_offset": 60,"type": "","position": 7}, + {"token": "τους","start_offset": 61,"end_offset": 65,"type": "","position": 8}, + {"token": "είνα","start_offset": 66,"end_offset": 71,"type": "","position": 9}, + {"token": "123456","start_offset": 72,"end_offset": 78,"type": "","position": 10} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/hindi.md b/_analyzers/language-analyzers/hindi.md new file mode 100644 index 0000000000..b2812edd49 --- /dev/null +++ b/_analyzers/language-analyzers/hindi.md @@ -0,0 +1,178 @@ +--- +layout: default +title: Hindi +parent: Language analyzers +grand_parent: Analyzers +nav_order: 190 +--- + +# Hindi analyzer + +The built-in `hindi` analyzer can be applied to a text field using the following command: + +```json +PUT /hindi-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "hindi" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_hindi_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_hindi_analyzer": { + "type": "hindi", + "stem_exclusion": ["अधिकार", "अनुमोदन"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Hindi analyzer internals + +The `hindi` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- lowercase +- decimal_digit +- keyword +- normalization (indic) +- normalization (Hindi) +- stop (Hindi) +- stemmer (Hindi) + +## Custom Hindi analyzer + +You can create custom Hindi analyzer using the following command: + +```json +PUT /hindi-index +{ + "settings": { + "analysis": { + "filter": { + "hindi_stop": { + "type": "stop", + "stopwords": "_hindi_" + }, + "hindi_stemmer": { + "type": "stemmer", + "language": "hindi" + }, + "hindi_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "hindi_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "decimal_digit", + "hindi_keywords", + "indic_normalization", + "hindi_normalization", + "hindi_stop", + "hindi_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "hindi_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /hindi-index/_analyze +{ + "field": "content", + "text": "छात्र भारतीय विश्वविद्यालयों में पढ़ते हैं। उनके नंबर १२३४५६ हैं।" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "छातर", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "भारतिय", + "start_offset": 6, + "end_offset": 12, + "type": "", + "position": 1 + }, + { + "token": "विशवविदयालय", + "start_offset": 13, + "end_offset": 28, + "type": "", + "position": 2 + }, + { + "token": "पढ", + "start_offset": 33, + "end_offset": 38, + "type": "", + "position": 4 + }, + { + "token": "नंबर", + "start_offset": 49, + "end_offset": 53, + "type": "", + "position": 7 + }, + { + "token": "123456", + "start_offset": 54, + "end_offset": 60, + "type": "", + "position": 8 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/hungarian.md b/_analyzers/language-analyzers/hungarian.md new file mode 100644 index 0000000000..7e32ead084 --- /dev/null +++ b/_analyzers/language-analyzers/hungarian.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Hungarian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 200 +--- + +# Hungarian analyzer + +The built-in `hungarian` analyzer can be applied to a text field using the following command: + +```json +PUT /hungarian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "hungarian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_hungarian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_hungarian_analyzer": { + "type": "hungarian", + "stem_exclusion": ["hatalom", "jóváhagyás"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Hungarian analyzer internals + +The `hungarian` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- lowercase +- stop (Hungarian) +- keyword +- stemmer (Hungarian) + +## Custom Hungarian analyzer + +You can create custom Hungarian analyzer using the following command: + +```json +PUT /hungarian-index +{ + "settings": { + "analysis": { + "filter": { + "hungarian_stop": { + "type": "stop", + "stopwords": "_hungarian_" + }, + "hungarian_stemmer": { + "type": "stemmer", + "language": "hungarian" + }, + "hungarian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "hungarian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "hungarian_stop", + "hungarian_keywords", + "hungarian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "hungarian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /hungarian-index/_analyze +{ + "field": "content", + "text": "A diákok a magyar egyetemeken tanulnak. A számaik 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "diák", + "start_offset": 2, + "end_offset": 8, + "type": "", + "position": 1 + }, + { + "token": "magyar", + "start_offset": 11, + "end_offset": 17, + "type": "", + "position": 3 + }, + { + "token": "egyetem", + "start_offset": 18, + "end_offset": 29, + "type": "", + "position": 4 + }, + { + "token": "tanul", + "start_offset": 30, + "end_offset": 38, + "type": "", + "position": 5 + }, + { + "token": "szám", + "start_offset": 42, + "end_offset": 49, + "type": "", + "position": 7 + }, + { + "token": "123456", + "start_offset": 50, + "end_offset": 56, + "type": "", + "position": 8 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/indonesian.md b/_analyzers/language-analyzers/indonesian.md new file mode 100644 index 0000000000..b4b567c588 --- /dev/null +++ b/_analyzers/language-analyzers/indonesian.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Indonesian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 210 +--- + +# Indonesian analyzer + +The built-in `indonesian` analyzer can be applied to a text field using the following command: + +```json +PUT /indonesian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "indonesian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_indonesian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_indonesian_analyzer": { + "type": "indonesian", + "stem_exclusion": ["otoritas", "persetujuan"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Indonesian analyzer internals + +The `indonesian` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- lowercase +- stop (Indonesian) +- keyword +- stemmer (Indonesian) + +## Custom Indonesian analyzer + +You can create custom Indonesian analyzer using the following command: + +```json +PUT /hungarian-index +{ + "settings": { + "analysis": { + "filter": { + "hungarian_stop": { + "type": "stop", + "stopwords": "_hungarian_" + }, + "hungarian_stemmer": { + "type": "stemmer", + "language": "hungarian" + }, + "hungarian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "hungarian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "hungarian_stop", + "hungarian_keywords", + "hungarian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "hungarian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /indonesian-index/_analyze +{ + "field": "content", + "text": "Mahasiswa belajar di universitas Indonesia. Nomor mereka adalah 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "mahasiswa", + "start_offset": 0, + "end_offset": 9, + "type": "", + "position": 0 + }, + { + "token": "ajar", + "start_offset": 10, + "end_offset": 17, + "type": "", + "position": 1 + }, + { + "token": "universitas", + "start_offset": 21, + "end_offset": 32, + "type": "", + "position": 3 + }, + { + "token": "indonesia", + "start_offset": 33, + "end_offset": 42, + "type": "", + "position": 4 + }, + { + "token": "nomor", + "start_offset": 44, + "end_offset": 49, + "type": "", + "position": 5 + }, + { + "token": "123456", + "start_offset": 64, + "end_offset": 70, + "type": "", + "position": 8 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/irish.md b/_analyzers/language-analyzers/irish.md new file mode 100644 index 0000000000..03fde20c3c --- /dev/null +++ b/_analyzers/language-analyzers/irish.md @@ -0,0 +1,157 @@ +--- +layout: default +title: Irish +parent: Language analyzers +grand_parent: Analyzers +nav_order: 210 +--- + +# Irish analyzer + +The built-in `irish` analyzer can be applied to a text field using the following command: + +```json +PUT /irish-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "irish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_irish_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_irish_analyzer": { + "type": "irish", + "stem_exclusion": ["údarás", "faomhadh"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Irish analyzer internals + +The `irish` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- hyphenation (Irish) +- elision (Irish) +- lowercase (Irish) +- stop (Irish) +- keyword +- stemmer (Irish) + +## Custom Irish analyzer + +You can create custom Irish analyzer using the following command: + +```json +PUT /irish-index +{ + "settings": { + "analysis": { + "filter": { + "irish_stop": { + "type": "stop", + "stopwords": "_irish_" + }, + "irish_elision": { + "type": "elision", + "articles": [ "d", "m", "b" ], + "articles_case": true + }, + "irish_hyphenation": { + "type": "stop", + "stopwords": [ "h", "n", "t" ], + "ignore_case": true + }, + "irish_lowercase": { + "type": "lowercase", + "language": "irish" + }, + "irish_stemmer": { + "type": "stemmer", + "language": "irish" + }, + "irish_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "irish_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "irish_hyphenation", + "irish_elision", + "irish_lowercase", + "irish_stop", + "irish_keywords", + "irish_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "irish_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /irish-index/_analyze +{ + "field": "content", + "text": "Tá mic léinn ag staidéar in ollscoileanna na hÉireann. Is iad a gcuid uimhreacha ná 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "tá","start_offset": 0,"end_offset": 2,"type": "","position": 0}, + {"token": "mic","start_offset": 3,"end_offset": 6,"type": "","position": 1}, + {"token": "léinn","start_offset": 7,"end_offset": 12,"type": "","position": 2}, + {"token": "staidéar","start_offset": 16,"end_offset": 24,"type": "","position": 4}, + {"token": "ollscoileanna","start_offset": 28,"end_offset": 41,"type": "","position": 6}, + {"token": "héireann","start_offset": 45,"end_offset": 53,"type": "","position": 8}, + {"token": "cuid","start_offset": 64,"end_offset": 69,"type": "","position": 12}, + {"token": "uimhreacha","start_offset": 70,"end_offset": 80,"type": "","position": 13}, + {"token": "123456","start_offset": 84,"end_offset": 90,"type": "","position": 15} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/italian.md b/_analyzers/language-analyzers/italian.md new file mode 100644 index 0000000000..636f58fcc8 --- /dev/null +++ b/_analyzers/language-analyzers/italian.md @@ -0,0 +1,148 @@ +--- +layout: default +title: Italian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 220 +--- + +# Italian analyzer + +The built-in `italian` analyzer can be applied to a text field using the following command: + +```json +PUT /italian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "italian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_italian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_italian_analyzer": { + "type": "italian", + "stem_exclusion": ["autorità", "approvazione"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Italian analyzer internals + +The `italian` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- elision (Italian) +- lowercase +- stop (Italian) +- keyword +- stemmer (Italian) + +## Custom Italian analyzer + +You can create custom Italian analyzer using the following command: + +```json +PUT /italian-index +{ + "settings": { + "analysis": { + "filter": { + "italian_stop": { + "type": "stop", + "stopwords": "_italian_" + }, + "italian_elision": { + "type": "elision", + "articles": [ + "c", "l", "all", "dall", "dell", + "nell", "sull", "coll", "pell", + "gl", "agl", "dagl", "degl", "negl", + "sugl", "un", "m", "t", "s", "v", "d" + ], + "articles_case": true + }, + "italian_stemmer": { + "type": "stemmer", + "language": "light_italian" + }, + "italian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "italian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "italian_elision", + "lowercase", + "italian_stop", + "italian_keywords", + "italian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "italian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /italian-index/_analyze +{ + "field": "content", + "text": "Gli studenti studiano nelle università italiane. I loro numeri sono 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "student","start_offset": 4,"end_offset": 12,"type": "","position": 1}, + {"token": "studian","start_offset": 13,"end_offset": 21,"type": "","position": 2}, + {"token": "universit","start_offset": 28,"end_offset": 38,"type": "","position": 4}, + {"token": "italian","start_offset": 39,"end_offset": 47,"type": "","position": 5}, + {"token": "numer","start_offset": 56,"end_offset": 62,"type": "","position": 8}, + {"token": "123456","start_offset": 68,"end_offset": 74,"type": "","position": 10} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/latvian.md b/_analyzers/language-analyzers/latvian.md new file mode 100644 index 0000000000..ecdc4b2f51 --- /dev/null +++ b/_analyzers/language-analyzers/latvian.md @@ -0,0 +1,148 @@ +--- +layout: default +title: Latvian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 230 +--- + +# Latvian analyzer + +The built-in `latvian` analyzer can be applied to a text field using the following command: + +```json +PUT /latvian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "latvian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_latvian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_latvian_analyzer": { + "type": "latvian", + "stem_exclusion": ["autoritāte", "apstiprinājums"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Latvian analyzer internals + +The `latvian` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- lowercase +- stop (Latvian) +- keyword +- stemmer (Latvian) + +## Custom Latvian analyzer + +You can create custom Latvian analyzer using the following command: + +```json +PUT /italian-index +{ + "settings": { + "analysis": { + "filter": { + "italian_stop": { + "type": "stop", + "stopwords": "_italian_" + }, + "italian_elision": { + "type": "elision", + "articles": [ + "c", "l", "all", "dall", "dell", + "nell", "sull", "coll", "pell", + "gl", "agl", "dagl", "degl", "negl", + "sugl", "un", "m", "t", "s", "v", "d" + ], + "articles_case": true + }, + "italian_stemmer": { + "type": "stemmer", + "language": "light_italian" + }, + "italian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "italian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "italian_elision", + "lowercase", + "italian_stop", + "italian_keywords", + "italian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "italian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /latvian-index/_analyze +{ + "field": "content", + "text": "Studenti mācās Latvijas universitātēs. Viņu numuri ir 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "student","start_offset": 0,"end_offset": 8,"type": "","position": 0}, + {"token": "māc","start_offset": 9,"end_offset": 14,"type": "","position": 1}, + {"token": "latvij","start_offset": 15,"end_offset": 23,"type": "","position": 2}, + {"token": "universitāt","start_offset": 24,"end_offset": 37,"type": "","position": 3}, + {"token": "vin","start_offset": 39,"end_offset": 43,"type": "","position": 4}, + {"token": "numur","start_offset": 44,"end_offset": 50,"type": "","position": 5}, + {"token": "123456","start_offset": 54,"end_offset": 60,"type": "","position": 7} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/lithuanian.md b/_analyzers/language-analyzers/lithuanian.md new file mode 100644 index 0000000000..123e01139e --- /dev/null +++ b/_analyzers/language-analyzers/lithuanian.md @@ -0,0 +1,136 @@ +--- +layout: default +title: Lithuanian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 230 +--- + +# Lithuanian analyzer + +The built-in `lithuanian` analyzer can be applied to a text field using the following command: + +```json +PUT /lithuanian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "lithuanian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_lithuanian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_lithuanian_analyzer": { + "type": "lithuanian", + "stem_exclusion": ["autoritetas", "patvirtinimas"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Lithuanian analyzer internals + +The `lithuanian` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- lowercase +- stop (Lithuanian) +- keyword +- stemmer (Lithuanian) + +## Custom Lithuanian analyzer + +You can create custom Lithuanian analyzer using the following command: + +```json +PUT /lithuanian-index +{ + "settings": { + "analysis": { + "filter": { + "lithuanian_stop": { + "type": "stop", + "stopwords": "_lithuanian_" + }, + "lithuanian_stemmer": { + "type": "stemmer", + "language": "lithuanian" + }, + "lithuanian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "lithuanian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "lithuanian_stop", + "lithuanian_keywords", + "lithuanian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "lithuanian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /lithuanian-index/_analyze +{ + "field": "content", + "text": "Studentai mokosi Lietuvos universitetuose. Jų numeriai yra 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "student","start_offset": 0,"end_offset": 9,"type": "","position": 0}, + {"token": "mok","start_offset": 10,"end_offset": 16,"type": "","position": 1}, + {"token": "lietuv","start_offset": 17,"end_offset": 25,"type": "","position": 2}, + {"token": "universitet","start_offset": 26,"end_offset": 41,"type": "","position": 3}, + {"token": "num","start_offset": 46,"end_offset": 54,"type": "","position": 5}, + {"token": "123456","start_offset": 59,"end_offset": 65,"type": "","position": 7} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/norwegian.md b/_analyzers/language-analyzers/norwegian.md new file mode 100644 index 0000000000..33d8e01f7f --- /dev/null +++ b/_analyzers/language-analyzers/norwegian.md @@ -0,0 +1,137 @@ +--- +layout: default +title: Norwegian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 240 +--- + +# Norwegian analyzer + +The built-in `norwegian` analyzer can be applied to a text field using the following command: + +```json +PUT /norwegian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "norwegian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_norwegian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_norwegian_analyzer": { + "type": "norwegian", + "stem_exclusion": ["autoritet", "godkjenning"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Norwegian analyzer internals + +The `norwegian` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- lowercase +- stop (Norwegian) +- keyword +- stemmer (Norwegian) + +## Custom Norwegian analyzer + +You can create custom Norwegian analyzer using the following command: + +```json +PUT /norwegian-index +{ + "settings": { + "analysis": { + "filter": { + "norwegian_stop": { + "type": "stop", + "stopwords": "_norwegian_" + }, + "norwegian_stemmer": { + "type": "stemmer", + "language": "norwegian" + }, + "norwegian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "norwegian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "norwegian_stop", + "norwegian_keywords", + "norwegian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "norwegian_analyzer" + } + } + } +} + +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /norwegian-index/_analyze +{ + "field": "content", + "text": "Studentene studerer ved norske universiteter. Deres nummer er 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "student","start_offset": 0,"end_offset": 10,"type": "","position": 0}, + {"token": "studer","start_offset": 11,"end_offset": 19,"type": "","position": 1}, + {"token": "norsk","start_offset": 24,"end_offset": 30,"type": "","position": 3}, + {"token": "universitet","start_offset": 31,"end_offset": 44,"type": "","position": 4}, + {"token": "numm","start_offset": 52,"end_offset": 58,"type": "","position": 6}, + {"token": "123456","start_offset": 62,"end_offset": 68,"type": "","position": 8} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/persian.md b/_analyzers/language-analyzers/persian.md new file mode 100644 index 0000000000..5693b9e045 --- /dev/null +++ b/_analyzers/language-analyzers/persian.md @@ -0,0 +1,142 @@ +--- +layout: default +title: Persian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 250 +--- + +# Persian analyzer + +The built-in `persian` analyzer can be applied to a text field using the following command: + +```json +PUT /persian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "persian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_persian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_persian_analyzer": { + "type": "persian", + "stem_exclusion": ["حکومت", "تأیید"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Persian analyzer internals + +The `persian` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- lowercase +- decimal_digit +- normalization (Arabic) +- normalization (Persian) +- keyword +- stemmer (Norwegian) + +## Custom Persian analyzer + +You can create custom Persian analyzer using the following command: + +```json +PUT /persian-index +{ + "settings": { + "analysis": { + "filter": { + "persian_stop": { + "type": "stop", + "stopwords": "_persian_" + }, + "persian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "char_filter": { + "null_width_replace_with_space": { + "type": "mapping", + "mappings": [ "\\u200C=>\\u0020"] + } + }, + "analyzer": { + "persian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "char_filter": [ "null_width_replace_with_space" ], + "filter": [ + "lowercase", + "decimal_digit", + "arabic_normalization", + "persian_normalization", + "persian_stop" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "persian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /persian-index/_analyze +{ + "field": "content", + "text": "دانشجویان در دانشگاه‌های ایرانی تحصیل می‌کنند. شماره‌های آن‌ها ۱۲۳۴۵۶ است." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "دانشجويان","start_offset": 0,"end_offset": 9,"type": "","position": 0}, + {"token": "دانشگاه","start_offset": 13,"end_offset": 20,"type": "","position": 2}, + {"token": "ايراني","start_offset": 25,"end_offset": 31,"type": "","position": 4}, + {"token": "تحصيل","start_offset": 32,"end_offset": 37,"type": "","position": 5}, + {"token": "شماره","start_offset": 47,"end_offset": 52,"type": "","position": 8}, + {"token": "123456","start_offset": 63,"end_offset": 69,"type": "","position": 12} + ] +} +``` \ No newline at end of file From 2b2845f417918a7292707b5820c784037ff32f75 Mon Sep 17 00:00:00 2001 From: Anton Rubin Date: Fri, 1 Nov 2024 10:50:01 +0000 Subject: [PATCH 09/15] adding portuguese,romanian,russian,sorani,spanish,swedish,thai and turkish language analyzer docs Signed-off-by: Anton Rubin --- _analyzers/language-analyzers/arabic.md | 2 +- _analyzers/language-analyzers/armenian.md | 2 +- _analyzers/language-analyzers/basque.md | 2 +- _analyzers/language-analyzers/bengali.md | 2 +- _analyzers/language-analyzers/brazilian.md | 2 +- _analyzers/language-analyzers/bulgarian.md | 2 +- _analyzers/language-analyzers/catalan.md | 2 +- _analyzers/language-analyzers/cjk.md | 2 +- _analyzers/language-analyzers/czech.md | 2 +- _analyzers/language-analyzers/danish.md | 2 +- _analyzers/language-analyzers/dutch.md | 2 +- _analyzers/language-analyzers/english.md | 2 +- _analyzers/language-analyzers/estonian.md | 2 +- _analyzers/language-analyzers/finnish.md | 2 +- _analyzers/language-analyzers/french.md | 2 +- _analyzers/language-analyzers/galician.md | 2 +- _analyzers/language-analyzers/german.md | 2 +- _analyzers/language-analyzers/greek.md | 2 +- _analyzers/language-analyzers/hindi.md | 2 +- _analyzers/language-analyzers/hungarian.md | 2 +- _analyzers/language-analyzers/indonesian.md | 2 +- _analyzers/language-analyzers/irish.md | 2 +- _analyzers/language-analyzers/italian.md | 2 +- _analyzers/language-analyzers/latvian.md | 2 +- _analyzers/language-analyzers/lithuanian.md | 2 +- _analyzers/language-analyzers/norwegian.md | 2 +- _analyzers/language-analyzers/persian.md | 4 +- _analyzers/language-analyzers/portuguese.md | 172 ++++++++++++++++++++ _analyzers/language-analyzers/romanian.md | 172 ++++++++++++++++++++ _analyzers/language-analyzers/russian.md | 172 ++++++++++++++++++++ _analyzers/language-analyzers/sorani.md | 168 +++++++++++++++++++ _analyzers/language-analyzers/spanish.md | 172 ++++++++++++++++++++ _analyzers/language-analyzers/swedish.md | 172 ++++++++++++++++++++ _analyzers/language-analyzers/thai.md | 132 +++++++++++++++ _analyzers/language-analyzers/turkish.md | 143 ++++++++++++++++ 35 files changed, 1332 insertions(+), 27 deletions(-) create mode 100644 _analyzers/language-analyzers/portuguese.md create mode 100644 _analyzers/language-analyzers/romanian.md create mode 100644 _analyzers/language-analyzers/russian.md create mode 100644 _analyzers/language-analyzers/sorani.md create mode 100644 _analyzers/language-analyzers/spanish.md create mode 100644 _analyzers/language-analyzers/swedish.md create mode 100644 _analyzers/language-analyzers/thai.md create mode 100644 _analyzers/language-analyzers/turkish.md diff --git a/_analyzers/language-analyzers/arabic.md b/_analyzers/language-analyzers/arabic.md index b15d7ee58d..2bbfb81140 100644 --- a/_analyzers/language-analyzers/arabic.md +++ b/_analyzers/language-analyzers/arabic.md @@ -52,7 +52,7 @@ The `arabic` analyzer is build using the following: Tokenizer: `standard` -Token Filters: +Token filters: - lowercase - decimal_digit - stop (Arabic) diff --git a/_analyzers/language-analyzers/armenian.md b/_analyzers/language-analyzers/armenian.md index 1324e39420..9355a49d05 100644 --- a/_analyzers/language-analyzers/armenian.md +++ b/_analyzers/language-analyzers/armenian.md @@ -52,7 +52,7 @@ The `armenian` analyzer is build using the following: Tokenizer: `standard` -Token Filters: +Token filters: - lowercase - stop (Armenian) - keywords diff --git a/_analyzers/language-analyzers/basque.md b/_analyzers/language-analyzers/basque.md index bab4ffa0fe..ada0b95cf5 100644 --- a/_analyzers/language-analyzers/basque.md +++ b/_analyzers/language-analyzers/basque.md @@ -52,7 +52,7 @@ The `basque` analyzer is build using the following: Tokenizer: `standard` -Token Filters: +Token filters: - lowercase - stop (Basque) - keywords diff --git a/_analyzers/language-analyzers/bengali.md b/_analyzers/language-analyzers/bengali.md index 72132e8e91..ec3f7f0ac5 100644 --- a/_analyzers/language-analyzers/bengali.md +++ b/_analyzers/language-analyzers/bengali.md @@ -52,7 +52,7 @@ The `bengali` analyzer is build using the following: Tokenizer: `standard` -Token Filters: +Token filters: - lowercase - decimal_digit - indic_normalization diff --git a/_analyzers/language-analyzers/brazilian.md b/_analyzers/language-analyzers/brazilian.md index b905773bbb..3e6eb3f89d 100644 --- a/_analyzers/language-analyzers/brazilian.md +++ b/_analyzers/language-analyzers/brazilian.md @@ -52,7 +52,7 @@ The `brazilian` analyzer is build using the following: Tokenizer: `standard` -Token Filters: +Token filters: - lowercase - stop (Brazilian) - keywords diff --git a/_analyzers/language-analyzers/bulgarian.md b/_analyzers/language-analyzers/bulgarian.md index d924a81afc..682430717f 100644 --- a/_analyzers/language-analyzers/bulgarian.md +++ b/_analyzers/language-analyzers/bulgarian.md @@ -52,7 +52,7 @@ The `bulgarian` analyzer is build using the following: Tokenizer: `standard` -Token Filters: +Token filters: - lowercase - stop (Bulgarian) - keywords diff --git a/_analyzers/language-analyzers/catalan.md b/_analyzers/language-analyzers/catalan.md index b1df91ce20..7a2c2e690b 100644 --- a/_analyzers/language-analyzers/catalan.md +++ b/_analyzers/language-analyzers/catalan.md @@ -52,7 +52,7 @@ The `catalan` analyzer is build using the following: Tokenizer: `standard` -Token Filters: +Token filters: - elision (Catalan) - lowercase - stop (Catalan) diff --git a/_analyzers/language-analyzers/cjk.md b/_analyzers/language-analyzers/cjk.md index e66b222062..8547a3156f 100644 --- a/_analyzers/language-analyzers/cjk.md +++ b/_analyzers/language-analyzers/cjk.md @@ -52,7 +52,7 @@ The `cjk` analyzer is build using the following: Tokenizer: `standard` -Token Filters: +Token filters: - cjk_width - lowercase - cjk_bigram diff --git a/_analyzers/language-analyzers/czech.md b/_analyzers/language-analyzers/czech.md index f0a2ac6482..b7725920e3 100644 --- a/_analyzers/language-analyzers/czech.md +++ b/_analyzers/language-analyzers/czech.md @@ -52,7 +52,7 @@ The `czech` analyzer is build using the following: Tokenizer: `standard` -Token Filters: +Token filters: - lowercase - stop (Czech) - keyword diff --git a/_analyzers/language-analyzers/danish.md b/_analyzers/language-analyzers/danish.md index 3f974d5e0f..652aedb3b7 100644 --- a/_analyzers/language-analyzers/danish.md +++ b/_analyzers/language-analyzers/danish.md @@ -52,7 +52,7 @@ The `danish` analyzer is build using the following: Tokenizer: `standard` -Token Filters: +Token filters: - lowercase - stop (Danish) - keyword diff --git a/_analyzers/language-analyzers/dutch.md b/_analyzers/language-analyzers/dutch.md index e96c05d147..5a4153702d 100644 --- a/_analyzers/language-analyzers/dutch.md +++ b/_analyzers/language-analyzers/dutch.md @@ -52,7 +52,7 @@ The `dutch` analyzer is build using the following: Tokenizer: `standard` -Token Filters: +Token filters: - lowercase - stop (Dutch) - keyword diff --git a/_analyzers/language-analyzers/english.md b/_analyzers/language-analyzers/english.md index 4c3dff5bbc..e266681030 100644 --- a/_analyzers/language-analyzers/english.md +++ b/_analyzers/language-analyzers/english.md @@ -52,7 +52,7 @@ The `english` analyzer is build using the following: Tokenizer: `standard` -Token Filters: +Token filters: - stemmer (possessive_english) - lowercase - stop (English) diff --git a/_analyzers/language-analyzers/estonian.md b/_analyzers/language-analyzers/estonian.md index 6b5afa2271..d67c88d3b2 100644 --- a/_analyzers/language-analyzers/estonian.md +++ b/_analyzers/language-analyzers/estonian.md @@ -52,7 +52,7 @@ The `estonian` analyzer is build using the following: Tokenizer: `standard` -Token Filters: +Token filters: - lowercase - stop (Estonian) - keyword diff --git a/_analyzers/language-analyzers/finnish.md b/_analyzers/language-analyzers/finnish.md index ccc1534b2f..73c4eade5e 100644 --- a/_analyzers/language-analyzers/finnish.md +++ b/_analyzers/language-analyzers/finnish.md @@ -52,7 +52,7 @@ The `finnish` analyzer is build using the following: Tokenizer: `standard` -Token Filters: +Token filters: - lowercase - stop (Finnish) - keyword diff --git a/_analyzers/language-analyzers/french.md b/_analyzers/language-analyzers/french.md index 730a2066d4..574be37ab5 100644 --- a/_analyzers/language-analyzers/french.md +++ b/_analyzers/language-analyzers/french.md @@ -52,7 +52,7 @@ The `french` analyzer is build using the following: Tokenizer: `standard` -Token Filters: +Token filters: - elision (French) - lowercase - stop (French) diff --git a/_analyzers/language-analyzers/galician.md b/_analyzers/language-analyzers/galician.md index e0f833e13d..75c789f1c2 100644 --- a/_analyzers/language-analyzers/galician.md +++ b/_analyzers/language-analyzers/galician.md @@ -52,7 +52,7 @@ The `galician` analyzer is build using the following: Tokenizer: `standard` -Token Filters: +Token filters: - lowercase - stop (French) - keyword diff --git a/_analyzers/language-analyzers/german.md b/_analyzers/language-analyzers/german.md index 3076fea57c..ed9bb19229 100644 --- a/_analyzers/language-analyzers/german.md +++ b/_analyzers/language-analyzers/german.md @@ -52,7 +52,7 @@ The `german` analyzer is build using the following: Tokenizer: `standard` -Token Filters: +Token filters: - lowercase - stop (German) - keyword diff --git a/_analyzers/language-analyzers/greek.md b/_analyzers/language-analyzers/greek.md index 01735581ca..94b9e5dddb 100644 --- a/_analyzers/language-analyzers/greek.md +++ b/_analyzers/language-analyzers/greek.md @@ -52,7 +52,7 @@ The `greek` analyzer is build using the following: Tokenizer: `standard` -Token Filters: +Token filters: - lowercase - stop (Greek) - keyword diff --git a/_analyzers/language-analyzers/hindi.md b/_analyzers/language-analyzers/hindi.md index b2812edd49..14964bfa4d 100644 --- a/_analyzers/language-analyzers/hindi.md +++ b/_analyzers/language-analyzers/hindi.md @@ -52,7 +52,7 @@ The `hindi` analyzer is build using the following: Tokenizer: `standard` -Token Filters: +Token filters: - lowercase - decimal_digit - keyword diff --git a/_analyzers/language-analyzers/hungarian.md b/_analyzers/language-analyzers/hungarian.md index 7e32ead084..f1851edf95 100644 --- a/_analyzers/language-analyzers/hungarian.md +++ b/_analyzers/language-analyzers/hungarian.md @@ -52,7 +52,7 @@ The `hungarian` analyzer is build using the following: Tokenizer: `standard` -Token Filters: +Token filters: - lowercase - stop (Hungarian) - keyword diff --git a/_analyzers/language-analyzers/indonesian.md b/_analyzers/language-analyzers/indonesian.md index b4b567c588..feeef6254e 100644 --- a/_analyzers/language-analyzers/indonesian.md +++ b/_analyzers/language-analyzers/indonesian.md @@ -52,7 +52,7 @@ The `indonesian` analyzer is build using the following: Tokenizer: `standard` -Token Filters: +Token filters: - lowercase - stop (Indonesian) - keyword diff --git a/_analyzers/language-analyzers/irish.md b/_analyzers/language-analyzers/irish.md index 03fde20c3c..b914ba6b21 100644 --- a/_analyzers/language-analyzers/irish.md +++ b/_analyzers/language-analyzers/irish.md @@ -52,7 +52,7 @@ The `irish` analyzer is build using the following: Tokenizer: `standard` -Token Filters: +Token filters: - hyphenation (Irish) - elision (Irish) - lowercase (Irish) diff --git a/_analyzers/language-analyzers/italian.md b/_analyzers/language-analyzers/italian.md index 636f58fcc8..11113635ca 100644 --- a/_analyzers/language-analyzers/italian.md +++ b/_analyzers/language-analyzers/italian.md @@ -52,7 +52,7 @@ The `italian` analyzer is build using the following: Tokenizer: `standard` -Token Filters: +Token filters: - elision (Italian) - lowercase - stop (Italian) diff --git a/_analyzers/language-analyzers/latvian.md b/_analyzers/language-analyzers/latvian.md index ecdc4b2f51..820cb252b8 100644 --- a/_analyzers/language-analyzers/latvian.md +++ b/_analyzers/language-analyzers/latvian.md @@ -52,7 +52,7 @@ The `latvian` analyzer is build using the following: Tokenizer: `standard` -Token Filters: +Token filters: - lowercase - stop (Latvian) - keyword diff --git a/_analyzers/language-analyzers/lithuanian.md b/_analyzers/language-analyzers/lithuanian.md index 123e01139e..55a94c9c1e 100644 --- a/_analyzers/language-analyzers/lithuanian.md +++ b/_analyzers/language-analyzers/lithuanian.md @@ -52,7 +52,7 @@ The `lithuanian` analyzer is build using the following: Tokenizer: `standard` -Token Filters: +Token filters: - lowercase - stop (Lithuanian) - keyword diff --git a/_analyzers/language-analyzers/norwegian.md b/_analyzers/language-analyzers/norwegian.md index 33d8e01f7f..92fbd9231e 100644 --- a/_analyzers/language-analyzers/norwegian.md +++ b/_analyzers/language-analyzers/norwegian.md @@ -52,7 +52,7 @@ The `norwegian` analyzer is build using the following: Tokenizer: `standard` -Token Filters: +Token filters: - lowercase - stop (Norwegian) - keyword diff --git a/_analyzers/language-analyzers/persian.md b/_analyzers/language-analyzers/persian.md index 5693b9e045..57ea1ea796 100644 --- a/_analyzers/language-analyzers/persian.md +++ b/_analyzers/language-analyzers/persian.md @@ -52,7 +52,9 @@ The `persian` analyzer is build using the following: Tokenizer: `standard` -Token Filters: +Char filter: `mapping` + +Token filters: - lowercase - decimal_digit - normalization (Arabic) diff --git a/_analyzers/language-analyzers/portuguese.md b/_analyzers/language-analyzers/portuguese.md new file mode 100644 index 0000000000..eb7b959c0b --- /dev/null +++ b/_analyzers/language-analyzers/portuguese.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Portuguese +parent: Language analyzers +grand_parent: Analyzers +nav_order: 260 +--- + +# Portuguese analyzer + +The built-in `portuguese` analyzer can be applied to a text field using the following command: + +```json +PUT /portuguese-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "portuguese" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_portuguese_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_portuguese_analyzer": { + "type": "portuguese", + "stem_exclusion": ["autoridade", "aprovação"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Portuguese analyzer internals + +The `portuguese` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- lowercase +- stop (Portuguese) +- keyword +- stemmer (Portuguese) + +## Custom Portuguese analyzer + +You can create custom Portuguese analyzer using the following command: + +```json +PUT /portuguese-index +{ + "settings": { + "analysis": { + "filter": { + "portuguese_stop": { + "type": "stop", + "stopwords": "_portuguese_" + }, + "portuguese_stemmer": { + "type": "stemmer", + "language": "light_portuguese" + }, + "portuguese_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "portuguese_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "portuguese_stop", + "portuguese_keywords", + "portuguese_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "portuguese_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /portuguese-index/_analyze +{ + "field": "content", + "text": "Os estudantes estudam nas universidades brasileiras. Seus números são 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "estudant", + "start_offset": 3, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "estudam", + "start_offset": 14, + "end_offset": 21, + "type": "", + "position": 2 + }, + { + "token": "universidad", + "start_offset": 26, + "end_offset": 39, + "type": "", + "position": 4 + }, + { + "token": "brasileir", + "start_offset": 40, + "end_offset": 51, + "type": "", + "position": 5 + }, + { + "token": "numer", + "start_offset": 58, + "end_offset": 65, + "type": "", + "position": 7 + }, + { + "token": "123456", + "start_offset": 70, + "end_offset": 76, + "type": "", + "position": 9 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/romanian.md b/_analyzers/language-analyzers/romanian.md new file mode 100644 index 0000000000..9b5c909665 --- /dev/null +++ b/_analyzers/language-analyzers/romanian.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Romanian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 270 +--- + +# Romanian analyzer + +The built-in `romanian` analyzer can be applied to a text field using the following command: + +```json +PUT /romanian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "romanian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_romanian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_romanian_analyzer": { + "type": "romanian", + "stem_exclusion": ["autoritate", "aprobat"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Romanian analyzer internals + +The `romanian` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- lowercase +- stop (Romanian) +- keyword +- stemmer (Romanian) + +## Custom Romanian analyzer + +You can create custom Romanian analyzer using the following command: + +```json +PUT /romanian-index +{ + "settings": { + "analysis": { + "filter": { + "romanian_stop": { + "type": "stop", + "stopwords": "_romanian_" + }, + "romanian_stemmer": { + "type": "stemmer", + "language": "romanian" + }, + "romanian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "romanian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "romanian_stop", + "romanian_keywords", + "romanian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "romanian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /romanian-index/_analyze +{ + "field": "content", + "text": "Studenții învață la universitățile din România. Numerele lor sunt 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "studenț", + "start_offset": 0, + "end_offset": 9, + "type": "", + "position": 0 + }, + { + "token": "învaț", + "start_offset": 10, + "end_offset": 16, + "type": "", + "position": 1 + }, + { + "token": "universităț", + "start_offset": 20, + "end_offset": 34, + "type": "", + "position": 3 + }, + { + "token": "român", + "start_offset": 39, + "end_offset": 46, + "type": "", + "position": 5 + }, + { + "token": "numer", + "start_offset": 48, + "end_offset": 56, + "type": "", + "position": 6 + }, + { + "token": "123456", + "start_offset": 66, + "end_offset": 72, + "type": "", + "position": 9 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/russian.md b/_analyzers/language-analyzers/russian.md new file mode 100644 index 0000000000..9552bce9da --- /dev/null +++ b/_analyzers/language-analyzers/russian.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Russian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 280 +--- + +# Russian analyzer + +The built-in `russian` analyzer can be applied to a text field using the following command: + +```json +PUT /russian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "russian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_russian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_russian_analyzer": { + "type": "russian", + "stem_exclusion": ["авторитет", "одобрение"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Russian analyzer internals + +The `russian` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- lowercase +- stop (Russian) +- keyword +- stemmer (Russian) + +## Custom Russian analyzer + +You can create custom Russian analyzer using the following command: + +```json +PUT /russian-index +{ + "settings": { + "analysis": { + "filter": { + "russian_stop": { + "type": "stop", + "stopwords": "_russian_" + }, + "russian_stemmer": { + "type": "stemmer", + "language": "russian" + }, + "russian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "russian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "russian_stop", + "russian_keywords", + "russian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "russian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /russian-index/_analyze +{ + "field": "content", + "text": "Студенты учатся в университетах России. Их номера 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "студент", + "start_offset": 0, + "end_offset": 8, + "type": "", + "position": 0 + }, + { + "token": "учат", + "start_offset": 9, + "end_offset": 15, + "type": "", + "position": 1 + }, + { + "token": "университет", + "start_offset": 18, + "end_offset": 31, + "type": "", + "position": 3 + }, + { + "token": "росс", + "start_offset": 32, + "end_offset": 38, + "type": "", + "position": 4 + }, + { + "token": "номер", + "start_offset": 43, + "end_offset": 49, + "type": "", + "position": 6 + }, + { + "token": "123456", + "start_offset": 50, + "end_offset": 56, + "type": "", + "position": 7 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/sorani.md b/_analyzers/language-analyzers/sorani.md new file mode 100644 index 0000000000..df44ea5f27 --- /dev/null +++ b/_analyzers/language-analyzers/sorani.md @@ -0,0 +1,168 @@ +--- +layout: default +title: Sorani +parent: Language analyzers +grand_parent: Analyzers +nav_order: 290 +--- + +# Sorani analyzer + +The built-in `sorani` analyzer can be applied to a text field using the following command: + +```json +PUT /sorani-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "sorani" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_sorani_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_sorani_analyzer": { + "type": "sorani", + "stem_exclusion": ["مؤسسه", "اجازه"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Sorani analyzer internals + +The `sorani` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- normalization (Sorani) +- lowercase +- decimal_digit +- stop (Sorani) +- keyword +- stemmer (Sorani) + +## Custom Sorani analyzer + +You can create custom Sorani analyzer using the following command: + +```json +PUT /sorani-index +{ + "settings": { + "analysis": { + "filter": { + "sorani_stop": { + "type": "stop", + "stopwords": "_sorani_" + }, + "sorani_stemmer": { + "type": "stemmer", + "language": "sorani" + }, + "sorani_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "sorani_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "decimal_digit", + "sorani_stop", + "sorani_keywords", + "sorani_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "sorani_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /sorani-index/_analyze +{ + "field": "content", + "text": "خوێندنی فەرمی لە هەولێرەوە. ژمارەکان ١٢٣٤٥٦." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "خوێندن", + "start_offset": 0, + "end_offset": 7, + "type": "", + "position": 0 + }, + { + "token": "فەرم", + "start_offset": 8, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "هەولێر", + "start_offset": 17, + "end_offset": 26, + "type": "", + "position": 3 + }, + { + "token": "ژمار", + "start_offset": 28, + "end_offset": 36, + "type": "", + "position": 4 + }, + { + "token": "123456", + "start_offset": 37, + "end_offset": 43, + "type": "", + "position": 5 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/spanish.md b/_analyzers/language-analyzers/spanish.md new file mode 100644 index 0000000000..98ded27b83 --- /dev/null +++ b/_analyzers/language-analyzers/spanish.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Spanish +parent: Language analyzers +grand_parent: Analyzers +nav_order: 300 +--- + +# Spanish analyzer + +The built-in `spanish` analyzer can be applied to a text field using the following command: + +```json +PUT /spanish-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "spanish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_spanish_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_spanish_analyzer": { + "type": "spanish", + "stem_exclusion": ["autoridad", "aprobación"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Spanish analyzer internals + +The `spanish` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- lowercase +- stop (Spanish) +- keyword +- stemmer (Spanish) + +## Custom Spanish analyzer + +You can create custom Spanish analyzer using the following command: + +```json +PUT /spanish-index +{ + "settings": { + "analysis": { + "filter": { + "spanish_stop": { + "type": "stop", + "stopwords": "_spanish_" + }, + "spanish_stemmer": { + "type": "stemmer", + "language": "light_spanish" + }, + "spanish_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "spanish_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "spanish_stop", + "spanish_keywords", + "spanish_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "spanish_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /spanish-index/_analyze +{ + "field": "content", + "text": "Los estudiantes estudian en universidades españolas. Sus números son 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "estudiant", + "start_offset": 4, + "end_offset": 15, + "type": "", + "position": 1 + }, + { + "token": "estudian", + "start_offset": 16, + "end_offset": 24, + "type": "", + "position": 2 + }, + { + "token": "universidad", + "start_offset": 28, + "end_offset": 41, + "type": "", + "position": 4 + }, + { + "token": "español", + "start_offset": 42, + "end_offset": 51, + "type": "", + "position": 5 + }, + { + "token": "numer", + "start_offset": 57, + "end_offset": 64, + "type": "", + "position": 7 + }, + { + "token": "123456", + "start_offset": 69, + "end_offset": 75, + "type": "", + "position": 9 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/swedish.md b/_analyzers/language-analyzers/swedish.md new file mode 100644 index 0000000000..67decf2344 --- /dev/null +++ b/_analyzers/language-analyzers/swedish.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Swedish +parent: Language analyzers +grand_parent: Analyzers +nav_order: 310 +--- + +# Swedish analyzer + +The built-in `swedish` analyzer can be applied to a text field using the following command: + +```json +PUT /swedish-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "swedish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_swedish_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_swedish_analyzer": { + "type": "swedish", + "stem_exclusion": ["myndighet", "godkännande"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Swedish analyzer internals + +The `swedish` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- lowercase +- stop (Swedish) +- keyword +- stemmer (Swedish) + +## Custom Swedish analyzer + +You can create custom Swedish analyzer using the following command: + +```json +PUT /swedish-index +{ + "settings": { + "analysis": { + "filter": { + "swedish_stop": { + "type": "stop", + "stopwords": "_swedish_" + }, + "swedish_stemmer": { + "type": "stemmer", + "language": "swedish" + }, + "swedish_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "swedish_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "swedish_stop", + "swedish_keywords", + "swedish_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "swedish_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /swedish-index/_analyze +{ + "field": "content", + "text": "Studenter studerar vid svenska universitet. Deras nummer är 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "student", + "start_offset": 0, + "end_offset": 9, + "type": "", + "position": 0 + }, + { + "token": "studer", + "start_offset": 10, + "end_offset": 18, + "type": "", + "position": 1 + }, + { + "token": "svensk", + "start_offset": 23, + "end_offset": 30, + "type": "", + "position": 3 + }, + { + "token": "universitet", + "start_offset": 31, + "end_offset": 42, + "type": "", + "position": 4 + }, + { + "token": "numm", + "start_offset": 50, + "end_offset": 56, + "type": "", + "position": 6 + }, + { + "token": "123456", + "start_offset": 60, + "end_offset": 66, + "type": "", + "position": 8 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/thai.md b/_analyzers/language-analyzers/thai.md new file mode 100644 index 0000000000..f251067dc0 --- /dev/null +++ b/_analyzers/language-analyzers/thai.md @@ -0,0 +1,132 @@ +--- +layout: default +title: Thai +parent: Language analyzers +grand_parent: Analyzers +nav_order: 320 +--- + +# Thai analyzer + +The built-in `thai` analyzer can be applied to a text field using the following command: + +```json +PUT /thai-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "thai" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_thai_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_thai_analyzer": { + "type": "thai", + "stem_exclusion": ["อำนาจ", "การอนุมัติ"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Thai analyzer internals + +The `thai` analyzer is build using the following: + +Tokenizer: `thai` + +Token Filters: +- lowercase +- decimal_digit +- stop (Thai) +- keyword + +## Custom Thai analyzer + +You can create custom Thai analyzer using the following command: + +```json +PUT /thai-index +{ + "settings": { + "analysis": { + "filter": { + "thai_stop": { + "type": "stop", + "stopwords": "_thai_" + }, + "thai_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "thai_analyzer": { + "tokenizer": "thai", + "filter": [ + "lowercase", + "decimal_digit", + "thai_stop", + "thai_keywords" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "thai_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /thai-index/_analyze +{ + "field": "content", + "text": "นักเรียนกำลังศึกษาอยู่ที่มหาวิทยาลัยไทย หมายเลข 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "นักเรียน","start_offset": 0,"end_offset": 8,"type": "word","position": 0}, + {"token": "กำลัง","start_offset": 8,"end_offset": 13,"type": "word","position": 1}, + {"token": "ศึกษา","start_offset": 13,"end_offset": 18,"type": "word","position": 2}, + {"token": "มหาวิทยาลัย","start_offset": 25,"end_offset": 36,"type": "word","position": 5}, + {"token": "ไทย","start_offset": 36,"end_offset": 39,"type": "word","position": 6}, + {"token": "หมายเลข","start_offset": 40,"end_offset": 47,"type": "word","position": 7}, + {"token": "123456","start_offset": 48,"end_offset": 54,"type": "word","position": 8} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/turkish.md b/_analyzers/language-analyzers/turkish.md new file mode 100644 index 0000000000..9255682322 --- /dev/null +++ b/_analyzers/language-analyzers/turkish.md @@ -0,0 +1,143 @@ +--- +layout: default +title: Turkish +parent: Language analyzers +grand_parent: Analyzers +nav_order: 330 +--- + +# Turkish analyzer + +The built-in `turkish` analyzer can be applied to a text field using the following command: + +```json +PUT /turkish-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "turkish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can also use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_turkish_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_turkish_analyzer": { + "type": "turkish", + "stem_exclusion": ["otorite", "onay"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Turkish analyzer internals + +The `turkish` analyzer is build using the following: + +Tokenizer: `standard` + +Token Filters: +- apostrophe +- lowercase (Turkish) +- stop (Turkish) +- keyword +- stemmer (Turkish) + +## Custom Turkish analyzer + +You can create custom Turkish analyzer using the following command: + +```json +PUT /turkish-index +{ + "settings": { + "analysis": { + "filter": { + "turkish_stop": { + "type": "stop", + "stopwords": "_turkish_" + }, + "turkish_stemmer": { + "type": "stemmer", + "language": "turkish" + }, + "turkish_lowercase": { + "type": "lowercase", + "language": "turkish" + }, + "turkish_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "turkish_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "apostrophe", + "turkish_lowercase", + "turkish_stop", + "turkish_keywords", + "turkish_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "turkish_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /turkish-index/_analyze +{ + "field": "content", + "text": "Öğrenciler Türk üniversitelerinde öğrenim görüyor. Numara 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "öğrenci","start_offset": 0,"end_offset": 10,"type": "","position": 0}, + {"token": "türk","start_offset": 11,"end_offset": 15,"type": "","position": 1}, + {"token": "üniversite","start_offset": 16,"end_offset": 33,"type": "","position": 2}, + {"token": "öğre","start_offset": 34,"end_offset": 41,"type": "","position": 3}, + {"token": "görüyor","start_offset": 42,"end_offset": 49,"type": "","position": 4}, + {"token": "numar","start_offset": 51,"end_offset": 57,"type": "","position": 5}, + {"token": "123456","start_offset": 58,"end_offset": 64,"type": "","position": 6} + ] +} +``` \ No newline at end of file From 3fc50b1498325c351f6c416654703970ecb6122b Mon Sep 17 00:00:00 2001 From: AntonEliatra Date: Thu, 7 Nov 2024 11:38:06 +0000 Subject: [PATCH 10/15] Apply suggestions from code review Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: AntonEliatra --- _analyzers/language-analyzers/arabic.md | 10 +++++----- _analyzers/language-analyzers/armenian.md | 2 +- _analyzers/language-analyzers/basque.md | 2 +- _analyzers/language-analyzers/bengali.md | 2 +- _analyzers/language-analyzers/brazilian.md | 2 +- _analyzers/language-analyzers/bulgarian.md | 2 +- _analyzers/language-analyzers/catalan.md | 2 +- _analyzers/language-analyzers/cjk.md | 2 +- _analyzers/language-analyzers/czech.md | 2 +- _analyzers/language-analyzers/danish.md | 2 +- _analyzers/language-analyzers/dutch.md | 2 +- _analyzers/language-analyzers/english.md | 2 +- _analyzers/language-analyzers/estonian.md | 2 +- _analyzers/language-analyzers/finnish.md | 2 +- _analyzers/language-analyzers/french.md | 2 +- _analyzers/language-analyzers/galician.md | 2 +- _analyzers/language-analyzers/german.md | 2 +- _analyzers/language-analyzers/greek.md | 2 +- _analyzers/language-analyzers/hindi.md | 2 +- _analyzers/language-analyzers/hungarian.md | 2 +- _analyzers/language-analyzers/index.md | 14 +++++++------- _analyzers/language-analyzers/indonesian.md | 2 +- _analyzers/language-analyzers/irish.md | 2 +- _analyzers/language-analyzers/italian.md | 2 +- _analyzers/language-analyzers/latvian.md | 2 +- _analyzers/language-analyzers/lithuanian.md | 2 +- _analyzers/language-analyzers/norwegian.md | 2 +- _analyzers/language-analyzers/persian.md | 2 +- _analyzers/language-analyzers/portuguese.md | 2 +- _analyzers/language-analyzers/romanian.md | 2 +- _analyzers/language-analyzers/russian.md | 2 +- _analyzers/language-analyzers/sorani.md | 2 +- _analyzers/language-analyzers/spanish.md | 2 +- _analyzers/language-analyzers/swedish.md | 2 +- _analyzers/language-analyzers/thai.md | 2 +- _analyzers/language-analyzers/turkish.md | 2 +- 36 files changed, 46 insertions(+), 46 deletions(-) diff --git a/_analyzers/language-analyzers/arabic.md b/_analyzers/language-analyzers/arabic.md index 2bbfb81140..64671d3b0c 100644 --- a/_analyzers/language-analyzers/arabic.md +++ b/_analyzers/language-analyzers/arabic.md @@ -27,7 +27,7 @@ PUT /arabic-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_arabic @@ -48,11 +48,11 @@ PUT index_with_stem_exclusion_arabic ## Arabic analyzer internals -The `arabic` analyzer is build using the following: +The `arabic` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token filters: +- Token filters: - lowercase - decimal_digit - stop (Arabic) @@ -62,7 +62,7 @@ Token filters: ## Custom Arabic analyzer -You can create custom Arabic analyzer using the following command: +You can create a custom Arabic analyzer using the following command: ```json PUT /arabic-index diff --git a/_analyzers/language-analyzers/armenian.md b/_analyzers/language-analyzers/armenian.md index 9355a49d05..38810533e1 100644 --- a/_analyzers/language-analyzers/armenian.md +++ b/_analyzers/language-analyzers/armenian.md @@ -27,7 +27,7 @@ PUT /arabic-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_armenian_analyzer diff --git a/_analyzers/language-analyzers/basque.md b/_analyzers/language-analyzers/basque.md index ada0b95cf5..47e71b43e3 100644 --- a/_analyzers/language-analyzers/basque.md +++ b/_analyzers/language-analyzers/basque.md @@ -27,7 +27,7 @@ PUT /basque-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_basque_analyzer diff --git a/_analyzers/language-analyzers/bengali.md b/_analyzers/language-analyzers/bengali.md index ec3f7f0ac5..8107ce7dfe 100644 --- a/_analyzers/language-analyzers/bengali.md +++ b/_analyzers/language-analyzers/bengali.md @@ -27,7 +27,7 @@ PUT /bengali-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_bengali_analyzer diff --git a/_analyzers/language-analyzers/brazilian.md b/_analyzers/language-analyzers/brazilian.md index 3e6eb3f89d..925e559ac0 100644 --- a/_analyzers/language-analyzers/brazilian.md +++ b/_analyzers/language-analyzers/brazilian.md @@ -27,7 +27,7 @@ PUT /brazilian-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_brazilian_analyzer diff --git a/_analyzers/language-analyzers/bulgarian.md b/_analyzers/language-analyzers/bulgarian.md index 682430717f..496ac086b7 100644 --- a/_analyzers/language-analyzers/bulgarian.md +++ b/_analyzers/language-analyzers/bulgarian.md @@ -27,7 +27,7 @@ PUT /bulgarian-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_bulgarian_analyzer diff --git a/_analyzers/language-analyzers/catalan.md b/_analyzers/language-analyzers/catalan.md index 7a2c2e690b..d6a9b1c8b2 100644 --- a/_analyzers/language-analyzers/catalan.md +++ b/_analyzers/language-analyzers/catalan.md @@ -27,7 +27,7 @@ PUT /catalan-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_catalan_analyzer diff --git a/_analyzers/language-analyzers/cjk.md b/_analyzers/language-analyzers/cjk.md index 8547a3156f..31dc917e99 100644 --- a/_analyzers/language-analyzers/cjk.md +++ b/_analyzers/language-analyzers/cjk.md @@ -27,7 +27,7 @@ PUT /cjk-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_cjk_analyzer diff --git a/_analyzers/language-analyzers/czech.md b/_analyzers/language-analyzers/czech.md index b7725920e3..3c1fbd9c9c 100644 --- a/_analyzers/language-analyzers/czech.md +++ b/_analyzers/language-analyzers/czech.md @@ -27,7 +27,7 @@ PUT /czech-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_czech_analyzer diff --git a/_analyzers/language-analyzers/danish.md b/_analyzers/language-analyzers/danish.md index 652aedb3b7..c9ccc0c01e 100644 --- a/_analyzers/language-analyzers/danish.md +++ b/_analyzers/language-analyzers/danish.md @@ -27,7 +27,7 @@ PUT /danish-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_danish_analyzer diff --git a/_analyzers/language-analyzers/dutch.md b/_analyzers/language-analyzers/dutch.md index 5a4153702d..90ce69fa09 100644 --- a/_analyzers/language-analyzers/dutch.md +++ b/_analyzers/language-analyzers/dutch.md @@ -27,7 +27,7 @@ PUT /dutch-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_dutch_analyzer diff --git a/_analyzers/language-analyzers/english.md b/_analyzers/language-analyzers/english.md index e266681030..fda095b912 100644 --- a/_analyzers/language-analyzers/english.md +++ b/_analyzers/language-analyzers/english.md @@ -27,7 +27,7 @@ PUT /english-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_english_analyzer diff --git a/_analyzers/language-analyzers/estonian.md b/_analyzers/language-analyzers/estonian.md index d67c88d3b2..01961c2d46 100644 --- a/_analyzers/language-analyzers/estonian.md +++ b/_analyzers/language-analyzers/estonian.md @@ -27,7 +27,7 @@ PUT /estonian-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_estonian_analyzer diff --git a/_analyzers/language-analyzers/finnish.md b/_analyzers/language-analyzers/finnish.md index 73c4eade5e..3ac753e5ea 100644 --- a/_analyzers/language-analyzers/finnish.md +++ b/_analyzers/language-analyzers/finnish.md @@ -27,7 +27,7 @@ PUT /finnish-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_finnish_analyzer diff --git a/_analyzers/language-analyzers/french.md b/_analyzers/language-analyzers/french.md index 574be37ab5..278bfbb333 100644 --- a/_analyzers/language-analyzers/french.md +++ b/_analyzers/language-analyzers/french.md @@ -27,7 +27,7 @@ PUT /french-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_french_analyzer diff --git a/_analyzers/language-analyzers/galician.md b/_analyzers/language-analyzers/galician.md index 75c789f1c2..515717bf3f 100644 --- a/_analyzers/language-analyzers/galician.md +++ b/_analyzers/language-analyzers/galician.md @@ -27,7 +27,7 @@ PUT /galician-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_galician_analyzer diff --git a/_analyzers/language-analyzers/german.md b/_analyzers/language-analyzers/german.md index ed9bb19229..1e679aca0a 100644 --- a/_analyzers/language-analyzers/german.md +++ b/_analyzers/language-analyzers/german.md @@ -27,7 +27,7 @@ PUT /german-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_german_analyzer diff --git a/_analyzers/language-analyzers/greek.md b/_analyzers/language-analyzers/greek.md index 94b9e5dddb..4b44d7014c 100644 --- a/_analyzers/language-analyzers/greek.md +++ b/_analyzers/language-analyzers/greek.md @@ -27,7 +27,7 @@ PUT /greek-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_greek_analyzer diff --git a/_analyzers/language-analyzers/hindi.md b/_analyzers/language-analyzers/hindi.md index 14964bfa4d..b1fdabb2b6 100644 --- a/_analyzers/language-analyzers/hindi.md +++ b/_analyzers/language-analyzers/hindi.md @@ -27,7 +27,7 @@ PUT /hindi-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_hindi_analyzer diff --git a/_analyzers/language-analyzers/hungarian.md b/_analyzers/language-analyzers/hungarian.md index f1851edf95..83330eb708 100644 --- a/_analyzers/language-analyzers/hungarian.md +++ b/_analyzers/language-analyzers/hungarian.md @@ -27,7 +27,7 @@ PUT /hungarian-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_hungarian_analyzer diff --git a/_analyzers/language-analyzers/index.md b/_analyzers/language-analyzers/index.md index c69337f3a9..639ff28502 100644 --- a/_analyzers/language-analyzers/index.md +++ b/_analyzers/language-analyzers/index.md @@ -41,13 +41,13 @@ PUT my-index } ``` -## stem_exclusion +## Stem exclusion -The `stem_exclusion` feature can be applied to many language analyzers by providing a list of lowercase words that should be excluded from stemming. Internally, OpenSearch uses the `keyword_marker` token filter to mark these words as keywords, ensuring they are not stemmed. +You can apply stem exclusion to many language analyzers by providing a list of lowercase words that should be excluded from stemming. Internally, OpenSearch uses the `keyword_marker` token filter to mark these words as keywords, ensuring they are not stemmed. -## Example stem_exclusion +## Stem exclusion example -You can use the following command to configure `stem_exclusion`: +Use the following request to configure `stem_exclusion`: ```json PUT index_with_stem_exclusion_english_analyzer @@ -66,7 +66,7 @@ PUT index_with_stem_exclusion_english_analyzer ``` {% include copy-curl.html %} -Following languages support `stem_exclusion`: +The following languages support stem exclusion: - arabic - armenian @@ -101,9 +101,9 @@ Following languages support `stem_exclusion`: - turkish -## stem_exclusion with custom analyzer +## Stem exclusion with custom analyzers -All language analyzers are made up from tokenizers and token filters specific to the particular language. If you want to implement a custom version of the language analyzer with `stem_exclusion`, you need to configure `keyword_marker` token filter and list the necessary words in `keywords` parameter, see the following example: +All language analyzers consist of tokenizers and token filters specific to the particular language. If you want to implement a custom version of the language analyzer with stem exclusion, you need to configure the `keyword_marker` token filter and list the words excluded from stemming in the `keywords` parameter: ```json PUT index_with_keyword_marker_analyzer diff --git a/_analyzers/language-analyzers/indonesian.md b/_analyzers/language-analyzers/indonesian.md index feeef6254e..73b551cd9a 100644 --- a/_analyzers/language-analyzers/indonesian.md +++ b/_analyzers/language-analyzers/indonesian.md @@ -27,7 +27,7 @@ PUT /indonesian-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_indonesian_analyzer diff --git a/_analyzers/language-analyzers/irish.md b/_analyzers/language-analyzers/irish.md index b914ba6b21..b4e25e57c8 100644 --- a/_analyzers/language-analyzers/irish.md +++ b/_analyzers/language-analyzers/irish.md @@ -27,7 +27,7 @@ PUT /irish-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_irish_analyzer diff --git a/_analyzers/language-analyzers/italian.md b/_analyzers/language-analyzers/italian.md index 11113635ca..1fc1063efd 100644 --- a/_analyzers/language-analyzers/italian.md +++ b/_analyzers/language-analyzers/italian.md @@ -27,7 +27,7 @@ PUT /italian-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_italian_analyzer diff --git a/_analyzers/language-analyzers/latvian.md b/_analyzers/language-analyzers/latvian.md index 820cb252b8..620f694c23 100644 --- a/_analyzers/language-analyzers/latvian.md +++ b/_analyzers/language-analyzers/latvian.md @@ -27,7 +27,7 @@ PUT /latvian-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_latvian_analyzer diff --git a/_analyzers/language-analyzers/lithuanian.md b/_analyzers/language-analyzers/lithuanian.md index 55a94c9c1e..6d67dc2262 100644 --- a/_analyzers/language-analyzers/lithuanian.md +++ b/_analyzers/language-analyzers/lithuanian.md @@ -27,7 +27,7 @@ PUT /lithuanian-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_lithuanian_analyzer diff --git a/_analyzers/language-analyzers/norwegian.md b/_analyzers/language-analyzers/norwegian.md index 92fbd9231e..5a00a27924 100644 --- a/_analyzers/language-analyzers/norwegian.md +++ b/_analyzers/language-analyzers/norwegian.md @@ -27,7 +27,7 @@ PUT /norwegian-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_norwegian_analyzer diff --git a/_analyzers/language-analyzers/persian.md b/_analyzers/language-analyzers/persian.md index 57ea1ea796..1a335dd483 100644 --- a/_analyzers/language-analyzers/persian.md +++ b/_analyzers/language-analyzers/persian.md @@ -27,7 +27,7 @@ PUT /persian-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_persian_analyzer diff --git a/_analyzers/language-analyzers/portuguese.md b/_analyzers/language-analyzers/portuguese.md index eb7b959c0b..301d043f7d 100644 --- a/_analyzers/language-analyzers/portuguese.md +++ b/_analyzers/language-analyzers/portuguese.md @@ -27,7 +27,7 @@ PUT /portuguese-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_portuguese_analyzer diff --git a/_analyzers/language-analyzers/romanian.md b/_analyzers/language-analyzers/romanian.md index 9b5c909665..6795d01a4d 100644 --- a/_analyzers/language-analyzers/romanian.md +++ b/_analyzers/language-analyzers/romanian.md @@ -27,7 +27,7 @@ PUT /romanian-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_romanian_analyzer diff --git a/_analyzers/language-analyzers/russian.md b/_analyzers/language-analyzers/russian.md index 9552bce9da..3a305ee051 100644 --- a/_analyzers/language-analyzers/russian.md +++ b/_analyzers/language-analyzers/russian.md @@ -27,7 +27,7 @@ PUT /russian-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_russian_analyzer diff --git a/_analyzers/language-analyzers/sorani.md b/_analyzers/language-analyzers/sorani.md index df44ea5f27..760b7e46c6 100644 --- a/_analyzers/language-analyzers/sorani.md +++ b/_analyzers/language-analyzers/sorani.md @@ -27,7 +27,7 @@ PUT /sorani-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_sorani_analyzer diff --git a/_analyzers/language-analyzers/spanish.md b/_analyzers/language-analyzers/spanish.md index 98ded27b83..a20d0fa509 100644 --- a/_analyzers/language-analyzers/spanish.md +++ b/_analyzers/language-analyzers/spanish.md @@ -27,7 +27,7 @@ PUT /spanish-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_spanish_analyzer diff --git a/_analyzers/language-analyzers/swedish.md b/_analyzers/language-analyzers/swedish.md index 67decf2344..f70a0dbca1 100644 --- a/_analyzers/language-analyzers/swedish.md +++ b/_analyzers/language-analyzers/swedish.md @@ -27,7 +27,7 @@ PUT /swedish-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_swedish_analyzer diff --git a/_analyzers/language-analyzers/thai.md b/_analyzers/language-analyzers/thai.md index f251067dc0..78c3d1250d 100644 --- a/_analyzers/language-analyzers/thai.md +++ b/_analyzers/language-analyzers/thai.md @@ -27,7 +27,7 @@ PUT /thai-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_thai_analyzer diff --git a/_analyzers/language-analyzers/turkish.md b/_analyzers/language-analyzers/turkish.md index 9255682322..14a6f8e9b6 100644 --- a/_analyzers/language-analyzers/turkish.md +++ b/_analyzers/language-analyzers/turkish.md @@ -27,7 +27,7 @@ PUT /turkish-index ## Stem exclusion -You can also use `stem_exclusion` with this language analyzer using the following command: +You can use `stem_exclusion` with this language analyzer using the following command: ```json PUT index_with_stem_exclusion_turkish_analyzer From 26fbb9b5a7f156e3eb0f403b6dd04e05fa4570fa Mon Sep 17 00:00:00 2001 From: Anton Rubin Date: Thu, 7 Nov 2024 12:34:36 +0000 Subject: [PATCH 11/15] updating as per pr review Signed-off-by: Anton Rubin --- _analyzers/language-analyzers/arabic.md | 12 ++-- _analyzers/language-analyzers/armenian.md | 14 ++-- _analyzers/language-analyzers/basque.md | 14 ++-- _analyzers/language-analyzers/bengali.md | 24 +++---- _analyzers/language-analyzers/brazilian.md | 14 ++-- _analyzers/language-analyzers/bulgarian.md | 14 ++-- _analyzers/language-analyzers/catalan.md | 16 ++--- _analyzers/language-analyzers/cjk.md | 14 ++-- _analyzers/language-analyzers/czech.md | 14 ++-- _analyzers/language-analyzers/danish.md | 14 ++-- _analyzers/language-analyzers/dutch.md | 16 ++--- _analyzers/language-analyzers/english.md | 16 ++--- _analyzers/language-analyzers/estonian.md | 14 ++-- _analyzers/language-analyzers/finnish.md | 14 ++-- _analyzers/language-analyzers/french.md | 16 ++--- _analyzers/language-analyzers/galician.md | 14 ++-- _analyzers/language-analyzers/german.md | 16 ++--- _analyzers/language-analyzers/greek.md | 14 ++-- _analyzers/language-analyzers/hindi.md | 24 +++---- _analyzers/language-analyzers/hungarian.md | 14 ++-- _analyzers/language-analyzers/index.md | 76 ++++++++++----------- _analyzers/language-analyzers/indonesian.md | 14 ++-- _analyzers/language-analyzers/irish.md | 18 ++--- _analyzers/language-analyzers/italian.md | 16 ++--- _analyzers/language-analyzers/latvian.md | 14 ++-- _analyzers/language-analyzers/lithuanian.md | 14 ++-- _analyzers/language-analyzers/norwegian.md | 14 ++-- _analyzers/language-analyzers/persian.md | 20 +++--- _analyzers/language-analyzers/portuguese.md | 14 ++-- _analyzers/language-analyzers/romanian.md | 14 ++-- _analyzers/language-analyzers/russian.md | 14 ++-- _analyzers/language-analyzers/sorani.md | 18 ++--- _analyzers/language-analyzers/spanish.md | 14 ++-- _analyzers/language-analyzers/swedish.md | 14 ++-- _analyzers/language-analyzers/thai.md | 14 ++-- _analyzers/language-analyzers/turkish.md | 16 ++--- 36 files changed, 306 insertions(+), 306 deletions(-) diff --git a/_analyzers/language-analyzers/arabic.md b/_analyzers/language-analyzers/arabic.md index 64671d3b0c..b6508827ff 100644 --- a/_analyzers/language-analyzers/arabic.md +++ b/_analyzers/language-analyzers/arabic.md @@ -53,12 +53,12 @@ The `arabic` analyzer is built using the following components: - Tokenizer: `standard` - Token filters: -- lowercase -- decimal_digit -- stop (Arabic) -- normalization (Arabic) -- keywords -- stemmer (Arabic) + - lowercase + - decimal_digit + - stop (Arabic) + - normalization (Arabic) + - keywords + - stemmer (Arabic) ## Custom Arabic analyzer diff --git a/_analyzers/language-analyzers/armenian.md b/_analyzers/language-analyzers/armenian.md index 38810533e1..1338fd38ed 100644 --- a/_analyzers/language-analyzers/armenian.md +++ b/_analyzers/language-analyzers/armenian.md @@ -48,15 +48,15 @@ PUT index_with_stem_exclusion_armenian_analyzer ## Armenian analyzer internals -The `armenian` analyzer is build using the following: +The `armenian` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token filters: -- lowercase -- stop (Armenian) -- keywords -- stemmer (Armenian) +- Token filters: + - lowercase + - stop (Armenian) + - keywords + - stemmer (Armenian) ## Custom Armenian analyzer diff --git a/_analyzers/language-analyzers/basque.md b/_analyzers/language-analyzers/basque.md index 47e71b43e3..6613bc343b 100644 --- a/_analyzers/language-analyzers/basque.md +++ b/_analyzers/language-analyzers/basque.md @@ -48,15 +48,15 @@ PUT index_with_stem_exclusion_basque_analyzer ## Basque analyzer internals -The `basque` analyzer is build using the following: +The `basque` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token filters: -- lowercase -- stop (Basque) -- keywords -- stemmer (Basque) +- Token filters: + - lowercase + - stop (Basque) + - keywords + - stemmer (Basque) ## Custom Basque analyzer diff --git a/_analyzers/language-analyzers/bengali.md b/_analyzers/language-analyzers/bengali.md index 8107ce7dfe..e1c53fd387 100644 --- a/_analyzers/language-analyzers/bengali.md +++ b/_analyzers/language-analyzers/bengali.md @@ -48,18 +48,18 @@ PUT index_with_stem_exclusion_bengali_analyzer ## Bengali analyzer internals -The `bengali` analyzer is build using the following: - -Tokenizer: `standard` - -Token filters: -- lowercase -- decimal_digit -- indic_normalization -- normalization (Bengali) -- stop (Bengali) -- keywords -- stemmer (Bengali) +The `bengali` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - decimal_digit + - indic_normalization + - normalization (Bengali) + - stop (Bengali) + - keywords + - stemmer (Bengali) ## Custom Bengali analyzer diff --git a/_analyzers/language-analyzers/brazilian.md b/_analyzers/language-analyzers/brazilian.md index 925e559ac0..eae04b03d4 100644 --- a/_analyzers/language-analyzers/brazilian.md +++ b/_analyzers/language-analyzers/brazilian.md @@ -48,15 +48,15 @@ PUT index_with_stem_exclusion_brazilian_analyzer ## Brazilian analyzer internals -The `brazilian` analyzer is build using the following: +The `brazilian` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token filters: -- lowercase -- stop (Brazilian) -- keywords -- stemmer (Brazilian) +- Token filters: + - lowercase + - stop (Brazilian) + - keywords + - stemmer (Brazilian) ## Custom Brazilian analyzer diff --git a/_analyzers/language-analyzers/bulgarian.md b/_analyzers/language-analyzers/bulgarian.md index 496ac086b7..0ac726cba2 100644 --- a/_analyzers/language-analyzers/bulgarian.md +++ b/_analyzers/language-analyzers/bulgarian.md @@ -48,15 +48,15 @@ PUT index_with_stem_exclusion_bulgarian_analyzer ## Bulgarian analyzer internals -The `bulgarian` analyzer is build using the following: +The `bulgarian` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token filters: -- lowercase -- stop (Bulgarian) -- keywords -- stemmer (Bulgarian) +- Token filters: + - lowercase + - stop (Bulgarian) + - keywords + - stemmer (Bulgarian) ## Custom Bulgarian analyzer diff --git a/_analyzers/language-analyzers/catalan.md b/_analyzers/language-analyzers/catalan.md index d6a9b1c8b2..4727aed9a2 100644 --- a/_analyzers/language-analyzers/catalan.md +++ b/_analyzers/language-analyzers/catalan.md @@ -48,16 +48,16 @@ PUT index_with_stem_exclusion_catalan_analyzer ## Catalan analyzer internals -The `catalan` analyzer is build using the following: +The `catalan` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token filters: -- elision (Catalan) -- lowercase -- stop (Catalan) -- keywords -- stemmer (Catalan) +- Token filters: + - elision (Catalan) + - lowercase + - stop (Catalan) + - keywords + - stemmer (Catalan) ## Custom Catalan analyzer diff --git a/_analyzers/language-analyzers/cjk.md b/_analyzers/language-analyzers/cjk.md index 31dc917e99..3968113e6e 100644 --- a/_analyzers/language-analyzers/cjk.md +++ b/_analyzers/language-analyzers/cjk.md @@ -48,15 +48,15 @@ PUT index_with_stem_exclusion_cjk_analyzer ## CJK analyzer internals -The `cjk` analyzer is build using the following: +The `cjk` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token filters: -- cjk_width -- lowercase -- cjk_bigram -- stop (similar to English) +- Token filters: + - cjk_width + - lowercase + - cjk_bigram + - stop (similar to English) ## Custom CJK analyzer diff --git a/_analyzers/language-analyzers/czech.md b/_analyzers/language-analyzers/czech.md index 3c1fbd9c9c..12381472a5 100644 --- a/_analyzers/language-analyzers/czech.md +++ b/_analyzers/language-analyzers/czech.md @@ -48,15 +48,15 @@ PUT index_with_stem_exclusion_czech_analyzer ## Czech analyzer internals -The `czech` analyzer is build using the following: +The `czech` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token filters: -- lowercase -- stop (Czech) -- keyword -- stemmer (Czech) +- Token filters: + - lowercase + - stop (Czech) + - keyword + - stemmer (Czech) ## Custom Czech analyzer diff --git a/_analyzers/language-analyzers/danish.md b/_analyzers/language-analyzers/danish.md index c9ccc0c01e..7a5e53f11f 100644 --- a/_analyzers/language-analyzers/danish.md +++ b/_analyzers/language-analyzers/danish.md @@ -48,15 +48,15 @@ PUT index_with_stem_exclusion_danish_analyzer ## Danish analyzer internals -The `danish` analyzer is build using the following: +The `danish` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token filters: -- lowercase -- stop (Danish) -- keyword -- stemmer (Danish) +- Token filters: + - lowercase + - stop (Danish) + - keyword + - stemmer (Danish) ## Custom Danish analyzer diff --git a/_analyzers/language-analyzers/dutch.md b/_analyzers/language-analyzers/dutch.md index 90ce69fa09..334a93f5b0 100644 --- a/_analyzers/language-analyzers/dutch.md +++ b/_analyzers/language-analyzers/dutch.md @@ -48,16 +48,16 @@ PUT index_with_stem_exclusion_dutch_analyzer ## Dutch analyzer internals -The `dutch` analyzer is build using the following: +The `dutch` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token filters: -- lowercase -- stop (Dutch) -- keyword -- stemmer_override -- stemmer (Dutch) +- Token filters: + - lowercase + - stop (Dutch) + - keyword + - stemmer_override + - stemmer (Dutch) ## Custom Dutch analyzer diff --git a/_analyzers/language-analyzers/english.md b/_analyzers/language-analyzers/english.md index fda095b912..46a6a20961 100644 --- a/_analyzers/language-analyzers/english.md +++ b/_analyzers/language-analyzers/english.md @@ -48,16 +48,16 @@ PUT index_with_stem_exclusion_english_analyzer ## English analyzer internals -The `english` analyzer is build using the following: +The `english` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token filters: -- stemmer (possessive_english) -- lowercase -- stop (English) -- keyword -- stemmer (English) +- Token filters: + - stemmer (possessive_english) + - lowercase + - stop (English) + - keyword + - stemmer (English) ## Custom English analyzer diff --git a/_analyzers/language-analyzers/estonian.md b/_analyzers/language-analyzers/estonian.md index 01961c2d46..49411ddf96 100644 --- a/_analyzers/language-analyzers/estonian.md +++ b/_analyzers/language-analyzers/estonian.md @@ -48,15 +48,15 @@ PUT index_with_stem_exclusion_estonian_analyzer ## Estonian analyzer internals -The `estonian` analyzer is build using the following: +The `estonian` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token filters: -- lowercase -- stop (Estonian) -- keyword -- stemmer (Estonian) +- Token filters: + - lowercase + - stop (Estonian) + - keyword + - stemmer (Estonian) ## Custom Estonian analyzer diff --git a/_analyzers/language-analyzers/finnish.md b/_analyzers/language-analyzers/finnish.md index 3ac753e5ea..f39a53adf9 100644 --- a/_analyzers/language-analyzers/finnish.md +++ b/_analyzers/language-analyzers/finnish.md @@ -48,15 +48,15 @@ PUT index_with_stem_exclusion_finnish_analyzer ## Finnish analyzer internals -The `finnish` analyzer is build using the following: +The `finnish` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token filters: -- lowercase -- stop (Finnish) -- keyword -- stemmer (Finnish) +- Token filters: + - lowercase + - stop (Finnish) + - keyword + - stemmer (Finnish) ## Custom Finnish analyzer diff --git a/_analyzers/language-analyzers/french.md b/_analyzers/language-analyzers/french.md index 278bfbb333..fd1c9e7687 100644 --- a/_analyzers/language-analyzers/french.md +++ b/_analyzers/language-analyzers/french.md @@ -48,16 +48,16 @@ PUT index_with_stem_exclusion_french_analyzer ## French analyzer internals -The `french` analyzer is build using the following: +The `french` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token filters: -- elision (French) -- lowercase -- stop (French) -- keyword -- stemmer (French) +- Token filters: + - elision (French) + - lowercase + - stop (French) + - keyword + - stemmer (French) ## Custom French analyzer diff --git a/_analyzers/language-analyzers/galician.md b/_analyzers/language-analyzers/galician.md index 515717bf3f..d4fd176b87 100644 --- a/_analyzers/language-analyzers/galician.md +++ b/_analyzers/language-analyzers/galician.md @@ -48,15 +48,15 @@ PUT index_with_stem_exclusion_galician_analyzer ## Galician analyzer internals -The `galician` analyzer is build using the following: +The `galician` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token filters: -- lowercase -- stop (French) -- keyword -- stemmer (French) +- Token filters: + - lowercase + - stop (French) + - keyword + - stemmer (French) ## Custom Galician analyzer diff --git a/_analyzers/language-analyzers/german.md b/_analyzers/language-analyzers/german.md index 1e679aca0a..d6859381e9 100644 --- a/_analyzers/language-analyzers/german.md +++ b/_analyzers/language-analyzers/german.md @@ -48,16 +48,16 @@ PUT index_with_stem_exclusion_german_analyzer ## German analyzer internals -The `german` analyzer is build using the following: +The `german` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token filters: -- lowercase -- stop (German) -- keyword -- normalization (German) -- stemmer (German) +- Token filters: + - lowercase + - stop (German) + - keyword + - normalization (German) + - stemmer (German) ## Custom German analyzer diff --git a/_analyzers/language-analyzers/greek.md b/_analyzers/language-analyzers/greek.md index 4b44d7014c..fcc1be8c86 100644 --- a/_analyzers/language-analyzers/greek.md +++ b/_analyzers/language-analyzers/greek.md @@ -48,15 +48,15 @@ PUT index_with_stem_exclusion_greek_analyzer ## Greek analyzer internals -The `greek` analyzer is build using the following: +The `greek` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token filters: -- lowercase -- stop (Greek) -- keyword -- stemmer (Greek) +- Token filters: + - lowercase + - stop (Greek) + - keyword + - stemmer (Greek) ## Custom Greek analyzer diff --git a/_analyzers/language-analyzers/hindi.md b/_analyzers/language-analyzers/hindi.md index b1fdabb2b6..d9920008b1 100644 --- a/_analyzers/language-analyzers/hindi.md +++ b/_analyzers/language-analyzers/hindi.md @@ -48,18 +48,18 @@ PUT index_with_stem_exclusion_hindi_analyzer ## Hindi analyzer internals -The `hindi` analyzer is build using the following: - -Tokenizer: `standard` - -Token filters: -- lowercase -- decimal_digit -- keyword -- normalization (indic) -- normalization (Hindi) -- stop (Hindi) -- stemmer (Hindi) +The `hindi` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - decimal_digit + - keyword + - normalization (indic) + - normalization (Hindi) + - stop (Hindi) + - stemmer (Hindi) ## Custom Hindi analyzer diff --git a/_analyzers/language-analyzers/hungarian.md b/_analyzers/language-analyzers/hungarian.md index 83330eb708..601b5d3968 100644 --- a/_analyzers/language-analyzers/hungarian.md +++ b/_analyzers/language-analyzers/hungarian.md @@ -48,15 +48,15 @@ PUT index_with_stem_exclusion_hungarian_analyzer ## Hungarian analyzer internals -The `hungarian` analyzer is build using the following: +The `hungarian` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token filters: -- lowercase -- stop (Hungarian) -- keyword -- stemmer (Hungarian) +- Token filters: + - lowercase + - stop (Hungarian) + - keyword + - stemmer (Hungarian) ## Custom Hungarian analyzer diff --git a/_analyzers/language-analyzers/index.md b/_analyzers/language-analyzers/index.md index 639ff28502..afe9d82452 100644 --- a/_analyzers/language-analyzers/index.md +++ b/_analyzers/language-analyzers/index.md @@ -4,7 +4,9 @@ title: Language analyzers nav_order: 100 parent: Analyzers has_children: true -has_toc: false +has_toc: true +redirect_from: + - /query-dsl/analyzers/language-analyzers/ --- # Language analyzers @@ -20,14 +22,14 @@ To use the analyzer when you map an index, specify the value within your query. #### Example request -The following query specifies the `french` language analyzer for the index `my-index`: +The following query specifies index `my-index` with `content` field configured as multi-field and sub-field named `french` is configured with `french` language analyzer: ```json PUT my-index { "mappings": { "properties": { - "text": { + "content": { "type": "text", "fields": { "french": { @@ -40,10 +42,42 @@ PUT my-index } } ``` +{% include copy-curl.html %} + +Default `french` analyzer can also be configured for the entire index using the following query: + +```json +PUT my-index +{ + "settings": { + "analysis": { + "analyzer": { + "default": { + "type": "french" + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text" + }, + "title": { + "type": "text" + }, + "description": { + "type": "text" + } + } + } +} +``` +{% include copy-curl.html %} ## Stem exclusion -You can apply stem exclusion to many language analyzers by providing a list of lowercase words that should be excluded from stemming. Internally, OpenSearch uses the `keyword_marker` token filter to mark these words as keywords, ensuring they are not stemmed. +You can apply stem exclusion to any language analyzer by providing a list of lowercase words that should be excluded from stemming. Internally, OpenSearch uses the `keyword_marker` token filter to mark these words as keywords, ensuring they are not stemmed. ## Stem exclusion example @@ -66,40 +100,6 @@ PUT index_with_stem_exclusion_english_analyzer ``` {% include copy-curl.html %} -The following languages support stem exclusion: - -- arabic -- armenian -- basque -- bengali -- brazilian -- bulgarian -- catalan -- cjk -- czech -- danish -- dutch -- english -- finnish -- french -- galician -- german -- hindi -- hungarian -- indonesian -- irish -- italian -- latvian -- lithuanian -- norwegian -- portuguese -- romanian -- russian -- sorani -- spanish -- swedish -- turkish - ## Stem exclusion with custom analyzers diff --git a/_analyzers/language-analyzers/indonesian.md b/_analyzers/language-analyzers/indonesian.md index 73b551cd9a..920319082a 100644 --- a/_analyzers/language-analyzers/indonesian.md +++ b/_analyzers/language-analyzers/indonesian.md @@ -48,15 +48,15 @@ PUT index_with_stem_exclusion_indonesian_analyzer ## Indonesian analyzer internals -The `indonesian` analyzer is build using the following: +The `indonesian` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token filters: -- lowercase -- stop (Indonesian) -- keyword -- stemmer (Indonesian) +- Token filters: + - lowercase + - stop (Indonesian) + - keyword + - stemmer (Indonesian) ## Custom Indonesian analyzer diff --git a/_analyzers/language-analyzers/irish.md b/_analyzers/language-analyzers/irish.md index b4e25e57c8..606a81a10e 100644 --- a/_analyzers/language-analyzers/irish.md +++ b/_analyzers/language-analyzers/irish.md @@ -48,17 +48,17 @@ PUT index_with_stem_exclusion_irish_analyzer ## Irish analyzer internals -The `irish` analyzer is build using the following: +The `irish` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token filters: -- hyphenation (Irish) -- elision (Irish) -- lowercase (Irish) -- stop (Irish) -- keyword -- stemmer (Irish) +- Token filters: + - hyphenation (Irish) + - elision (Irish) + - lowercase (Irish) + - stop (Irish) + - keyword + - stemmer (Irish) ## Custom Irish analyzer diff --git a/_analyzers/language-analyzers/italian.md b/_analyzers/language-analyzers/italian.md index 1fc1063efd..6cf423fe67 100644 --- a/_analyzers/language-analyzers/italian.md +++ b/_analyzers/language-analyzers/italian.md @@ -48,16 +48,16 @@ PUT index_with_stem_exclusion_italian_analyzer ## Italian analyzer internals -The `italian` analyzer is build using the following: +The `italian` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token filters: -- elision (Italian) -- lowercase -- stop (Italian) -- keyword -- stemmer (Italian) +- Token filters: + - elision (Italian) + - lowercase + - stop (Italian) + - keyword + - stemmer (Italian) ## Custom Italian analyzer diff --git a/_analyzers/language-analyzers/latvian.md b/_analyzers/language-analyzers/latvian.md index 620f694c23..edb8c5e95e 100644 --- a/_analyzers/language-analyzers/latvian.md +++ b/_analyzers/language-analyzers/latvian.md @@ -48,15 +48,15 @@ PUT index_with_stem_exclusion_latvian_analyzer ## Latvian analyzer internals -The `latvian` analyzer is build using the following: +The `latvian` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token filters: -- lowercase -- stop (Latvian) -- keyword -- stemmer (Latvian) +- Token filters: + - lowercase + - stop (Latvian) + - keyword + - stemmer (Latvian) ## Custom Latvian analyzer diff --git a/_analyzers/language-analyzers/lithuanian.md b/_analyzers/language-analyzers/lithuanian.md index 6d67dc2262..7f2da59101 100644 --- a/_analyzers/language-analyzers/lithuanian.md +++ b/_analyzers/language-analyzers/lithuanian.md @@ -48,15 +48,15 @@ PUT index_with_stem_exclusion_lithuanian_analyzer ## Lithuanian analyzer internals -The `lithuanian` analyzer is build using the following: +The `lithuanian` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token filters: -- lowercase -- stop (Lithuanian) -- keyword -- stemmer (Lithuanian) +- Token filters: + - lowercase + - stop (Lithuanian) + - keyword + - stemmer (Lithuanian) ## Custom Lithuanian analyzer diff --git a/_analyzers/language-analyzers/norwegian.md b/_analyzers/language-analyzers/norwegian.md index 5a00a27924..171da8ad51 100644 --- a/_analyzers/language-analyzers/norwegian.md +++ b/_analyzers/language-analyzers/norwegian.md @@ -48,15 +48,15 @@ PUT index_with_stem_exclusion_norwegian_analyzer ## Norwegian analyzer internals -The `norwegian` analyzer is build using the following: +The `norwegian` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token filters: -- lowercase -- stop (Norwegian) -- keyword -- stemmer (Norwegian) +- Token filters: + - lowercase + - stop (Norwegian) + - keyword + - stemmer (Norwegian) ## Custom Norwegian analyzer diff --git a/_analyzers/language-analyzers/persian.md b/_analyzers/language-analyzers/persian.md index 1a335dd483..d6018ccaab 100644 --- a/_analyzers/language-analyzers/persian.md +++ b/_analyzers/language-analyzers/persian.md @@ -48,19 +48,19 @@ PUT index_with_stem_exclusion_persian_analyzer ## Persian analyzer internals -The `persian` analyzer is build using the following: +The `persian` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Char filter: `mapping` +- Char filter: `mapping` -Token filters: -- lowercase -- decimal_digit -- normalization (Arabic) -- normalization (Persian) -- keyword -- stemmer (Norwegian) +- Token filters: + - lowercase + - decimal_digit + - normalization (Arabic) + - normalization (Persian) + - keyword + - stemmer (Norwegian) ## Custom Persian analyzer diff --git a/_analyzers/language-analyzers/portuguese.md b/_analyzers/language-analyzers/portuguese.md index 301d043f7d..9a752dc4c6 100644 --- a/_analyzers/language-analyzers/portuguese.md +++ b/_analyzers/language-analyzers/portuguese.md @@ -48,15 +48,15 @@ PUT index_with_stem_exclusion_portuguese_analyzer ## Portuguese analyzer internals -The `portuguese` analyzer is build using the following: +The `portuguese` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token Filters: -- lowercase -- stop (Portuguese) -- keyword -- stemmer (Portuguese) +- Token Filters: + - lowercase + - stop (Portuguese) + - keyword + - stemmer (Portuguese) ## Custom Portuguese analyzer diff --git a/_analyzers/language-analyzers/romanian.md b/_analyzers/language-analyzers/romanian.md index 6795d01a4d..bffe26288b 100644 --- a/_analyzers/language-analyzers/romanian.md +++ b/_analyzers/language-analyzers/romanian.md @@ -48,15 +48,15 @@ PUT index_with_stem_exclusion_romanian_analyzer ## Romanian analyzer internals -The `romanian` analyzer is build using the following: +The `romanian` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token Filters: -- lowercase -- stop (Romanian) -- keyword -- stemmer (Romanian) +- Token Filters: + - lowercase + - stop (Romanian) + - keyword + - stemmer (Romanian) ## Custom Romanian analyzer diff --git a/_analyzers/language-analyzers/russian.md b/_analyzers/language-analyzers/russian.md index 3a305ee051..ac9ae0d72a 100644 --- a/_analyzers/language-analyzers/russian.md +++ b/_analyzers/language-analyzers/russian.md @@ -48,15 +48,15 @@ PUT index_with_stem_exclusion_russian_analyzer ## Russian analyzer internals -The `russian` analyzer is build using the following: +The `russian` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token Filters: -- lowercase -- stop (Russian) -- keyword -- stemmer (Russian) +- Token Filters: + - lowercase + - stop (Russian) + - keyword + - stemmer (Russian) ## Custom Russian analyzer diff --git a/_analyzers/language-analyzers/sorani.md b/_analyzers/language-analyzers/sorani.md index 760b7e46c6..8c31c3ef1e 100644 --- a/_analyzers/language-analyzers/sorani.md +++ b/_analyzers/language-analyzers/sorani.md @@ -48,17 +48,17 @@ PUT index_with_stem_exclusion_sorani_analyzer ## Sorani analyzer internals -The `sorani` analyzer is build using the following: +The `sorani` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token Filters: -- normalization (Sorani) -- lowercase -- decimal_digit -- stop (Sorani) -- keyword -- stemmer (Sorani) +- Token Filters: + - normalization (Sorani) + - lowercase + - decimal_digit + - stop (Sorani) + - keyword + - stemmer (Sorani) ## Custom Sorani analyzer diff --git a/_analyzers/language-analyzers/spanish.md b/_analyzers/language-analyzers/spanish.md index a20d0fa509..3a1573d291 100644 --- a/_analyzers/language-analyzers/spanish.md +++ b/_analyzers/language-analyzers/spanish.md @@ -48,15 +48,15 @@ PUT index_with_stem_exclusion_spanish_analyzer ## Spanish analyzer internals -The `spanish` analyzer is build using the following: +The `spanish` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token Filters: -- lowercase -- stop (Spanish) -- keyword -- stemmer (Spanish) +- Token Filters: + - lowercase + - stop (Spanish) + - keyword + - stemmer (Spanish) ## Custom Spanish analyzer diff --git a/_analyzers/language-analyzers/swedish.md b/_analyzers/language-analyzers/swedish.md index f70a0dbca1..9aadc9bc60 100644 --- a/_analyzers/language-analyzers/swedish.md +++ b/_analyzers/language-analyzers/swedish.md @@ -48,15 +48,15 @@ PUT index_with_stem_exclusion_swedish_analyzer ## Swedish analyzer internals -The `swedish` analyzer is build using the following: +The `swedish` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token Filters: -- lowercase -- stop (Swedish) -- keyword -- stemmer (Swedish) +- Token Filters: + - lowercase + - stop (Swedish) + - keyword + - stemmer (Swedish) ## Custom Swedish analyzer diff --git a/_analyzers/language-analyzers/thai.md b/_analyzers/language-analyzers/thai.md index 78c3d1250d..5f33554cf7 100644 --- a/_analyzers/language-analyzers/thai.md +++ b/_analyzers/language-analyzers/thai.md @@ -48,15 +48,15 @@ PUT index_with_stem_exclusion_thai_analyzer ## Thai analyzer internals -The `thai` analyzer is build using the following: +The `thai` analyzer is built using the following components: -Tokenizer: `thai` +- Tokenizer: `thai` -Token Filters: -- lowercase -- decimal_digit -- stop (Thai) -- keyword +- Token Filters: + - lowercase + - decimal_digit + - stop (Thai) + - keyword ## Custom Thai analyzer diff --git a/_analyzers/language-analyzers/turkish.md b/_analyzers/language-analyzers/turkish.md index 14a6f8e9b6..9e9b31acbc 100644 --- a/_analyzers/language-analyzers/turkish.md +++ b/_analyzers/language-analyzers/turkish.md @@ -48,16 +48,16 @@ PUT index_with_stem_exclusion_turkish_analyzer ## Turkish analyzer internals -The `turkish` analyzer is build using the following: +The `turkish` analyzer is built using the following components: -Tokenizer: `standard` +- Tokenizer: `standard` -Token Filters: -- apostrophe -- lowercase (Turkish) -- stop (Turkish) -- keyword -- stemmer (Turkish) +- Token Filters: + - apostrophe + - lowercase (Turkish) + - stop (Turkish) + - keyword + - stemmer (Turkish) ## Custom Turkish analyzer From 5172a0d9f3254e9601a6a1f70aff13ab66af73c8 Mon Sep 17 00:00:00 2001 From: Anton Rubin Date: Thu, 7 Nov 2024 12:46:20 +0000 Subject: [PATCH 12/15] fixing broken link Signed-off-by: Anton Rubin --- _analyzers/supported-analyzers/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_analyzers/supported-analyzers/index.md b/_analyzers/supported-analyzers/index.md index fef0b4d34e..43e41b8d6a 100644 --- a/_analyzers/supported-analyzers/index.md +++ b/_analyzers/supported-analyzers/index.md @@ -29,7 +29,7 @@ Analyzer | Analysis performed | Analyzer output ## Language analyzers -OpenSearch supports multiple language analyzers. For more information, see [Language analyzers]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/). +OpenSearch supports multiple language analyzers. For more information, see [Language analyzers]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/index). ## Additional analyzers From 719ef66f6b5d0cfbac247dd9a0f79a6389f3b19a Mon Sep 17 00:00:00 2001 From: AntonEliatra Date: Tue, 12 Nov 2024 16:51:06 +0000 Subject: [PATCH 13/15] Apply suggestions from code review Co-authored-by: Nathan Bower Signed-off-by: AntonEliatra --- _analyzers/language-analyzers/arabic.md | 2 +- _analyzers/language-analyzers/armenian.md | 4 ++-- _analyzers/language-analyzers/basque.md | 4 ++-- _analyzers/language-analyzers/bengali.md | 4 ++-- _analyzers/language-analyzers/brazilian.md | 4 ++-- _analyzers/language-analyzers/bulgarian.md | 4 ++-- _analyzers/language-analyzers/catalan.md | 4 ++-- _analyzers/language-analyzers/cjk.md | 2 +- _analyzers/language-analyzers/czech.md | 2 +- _analyzers/language-analyzers/danish.md | 2 +- _analyzers/language-analyzers/dutch.md | 2 +- _analyzers/language-analyzers/english.md | 2 +- _analyzers/language-analyzers/estonian.md | 2 +- _analyzers/language-analyzers/finnish.md | 2 +- _analyzers/language-analyzers/french.md | 2 +- _analyzers/language-analyzers/galician.md | 2 +- _analyzers/language-analyzers/german.md | 2 +- _analyzers/language-analyzers/greek.md | 2 +- _analyzers/language-analyzers/hindi.md | 2 +- _analyzers/language-analyzers/hungarian.md | 2 +- _analyzers/language-analyzers/index.md | 12 ++++++------ _analyzers/language-analyzers/indonesian.md | 2 +- _analyzers/language-analyzers/irish.md | 2 +- _analyzers/language-analyzers/italian.md | 2 +- _analyzers/language-analyzers/latvian.md | 2 +- _analyzers/language-analyzers/lithuanian.md | 2 +- _analyzers/language-analyzers/norwegian.md | 2 +- _analyzers/language-analyzers/persian.md | 2 +- _analyzers/language-analyzers/portuguese.md | 4 ++-- _analyzers/language-analyzers/romanian.md | 4 ++-- _analyzers/language-analyzers/russian.md | 4 ++-- _analyzers/language-analyzers/sorani.md | 4 ++-- _analyzers/language-analyzers/spanish.md | 4 ++-- _analyzers/language-analyzers/swedish.md | 4 ++-- _analyzers/language-analyzers/thai.md | 4 ++-- _analyzers/language-analyzers/turkish.md | 4 ++-- 36 files changed, 55 insertions(+), 55 deletions(-) diff --git a/_analyzers/language-analyzers/arabic.md b/_analyzers/language-analyzers/arabic.md index b6508827ff..e61c684cbb 100644 --- a/_analyzers/language-analyzers/arabic.md +++ b/_analyzers/language-analyzers/arabic.md @@ -57,7 +57,7 @@ The `arabic` analyzer is built using the following components: - decimal_digit - stop (Arabic) - normalization (Arabic) - - keywords + - keyword - stemmer (Arabic) ## Custom Arabic analyzer diff --git a/_analyzers/language-analyzers/armenian.md b/_analyzers/language-analyzers/armenian.md index 1338fd38ed..9bd0549c80 100644 --- a/_analyzers/language-analyzers/armenian.md +++ b/_analyzers/language-analyzers/armenian.md @@ -55,12 +55,12 @@ The `armenian` analyzer is built using the following components: - Token filters: - lowercase - stop (Armenian) - - keywords + - keyword - stemmer (Armenian) ## Custom Armenian analyzer -You can create custom Armenian analyzer using the following command: +You can create a custom Armenian analyzer using the following command: ```json PUT /armenian-index diff --git a/_analyzers/language-analyzers/basque.md b/_analyzers/language-analyzers/basque.md index 6613bc343b..e73510cc66 100644 --- a/_analyzers/language-analyzers/basque.md +++ b/_analyzers/language-analyzers/basque.md @@ -55,12 +55,12 @@ The `basque` analyzer is built using the following components: - Token filters: - lowercase - stop (Basque) - - keywords + - keyword - stemmer (Basque) ## Custom Basque analyzer -You can create custom Basque analyzer using the following command: +You can create a custom Basque analyzer using the following command: ```json PUT /basque-index diff --git a/_analyzers/language-analyzers/bengali.md b/_analyzers/language-analyzers/bengali.md index e1c53fd387..af913a01ef 100644 --- a/_analyzers/language-analyzers/bengali.md +++ b/_analyzers/language-analyzers/bengali.md @@ -58,12 +58,12 @@ The `bengali` analyzer is built using the following components: - indic_normalization - normalization (Bengali) - stop (Bengali) - - keywords + - keyword - stemmer (Bengali) ## Custom Bengali analyzer -You can create custom Bengali analyzer using the following command: +You can create a custom Bengali analyzer using the following command: ```json PUT /bengali-index diff --git a/_analyzers/language-analyzers/brazilian.md b/_analyzers/language-analyzers/brazilian.md index eae04b03d4..67db2b92bc 100644 --- a/_analyzers/language-analyzers/brazilian.md +++ b/_analyzers/language-analyzers/brazilian.md @@ -55,12 +55,12 @@ The `brazilian` analyzer is built using the following components: - Token filters: - lowercase - stop (Brazilian) - - keywords + - keyword - stemmer (Brazilian) ## Custom Brazilian analyzer -You can create custom Brazilian analyzer using the following command: +You can create a custom Brazilian analyzer using the following command: ```json PUT /brazilian-index diff --git a/_analyzers/language-analyzers/bulgarian.md b/_analyzers/language-analyzers/bulgarian.md index 0ac726cba2..42d5794e18 100644 --- a/_analyzers/language-analyzers/bulgarian.md +++ b/_analyzers/language-analyzers/bulgarian.md @@ -55,12 +55,12 @@ The `bulgarian` analyzer is built using the following components: - Token filters: - lowercase - stop (Bulgarian) - - keywords + - keyword - stemmer (Bulgarian) ## Custom Bulgarian analyzer -You can create custom Bulgarian analyzer using the following command: +You can create a custom Bulgarian analyzer using the following command: ```json PUT /bulgarian-index diff --git a/_analyzers/language-analyzers/catalan.md b/_analyzers/language-analyzers/catalan.md index 4727aed9a2..89762da094 100644 --- a/_analyzers/language-analyzers/catalan.md +++ b/_analyzers/language-analyzers/catalan.md @@ -56,12 +56,12 @@ The `catalan` analyzer is built using the following components: - elision (Catalan) - lowercase - stop (Catalan) - - keywords + - keyword - stemmer (Catalan) ## Custom Catalan analyzer -You can create custom Catalan analyzer using the following command: +You can create a custom Catalan analyzer using the following command: ```json PUT /catalan-index diff --git a/_analyzers/language-analyzers/cjk.md b/_analyzers/language-analyzers/cjk.md index 3968113e6e..aed7e6da22 100644 --- a/_analyzers/language-analyzers/cjk.md +++ b/_analyzers/language-analyzers/cjk.md @@ -60,7 +60,7 @@ The `cjk` analyzer is built using the following components: ## Custom CJK analyzer -You can create custom CJK analyzer using the following command: +You can create a custom CJK analyzer using the following command: ```json PUT /cjk-index diff --git a/_analyzers/language-analyzers/czech.md b/_analyzers/language-analyzers/czech.md index 12381472a5..c1778cd0f4 100644 --- a/_analyzers/language-analyzers/czech.md +++ b/_analyzers/language-analyzers/czech.md @@ -60,7 +60,7 @@ The `czech` analyzer is built using the following components: ## Custom Czech analyzer -You can create custom Czech analyzer using the following command: +You can create a custom Czech analyzer using the following command: ```json PUT /czech-index diff --git a/_analyzers/language-analyzers/danish.md b/_analyzers/language-analyzers/danish.md index 7a5e53f11f..b5ee1b0e97 100644 --- a/_analyzers/language-analyzers/danish.md +++ b/_analyzers/language-analyzers/danish.md @@ -60,7 +60,7 @@ The `danish` analyzer is built using the following components: ## Custom Danish analyzer -You can create custom Danish analyzer using the following command: +You can create a custom Danish analyzer using the following command: ```json PUT /danish-index diff --git a/_analyzers/language-analyzers/dutch.md b/_analyzers/language-analyzers/dutch.md index 334a93f5b0..0259707d78 100644 --- a/_analyzers/language-analyzers/dutch.md +++ b/_analyzers/language-analyzers/dutch.md @@ -61,7 +61,7 @@ The `dutch` analyzer is built using the following components: ## Custom Dutch analyzer -You can create custom Dutch analyzer using the following command: +You can create a custom Dutch analyzer using the following command: ```json PUT /dutch-index diff --git a/_analyzers/language-analyzers/english.md b/_analyzers/language-analyzers/english.md index 46a6a20961..2d0b600312 100644 --- a/_analyzers/language-analyzers/english.md +++ b/_analyzers/language-analyzers/english.md @@ -61,7 +61,7 @@ The `english` analyzer is built using the following components: ## Custom English analyzer -You can create custom English analyzer using the following command: +You can create a custom English analyzer using the following command: ```json PUT /english-index diff --git a/_analyzers/language-analyzers/estonian.md b/_analyzers/language-analyzers/estonian.md index 49411ddf96..a4cb664f18 100644 --- a/_analyzers/language-analyzers/estonian.md +++ b/_analyzers/language-analyzers/estonian.md @@ -60,7 +60,7 @@ The `estonian` analyzer is built using the following components: ## Custom Estonian analyzer -You can create custom Estonian analyzer using the following command: +You can create a custom Estonian analyzer using the following command: ```json PUT /estonian-index diff --git a/_analyzers/language-analyzers/finnish.md b/_analyzers/language-analyzers/finnish.md index f39a53adf9..6f559650d2 100644 --- a/_analyzers/language-analyzers/finnish.md +++ b/_analyzers/language-analyzers/finnish.md @@ -60,7 +60,7 @@ The `finnish` analyzer is built using the following components: ## Custom Finnish analyzer -You can create custom Finnish analyzer using the following command: +You can create a custom Finnish analyzer using the following command: ```json PUT /finnish-index diff --git a/_analyzers/language-analyzers/french.md b/_analyzers/language-analyzers/french.md index fd1c9e7687..64e7ab5415 100644 --- a/_analyzers/language-analyzers/french.md +++ b/_analyzers/language-analyzers/french.md @@ -61,7 +61,7 @@ The `french` analyzer is built using the following components: ## Custom French analyzer -You can create custom French analyzer using the following command: +You can create a custom French analyzer using the following command: ```json PUT /french-index diff --git a/_analyzers/language-analyzers/galician.md b/_analyzers/language-analyzers/galician.md index d4fd176b87..00338b23a7 100644 --- a/_analyzers/language-analyzers/galician.md +++ b/_analyzers/language-analyzers/galician.md @@ -60,7 +60,7 @@ The `galician` analyzer is built using the following components: ## Custom Galician analyzer -You can create custom Galician analyzer using the following command: +You can create a custom Galician analyzer using the following command: ```json PUT /galician-index diff --git a/_analyzers/language-analyzers/german.md b/_analyzers/language-analyzers/german.md index d6859381e9..4071ef5378 100644 --- a/_analyzers/language-analyzers/german.md +++ b/_analyzers/language-analyzers/german.md @@ -61,7 +61,7 @@ The `german` analyzer is built using the following components: ## Custom German analyzer -You can create custom German analyzer using the following command: +You can create a custom German analyzer using the following command: ```json PUT /german-index diff --git a/_analyzers/language-analyzers/greek.md b/_analyzers/language-analyzers/greek.md index fcc1be8c86..2446b1e2d6 100644 --- a/_analyzers/language-analyzers/greek.md +++ b/_analyzers/language-analyzers/greek.md @@ -60,7 +60,7 @@ The `greek` analyzer is built using the following components: ## Custom Greek analyzer -You can create custom Greek analyzer using the following command: +You can create a custom Greek analyzer using the following command: ```json PUT /greek-index diff --git a/_analyzers/language-analyzers/hindi.md b/_analyzers/language-analyzers/hindi.md index d9920008b1..93f2eea319 100644 --- a/_analyzers/language-analyzers/hindi.md +++ b/_analyzers/language-analyzers/hindi.md @@ -63,7 +63,7 @@ The `hindi` analyzer is built using the following components: ## Custom Hindi analyzer -You can create custom Hindi analyzer using the following command: +You can create a custom Hindi analyzer using the following command: ```json PUT /hindi-index diff --git a/_analyzers/language-analyzers/hungarian.md b/_analyzers/language-analyzers/hungarian.md index 601b5d3968..d115c5d29c 100644 --- a/_analyzers/language-analyzers/hungarian.md +++ b/_analyzers/language-analyzers/hungarian.md @@ -60,7 +60,7 @@ The `hungarian` analyzer is built using the following components: ## Custom Hungarian analyzer -You can create custom Hungarian analyzer using the following command: +You can create a custom Hungarian analyzer using the following command: ```json PUT /hungarian-index diff --git a/_analyzers/language-analyzers/index.md b/_analyzers/language-analyzers/index.md index afe9d82452..17c3cb613b 100644 --- a/_analyzers/language-analyzers/index.md +++ b/_analyzers/language-analyzers/index.md @@ -12,9 +12,9 @@ redirect_from: # Language analyzers OpenSearch supports the following language analyzers: -`arabic`, `armenian`, `basque`, `bengali`, `brazilian`, `bulgarian`, `catalan`, `czech`, `danish`, `dutch`, `english`, `estonian`, `finnish`, `french`, `galician`, `german`, `greek`, `hindi`, `hungarian`, `indonesian`, `irish`, `italian`, `latvian`, `lithuanian`, `norwegian`, `persian`, `portuguese`, `romanian`, `russian`, `sorani`, `spanish`, `swedish`, `turkish`, and `thai`. +`arabic`, `armenian`, `basque`, `bengali`, `brazilian`, `bulgarian`, `catalan`, `czech`, `danish`, `dutch`, `english`, `estonian`, `finnish`, `french`, `galician`, `german`, `greek`, `hindi`, `hungarian`, `indonesian`, `irish`, `italian`, `latvian`, `lithuanian`, `norwegian`, `persian`, `portuguese`, `romanian`, `russian`, `sorani`, `spanish`, `swedish`, `thai`, and `turkish`. -To use the analyzer when you map an index, specify the value within your query. For example, to map your index with the French language analyzer, specify the `french` value for the analyzer field: +To use an analyzer when you map an index, specify the value in your query. For example, to map your index with the French language analyzer, specify the `french` value in the analyzer field: ```json "analyzer": "french" @@ -22,7 +22,7 @@ To use the analyzer when you map an index, specify the value within your query. #### Example request -The following query specifies index `my-index` with `content` field configured as multi-field and sub-field named `french` is configured with `french` language analyzer: +The following query specifies an index `my-index` with the `content` field configured as multi-field, and a sub-field named `french` is configured with the `french` language analyzer: ```json PUT my-index @@ -44,7 +44,7 @@ PUT my-index ``` {% include copy-curl.html %} -Default `french` analyzer can also be configured for the entire index using the following query: +The default `french` analyzer can also be configured for the entire index using the following query: ```json PUT my-index @@ -77,7 +77,7 @@ PUT my-index ## Stem exclusion -You can apply stem exclusion to any language analyzer by providing a list of lowercase words that should be excluded from stemming. Internally, OpenSearch uses the `keyword_marker` token filter to mark these words as keywords, ensuring they are not stemmed. +You can apply stem exclusion to any language analyzer by providing a list of lowercase words that should be excluded from stemming. Internally, OpenSearch uses the `keyword_marker` token filter to mark these words as keywords, ensuring that they are not stemmed. ## Stem exclusion example @@ -103,7 +103,7 @@ PUT index_with_stem_exclusion_english_analyzer ## Stem exclusion with custom analyzers -All language analyzers consist of tokenizers and token filters specific to the particular language. If you want to implement a custom version of the language analyzer with stem exclusion, you need to configure the `keyword_marker` token filter and list the words excluded from stemming in the `keywords` parameter: +All language analyzers consist of tokenizers and token filters specific to a particular language. If you want to implement a custom version of the language analyzer with stem exclusion, you need to configure the `keyword_marker` token filter and list the words excluded from stemming in the `keywords` parameter: ```json PUT index_with_keyword_marker_analyzer diff --git a/_analyzers/language-analyzers/indonesian.md b/_analyzers/language-analyzers/indonesian.md index 920319082a..5c3d430b3a 100644 --- a/_analyzers/language-analyzers/indonesian.md +++ b/_analyzers/language-analyzers/indonesian.md @@ -60,7 +60,7 @@ The `indonesian` analyzer is built using the following components: ## Custom Indonesian analyzer -You can create custom Indonesian analyzer using the following command: +You can create a custom Indonesian analyzer using the following command: ```json PUT /hungarian-index diff --git a/_analyzers/language-analyzers/irish.md b/_analyzers/language-analyzers/irish.md index 606a81a10e..3e1535d134 100644 --- a/_analyzers/language-analyzers/irish.md +++ b/_analyzers/language-analyzers/irish.md @@ -62,7 +62,7 @@ The `irish` analyzer is built using the following components: ## Custom Irish analyzer -You can create custom Irish analyzer using the following command: +You can create a custom Irish analyzer using the following command: ```json PUT /irish-index diff --git a/_analyzers/language-analyzers/italian.md b/_analyzers/language-analyzers/italian.md index 6cf423fe67..190056d63c 100644 --- a/_analyzers/language-analyzers/italian.md +++ b/_analyzers/language-analyzers/italian.md @@ -61,7 +61,7 @@ The `italian` analyzer is built using the following components: ## Custom Italian analyzer -You can create custom Italian analyzer using the following command: +You can create a custom Italian analyzer using the following command: ```json PUT /italian-index diff --git a/_analyzers/language-analyzers/latvian.md b/_analyzers/language-analyzers/latvian.md index edb8c5e95e..2301759763 100644 --- a/_analyzers/language-analyzers/latvian.md +++ b/_analyzers/language-analyzers/latvian.md @@ -60,7 +60,7 @@ The `latvian` analyzer is built using the following components: ## Custom Latvian analyzer -You can create custom Latvian analyzer using the following command: +You can create a custom Latvian analyzer using the following command: ```json PUT /italian-index diff --git a/_analyzers/language-analyzers/lithuanian.md b/_analyzers/language-analyzers/lithuanian.md index 7f2da59101..ca5966c54e 100644 --- a/_analyzers/language-analyzers/lithuanian.md +++ b/_analyzers/language-analyzers/lithuanian.md @@ -60,7 +60,7 @@ The `lithuanian` analyzer is built using the following components: ## Custom Lithuanian analyzer -You can create custom Lithuanian analyzer using the following command: +You can create a custom Lithuanian analyzer using the following command: ```json PUT /lithuanian-index diff --git a/_analyzers/language-analyzers/norwegian.md b/_analyzers/language-analyzers/norwegian.md index 171da8ad51..cfb04eebf3 100644 --- a/_analyzers/language-analyzers/norwegian.md +++ b/_analyzers/language-analyzers/norwegian.md @@ -60,7 +60,7 @@ The `norwegian` analyzer is built using the following components: ## Custom Norwegian analyzer -You can create custom Norwegian analyzer using the following command: +You can create a custom Norwegian analyzer using the following command: ```json PUT /norwegian-index diff --git a/_analyzers/language-analyzers/persian.md b/_analyzers/language-analyzers/persian.md index d6018ccaab..40b38656fd 100644 --- a/_analyzers/language-analyzers/persian.md +++ b/_analyzers/language-analyzers/persian.md @@ -64,7 +64,7 @@ The `persian` analyzer is built using the following components: ## Custom Persian analyzer -You can create custom Persian analyzer using the following command: +You can create a custom Persian analyzer using the following command: ```json PUT /persian-index diff --git a/_analyzers/language-analyzers/portuguese.md b/_analyzers/language-analyzers/portuguese.md index 9a752dc4c6..166ffa0010 100644 --- a/_analyzers/language-analyzers/portuguese.md +++ b/_analyzers/language-analyzers/portuguese.md @@ -52,7 +52,7 @@ The `portuguese` analyzer is built using the following components: - Tokenizer: `standard` -- Token Filters: +- Token filters: - lowercase - stop (Portuguese) - keyword @@ -60,7 +60,7 @@ The `portuguese` analyzer is built using the following components: ## Custom Portuguese analyzer -You can create custom Portuguese analyzer using the following command: +You can create a custom Portuguese analyzer using the following command: ```json PUT /portuguese-index diff --git a/_analyzers/language-analyzers/romanian.md b/_analyzers/language-analyzers/romanian.md index bffe26288b..cad0953385 100644 --- a/_analyzers/language-analyzers/romanian.md +++ b/_analyzers/language-analyzers/romanian.md @@ -52,7 +52,7 @@ The `romanian` analyzer is built using the following components: - Tokenizer: `standard` -- Token Filters: +- Token filters: - lowercase - stop (Romanian) - keyword @@ -60,7 +60,7 @@ The `romanian` analyzer is built using the following components: ## Custom Romanian analyzer -You can create custom Romanian analyzer using the following command: +You can create a custom Romanian analyzer using the following command: ```json PUT /romanian-index diff --git a/_analyzers/language-analyzers/russian.md b/_analyzers/language-analyzers/russian.md index ac9ae0d72a..bd57ba0b27 100644 --- a/_analyzers/language-analyzers/russian.md +++ b/_analyzers/language-analyzers/russian.md @@ -52,7 +52,7 @@ The `russian` analyzer is built using the following components: - Tokenizer: `standard` -- Token Filters: +- Token filters: - lowercase - stop (Russian) - keyword @@ -60,7 +60,7 @@ The `russian` analyzer is built using the following components: ## Custom Russian analyzer -You can create custom Russian analyzer using the following command: +You can create a custom Russian analyzer using the following command: ```json PUT /russian-index diff --git a/_analyzers/language-analyzers/sorani.md b/_analyzers/language-analyzers/sorani.md index 8c31c3ef1e..f71d43c481 100644 --- a/_analyzers/language-analyzers/sorani.md +++ b/_analyzers/language-analyzers/sorani.md @@ -52,7 +52,7 @@ The `sorani` analyzer is built using the following components: - Tokenizer: `standard` -- Token Filters: +- Token filters: - normalization (Sorani) - lowercase - decimal_digit @@ -62,7 +62,7 @@ The `sorani` analyzer is built using the following components: ## Custom Sorani analyzer -You can create custom Sorani analyzer using the following command: +You can create a custom Sorani analyzer using the following command: ```json PUT /sorani-index diff --git a/_analyzers/language-analyzers/spanish.md b/_analyzers/language-analyzers/spanish.md index 3a1573d291..8a0d8fad3c 100644 --- a/_analyzers/language-analyzers/spanish.md +++ b/_analyzers/language-analyzers/spanish.md @@ -52,7 +52,7 @@ The `spanish` analyzer is built using the following components: - Tokenizer: `standard` -- Token Filters: +- Token filters: - lowercase - stop (Spanish) - keyword @@ -60,7 +60,7 @@ The `spanish` analyzer is built using the following components: ## Custom Spanish analyzer -You can create custom Spanish analyzer using the following command: +You can create a custom Spanish analyzer using the following command: ```json PUT /spanish-index diff --git a/_analyzers/language-analyzers/swedish.md b/_analyzers/language-analyzers/swedish.md index 9aadc9bc60..9da595f12e 100644 --- a/_analyzers/language-analyzers/swedish.md +++ b/_analyzers/language-analyzers/swedish.md @@ -52,7 +52,7 @@ The `swedish` analyzer is built using the following components: - Tokenizer: `standard` -- Token Filters: +- Token filters: - lowercase - stop (Swedish) - keyword @@ -60,7 +60,7 @@ The `swedish` analyzer is built using the following components: ## Custom Swedish analyzer -You can create custom Swedish analyzer using the following command: +You can create a custom Swedish analyzer using the following command: ```json PUT /swedish-index diff --git a/_analyzers/language-analyzers/thai.md b/_analyzers/language-analyzers/thai.md index 5f33554cf7..e4daa1f0be 100644 --- a/_analyzers/language-analyzers/thai.md +++ b/_analyzers/language-analyzers/thai.md @@ -52,7 +52,7 @@ The `thai` analyzer is built using the following components: - Tokenizer: `thai` -- Token Filters: +- Token filters: - lowercase - decimal_digit - stop (Thai) @@ -60,7 +60,7 @@ The `thai` analyzer is built using the following components: ## Custom Thai analyzer -You can create custom Thai analyzer using the following command: +You can create a custom Thai analyzer using the following command: ```json PUT /thai-index diff --git a/_analyzers/language-analyzers/turkish.md b/_analyzers/language-analyzers/turkish.md index 9e9b31acbc..fb36c5413c 100644 --- a/_analyzers/language-analyzers/turkish.md +++ b/_analyzers/language-analyzers/turkish.md @@ -52,7 +52,7 @@ The `turkish` analyzer is built using the following components: - Tokenizer: `standard` -- Token Filters: +- Token filters: - apostrophe - lowercase (Turkish) - stop (Turkish) @@ -61,7 +61,7 @@ The `turkish` analyzer is built using the following components: ## Custom Turkish analyzer -You can create custom Turkish analyzer using the following command: +You can create a custom Turkish analyzer using the following command: ```json PUT /turkish-index From e29f6900e13eb6c16b57457e880647dbd4ffd58c Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Thu, 14 Nov 2024 16:19:32 -0500 Subject: [PATCH 14/15] Update _analyzers/language-analyzers/index.md Co-authored-by: Nathan Bower Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> From 3a429eb4968a8d44510f88784dbda5d5fee6e3a0 Mon Sep 17 00:00:00 2001 From: Fanit Kolchina Date: Thu, 14 Nov 2024 16:25:03 -0500 Subject: [PATCH 15/15] Add redirect to index page Signed-off-by: Fanit Kolchina --- _analyzers/language-analyzers/index.md | 1 + 1 file changed, 1 insertion(+) diff --git a/_analyzers/language-analyzers/index.md b/_analyzers/language-analyzers/index.md index 17c3cb613b..89a4a42254 100644 --- a/_analyzers/language-analyzers/index.md +++ b/_analyzers/language-analyzers/index.md @@ -7,6 +7,7 @@ has_children: true has_toc: true redirect_from: - /query-dsl/analyzers/language-analyzers/ + - /analyzers/language-analyzers/ --- # Language analyzers