opensearch-project · kolchfa-aws · Nov 14, 2024 · Oct 21, 2024 · Oct 21, 2024 · Oct 22, 2024
@@ -0,0 +1,182 @@
+---
+layout: default
+title: Arabic
+parent: Language analyzers
+grand_parent: Analyzers
+nav_order: 10
+---
+
+# Arabic analyzer
+
+The built-in `arabic` analyzer can be applied to a text field using the following command:
+
+```json
+PUT /arabic-index
+{
+  "mappings": {
+    "properties": {
+      "content": {
+        "type": "text",
+        "analyzer": "arabic"
+      }
+    }
+  }
+}
+```
+{% include copy-curl.html %}
+
+## Stem exclusion
+
+You can also use `stem_exclusion` with this language analyzer using the following command:
+
+```json
+PUT index_with_stem_exclusion_arabic
+{
+  "settings": {
+    "analysis": {
+      "analyzer": {
+        "stem_exclusion_arabic_analyzer":{
+          "type":"arabic",
+          "stem_exclusion":["تكنولوجيا","سلطة "]
+        }
+      }
+    }
+  }
+}
+```
+{% include copy-curl.html %}
+
+## Arabic analyzer internals
+
+The `arabic` analyzer is build using the following:
+
+Tokenizer: `standard`
+
+Token filters:
+- lowercase
+- decimal_digit
+- stop (Arabic)
+- normalization (Arabic)
+- keywords
+- stemmer (Arabic)
+
+## Custom Arabic analyzer
+
+You can create custom Arabic analyzer using the following command:
+
+```json
+PUT /arabic-index
+{
+  "settings": {
+    "analysis": {
+      "filter": {
+        "arabic_stop": {
+          "type": "stop",
+          "stopwords": "_arabic_"
+        },
+        "arabic_stemmer": {
+          "type": "stemmer",
+          "language": "arabic"
+        },
+        "arabic_normalization": {
+          "type": "arabic_normalization"
+        },
+        "decimal_digit": {
+          "type": "decimal_digit"
+        },
+        "arabic_keywords": {
+          "type":       "keyword_marker",
+          "keywords":   [] 
+        }
+      },
+      "analyzer": {
+        "arabic_analyzer": {
+          "type": "custom",
+          "tokenizer": "standard",
+          "filter": [
+            "lowercase",
+            "arabic_normalization",
+            "decimal_digit",
+            "arabic_stop",
+            "arabic_keywords",
+            "arabic_stemmer"
+          ]
+        }
+      }
+    }
+  },
+  "mappings": {
+    "properties": {
+      "content": {
+        "type": "text",
+        "analyzer": "arabic_analyzer"
+      }
+    }
+  }
+}
+```
+{% include copy-curl.html %}
+
+## Generated tokens
+
+Use the following request to examine the tokens generated using the analyzer:
+
+```json
+POST /arabic-index/_analyze
+{
+  "field": "content",
+  "text": "الطلاب يدرسون في الجامعات العربية. أرقامهم ١٢٣٤٥٦."
+}
+```
+{% include copy-curl.html %}
+
+The response contains the generated tokens:
+
+```json
+{
+  "tokens": [
+    {
+      "token": "طلاب",
+      "start_offset": 0,
+      "end_offset": 6,
+      "type": "<ALPHANUM>",
+      "position": 0
+    },
+    {
+      "token": "يدرس",
+      "start_offset": 7,
+      "end_offset": 13,
+      "type": "<ALPHANUM>",
+      "position": 1
+    },
+    {
+      "token": "جامع",
+      "start_offset": 17,
+      "end_offset": 25,
+      "type": "<ALPHANUM>",
+      "position": 3
+    },
+    {
+      "token": "عرب",
+      "start_offset": 26,
+      "end_offset": 33,
+      "type": "<ALPHANUM>",
+      "position": 4
+    },
+    {
+      "token": "ارقامهم",
+      "start_offset": 35,
+      "end_offset": 42,
+      "type": "<ALPHANUM>",
+      "position": 5
+    },
+    {
+      "token": "123456",
+      "start_offset": 43,
+      "end_offset": 49,
+      "type": "<NUM>",
+      "position": 6
+    }
+  ]
+}
+```
@@ -0,0 +1,137 @@
+---
+layout: default
+title: Armenian
+parent: Language analyzers
+grand_parent: Analyzers
+nav_order: 20
+---
+
+# Armenian analyzer
+
+The built-in `armenian` analyzer can be applied to a text field using the following command:
+
+```json
+PUT /arabic-index
+{
+  "mappings": {
+    "properties": {
+      "content": {
+        "type": "text",
+        "analyzer": "armenian"
+      }
+    }
+  }
+}
+```
+{% include copy-curl.html %}
+
+## Stem exclusion
+
+You can also use `stem_exclusion` with this language analyzer using the following command:
+
+```json
+PUT index_with_stem_exclusion_armenian_analyzer
+{
+  "settings": {
+    "analysis": {
+      "analyzer": {
+        "stem_exclusion_armenian_analyzer": {
+          "type": "armenian",
+          "stem_exclusion": ["բարև", "խաղաղություն"] 
+        }
+      }
+    }
+  }
+}
+```
+{% include copy-curl.html %}
+
+## Armenian analyzer internals
+
+The `armenian` analyzer is build using the following:
+
+Tokenizer: `standard`
+
+Token filters:
+- lowercase
+- stop (Armenian)
+- keywords
+- stemmer (Armenian)
+
+## Custom Armenian analyzer
+
+You can create custom Armenian analyzer using the following command:
-You can create custom Armenian analyzer using the following command:
+You can create a custom Armenian analyzer using the following command:
-You can create custom Armenian analyzer using the following command:
+You can create a custom Armenian analyzer using the following command:
+
+```json
+PUT /armenian-index
+{
+  "settings": {
+    "analysis": {
+      "filter": {
+        "armenian_stop": {
+          "type": "stop",
+          "stopwords": "_armenian_"
+        },
+        "armenian_stemmer": {
+          "type": "stemmer",
+          "language": "armenian"
+        },
+        "armenian_keywords": {
+          "type":       "keyword_marker",
+          "keywords":   [] 
+        }
+      },
+      "analyzer": {
+        "armenian_analyzer": {
+          "type": "custom",
+          "tokenizer": "standard",
+          "filter": [
+            "lowercase",
+            "armenian_stop",
+            "armenian_keywords",
+            "armenian_stemmer"
+          ]
+        }
+      }
+    }
+  },
+  "mappings": {
+    "properties": {
+      "content": {
+        "type": "text",
+        "analyzer": "armenian_analyzer"
+      }
+    }
+  }
+}
+```
+{% include copy-curl.html %}
+
+## Generated tokens
+
+Use the following request to examine the tokens generated using the analyzer:
+
+```json
+GET armenian-index/_analyze
+{
+  "analyzer": "stem_exclusion_armenian_analyzer",
+  "text": "բարև բոլորին, մենք խաղաղություն ենք ուզում և նոր օր ենք սկսել"
+}
+```
+{% include copy-curl.html %}
+
+The response contains the generated tokens:
+
+```json
+{
+  "tokens": [
+    {"token": "բարև","start_offset": 0,"end_offset": 4,"type": "<ALPHANUM>","position": 0},
+    {"token": "բոլոր","start_offset": 5,"end_offset": 12,"type": "<ALPHANUM>","position": 1},
+    {"token": "խաղաղություն","start_offset": 19,"end_offset": 31,"type": "<ALPHANUM>","position": 3},
+    {"token": "ուզ","start_offset": 36,"end_offset": 42,"type": "<ALPHANUM>","position": 5},
+    {"token": "նոր","start_offset": 45,"end_offset": 48,"type": "<ALPHANUM>","position": 7},
+    {"token": "օր","start_offset": 49,"end_offset": 51,"type": "<ALPHANUM>","position": 8},
+    {"token": "սկսել","start_offset": 56,"end_offset": 61,"type": "<ALPHANUM>","position": 10}
+  ]
+}
+```