diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 00000000..4dd42773
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,5 @@
+FROM opensearchproject/opensearch:2.11.0
+
+RUN /usr/share/opensearch/bin/opensearch-plugin install --batch analysis-kuromoji
+RUN /usr/share/opensearch/bin/opensearch-plugin install --batch analysis-smartcn
+RUN /usr/share/opensearch/bin/opensearch-plugin install --batch analysis-icu
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index 58a1c25c..d1ff2a71 100644
--- a/pom.xml
+++ b/pom.xml
@@ -33,6 +33,7 @@
true
3.2.2
2.23.0
+ 0.43.4
@@ -246,6 +247,33 @@
${format.skip}
+
+ io.fabric8
+ docker-maven-plugin
+ ${version.docker.plugin}
+
+
+
+ opensearch-custom-plugin
+
+ ${project.basedir}/Dockerfile
+
+ 2.11.0
+
+
+
+
+
+
+
+ build
+ initialize
+
+ build
+
+
+
+
diff --git a/src/main/java/io/quarkus/search/app/SearchService.java b/src/main/java/io/quarkus/search/app/SearchService.java
index 757c9800..026bad50 100644
--- a/src/main/java/io/quarkus/search/app/SearchService.java
+++ b/src/main/java/io/quarkus/search/app/SearchService.java
@@ -16,6 +16,7 @@
import org.hibernate.Length;
import org.hibernate.search.engine.search.common.BooleanOperator;
+import org.hibernate.search.engine.search.predicate.dsl.SimpleQueryFlag;
import org.hibernate.search.mapper.orm.session.SearchSession;
import org.jboss.resteasy.reactive.RestQuery;
@@ -79,6 +80,8 @@ public SearchResult search(@RestQuery @DefaultValue(QuarkusVersi
.field(localizedField("summary_autocomplete", language)).boost(0.5f)
.field(localizedField("fullContent_autocomplete", language)).boost(0.1f)
.matching(q)
+ // See: https://github.com/elastic/elasticsearch/issues/39905
+ .flags(SimpleQueryFlag.AND, SimpleQueryFlag.OR)
.defaultOperator(BooleanOperator.AND))
.should(f.match().field("origin").matching("quarkus").boost(50.0f))
.should(f.not(f.match().field("topics").matching("compatibility"))
diff --git a/src/main/java/io/quarkus/search/app/hibernate/AnalysisConfigurer.java b/src/main/java/io/quarkus/search/app/hibernate/AnalysisConfigurer.java
index 7afb603c..dcd1718a 100644
--- a/src/main/java/io/quarkus/search/app/hibernate/AnalysisConfigurer.java
+++ b/src/main/java/io/quarkus/search/app/hibernate/AnalysisConfigurer.java
@@ -1,5 +1,7 @@
package io.quarkus.search.app.hibernate;
+import java.util.EnumSet;
+
import org.hibernate.search.backend.elasticsearch.analysis.ElasticsearchAnalysisConfigurationContext;
import org.hibernate.search.backend.elasticsearch.analysis.ElasticsearchAnalysisConfigurer;
import org.hibernate.search.engine.backend.analysis.AnalyzerNames;
@@ -23,23 +25,84 @@ public static String autocompleteAnalyzer(Language language) {
@Override
public void configure(ElasticsearchAnalysisConfigurationContext context) {
- // just to have something for app to start correctly:
- for (Language language : Language.values()) {
+ // for en/es/pt we are going to use the same english configuration since guides are not translated
+ EnumSet englishLanguages = EnumSet.of(Language.ENGLISH, Language.PORTUGUESE, Language.SPANISH);
+ for (Language language : englishLanguages) {
+ SharedFilters result = sharedFilters(context, language);
+
+ // default:
context.analyzer(defaultAnalyzer(language)).custom()
.tokenizer("standard")
- .tokenFilters("lowercase", "asciifolding", "stemmer")
+ .tokenFilters(result.possessiveStemmer(), "lowercase", "asciifolding", result.stop(),
+ result.regularStemmer())
.charFilters("html_strip");
- String autocompleteEdgeNgram = "autocomplete_edge_ngram_%s".formatted(language.code);
+
+ // autocomplete:
context.analyzer(autocompleteAnalyzer(language)).custom()
.tokenizer("standard")
- .tokenFilters("lowercase", "asciifolding", "stemmer", autocompleteEdgeNgram)
+ .tokenFilters(result.possessiveStemmer(), "lowercase", "asciifolding", result.stop(),
+ result.regularStemmer(), result.autocompleteEdgeNgram())
.charFilters("html_strip");
- context.tokenFilter(autocompleteEdgeNgram)
- .type("edge_ngram")
- .param("min_gram", 1)
- .param("max_gram", 10);
}
+
+ // japanese
+ // https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji-analyzer.html
+ SharedFilters japanese = sharedFilters(context, Language.JAPANESE);
+ context.analyzer(defaultAnalyzer(Language.JAPANESE)).custom()
+ .tokenizer("kuromoji_tokenizer")
+ .tokenFilters("kuromoji_baseform", "kuromoji_part_of_speech", japanese.possessiveStemmer(), "lowercase",
+ "asciifolding", "ja_stop", japanese.stop(), "kuromoji_stemmer", japanese.regularStemmer())
+ .charFilters("icu_normalizer", "html_strip");
+
+ // autocomplete:
+ context.analyzer(autocompleteAnalyzer(Language.JAPANESE)).custom()
+ .tokenizer("kuromoji_tokenizer")
+ .tokenFilters("kuromoji_baseform", "kuromoji_part_of_speech", japanese.possessiveStemmer(), "lowercase",
+ "asciifolding", "ja_stop", japanese.stop(), "kuromoji_stemmer", japanese.regularStemmer(),
+ japanese.autocompleteEdgeNgram())
+ .charFilters("html_strip");
+
+ // chinese
+ // https://www.elastic.co/guide/en/elasticsearch/plugins/current/_reimplementing_and_extending_the_analyzers.html
+ SharedFilters chinese = sharedFilters(context, Language.CHINESE);
+ context.analyzer(defaultAnalyzer(Language.CHINESE)).custom()
+ .tokenizer("smartcn_tokenizer")
+ .tokenFilters(chinese.possessiveStemmer(), "lowercase", "asciifolding", "smartcn_stop", chinese.stop(),
+ chinese.regularStemmer())
+ .charFilters("html_strip");
+
+ // autocomplete:
+ context.analyzer(autocompleteAnalyzer(Language.CHINESE)).custom()
+ .tokenizer("smartcn_tokenizer")
+ .tokenFilters(chinese.possessiveStemmer(), "lowercase", "asciifolding", "smartcn_stop", chinese.stop(),
+ chinese.regularStemmer(), chinese.autocompleteEdgeNgram())
+ .charFilters("html_strip");
+
context.normalizer(SORT).custom()
.tokenFilters("lowercase");
}
+
+ private static SharedFilters sharedFilters(ElasticsearchAnalysisConfigurationContext context, Language language) {
+ String stop = "stop_%s".formatted(language.code);
+ String regularStemmer = "stemmer_%s".formatted(language.code);
+ String possessiveStemmer = "possessive_stemmer_%s".formatted(language.code);
+ String autocompleteEdgeNgram = "autocomplete_edge_ngram_%s".formatted(language.code);
+ context.tokenFilter(stop)
+ .type("stop")
+ .param("stopwords", "_english_");
+ context.tokenFilter(regularStemmer)
+ .type("stemmer")
+ .param("language", "english");
+ context.tokenFilter(possessiveStemmer)
+ .type("stemmer")
+ .param("language", "possessive_english");
+ context.tokenFilter(autocompleteEdgeNgram)
+ .type("edge_ngram")
+ .param("min_gram", 1)
+ .param("max_gram", 10);
+ return new SharedFilters(stop, regularStemmer, possessiveStemmer, autocompleteEdgeNgram);
+ }
+
+ private record SharedFilters(String stop, String regularStemmer, String possessiveStemmer, String autocompleteEdgeNgram) {
+ }
}
diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties
index cba27b11..894a0e0b 100644
--- a/src/main/resources/application.properties
+++ b/src/main/resources/application.properties
@@ -28,7 +28,7 @@ quarkus.datasource.jdbc.url=jdbc:h2:mem:searchquarkusio
quarkus.hibernate-orm.database.generation=drop-and-create
## Hibernate Search
quarkus.hibernate-search-orm.elasticsearch.version=opensearch:2.11
-quarkus.elasticsearch.devservices.image-name=docker.io/opensearchproject/opensearch:2.11.0
+quarkus.elasticsearch.devservices.image-name=opensearch-custom-plugin:2.11.0
## We need to apply a custom OpenSearch mapping to exclude very large fields from the _source
quarkus.hibernate-search-orm.elasticsearch.schema-management.mapping-file=indexes/mapping-template.json
quarkus.hibernate-search-orm.elasticsearch.schema-management.settings-file=indexes/settings-template.json
diff --git a/src/test/java/io/quarkus/search/app/SearchServiceTest.java b/src/test/java/io/quarkus/search/app/SearchServiceTest.java
index fd8e1f46..e677cd62 100644
--- a/src/test/java/io/quarkus/search/app/SearchServiceTest.java
+++ b/src/test/java/io/quarkus/search/app/SearchServiceTest.java
@@ -187,8 +187,8 @@ private static List relevance() {
GuideRef.HIBERNATE_REACTIVE,
GuideRef.HIBERNATE_REACTIVE_PANACHE,
GuideRef.HIBERNATE_ORM_PANACHE,
- GuideRef.HIBERNATE_ORM,
GuideRef.HIBERNATE_ORM_PANACHE_KOTLIN,
+ GuideRef.HIBERNATE_ORM,
GuideRef.DUPLICATED_CONTEXT, // contains "Hibernate Reactive"
GuideRef.SPRING_DATA_JPA)),
Arguments.of("jpa", GuideRef.urls(
@@ -342,8 +342,8 @@ void language() {
.then()
.statusCode(200)
.extract().body().as(SEARCH_RESULT_SEARCH_HITS);
- assertThat(result.hits()).extracting(GuideSearchHit::title).contains(
- "Hibernate搜索指南");
+ assertThat(result.hits()).extracting(GuideSearchHit::title)
+ .contains("Hibernate搜索指南");
}
private static ThrowingConsumer hitsHaveCorrectWordHighlighted(AtomicInteger matches, String word,