diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..4dd42773 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,5 @@ +FROM opensearchproject/opensearch:2.11.0 + +RUN /usr/share/opensearch/bin/opensearch-plugin install --batch analysis-kuromoji +RUN /usr/share/opensearch/bin/opensearch-plugin install --batch analysis-smartcn +RUN /usr/share/opensearch/bin/opensearch-plugin install --batch analysis-icu \ No newline at end of file diff --git a/pom.xml b/pom.xml index 58a1c25c..d1ff2a71 100644 --- a/pom.xml +++ b/pom.xml @@ -33,6 +33,7 @@ true 3.2.2 2.23.0 + 0.43.4 @@ -246,6 +247,33 @@ ${format.skip} + + io.fabric8 + docker-maven-plugin + ${version.docker.plugin} + + + + opensearch-custom-plugin + + ${project.basedir}/Dockerfile + + 2.11.0 + + + + + + + + build + initialize + + build + + + + diff --git a/src/main/java/io/quarkus/search/app/SearchService.java b/src/main/java/io/quarkus/search/app/SearchService.java index 757c9800..026bad50 100644 --- a/src/main/java/io/quarkus/search/app/SearchService.java +++ b/src/main/java/io/quarkus/search/app/SearchService.java @@ -16,6 +16,7 @@ import org.hibernate.Length; import org.hibernate.search.engine.search.common.BooleanOperator; +import org.hibernate.search.engine.search.predicate.dsl.SimpleQueryFlag; import org.hibernate.search.mapper.orm.session.SearchSession; import org.jboss.resteasy.reactive.RestQuery; @@ -79,6 +80,8 @@ public SearchResult search(@RestQuery @DefaultValue(QuarkusVersi .field(localizedField("summary_autocomplete", language)).boost(0.5f) .field(localizedField("fullContent_autocomplete", language)).boost(0.1f) .matching(q) + // See: https://github.com/elastic/elasticsearch/issues/39905 + .flags(SimpleQueryFlag.AND, SimpleQueryFlag.OR) .defaultOperator(BooleanOperator.AND)) .should(f.match().field("origin").matching("quarkus").boost(50.0f)) .should(f.not(f.match().field("topics").matching("compatibility")) diff --git a/src/main/java/io/quarkus/search/app/hibernate/AnalysisConfigurer.java b/src/main/java/io/quarkus/search/app/hibernate/AnalysisConfigurer.java index 7afb603c..dcd1718a 100644 --- a/src/main/java/io/quarkus/search/app/hibernate/AnalysisConfigurer.java +++ b/src/main/java/io/quarkus/search/app/hibernate/AnalysisConfigurer.java @@ -1,5 +1,7 @@ package io.quarkus.search.app.hibernate; +import java.util.EnumSet; + import org.hibernate.search.backend.elasticsearch.analysis.ElasticsearchAnalysisConfigurationContext; import org.hibernate.search.backend.elasticsearch.analysis.ElasticsearchAnalysisConfigurer; import org.hibernate.search.engine.backend.analysis.AnalyzerNames; @@ -23,23 +25,84 @@ public static String autocompleteAnalyzer(Language language) { @Override public void configure(ElasticsearchAnalysisConfigurationContext context) { - // just to have something for app to start correctly: - for (Language language : Language.values()) { + // for en/es/pt we are going to use the same english configuration since guides are not translated + EnumSet englishLanguages = EnumSet.of(Language.ENGLISH, Language.PORTUGUESE, Language.SPANISH); + for (Language language : englishLanguages) { + SharedFilters result = sharedFilters(context, language); + + // default: context.analyzer(defaultAnalyzer(language)).custom() .tokenizer("standard") - .tokenFilters("lowercase", "asciifolding", "stemmer") + .tokenFilters(result.possessiveStemmer(), "lowercase", "asciifolding", result.stop(), + result.regularStemmer()) .charFilters("html_strip"); - String autocompleteEdgeNgram = "autocomplete_edge_ngram_%s".formatted(language.code); + + // autocomplete: context.analyzer(autocompleteAnalyzer(language)).custom() .tokenizer("standard") - .tokenFilters("lowercase", "asciifolding", "stemmer", autocompleteEdgeNgram) + .tokenFilters(result.possessiveStemmer(), "lowercase", "asciifolding", result.stop(), + result.regularStemmer(), result.autocompleteEdgeNgram()) .charFilters("html_strip"); - context.tokenFilter(autocompleteEdgeNgram) - .type("edge_ngram") - .param("min_gram", 1) - .param("max_gram", 10); } + + // japanese + // https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji-analyzer.html + SharedFilters japanese = sharedFilters(context, Language.JAPANESE); + context.analyzer(defaultAnalyzer(Language.JAPANESE)).custom() + .tokenizer("kuromoji_tokenizer") + .tokenFilters("kuromoji_baseform", "kuromoji_part_of_speech", japanese.possessiveStemmer(), "lowercase", + "asciifolding", "ja_stop", japanese.stop(), "kuromoji_stemmer", japanese.regularStemmer()) + .charFilters("icu_normalizer", "html_strip"); + + // autocomplete: + context.analyzer(autocompleteAnalyzer(Language.JAPANESE)).custom() + .tokenizer("kuromoji_tokenizer") + .tokenFilters("kuromoji_baseform", "kuromoji_part_of_speech", japanese.possessiveStemmer(), "lowercase", + "asciifolding", "ja_stop", japanese.stop(), "kuromoji_stemmer", japanese.regularStemmer(), + japanese.autocompleteEdgeNgram()) + .charFilters("html_strip"); + + // chinese + // https://www.elastic.co/guide/en/elasticsearch/plugins/current/_reimplementing_and_extending_the_analyzers.html + SharedFilters chinese = sharedFilters(context, Language.CHINESE); + context.analyzer(defaultAnalyzer(Language.CHINESE)).custom() + .tokenizer("smartcn_tokenizer") + .tokenFilters(chinese.possessiveStemmer(), "lowercase", "asciifolding", "smartcn_stop", chinese.stop(), + chinese.regularStemmer()) + .charFilters("html_strip"); + + // autocomplete: + context.analyzer(autocompleteAnalyzer(Language.CHINESE)).custom() + .tokenizer("smartcn_tokenizer") + .tokenFilters(chinese.possessiveStemmer(), "lowercase", "asciifolding", "smartcn_stop", chinese.stop(), + chinese.regularStemmer(), chinese.autocompleteEdgeNgram()) + .charFilters("html_strip"); + context.normalizer(SORT).custom() .tokenFilters("lowercase"); } + + private static SharedFilters sharedFilters(ElasticsearchAnalysisConfigurationContext context, Language language) { + String stop = "stop_%s".formatted(language.code); + String regularStemmer = "stemmer_%s".formatted(language.code); + String possessiveStemmer = "possessive_stemmer_%s".formatted(language.code); + String autocompleteEdgeNgram = "autocomplete_edge_ngram_%s".formatted(language.code); + context.tokenFilter(stop) + .type("stop") + .param("stopwords", "_english_"); + context.tokenFilter(regularStemmer) + .type("stemmer") + .param("language", "english"); + context.tokenFilter(possessiveStemmer) + .type("stemmer") + .param("language", "possessive_english"); + context.tokenFilter(autocompleteEdgeNgram) + .type("edge_ngram") + .param("min_gram", 1) + .param("max_gram", 10); + return new SharedFilters(stop, regularStemmer, possessiveStemmer, autocompleteEdgeNgram); + } + + private record SharedFilters(String stop, String regularStemmer, String possessiveStemmer, String autocompleteEdgeNgram) { + } } diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties index cba27b11..894a0e0b 100644 --- a/src/main/resources/application.properties +++ b/src/main/resources/application.properties @@ -28,7 +28,7 @@ quarkus.datasource.jdbc.url=jdbc:h2:mem:searchquarkusio quarkus.hibernate-orm.database.generation=drop-and-create ## Hibernate Search quarkus.hibernate-search-orm.elasticsearch.version=opensearch:2.11 -quarkus.elasticsearch.devservices.image-name=docker.io/opensearchproject/opensearch:2.11.0 +quarkus.elasticsearch.devservices.image-name=opensearch-custom-plugin:2.11.0 ## We need to apply a custom OpenSearch mapping to exclude very large fields from the _source quarkus.hibernate-search-orm.elasticsearch.schema-management.mapping-file=indexes/mapping-template.json quarkus.hibernate-search-orm.elasticsearch.schema-management.settings-file=indexes/settings-template.json diff --git a/src/test/java/io/quarkus/search/app/SearchServiceTest.java b/src/test/java/io/quarkus/search/app/SearchServiceTest.java index fd8e1f46..e677cd62 100644 --- a/src/test/java/io/quarkus/search/app/SearchServiceTest.java +++ b/src/test/java/io/quarkus/search/app/SearchServiceTest.java @@ -187,8 +187,8 @@ private static List relevance() { GuideRef.HIBERNATE_REACTIVE, GuideRef.HIBERNATE_REACTIVE_PANACHE, GuideRef.HIBERNATE_ORM_PANACHE, - GuideRef.HIBERNATE_ORM, GuideRef.HIBERNATE_ORM_PANACHE_KOTLIN, + GuideRef.HIBERNATE_ORM, GuideRef.DUPLICATED_CONTEXT, // contains "Hibernate Reactive" GuideRef.SPRING_DATA_JPA)), Arguments.of("jpa", GuideRef.urls( @@ -342,8 +342,8 @@ void language() { .then() .statusCode(200) .extract().body().as(SEARCH_RESULT_SEARCH_HITS); - assertThat(result.hits()).extracting(GuideSearchHit::title).contains( - "Hibernate"); + assertThat(result.hits()).extracting(GuideSearchHit::title) + .contains("Hibernate搜索指南"); } private static ThrowingConsumer hitsHaveCorrectWordHighlighted(AtomicInteger matches, String word,