Skip to content

Commit

Permalink
Customize analyzers for various languages
Browse files Browse the repository at this point in the history
  • Loading branch information
marko-bekhta committed Dec 7, 2023
1 parent 0990eb2 commit 9273875
Show file tree
Hide file tree
Showing 6 changed files with 112 additions and 13 deletions.
5 changes: 5 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
FROM opensearchproject/opensearch:2.11.0

RUN /usr/share/opensearch/bin/opensearch-plugin install --batch analysis-kuromoji
RUN /usr/share/opensearch/bin/opensearch-plugin install --batch analysis-smartcn
RUN /usr/share/opensearch/bin/opensearch-plugin install --batch analysis-icu
28 changes: 28 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
<skipITs>true</skipITs>
<surefire-plugin.version>3.2.2</surefire-plugin.version>
<version.formatter.plugin>2.23.0</version.formatter.plugin>
<version.docker.plugin>0.43.4</version.docker.plugin>
</properties>
<dependencyManagement>
<dependencies>
Expand Down Expand Up @@ -246,6 +247,33 @@
<skip>${format.skip}</skip>
</configuration>
</plugin>
<plugin>
<groupId>io.fabric8</groupId>
<artifactId>docker-maven-plugin</artifactId>
<version>${version.docker.plugin}</version>
<configuration>
<images>
<image>
<name>opensearch-custom-plugin</name>
<build>
<dockerFile>${project.basedir}/Dockerfile</dockerFile>
<tags>
<tag>2.11.0</tag>
</tags>
</build>
</image>
</images>
</configuration>
<executions>
<execution>
<id>build</id>
<phase>initialize</phase>
<goals>
<goal>build</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
3 changes: 3 additions & 0 deletions src/main/java/io/quarkus/search/app/SearchService.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import org.hibernate.Length;
import org.hibernate.search.engine.search.common.BooleanOperator;
import org.hibernate.search.engine.search.predicate.dsl.SimpleQueryFlag;
import org.hibernate.search.mapper.orm.session.SearchSession;

import org.jboss.resteasy.reactive.RestQuery;
Expand Down Expand Up @@ -79,6 +80,8 @@ public SearchResult<GuideSearchHit> search(@RestQuery @DefaultValue(QuarkusVersi
.field(localizedField("summary_autocomplete", language)).boost(0.5f)
.field(localizedField("fullContent_autocomplete", language)).boost(0.1f)
.matching(q)
// See: https://github.com/elastic/elasticsearch/issues/39905
.flags(SimpleQueryFlag.AND, SimpleQueryFlag.OR)
.defaultOperator(BooleanOperator.AND))
.should(f.match().field("origin").matching("quarkus").boost(50.0f))
.should(f.not(f.match().field("topics").matching("compatibility"))
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package io.quarkus.search.app.hibernate;

import java.util.EnumSet;

import org.hibernate.search.backend.elasticsearch.analysis.ElasticsearchAnalysisConfigurationContext;
import org.hibernate.search.backend.elasticsearch.analysis.ElasticsearchAnalysisConfigurer;
import org.hibernate.search.engine.backend.analysis.AnalyzerNames;
Expand All @@ -23,23 +25,84 @@ public static String autocompleteAnalyzer(Language language) {

@Override
public void configure(ElasticsearchAnalysisConfigurationContext context) {
// just to have something for app to start correctly:
for (Language language : Language.values()) {
// for en/es/pt we are going to use the same english configuration since guides are not translated
EnumSet<Language> englishLanguages = EnumSet.of(Language.ENGLISH, Language.PORTUGUESE, Language.SPANISH);
for (Language language : englishLanguages) {
SharedFilters result = sharedFilters(context, language);

// default:
context.analyzer(defaultAnalyzer(language)).custom()
.tokenizer("standard")
.tokenFilters("lowercase", "asciifolding", "stemmer")
.tokenFilters(result.possessiveStemmer(), "lowercase", "asciifolding", result.stop(),
result.regularStemmer())
.charFilters("html_strip");
String autocompleteEdgeNgram = "autocomplete_edge_ngram_%s".formatted(language.code);

// autocomplete:
context.analyzer(autocompleteAnalyzer(language)).custom()
.tokenizer("standard")
.tokenFilters("lowercase", "asciifolding", "stemmer", autocompleteEdgeNgram)
.tokenFilters(result.possessiveStemmer(), "lowercase", "asciifolding", result.stop(),
result.regularStemmer(), result.autocompleteEdgeNgram())
.charFilters("html_strip");
context.tokenFilter(autocompleteEdgeNgram)
.type("edge_ngram")
.param("min_gram", 1)
.param("max_gram", 10);
}

// japanese
// https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji-analyzer.html
SharedFilters japanese = sharedFilters(context, Language.JAPANESE);
context.analyzer(defaultAnalyzer(Language.JAPANESE)).custom()
.tokenizer("kuromoji_tokenizer")
.tokenFilters("kuromoji_baseform", "kuromoji_part_of_speech", japanese.possessiveStemmer(), "lowercase",
"asciifolding", "ja_stop", japanese.stop(), "kuromoji_stemmer", japanese.regularStemmer())
.charFilters("icu_normalizer", "html_strip");

// autocomplete:
context.analyzer(autocompleteAnalyzer(Language.JAPANESE)).custom()
.tokenizer("kuromoji_tokenizer")
.tokenFilters("kuromoji_baseform", "kuromoji_part_of_speech", japanese.possessiveStemmer(), "lowercase",
"asciifolding", "ja_stop", japanese.stop(), "kuromoji_stemmer", japanese.regularStemmer(),
japanese.autocompleteEdgeNgram())
.charFilters("html_strip");

// chinese
// https://www.elastic.co/guide/en/elasticsearch/plugins/current/_reimplementing_and_extending_the_analyzers.html
SharedFilters chinese = sharedFilters(context, Language.CHINESE);
context.analyzer(defaultAnalyzer(Language.CHINESE)).custom()
.tokenizer("smartcn_tokenizer")
.tokenFilters(chinese.possessiveStemmer(), "lowercase", "asciifolding", "smartcn_stop", chinese.stop(),
chinese.regularStemmer())
.charFilters("html_strip");

// autocomplete:
context.analyzer(autocompleteAnalyzer(Language.CHINESE)).custom()
.tokenizer("smartcn_tokenizer")
.tokenFilters(chinese.possessiveStemmer(), "lowercase", "asciifolding", "smartcn_stop", chinese.stop(),
chinese.regularStemmer(), chinese.autocompleteEdgeNgram())
.charFilters("html_strip");

context.normalizer(SORT).custom()
.tokenFilters("lowercase");
}

private static SharedFilters sharedFilters(ElasticsearchAnalysisConfigurationContext context, Language language) {
String stop = "stop_%s".formatted(language.code);
String regularStemmer = "stemmer_%s".formatted(language.code);
String possessiveStemmer = "possessive_stemmer_%s".formatted(language.code);
String autocompleteEdgeNgram = "autocomplete_edge_ngram_%s".formatted(language.code);
context.tokenFilter(stop)
.type("stop")
.param("stopwords", "_english_");
context.tokenFilter(regularStemmer)
.type("stemmer")
.param("language", "english");
context.tokenFilter(possessiveStemmer)
.type("stemmer")
.param("language", "possessive_english");
context.tokenFilter(autocompleteEdgeNgram)
.type("edge_ngram")
.param("min_gram", 1)
.param("max_gram", 10);
return new SharedFilters(stop, regularStemmer, possessiveStemmer, autocompleteEdgeNgram);
}

private record SharedFilters(String stop, String regularStemmer, String possessiveStemmer, String autocompleteEdgeNgram) {
}
}
2 changes: 1 addition & 1 deletion src/main/resources/application.properties
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ quarkus.datasource.jdbc.url=jdbc:h2:mem:searchquarkusio
quarkus.hibernate-orm.database.generation=drop-and-create
## Hibernate Search
quarkus.hibernate-search-orm.elasticsearch.version=opensearch:2.11
quarkus.elasticsearch.devservices.image-name=docker.io/opensearchproject/opensearch:2.11.0
quarkus.elasticsearch.devservices.image-name=opensearch-custom-plugin:2.11.0
## We need to apply a custom OpenSearch mapping to exclude very large fields from the _source
quarkus.hibernate-search-orm.elasticsearch.schema-management.mapping-file=indexes/mapping-template.json
quarkus.hibernate-search-orm.elasticsearch.schema-management.settings-file=indexes/settings-template.json
Expand Down
6 changes: 3 additions & 3 deletions src/test/java/io/quarkus/search/app/SearchServiceTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -187,8 +187,8 @@ private static List<Arguments> relevance() {
GuideRef.HIBERNATE_REACTIVE,
GuideRef.HIBERNATE_REACTIVE_PANACHE,
GuideRef.HIBERNATE_ORM_PANACHE,
GuideRef.HIBERNATE_ORM,
GuideRef.HIBERNATE_ORM_PANACHE_KOTLIN,
GuideRef.HIBERNATE_ORM,
GuideRef.DUPLICATED_CONTEXT, // contains "Hibernate Reactive"
GuideRef.SPRING_DATA_JPA)),
Arguments.of("jpa", GuideRef.urls(
Expand Down Expand Up @@ -342,8 +342,8 @@ void language() {
.then()
.statusCode(200)
.extract().body().as(SEARCH_RESULT_SEARCH_HITS);
assertThat(result.hits()).extracting(GuideSearchHit::title).contains(
"Hibernate<span class=\"highlighted\"></span><span class=\"highlighted\">索</span><span class=\"highlighted\">指</span><span class=\"highlighted\">南</span>");
assertThat(result.hits()).extracting(GuideSearchHit::title)
.contains("Hibernate<span class=\"highlighted\">搜索</span><span class=\"highlighted\">指南</span>");
}

private static ThrowingConsumer<String> hitsHaveCorrectWordHighlighted(AtomicInteger matches, String word,
Expand Down

0 comments on commit 9273875

Please sign in to comment.