diff --git a/lucene/src/main/scala/textmogrify/lucene/AnalyzerBuilder.scala b/lucene/src/main/scala/textmogrify/lucene/AnalyzerBuilder.scala index 4f7e69b..980af8e 100644 --- a/lucene/src/main/scala/textmogrify/lucene/AnalyzerBuilder.scala +++ b/lucene/src/main/scala/textmogrify/lucene/AnalyzerBuilder.scala @@ -104,10 +104,9 @@ sealed abstract class AnalyzerBuilder private[lucene] (config: Config) { /** Build the Analyzer wrapped inside a Resource. */ def build[F[_]](implicit F: Sync[F]): Resource[F, Analyzer] - /** Directly construct a tokenizing function - */ + /** Build a tokenizing function that uses the Analyzer and collects tokens in a vector */ def tokenizer[F[_]](implicit F: Sync[F]): Resource[F, String => F[Vector[String]]] = - build.map(a => Tokenizer.vectorTokenizer(a)) + Tokenizer.vectorTokenizer(build) private[lucene] def mkFromStandardTokenizer[F[_]]( config: Config diff --git a/lucene/src/main/scala/textmogrify/lucene/AnalyzerResource.scala b/lucene/src/main/scala/textmogrify/lucene/AnalyzerResource.scala index 9db0be4..8340364 100644 --- a/lucene/src/main/scala/textmogrify/lucene/AnalyzerResource.scala +++ b/lucene/src/main/scala/textmogrify/lucene/AnalyzerResource.scala @@ -26,12 +26,4 @@ object AnalyzerResource { */ def fromAnalyzer[F[_]](analyzer: => Analyzer)(implicit F: Sync[F]): Resource[F, Analyzer] = Resource.make(F.delay(analyzer))(analyzer => F.delay(analyzer.close())) - - /** Construct a tokenizing function directly from an Analyzer - */ - def tokenizer[F[_]]( - analyzer: => Analyzer - )(implicit F: Sync[F]): Resource[F, String => F[Vector[String]]] = - fromAnalyzer(analyzer) - .map(a => Tokenizer.vectorTokenizer(a)) } diff --git a/lucene/src/main/scala/textmogrify/lucene/Tokenizer.scala b/lucene/src/main/scala/textmogrify/lucene/Tokenizer.scala index a7487cd..0ec8bce 100644 --- a/lucene/src/main/scala/textmogrify/lucene/Tokenizer.scala +++ b/lucene/src/main/scala/textmogrify/lucene/Tokenizer.scala @@ -16,6 +16,7 @@ package textmogrify.lucene +import cats.effect.Resource import cats.effect.kernel.Sync import scala.collection.mutable.ArrayBuffer import java.io.StringReader @@ -27,8 +28,10 @@ object Tokenizer { /** Build a tokenizing function that runs its input through the Analyzer and collects * all tokens into a `Vector` */ - def vectorTokenizer[F[_]](analyzer: Analyzer)(implicit F: Sync[F]): String => F[Vector[String]] = - (s: String) => + def vectorTokenizer[F[_]]( + analyzer: Resource[F, Analyzer] + )(implicit F: Sync[F]): Resource[F, String => F[Vector[String]]] = + analyzer.map { analyzer => (s: String) => F.delay { val ts = analyzer.tokenStream("textmogrify-field", new StringReader(s)) val termAtt = ts.addAttribute(classOf[CharTermAttribute]) @@ -42,4 +45,5 @@ object Tokenizer { ts.close() arr.toVector } + } } diff --git a/lucene/src/test/scala/textmogrify/lucene/AnalyzerResourceSuite.scala b/lucene/src/test/scala/textmogrify/lucene/AnalyzerResourceSuite.scala index 237cb53..13101c7 100644 --- a/lucene/src/test/scala/textmogrify/lucene/AnalyzerResourceSuite.scala +++ b/lucene/src/test/scala/textmogrify/lucene/AnalyzerResourceSuite.scala @@ -24,16 +24,18 @@ import org.apache.lucene.analysis.en.EnglishAnalyzer class AnalyzerResourceSuite extends CatsEffectSuite { test("tokenizer should work") { - val analyzer = AnalyzerResource.tokenizer[IO](new EnglishAnalyzer()) - val actual = analyzer.use { f => + val analyzer = AnalyzerResource.fromAnalyzer[IO](new EnglishAnalyzer()) + val tokenizer = Tokenizer.vectorTokenizer(analyzer) + val actual = tokenizer.use { f => f("Hello my name is Neeko") } assertIO(actual, Vector("hello", "my", "name", "neeko")) } test("tokenizer should yield a func that can be used multiple times") { - val analyzer = AnalyzerResource.tokenizer[IO](new EnglishAnalyzer()) - val actual = analyzer.use { f => + val analyzer = AnalyzerResource.fromAnalyzer[IO](new EnglishAnalyzer()) + val tokenizer = Tokenizer.vectorTokenizer(analyzer) + val actual = tokenizer.use { f => for { v1 <- f("Hello my name is Neeko") v2 <- f("I enjoy jumping on counters") @@ -50,14 +52,15 @@ class AnalyzerResourceSuite extends CatsEffectSuite { import org.apache.lucene.analysis.LowerCaseFilter import org.apache.lucene.analysis.Analyzer - val stemmer = AnalyzerResource.tokenizer[IO](new Analyzer { + val analyzer = AnalyzerResource.fromAnalyzer[IO](new Analyzer { protected def createComponents(fieldName: String): TokenStreamComponents = { val source = new StandardTokenizer() val tokens = new LowerCaseFilter(source) new TokenStreamComponents(source, new PorterStemFilter(tokens)) } }) - val actual = stemmer.use { f => + val tokenizer = Tokenizer.vectorTokenizer(analyzer) + val actual = tokenizer.use { f => for { v1 <- f("Hello my name is Neeko") v2 <- f("I enjoy jumping on counters")