Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Bump [email protected] and [email protected] #10810

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -146,10 +146,10 @@ public long getTotalTokenCount() {
try {
RegexpQuery query = new RegexpQuery(new Term("totalTokenCount", ".*"));
TopDocs docs = luceneSearcher.searcher.search(query, 1000); // Integer.MAX_VALUE might cause OOE on wrong index
if (docs.totalHits == 0) {
if (docs.totalHits.value == 0) {
throw new RuntimeException("Expected 'totalTokenCount' meta documents not found in 1grams index: " + luceneSearcher.directory);
} else if (docs.totalHits > 1000) {
throw new RuntimeException("Did not expect more than 1000 'totalTokenCount' meta documents: " + docs.totalHits + " in " + luceneSearcher.directory);
} else if (docs.totalHits.value > 1000) {
throw new RuntimeException("Did not expect more than 1000 'totalTokenCount' meta documents: " + docs.totalHits.value + " in " + luceneSearcher.directory);
} else {
long result = 0;
for (ScoreDoc scoreDoc : docs.scoreDocs) {
Expand Down Expand Up @@ -194,9 +194,9 @@ private long getCount(Term term, LuceneSearcher luceneSearcher) {
long result = 0;
try {
TopDocs docs = luceneSearcher.searcher.search(new TermQuery(term), 2000);
if (docs.totalHits > 2000) {
if (docs.totalHits.value > 2000) {
throw new RuntimeException("More than 2000 matches for '" + term + "' not supported for performance reasons: " +
docs.totalHits + " matches in " + luceneSearcher.directory);
docs.totalHits.value + " matches in " + luceneSearcher.directory);
}
for (ScoreDoc scoreDoc : docs.scoreDocs) {
String countStr = luceneSearcher.reader.document(scoreDoc.doc).get("count");
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -47,32 +47,33 @@ public static void main(String[] args) throws IOException {
try (FSDirectory directory = FSDirectory.open(dir.toPath());
IndexReader reader = DirectoryReader.open(directory)) {
IndexSearcher searcher = new IndexSearcher(reader);
Fields fields = MultiFields.getFields(reader);
Terms ngrams = fields.terms("ngram");
TermsEnum iterator = ngrams.iterator();
BytesRef next;
int i = 0;
while ((next = iterator.next()) != null) {
String term = next.utf8ToString();
if (term.startsWith(LanguageModel.GOOGLE_SENTENCE_START)) {
if (term.matches(".*_(ADJ|ADV|NUM|VERB|ADP|NOUN|PRON|CONJ|DET|PRT)$")) {
//System.out.println("ignore: " + term);
continue;
}
TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 3);
if (topDocs.totalHits == 0) {
throw new RuntimeException("No hits for " + term + ": " + topDocs.totalHits);
} else if (topDocs.totalHits == 1) {
int docId = topDocs.scoreDocs[0].doc;
Document document = reader.document(docId);
Long count = Long.parseLong(document.get("count"));
//System.out.println(term + " -> " + count);
totalCount += count;
if (++i % 10_000 == 0) {
System.out.println(i + " ... " + totalCount);
for (String field : FieldInfos.getIndexedFields(reader)) {
Terms ngrams = MultiTerms.getTerms(reader, field);
TermsEnum iterator = ngrams.iterator();
BytesRef next;
int i = 0;
while ((next = iterator.next()) != null) {
String term = next.utf8ToString();
if (term.startsWith(LanguageModel.GOOGLE_SENTENCE_START)) {
if (term.matches(".*_(ADJ|ADV|NUM|VERB|ADP|NOUN|PRON|CONJ|DET|PRT)$")) {
//System.out.println("ignore: " + term);
continue;
}
TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 3);
if (topDocs.totalHits.value == 0) {
throw new RuntimeException("No hits for " + term + ": " + topDocs.totalHits.value);
} else if (topDocs.totalHits.value == 1) {
int docId = topDocs.scoreDocs[0].doc;
Document document = reader.document(docId);
Long count = Long.parseLong(document.get("count"));
//System.out.println(term + " -> " + count);
totalCount += count;
if (++i % 10_000 == 0) {
System.out.println(i + " ... " + totalCount);
}
} else {
throw new RuntimeException("More hits than expected for " + term + ": " + topDocs.totalHits);
}
} else {
throw new RuntimeException("More hits than expected for " + term + ": " + topDocs.totalHits);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* LanguageTool, a natural language style checker
/* LanguageTool, a natural language style checker
* Copyright (C) 2015 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
Expand Down Expand Up @@ -95,13 +95,8 @@ private Document getDoc(String ngram, long count) {
}

@NotNull
private LongField getCountField(long count) {
FieldType fieldType = new FieldType();
fieldType.setStored(true);
fieldType.setOmitNorms(true);
fieldType.setNumericType(FieldType.NumericType.LONG);
fieldType.setDocValuesType(DocValuesType.NUMERIC);
return new LongField("count", count, fieldType);
private LongPoint getCountField(long count) {
return new LongPoint("count", count);
}
miurahr marked this conversation as resolved.
Show resolved Hide resolved

private void addTotalTokenCountDoc(long totalTokenCount, IndexWriter writer) throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,16 +177,16 @@ private void writeToLucene(int ngramSize, Map<String, Long> ngramToCount) throws
if (newReader != null) {
reader = newReader;
}*/
index.reader = DirectoryReader.open(index.indexWriter, true);
index.reader = DirectoryReader.open(index.indexWriter, true, true);
miurahr marked this conversation as resolved.
Show resolved Hide resolved
index.searcher = new IndexSearcher(index.reader);
for (Map.Entry<String, Long> entry : ngramToCount.entrySet()) {
Term ngram = new Term("ngram", entry.getKey());
TopDocs topDocs = index.searcher.search(new TermQuery(ngram), 2);
//System.out.println(ngram + " ==> " + topDocs.totalHits);
if (topDocs.totalHits == 0) {
if (topDocs.totalHits.value == 0) {
Document doc = getDoc(entry.getKey(), entry.getValue());
index.indexWriter.addDocument(doc);
} else if (topDocs.totalHits == 1) {
} else if (topDocs.totalHits.value == 1) {
int docNumber = topDocs.scoreDocs[0].doc;
Document document = index.reader.document(docNumber);
long oldCount = Long.parseLong(document.getField("count").stringValue());
Expand All @@ -195,7 +195,7 @@ private void writeToLucene(int ngramSize, Map<String, Long> ngramToCount) throws
index.indexWriter.addDocument(getDoc(entry.getKey(), oldCount + entry.getValue()));
// would probably be faster, but we currently rely on the count being a common field:
//indexWriter.updateNumericDocValue(ngram, "count", oldCount + entry.getValue());
} else if (topDocs.totalHits > 1) {
} else if (topDocs.totalHits.value > 1) {
throw new RuntimeException("Got more than one hit for: " + ngram);
}
//System.out.println(" " + entry.getKey() + " -> " + entry.getValue());
Expand All @@ -221,13 +221,8 @@ private Document getDoc(String ngram, long count) {
}

@NotNull
private LongField getCountField(long count) {
FieldType fieldType = new FieldType();
fieldType.setStored(true);
fieldType.setOmitNorms(true);
fieldType.setNumericType(FieldType.NumericType.LONG);
fieldType.setDocValuesType(DocValuesType.NUMERIC);
return new LongField("count", count, fieldType);
private LongPoint getCountField(long count) {
return new LongPoint("count", count);
miurahr marked this conversation as resolved.
Show resolved Hide resolved
}

private void addTotalTokenCountDoc(long totalTokenCount, IndexWriter writer) throws IOException {
Expand Down Expand Up @@ -269,7 +264,7 @@ static class LuceneLiveIndex {
IndexWriterConfig config = new IndexWriterConfig(analyzer);
directory = FSDirectory.open(dir.toPath());
indexWriter = new IndexWriter(directory, config);
reader = DirectoryReader.open(indexWriter, false);
reader = DirectoryReader.open(indexWriter, false, false);
miurahr marked this conversation as resolved.
Show resolved Hide resolved
searcher = new IndexSearcher(reader);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Set;

/**
* Prototype to find potential upper-only phrases like "Persischer Golf".
Expand All @@ -57,47 +59,48 @@ public static void main(String[] args) throws IOException {
FSDirectory fsDir = FSDirectory.open(new File(args[0]).toPath());
IndexReader reader = DirectoryReader.open(fsDir);
IndexSearcher searcher = new IndexSearcher(reader);
Fields fields = MultiFields.getFields(reader);
Terms terms = fields.terms("ngram");
TermsEnum termsEnum = terms.iterator();
int count = 0;
BytesRef next;
while ((next = termsEnum.next()) != null) {
String term = next.utf8ToString();
count++;
//term = "persischer Golf"; // for testing
String[] parts = term.split(" ");
boolean useful = true;
int lcCount = 0;
List<String> ucParts = new ArrayList<>();
for (String part : parts) {
if (part.length() < MIN_TERM_LEN) {
useful = false;
break;
for (String field: FieldInfos.getIndexedFields(reader)) {
miurahr marked this conversation as resolved.
Show resolved Hide resolved
Terms terms = MultiTerms.getTerms(reader, field);
TermsEnum termsEnum = terms.iterator();
miurahr marked this conversation as resolved.
Show resolved Hide resolved
int count = 0;
BytesRef next;
while ((next = termsEnum.next()) != null) {
String term = next.utf8ToString();
count++;
//term = "persischer Golf"; // for testing
String[] parts = term.split(" ");
boolean useful = true;
int lcCount = 0;
List<String> ucParts = new ArrayList<>();
for (String part : parts) {
if (part.length() < MIN_TERM_LEN) {
useful = false;
break;
}
String uc = StringTools.uppercaseFirstChar(part);
if (!part.equals(uc)) {
lcCount++;
}
ucParts.add(uc);
}
String uc = StringTools.uppercaseFirstChar(part);
if (!part.equals(uc)) {
lcCount++;
if (!useful || lcCount == 0 || lcCount == 2) {
continue;
}
ucParts.add(uc);
}
if (!useful || lcCount == 0 || lcCount == 2) {
continue;
}
String uppercase = String.join(" ", ucParts);
if (term.equals(uppercase)){
continue;
}
long thisCount = getOccurrenceCount(reader, searcher, term);
long thisUpperCount = getOccurrenceCount(reader, searcher, uppercase);
if (count % 10_000 == 0) {
System.err.println(count + " @ " + term);
}
if (thisCount > LIMIT || thisUpperCount > LIMIT) {
if (thisUpperCount > thisCount) {
if (isRelevant(lt, term)) {
float factor = (float)thisUpperCount / thisCount;
System.out.printf("%.2f " + thisUpperCount + " " + uppercase + " " + thisCount + " " + term + "\n", factor);
String uppercase = String.join(" ", ucParts);
if (term.equals(uppercase)) {
continue;
}
long thisCount = getOccurrenceCount(reader, searcher, term);
long thisUpperCount = getOccurrenceCount(reader, searcher, uppercase);
if (count % 10_000 == 0) {
System.err.println(count + " @ " + term);
}
if (thisCount > LIMIT || thisUpperCount > LIMIT) {
if (thisUpperCount > thisCount) {
if (isRelevant(lt, term)) {
float factor = (float) thisUpperCount / thisCount;
System.out.printf("%.2f " + thisUpperCount + " " + uppercase + " " + thisCount + " " + term + "\n", factor);
}
}
}
}
Expand All @@ -117,7 +120,7 @@ private static boolean isRelevant(JLanguageTool lt, String term) throws IOExcept

private static long getOccurrenceCount(IndexReader reader, IndexSearcher searcher, String term) throws IOException {
TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 5);
if (topDocs.totalHits == 0) {
if (topDocs.totalHits.value == 0) {
return 0;
}
int docId = topDocs.scoreDocs[0].doc;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,9 @@ public static void main(String[] args) throws IOException {
FSDirectory fsDir = FSDirectory.open(new File(args[0]).toPath());
IndexReader reader = DirectoryReader.open(fsDir);
IndexSearcher searcher = new IndexSearcher(reader);
Fields fields = MultiFields.getFields(reader);
Terms terms = MultiTerms.getTerms(reader, "ngram");
long max = 0;
String maxTerm = "";
Terms terms = fields.terms("ngram");
TermsEnum termsEnum = terms.iterator();
int count = 0;
BytesRef next;
Expand All @@ -71,5 +70,5 @@ public static void main(String[] args) throws IOException {
}
System.out.println("Max: " + max + " for " + maxTerm);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,7 @@ public static void main(String[] args) throws IOException {
String ngramIndexDir = args[0];
FSDirectory fsDir = FSDirectory.open(new File(ngramIndexDir).toPath());
IndexReader reader = DirectoryReader.open(fsDir);
Fields fields = MultiFields.getFields(reader);
Terms terms = fields.terms("ngram");
Terms terms = MultiTerms.getTerms(reader, "ngram");
TermsEnum termsEnum = terms.iterator();
int i = 0;
int needed = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
package org.languagetool.dev.bigdata;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
Expand Down
2 changes: 1 addition & 1 deletion languagetool-language-modules/ja/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@

<dependencies>
<dependency>
<groupId>com.github.lucene-gosen</groupId>
<groupId>org.omegat.lucene</groupId>
<artifactId>lucene-gosen</artifactId>
<classifier>ipadic</classifier>
</dependency>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@
*/
package org.languagetool.dev;

import org.apache.lucene.index.Fields;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
Expand Down Expand Up @@ -112,8 +111,7 @@ private void dumpOccurrences(Set<String> tokens) throws IOException {

private TermsEnum getIterator() throws IOException {
LuceneSearcher luceneSearcher = getLuceneSearcher(3);
Fields fields = MultiFields.getFields(luceneSearcher.getReader());
Terms terms = fields.terms("ngram");
Terms terms = MultiTerms.getTerms(luceneSearcher.getReader(), "ngram");
return terms.iterator();
}

Expand Down
2 changes: 1 addition & 1 deletion languagetool-wikipedia/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@

<properties>
<!-- we have to repeat it here as otherwise Grails project languagetool-community-website will fail: -->
<lucene.version>5.5.5</lucene.version>
<lucene.version>8.11.3</lucene.version>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Codebase verification

Deprecated Lucene API Usage Detected

The update to Lucene version 8.11.3 introduces potential compatibility issues due to deprecated API usages found in the codebase.

Areas to Address:

  • Deprecated Methods:

    • DirectoryReader.open is used in multiple files:
      • languagetool-wikipedia/src/main/java/org/languagetool/dev/index/Searcher.java
      • languagetool-wikipedia/src/test/java/org/languagetool/dev/index/PatternRuleQueryBuilderTest.java
      • languagetool-dev/src/main/java/org/languagetool/dev/archive/StartTokenCounter.java
      • [List continues as per script output]
  • Existing TODO Comments:

    • languagetool-core/src/main/java/org/languagetool/languagemodel/BaseLanguageModel.java
      • // TODO: lm.getCount("_START_") returns 0 for Google data -- see getCount(String) in LuceneLanguageModel

Recommendations:

  1. Update Deprecated APIs:

    • Replace DirectoryReader.open with the updated methods as per Lucene 8.11.3 documentation.
  2. Address TODOs:

    • Resolve the TODO comment to ensure the getCount method functions correctly with the updated Lucene version.
  3. Comprehensive Testing:

    • After making the necessary changes, perform thorough testing to ensure that all Lucene-dependent functionalities work as expected with the new version.
🔗 Analysis chain

LGTM! Verify compatibility with the updated Lucene version.

The Lucene version update from 5.5.5 to 8.11.3 aligns with the PR objectives. This is a significant version jump, which may introduce breaking changes or new features.

To ensure smooth integration, please verify the following:

  1. Compatibility of all Lucene-dependent code with version 8.11.3.
  2. Update of any Lucene-related documentation or comments in the codebase.
  3. Successful build and all tests passing with the new version.

Run the following script to check for potential compatibility issues:

This script will help identify areas that might need attention due to the Lucene version update.

🏁 Scripts executed

The following scripts were executed for the analysis:

Script:

#!/bin/bash
# Description: Check for potential Lucene compatibility issues

# Test: Search for Lucene imports and usages
echo "Searching for Lucene imports and usages:"
rg --type java "import org\.apache\.lucene" -A 5

# Test: Check for any TODO or FIXME comments related to Lucene
echo "Checking for TODO or FIXME comments related to Lucene:"
rg --type java "(TODO|FIXME).*Lucene"

# Test: Look for deprecated Lucene API usage
echo "Checking for potentially deprecated Lucene API usage:"
rg --type java "MultiFields\.getFields|DirectoryReader\.open|LongField"

Length of output: 45771

</properties>

<dependencies>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
package org.languagetool.dev.dumpcheck;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@
*/
package org.languagetool.dev.index;

import org.apache.lucene.analysis.CharacterUtils;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.AttributeFactory;

import java.io.IOException;
Expand All @@ -36,7 +36,6 @@ public final class AnyCharTokenizer extends Tokenizer {
private static final int MAX_WORD_LEN = Integer.MAX_VALUE; // extend the word length!

private final CharacterUtils.CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(4096);
private final CharacterUtils charUtils = CharacterUtils.getInstance();
private final CharTermAttribute termAtt = this.addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = this.addAttribute(OffsetAttribute.class);

Expand Down Expand Up @@ -83,7 +82,7 @@ public boolean incrementToken() throws IOException {
while(true) {
if(this.bufferIndex >= this.dataLen) {
this.offset += this.dataLen;
this.charUtils.fill(this.ioBuffer, this.input);
CharacterUtils.fill(this.ioBuffer, this.input);
if(this.ioBuffer.getLength() == 0) {
this.dataLen = 0;
if(length <= 0) {
Expand All @@ -97,7 +96,7 @@ public boolean incrementToken() throws IOException {
this.bufferIndex = 0;
}

int c = this.charUtils.codePointAt(this.ioBuffer.getBuffer(), this.bufferIndex, this.ioBuffer.getLength());
int c = Character.codePointAt(this.ioBuffer.getBuffer(), this.bufferIndex);
int charCount = Character.charCount(c);
this.bufferIndex += charCount;
if(this.isTokenChar(c)) {
Expand Down
Loading
Loading