forked from langchain4j/langchain4j
-
Notifications
You must be signed in to change notification settings - Fork 0
Add Oracle document store classes #38
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
# connecting settings | ||
ORACLE_JDBC_URL=jdbc:oracle:thin:@<host>:1521:<service> | ||
ORACLE_JDBC_USER=<user> | ||
ORACLE_JDBC_PASSWORD=<password> | ||
|
||
# sample files | ||
# sample PDF and text files | ||
DEMO_DS_PDF_FILE=</path/to/file.pdf> | ||
DEMO_DS_TEXT_FILE=</path/to/file.txt> | ||
# sample directory of files | ||
DEMO_DS_DIR=<dir> | ||
# sample table | ||
DEMO_DS_OWNER=<user> | ||
DEMO_DS_TABLE=<table> | ||
DEMO_DS_COLUMN=<column> | ||
|
||
# proxy for REST calls | ||
DEMO_PROXY=<host>:<port> | ||
|
||
# ONNX model | ||
DEMO_ONNX_DIR=<dir_alias> | ||
DEMO_ONNX_FILE=<file> | ||
DEMO_ONNX_MODEL=<name> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
170 changes: 170 additions & 0 deletions
170
...racle/src/main/java/dev/langchain4j/data/document/loader/oracle/OracleDocumentLoader.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
package dev.langchain4j.data.document.loader.oracle; | ||
|
||
import com.fasterxml.jackson.databind.JsonNode; | ||
import com.fasterxml.jackson.databind.ObjectMapper; | ||
|
||
import dev.langchain4j.data.document.Document; | ||
import dev.langchain4j.data.document.Metadata; | ||
|
||
import java.io.IOException; | ||
import java.nio.file.Files; | ||
import java.nio.file.Path; | ||
import java.nio.file.Paths; | ||
import java.sql.Blob; | ||
import java.sql.Connection; | ||
import java.sql.PreparedStatement; | ||
import java.sql.ResultSet; | ||
import java.sql.SQLException; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
import org.jsoup.Jsoup; | ||
import org.jsoup.nodes.Element; | ||
import org.jsoup.select.Elements; | ||
|
||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
public class OracleDocumentLoader { | ||
|
||
private static final Logger log = LoggerFactory.getLogger(OracleDocumentLoader.class); | ||
|
||
private final Connection conn; | ||
|
||
public OracleDocumentLoader(Connection conn) { | ||
this.conn = conn; | ||
} | ||
|
||
public List<Document> loadDocuments(String pref) { | ||
List<Document> documents = new ArrayList<>(); | ||
|
||
try { | ||
ObjectMapper mapper = new ObjectMapper(); | ||
JsonNode rootNode = mapper.readTree(pref); | ||
JsonNode fileNode = rootNode.path("file"); | ||
JsonNode dirNode = rootNode.path("dir"); | ||
JsonNode ownerNode = rootNode.path("owner"); | ||
JsonNode tableNode = rootNode.path("tablename"); | ||
JsonNode colNode = rootNode.path("colname"); | ||
|
||
if (fileNode.textValue() != null) { | ||
String filename = fileNode.textValue(); | ||
Document doc = loadDocument(filename, pref); | ||
if (doc != null) { | ||
documents.add(doc); | ||
} | ||
} else if (dirNode.textValue() != null) { | ||
String dir = dirNode.textValue(); | ||
Path root = Paths.get(dir); | ||
Files.walk(root).forEach(path -> { | ||
if (path.toFile().isFile()) { | ||
Document doc = loadDocument(path.toFile().toString(), pref); | ||
if (doc != null) { | ||
documents.add(doc); | ||
} | ||
} | ||
}); | ||
} else if (colNode.textValue() != null) { | ||
String column = colNode.textValue(); | ||
|
||
String table = tableNode.textValue(); | ||
String owner = ownerNode.textValue(); | ||
if (table == null) { | ||
throw new RuntimeException("Missing table in preference"); | ||
} | ||
if (owner == null) { | ||
throw new RuntimeException("Missing owner in preference"); | ||
} | ||
|
||
documents.addAll(loadDocuments(owner, table, column, pref)); | ||
} else { | ||
throw new RuntimeException("Invalid preference"); | ||
} | ||
} catch (IOException | RuntimeException ex) { | ||
String message = ex.getCause() != null ? ex.getCause().getMessage() : ex.getMessage(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You don't need to catch the RuntimeException, and I think we should re-throw the exception if we can't recover from the IOException: https://www.baeldung.com/java-exceptions |
||
log.warn("Failed to load '{}': {}", pref, message); | ||
} | ||
return documents; | ||
} | ||
|
||
private Document loadDocument(String filename, String pref) { | ||
Document document = null; | ||
|
||
try { | ||
byte[] bytes = Files.readAllBytes(Paths.get(filename)); | ||
|
||
String query = "select dbms_vector_chain.utl_to_text(?, json(?)) text, dbms_vector_chain.utl_to_text(?, json('{\"plaintext\": \"false\"}')) metadata from dual"; | ||
|
||
try (PreparedStatement stmt = conn.prepareStatement(query)) { | ||
Blob blob = conn.createBlob(); | ||
blob.setBytes(1, bytes); | ||
|
||
stmt.setBlob(1, blob); | ||
stmt.setObject(2, pref); | ||
stmt.setBlob(3, blob); | ||
|
||
try (ResultSet rs = stmt.executeQuery()) { | ||
while (rs.next()) { | ||
String text = rs.getString("text"); | ||
String html = rs.getString("metadata"); | ||
|
||
Metadata metadata = getMetadata(html); | ||
Path path = Paths.get(filename); | ||
metadata.put(Document.FILE_NAME, path.getFileName().toString()); | ||
metadata.put(Document.ABSOLUTE_DIRECTORY_PATH, path.getParent().toString()); | ||
document = new Document(text, metadata); | ||
} | ||
} | ||
} | ||
} catch (IOException | SQLException e) { | ||
String message = e.getCause() != null ? e.getCause().getMessage() : e.getMessage(); | ||
log.warn("Failed to load '{}': {}", filename, message); | ||
} | ||
|
||
return document; | ||
} | ||
|
||
private List<Document> loadDocuments(String owner, String table, String column, String pref) { | ||
List<Document> documents = new ArrayList<>(); | ||
|
||
String query = String.format("select dbms_vector_chain.utl_to_text(t.%s, json(?)) text, dbms_vector_chain.utl_to_text(t.%s, json('{\"plaintext\": \"false\"}')) metadata from %s.%s t", | ||
column, column, owner, table); | ||
try { | ||
try (PreparedStatement stmt = conn.prepareStatement(query)) { | ||
stmt.setObject(1, pref); | ||
try (ResultSet rs = stmt.executeQuery()) { | ||
while (rs.next()) { | ||
String text = rs.getString("text"); | ||
String html = rs.getString("metadata"); | ||
|
||
Metadata metadata = getMetadata(html); | ||
Document doc = new Document(text, metadata); | ||
documents.add(doc); | ||
} | ||
} | ||
} | ||
} catch (SQLException e) { | ||
String message = e.getCause() != null ? e.getCause().getMessage() : e.getMessage(); | ||
log.warn("Failed to load '{}': {}", column, message); | ||
} | ||
|
||
return documents; | ||
} | ||
|
||
private static Metadata getMetadata(String html) { | ||
Metadata metadata = new Metadata(); | ||
|
||
org.jsoup.nodes.Document doc = Jsoup.parse(html); | ||
Elements metaTags = doc.getElementsByTag("meta"); | ||
for (Element metaTag : metaTags) { | ||
String name = metaTag.attr("name"); | ||
if (name.isEmpty()) { | ||
continue; | ||
} | ||
String content = metaTag.attr("content"); | ||
metadata.put(name, content); | ||
} | ||
|
||
return metadata; | ||
} | ||
} |
13 changes: 13 additions & 0 deletions
13
langchain4j-oracle/src/main/java/dev/langchain4j/data/document/splitter/oracle/Chunk.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
package dev.langchain4j.data.document.splitter.oracle; | ||
|
||
public class Chunk { | ||
|
||
public int chunk_id; | ||
public int chunk_offset; | ||
public int chunk_length; | ||
public String chunk_data; | ||
|
||
public Chunk() { | ||
} | ||
|
||
} |
101 changes: 101 additions & 0 deletions
101
...e/src/main/java/dev/langchain4j/data/document/splitter/oracle/OracleDocumentSplitter.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
package dev.langchain4j.data.document.splitter.oracle; | ||
|
||
import com.fasterxml.jackson.databind.ObjectMapper; | ||
|
||
import dev.langchain4j.data.document.Document; | ||
import dev.langchain4j.data.document.DocumentSplitter; | ||
import dev.langchain4j.data.document.Metadata; | ||
import dev.langchain4j.data.segment.TextSegment; | ||
|
||
import java.io.IOException; | ||
import java.sql.Connection; | ||
import java.sql.PreparedStatement; | ||
import java.sql.ResultSet; | ||
import java.sql.SQLException; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
public class OracleDocumentSplitter implements DocumentSplitter { | ||
|
||
private static final Logger log = LoggerFactory.getLogger(OracleDocumentSplitter.class); | ||
|
||
private static final String INDEX = "index"; | ||
|
||
private final Connection conn; | ||
private final String pref; | ||
|
||
public OracleDocumentSplitter(Connection conn, String pref) { | ||
this.conn = conn; | ||
this.pref = pref; | ||
} | ||
|
||
@Override | ||
public List<TextSegment> split(Document document) { | ||
List<TextSegment> segments = new ArrayList<>(); | ||
String[] parts = split(document.text()); | ||
int index = 0; | ||
for (String part : parts) { | ||
segments.add(createSegment(part, document, index)); | ||
index++; | ||
} | ||
return segments; | ||
} | ||
|
||
@Override | ||
public List<TextSegment> splitAll(List<Document> list) { | ||
return DocumentSplitter.super.splitAll(list); | ||
} | ||
|
||
/** | ||
* Splits the provided text into parts. Implementation API. | ||
* | ||
* @param content The text to be split. | ||
* @return An array of parts. | ||
*/ | ||
public String[] split(String content) { | ||
|
||
List<String> strArr = new ArrayList<>(); | ||
|
||
try { | ||
String query = "select t.column_value as data from dbms_vector_chain.utl_to_chunks(?, json(?)) t"; | ||
try (PreparedStatement stmt = conn.prepareStatement(query)) { | ||
stmt.setObject(1, content); | ||
stmt.setObject(2, pref); | ||
try (ResultSet rs = stmt.executeQuery()) { | ||
while (rs.next()) { | ||
String text = rs.getString("data"); | ||
|
||
ObjectMapper mapper = new ObjectMapper(); | ||
Chunk chunk = mapper.readValue(text, Chunk.class); | ||
strArr.add(chunk.chunk_data); | ||
} | ||
} | ||
} | ||
} catch (IOException | SQLException ex) { | ||
String message = ex.getCause() != null ? ex.getCause().getMessage() : ex.getMessage(); | ||
log.warn("Failed to split '{}': {}", pref, message); | ||
} | ||
|
||
return strArr.toArray(new String[strArr.size()]); | ||
} | ||
|
||
/** | ||
* Creates a new {@link TextSegment} from the provided text and document. | ||
* | ||
* <p> | ||
* The segment inherits all metadata from the document. The segment also | ||
* includes an "index" metadata key representing the segment position within | ||
* the document. | ||
* | ||
* @param text The text of the segment. | ||
* @param document The document to which the segment belongs. | ||
* @param index The index of the segment within the document. | ||
*/ | ||
static TextSegment createSegment(String text, Document document, int index) { | ||
Metadata metadata = document.metadata().copy().put(INDEX, String.valueOf(index)); | ||
return TextSegment.from(text, metadata); | ||
} | ||
} |
11 changes: 11 additions & 0 deletions
11
langchain4j-oracle/src/main/java/dev/langchain4j/model/oracle/Embedding.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
package dev.langchain4j.model.oracle; | ||
|
||
public class Embedding { | ||
|
||
public int embed_id; | ||
public String embed_data; | ||
public String embed_vector; | ||
|
||
public Embedding() { | ||
} | ||
} |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we should add more information in here, and it's probably more a InvalidParameterException: https://docs.oracle.com/javase/8/docs/api/java/security/InvalidParameterException.html