Skip to content

Add Oracle document store classes #38

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions langchain4j-oracle/.env-example
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# connecting settings
ORACLE_JDBC_URL=jdbc:oracle:thin:@<host>:1521:<service>
ORACLE_JDBC_USER=<user>
ORACLE_JDBC_PASSWORD=<password>

# sample files
# sample PDF and text files
DEMO_DS_PDF_FILE=</path/to/file.pdf>
DEMO_DS_TEXT_FILE=</path/to/file.txt>
# sample directory of files
DEMO_DS_DIR=<dir>
# sample table
DEMO_DS_OWNER=<user>
DEMO_DS_TABLE=<table>
DEMO_DS_COLUMN=<column>

# proxy for REST calls
DEMO_PROXY=<host>:<port>

# ONNX model
DEMO_ONNX_DIR=<dir_alias>
DEMO_ONNX_FILE=<file>
DEMO_ONNX_MODEL=<name>
27 changes: 27 additions & 0 deletions langchain4j-oracle/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,33 @@
<scope>test</scope>
</dependency>

<!-- document store dependencies -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.18.1</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>2.16.1</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.16.1</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
<version>2.16.1</version>
</dependency>
<dependency>
<groupId>io.github.cdimascio</groupId>
<artifactId>dotenv-java</artifactId>
<version>3.0.0</version>
</dependency>

</dependencies>

<build>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
package dev.langchain4j.data.document.loader.oracle;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;

import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.Metadata;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.sql.Blob;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class OracleDocumentLoader {

private static final Logger log = LoggerFactory.getLogger(OracleDocumentLoader.class);

private final Connection conn;

public OracleDocumentLoader(Connection conn) {
this.conn = conn;
}

public List<Document> loadDocuments(String pref) {
List<Document> documents = new ArrayList<>();

try {
ObjectMapper mapper = new ObjectMapper();
JsonNode rootNode = mapper.readTree(pref);
JsonNode fileNode = rootNode.path("file");
JsonNode dirNode = rootNode.path("dir");
JsonNode ownerNode = rootNode.path("owner");
JsonNode tableNode = rootNode.path("tablename");
JsonNode colNode = rootNode.path("colname");

if (fileNode.textValue() != null) {
String filename = fileNode.textValue();
Document doc = loadDocument(filename, pref);
if (doc != null) {
documents.add(doc);
}
} else if (dirNode.textValue() != null) {
String dir = dirNode.textValue();
Path root = Paths.get(dir);
Files.walk(root).forEach(path -> {
if (path.toFile().isFile()) {
Document doc = loadDocument(path.toFile().toString(), pref);
if (doc != null) {
documents.add(doc);
}
}
});
} else if (colNode.textValue() != null) {
String column = colNode.textValue();

String table = tableNode.textValue();
String owner = ownerNode.textValue();
if (table == null) {
throw new RuntimeException("Missing table in preference");
}
if (owner == null) {
throw new RuntimeException("Missing owner in preference");
}

documents.addAll(loadDocuments(owner, table, column, pref));
} else {
throw new RuntimeException("Invalid preference");
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should add more information in here, and it's probably more a InvalidParameterException: https://docs.oracle.com/javase/8/docs/api/java/security/InvalidParameterException.html

}
} catch (IOException | RuntimeException ex) {
String message = ex.getCause() != null ? ex.getCause().getMessage() : ex.getMessage();
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You don't need to catch the RuntimeException, and I think we should re-throw the exception if we can't recover from the IOException: https://www.baeldung.com/java-exceptions

log.warn("Failed to load '{}': {}", pref, message);
}
return documents;
}

private Document loadDocument(String filename, String pref) {
Document document = null;

try {
byte[] bytes = Files.readAllBytes(Paths.get(filename));

String query = "select dbms_vector_chain.utl_to_text(?, json(?)) text, dbms_vector_chain.utl_to_text(?, json('{\"plaintext\": \"false\"}')) metadata from dual";

try (PreparedStatement stmt = conn.prepareStatement(query)) {
Blob blob = conn.createBlob();
blob.setBytes(1, bytes);

stmt.setBlob(1, blob);
stmt.setObject(2, pref);
stmt.setBlob(3, blob);

try (ResultSet rs = stmt.executeQuery()) {
while (rs.next()) {
String text = rs.getString("text");
String html = rs.getString("metadata");

Metadata metadata = getMetadata(html);
Path path = Paths.get(filename);
metadata.put(Document.FILE_NAME, path.getFileName().toString());
metadata.put(Document.ABSOLUTE_DIRECTORY_PATH, path.getParent().toString());
document = new Document(text, metadata);
}
}
}
} catch (IOException | SQLException e) {
String message = e.getCause() != null ? e.getCause().getMessage() : e.getMessage();
log.warn("Failed to load '{}': {}", filename, message);
}

return document;
}

private List<Document> loadDocuments(String owner, String table, String column, String pref) {
List<Document> documents = new ArrayList<>();

String query = String.format("select dbms_vector_chain.utl_to_text(t.%s, json(?)) text, dbms_vector_chain.utl_to_text(t.%s, json('{\"plaintext\": \"false\"}')) metadata from %s.%s t",
column, column, owner, table);
try {
try (PreparedStatement stmt = conn.prepareStatement(query)) {
stmt.setObject(1, pref);
try (ResultSet rs = stmt.executeQuery()) {
while (rs.next()) {
String text = rs.getString("text");
String html = rs.getString("metadata");

Metadata metadata = getMetadata(html);
Document doc = new Document(text, metadata);
documents.add(doc);
}
}
}
} catch (SQLException e) {
String message = e.getCause() != null ? e.getCause().getMessage() : e.getMessage();
log.warn("Failed to load '{}': {}", column, message);
}

return documents;
}

private static Metadata getMetadata(String html) {
Metadata metadata = new Metadata();

org.jsoup.nodes.Document doc = Jsoup.parse(html);
Elements metaTags = doc.getElementsByTag("meta");
for (Element metaTag : metaTags) {
String name = metaTag.attr("name");
if (name.isEmpty()) {
continue;
}
String content = metaTag.attr("content");
metadata.put(name, content);
}

return metadata;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package dev.langchain4j.data.document.splitter.oracle;

public class Chunk {

public int chunk_id;
public int chunk_offset;
public int chunk_length;
public String chunk_data;

public Chunk() {
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
package dev.langchain4j.data.document.splitter.oracle;

import com.fasterxml.jackson.databind.ObjectMapper;

import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentSplitter;
import dev.langchain4j.data.document.Metadata;
import dev.langchain4j.data.segment.TextSegment;

import java.io.IOException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class OracleDocumentSplitter implements DocumentSplitter {

private static final Logger log = LoggerFactory.getLogger(OracleDocumentSplitter.class);

private static final String INDEX = "index";

private final Connection conn;
private final String pref;

public OracleDocumentSplitter(Connection conn, String pref) {
this.conn = conn;
this.pref = pref;
}

@Override
public List<TextSegment> split(Document document) {
List<TextSegment> segments = new ArrayList<>();
String[] parts = split(document.text());
int index = 0;
for (String part : parts) {
segments.add(createSegment(part, document, index));
index++;
}
return segments;
}

@Override
public List<TextSegment> splitAll(List<Document> list) {
return DocumentSplitter.super.splitAll(list);
}

/**
* Splits the provided text into parts. Implementation API.
*
* @param content The text to be split.
* @return An array of parts.
*/
public String[] split(String content) {

List<String> strArr = new ArrayList<>();

try {
String query = "select t.column_value as data from dbms_vector_chain.utl_to_chunks(?, json(?)) t";
try (PreparedStatement stmt = conn.prepareStatement(query)) {
stmt.setObject(1, content);
stmt.setObject(2, pref);
try (ResultSet rs = stmt.executeQuery()) {
while (rs.next()) {
String text = rs.getString("data");

ObjectMapper mapper = new ObjectMapper();
Chunk chunk = mapper.readValue(text, Chunk.class);
strArr.add(chunk.chunk_data);
}
}
}
} catch (IOException | SQLException ex) {
String message = ex.getCause() != null ? ex.getCause().getMessage() : ex.getMessage();
log.warn("Failed to split '{}': {}", pref, message);
}

return strArr.toArray(new String[strArr.size()]);
}

/**
* Creates a new {@link TextSegment} from the provided text and document.
*
* <p>
* The segment inherits all metadata from the document. The segment also
* includes an "index" metadata key representing the segment position within
* the document.
*
* @param text The text of the segment.
* @param document The document to which the segment belongs.
* @param index The index of the segment within the document.
*/
static TextSegment createSegment(String text, Document document, int index) {
Metadata metadata = document.metadata().copy().put(INDEX, String.valueOf(index));
return TextSegment.from(text, metadata);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
package dev.langchain4j.model.oracle;

public class Embedding {

public int embed_id;
public String embed_data;
public String embed_vector;

public Embedding() {
}
}
Loading
Loading