Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add doc loader #47

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions langchain4j-oracle/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,33 @@
<scope>test</scope>
</dependency>

<!-- document store dependencies -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.18.1</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>2.16.1</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.16.1</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
<version>2.16.1</version>
</dependency>
<dependency>
<groupId>io.github.cdimascio</groupId>
<artifactId>dotenv-java</artifactId>
<version>3.0.0</version>
</dependency>

</dependencies>

<build>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
package dev.langchain4j.data.document.loader.oracle;

import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;

import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.Metadata;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.security.InvalidParameterException;
import java.sql.Blob;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class OracleDocumentLoader {

private static final Logger log = LoggerFactory.getLogger(OracleDocumentLoader.class);

private final Connection conn;

public OracleDocumentLoader(Connection conn) {
this.conn = conn;
}

public List<Document> loadDocuments(String pref) throws JsonProcessingException, IOException, SQLException {
List<Document> documents = new ArrayList<>();

ObjectMapper mapper = new ObjectMapper();
JsonNode rootNode = mapper.readTree(pref);
JsonNode fileNode = rootNode.path("file");
JsonNode dirNode = rootNode.path("dir");
JsonNode ownerNode = rootNode.path("owner");
JsonNode tableNode = rootNode.path("tablename");
JsonNode colNode = rootNode.path("colname");

if (fileNode.textValue() != null) {
String filename = fileNode.textValue();
Document doc = loadDocument(filename, pref);
if (doc != null) {
documents.add(doc);
}
} else if (dirNode.textValue() != null) {
String dir = dirNode.textValue();
Path root = Paths.get(dir);
Files.walk(root).forEach(path -> {
if (path.toFile().isFile()) {
Document doc = null;
try {
doc = loadDocument(path.toFile().toString(), pref);
} catch (IOException | SQLException e) {
String message = e.getCause() != null ? e.getCause().getMessage() : e.getMessage();
log.warn("Failed to summarize '{}': {}", pref, message);
}
if (doc != null) {
documents.add(doc);
}
}
});
} else if (colNode.textValue() != null) {
String column = colNode.textValue();

String table = tableNode.textValue();
String owner = ownerNode.textValue();
if (table == null) {
throw new InvalidParameterException("Missing table in preference");
}
if (owner == null) {
throw new InvalidParameterException("Missing owner in preference");
}

documents.addAll(loadDocuments(owner, table, column, pref));
} else {
throw new InvalidParameterException("Missing file, dir, or table in preference");
}

return documents;
}

private Document loadDocument(String filename, String pref) throws IOException, SQLException {
Document document = null;

byte[] bytes = Files.readAllBytes(Paths.get(filename));

String query = "select dbms_vector_chain.utl_to_text(?, json(?)) text, dbms_vector_chain.utl_to_text(?, json('{\"plaintext\": \"false\"}')) metadata from dual";

try (PreparedStatement stmt = conn.prepareStatement(query)) {
Blob blob = conn.createBlob();
blob.setBytes(1, bytes);

stmt.setBlob(1, blob);
stmt.setObject(2, pref);
stmt.setBlob(3, blob);

try (ResultSet rs = stmt.executeQuery()) {
while (rs.next()) {
String text = rs.getString("text");
String html = rs.getString("metadata");

Metadata metadata = getMetadata(html);
Path path = Paths.get(filename);
metadata.put(Document.FILE_NAME, path.getFileName().toString());
metadata.put(Document.ABSOLUTE_DIRECTORY_PATH, path.getParent().toString());
document = Document.from(text, metadata);
}
}
}

return document;
}

private List<Document> loadDocuments(String owner, String table, String column, String pref) throws SQLException {
List<Document> documents = new ArrayList<>();

String query = String.format("select dbms_vector_chain.utl_to_text(t.%s, json(?)) text, dbms_vector_chain.utl_to_text(t.%s, json('{\"plaintext\": \"false\"}')) metadata from %s.%s t",
column, column, owner, table);
try (PreparedStatement stmt = conn.prepareStatement(query)) {
stmt.setObject(1, pref);
try (ResultSet rs = stmt.executeQuery()) {
while (rs.next()) {
String text = rs.getString("text");
String html = rs.getString("metadata");

Metadata metadata = getMetadata(html);
Document doc = Document.from(text, metadata);
documents.add(doc);
}
}
}

return documents;
}

private static Metadata getMetadata(String html) {
Metadata metadata = new Metadata();

org.jsoup.nodes.Document doc = Jsoup.parse(html);
Elements metaTags = doc.getElementsByTag("meta");
for (Element metaTag : metaTags) {
String name = metaTag.attr("name");
if (name.isEmpty()) {
continue;
}
String content = metaTag.attr("content");
metadata.put(name, content);
}

return metadata;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package dev.langchain4j.data.document.splitter.oracle;

public class Chunk {

public int chunk_id;
public int chunk_offset;
public int chunk_length;
public String chunk_data;

public Chunk() {
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
package dev.langchain4j.data.document.splitter.oracle;

import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;

import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentSplitter;
import dev.langchain4j.data.document.Metadata;
import dev.langchain4j.data.segment.TextSegment;

import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class OracleDocumentSplitter implements DocumentSplitter {

private static final Logger log = LoggerFactory.getLogger(OracleDocumentSplitter.class);

private static final String INDEX = "index";

private final Connection conn;
private final String pref;

public OracleDocumentSplitter(Connection conn, String pref) {
this.conn = conn;
this.pref = pref;
}

@Override
public List<TextSegment> split(Document document) {
List<TextSegment> segments = new ArrayList<>();
try {
String[] parts = split(document.text());
int index = 0;
for (String part : parts) {
segments.add(createSegment(part, document, index));
index++;
}
} catch (SQLException | JsonProcessingException e) {
String message = e.getCause() != null ? e.getCause().getMessage() : e.getMessage();
log.warn("Failed to summarize '{}': {}", pref, message);
}
return segments;
}

@Override
public List<TextSegment> splitAll(List<Document> list) {
return DocumentSplitter.super.splitAll(list);
}

/**
* Splits the provided text into parts. Implementation API.
*
* @param content The text to be split.
* @return An array of parts.
*/
public String[] split(String content) throws SQLException, JsonProcessingException {

List<String> strArr = new ArrayList<>();

String query = "select t.column_value as data from dbms_vector_chain.utl_to_chunks(?, json(?)) t";
try (PreparedStatement stmt = conn.prepareStatement(query)) {
stmt.setObject(1, content);
stmt.setObject(2, pref);
try (ResultSet rs = stmt.executeQuery()) {
while (rs.next()) {
String text = rs.getString("data");

ObjectMapper mapper = new ObjectMapper();
Chunk chunk = mapper.readValue(text, Chunk.class);
strArr.add(chunk.chunk_data);
}
}
}

return strArr.toArray(new String[strArr.size()]);
}

/**
* Creates a new {@link TextSegment} from the provided text and document.
*
* <p>
* The segment inherits all metadata from the document. The segment also
* includes an "index" metadata key representing the segment position within
* the document.
*
* @param text The text of the segment.
* @param document The document to which the segment belongs.
* @param index The index of the segment within the document.
*/
static TextSegment createSegment(String text, Document document, int index) {
Metadata metadata = document.metadata().copy().put(INDEX, String.valueOf(index));
return TextSegment.from(text, metadata);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
package dev.langchain4j.model.oracle;

public class Embedding {

public int embed_id;
public String embed_data;
public String embed_vector;

public Embedding() {
}
}
Loading
Loading