diff --git a/spring-ai-core/src/main/java/org/springframework/ai/transformer/splitter/TextSplitter.java b/spring-ai-core/src/main/java/org/springframework/ai/transformer/splitter/TextSplitter.java index 6f856510139..809fc556b8f 100644 --- a/spring-ai-core/src/main/java/org/springframework/ai/transformer/splitter/TextSplitter.java +++ b/spring-ai-core/src/main/java/org/springframework/ai/transformer/splitter/TextSplitter.java @@ -88,7 +88,8 @@ private List createDocuments(List texts, List metadataCopy = metadata.entrySet() .stream() - .collect(Collectors.toMap(e -> e.getKey(), e -> e.getValue())); + .filter(e -> e.getKey() != null && e.getValue() != null) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); Document newDoc = new Document(chunk, metadataCopy); if (this.copyContentFormatter) { diff --git a/spring-ai-core/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java b/spring-ai-core/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java index ed6044da5e2..a5caf706ee0 100644 --- a/spring-ai-core/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java +++ b/spring-ai-core/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java @@ -42,7 +42,7 @@ protected List splitText(String text) { List chunks = new ArrayList<>(); chunks.add(text.substring(0, chuckSize)); - chunks.add(text.substring(chuckSize, text.length())); + chunks.add(text.substring(chuckSize)); return chunks; } @@ -213,4 +213,35 @@ public void pageWithChunkSplit() { () -> assertThat(splitedDocument.get(3).getMetadata().get("page_number")).isEqualTo(3)); } + @Test + public void testSplitTextWithNullMetadata() { + + var contentFormatter = DefaultContentFormatter.defaultConfig(); + + var doc = new Document("In the end, writing arises when man realizes that memory is not enough."); + + doc.getMetadata().put("key1", "value1"); + doc.getMetadata().put("key2", null); + + doc.setContentFormatter(contentFormatter); + + List chunks = testTextSplitter.apply(List.of(doc)); + + assertThat(testTextSplitter.isCopyContentFormatter()).isTrue(); + + assertThat(chunks).hasSize(2); + + // Doc chunks: + assertThat(chunks.get(0).getContent()).isEqualTo("In the end, writing arises when man"); + assertThat(chunks.get(1).getContent()).isEqualTo(" realizes that memory is not enough."); + + // Verify that the same, merged metadata is copied to all chunks. + assertThat(chunks.get(0).getMetadata()).isEqualTo(chunks.get(1).getMetadata()); + assertThat(chunks.get(1).getMetadata()).containsKeys("key1"); + + // Verify that the content formatters are copied from the parents to the chunks. + assertThat(chunks.get(0).getContentFormatter()).isSameAs(contentFormatter); + assertThat(chunks.get(1).getContentFormatter()).isSameAs(contentFormatter); + } + }