diff --git a/spring-ai-commons/src/main/java/org/springframework/ai/document/id/IdGenerator.java b/spring-ai-commons/src/main/java/org/springframework/ai/document/id/IdGenerator.java index 2251016f6d1..515e0bf797b 100644 --- a/spring-ai-commons/src/main/java/org/springframework/ai/document/id/IdGenerator.java +++ b/spring-ai-commons/src/main/java/org/springframework/ai/document/id/IdGenerator.java @@ -26,7 +26,7 @@ public interface IdGenerator { /** * Generate a unique ID for the given content. Note: some generator, such as the - * random generator might not dependant on or use the content parameters. + * random generator might not depend on or use the content parameters. * @param contents the content to generate an ID for. * @return the generated ID. */ diff --git a/spring-ai-commons/src/main/java/org/springframework/ai/transformer/splitter/TextSplitter.java b/spring-ai-commons/src/main/java/org/springframework/ai/transformer/splitter/TextSplitter.java index 9d2dc8dca51..3e0a4f31022 100644 --- a/spring-ai-commons/src/main/java/org/springframework/ai/transformer/splitter/TextSplitter.java +++ b/spring-ai-commons/src/main/java/org/springframework/ai/transformer/splitter/TextSplitter.java @@ -63,18 +63,22 @@ private List doSplitDocuments(List documents) { List texts = new ArrayList<>(); List> metadataList = new ArrayList<>(); List formatters = new ArrayList<>(); + List scores = new ArrayList<>(); + List originalIds = new ArrayList<>(); for (Document doc : documents) { texts.add(doc.getText()); metadataList.add(doc.getMetadata()); formatters.add(doc.getContentFormatter()); + scores.add(doc.getScore()); + originalIds.add(doc.getId()); } - return createDocuments(texts, formatters, metadataList); + return createDocuments(texts, formatters, metadataList, scores, originalIds); } private List createDocuments(List texts, List formatters, - List> metadataList) { + List> metadataList, List scores, List originalIds) { // Process the data in a column oriented way and recreate the Document List documents = new ArrayList<>(); @@ -82,25 +86,38 @@ private List createDocuments(List texts, List metadata = metadataList.get(i); + Double originalScore = scores.get(i); + String originalId = originalIds.get(i); + List chunks = splitText(text); if (chunks.size() > 1) { logger.info("Splitting up document into {} chunks.", chunks.size()); } - for (String chunk : chunks) { - // only primitive values are in here - - Map metadataCopy = metadata.entrySet() + + for (int chunkIndex = 0; chunkIndex < chunks.size(); chunkIndex++) { + String chunk = chunks.get(chunkIndex); + + Map enhancedMetadata = metadata.entrySet() .stream() .filter(e -> e.getKey() != null && e.getValue() != null) .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); - Document newDoc = new Document(chunk, metadataCopy); + + enhancedMetadata.put("parent_document_id", originalId); + enhancedMetadata.put("chunk_index", chunkIndex); + enhancedMetadata.put("total_chunks", chunks.size()); + + Document newDoc = Document.builder() + .text(chunk) + .metadata(enhancedMetadata) + .score(originalScore) + .build(); if (this.copyContentFormatter) { // Transfer the content-formatter of the parent to the chunked - // documents it was slit into. + // documents it was split into. newDoc.setContentFormatter(formatters.get(i)); } - // TODO copy over other properties. documents.add(newDoc); } } diff --git a/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java b/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java index 88fee22e92d..66ee5f4e9f5 100644 --- a/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java +++ b/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java @@ -82,9 +82,22 @@ public void testSplitText() { assertThat(chunks.get(3).getText()) .isEqualTo("choose. It isn’t the lack of an exit, but the abundance of exits that is so disorienting."); - // Verify that the same, merged metadata is copied to all chunks. - assertThat(chunks.get(0).getMetadata()).isEqualTo(chunks.get(1).getMetadata()); - assertThat(chunks.get(2).getMetadata()).isEqualTo(chunks.get(3).getMetadata()); + // Verify that the original metadata is copied to all chunks (including + // chunk-specific fields) + assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "key2", "parent_document_id", "chunk_index", + "total_chunks"); + assertThat(chunks.get(1).getMetadata()).containsKeys("key1", "key2", "parent_document_id", "chunk_index", + "total_chunks"); + assertThat(chunks.get(2).getMetadata()).containsKeys("key2", "key3", "parent_document_id", "chunk_index", + "total_chunks"); + assertThat(chunks.get(3).getMetadata()).containsKeys("key2", "key3", "parent_document_id", "chunk_index", + "total_chunks"); + + // Verify chunk indices are correct + assertThat(chunks.get(0).getMetadata().get("chunk_index")).isEqualTo(0); + assertThat(chunks.get(1).getMetadata().get("chunk_index")).isEqualTo(1); + assertThat(chunks.get(2).getMetadata().get("chunk_index")).isEqualTo(0); + assertThat(chunks.get(3).getMetadata().get("chunk_index")).isEqualTo(1); assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "key2").doesNotContainKeys("key3"); assertThat(chunks.get(2).getMetadata()).containsKeys("key2", "key3").doesNotContainKeys("key1"); @@ -148,7 +161,6 @@ public void pageNoChunkSplit() { @Test public void pageWithChunkSplit() { // given - var doc1 = new Document("1In the end, writing arises when man realizes that memory is not enough." + "1The most oppressive thing about the labyrinth is that you are constantly " + "1being forced to choose. It isn’t the lack of an exit, but the abundance of exits that is so disorienting.", @@ -236,13 +248,137 @@ public void testSplitTextWithNullMetadata() { assertThat(chunks.get(0).getText()).isEqualTo("In the end, writing arises when man"); assertThat(chunks.get(1).getText()).isEqualTo(" realizes that memory is not enough."); - // Verify that the same, merged metadata is copied to all chunks. - assertThat(chunks.get(0).getMetadata()).isEqualTo(chunks.get(1).getMetadata()); - assertThat(chunks.get(1).getMetadata()).containsKeys("key1"); + // Verify that the original metadata is copied to all chunks (with chunk-specific + // fields) + assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "parent_document_id", "chunk_index", + "total_chunks"); + assertThat(chunks.get(1).getMetadata()).containsKeys("key1", "parent_document_id", "chunk_index", + "total_chunks"); + + // Verify chunk indices are different + assertThat(chunks.get(0).getMetadata().get("chunk_index")).isEqualTo(0); + assertThat(chunks.get(1).getMetadata().get("chunk_index")).isEqualTo(1); // Verify that the content formatters are copied from the parents to the chunks. assertThat(chunks.get(0).getContentFormatter()).isSameAs(contentFormatter); assertThat(chunks.get(1).getContentFormatter()).isSameAs(contentFormatter); } + @Test + public void testScorePreservation() { + // given + Double originalScore = 0.95; + var doc = Document.builder() + .text("This is a test document that will be split into multiple chunks.") + .metadata(Map.of("source", "test.txt")) + .score(originalScore) + .build(); + + // when + List chunks = testTextSplitter.apply(List.of(doc)); + + // then + assertThat(chunks).hasSize(2); + assertThat(chunks.get(0).getScore()).isEqualTo(originalScore); + assertThat(chunks.get(1).getScore()).isEqualTo(originalScore); + } + + @Test + public void testParentDocumentTracking() { + // given + var doc1 = new Document("First document content for testing splitting functionality.", + Map.of("source", "doc1.txt")); + var doc2 = new Document("Second document content for testing splitting functionality.", + Map.of("source", "doc2.txt")); + + String originalId1 = doc1.getId(); + String originalId2 = doc2.getId(); + + // when + List chunks = testTextSplitter.apply(List.of(doc1, doc2)); + + // then + assertThat(chunks).hasSize(4); + + // Verify parent document tracking for doc1 chunks + assertThat(chunks.get(0).getMetadata().get("parent_document_id")).isEqualTo(originalId1); + assertThat(chunks.get(1).getMetadata().get("parent_document_id")).isEqualTo(originalId1); + + // Verify parent document tracking for doc2 chunks + assertThat(chunks.get(2).getMetadata().get("parent_document_id")).isEqualTo(originalId2); + assertThat(chunks.get(3).getMetadata().get("parent_document_id")).isEqualTo(originalId2); + } + + @Test + public void testChunkMetadataInformation() { + // given + var doc = new Document("This is a longer document that will be split into exactly two chunks for testing.", + Map.of("source", "test.txt")); + + // when + List chunks = testTextSplitter.apply(List.of(doc)); + + // then + assertThat(chunks).hasSize(2); + + // Verify chunk index and total chunks for first chunk + assertThat(chunks.get(0).getMetadata().get("chunk_index")).isEqualTo(0); + assertThat(chunks.get(0).getMetadata().get("total_chunks")).isEqualTo(2); + + // Verify chunk index and total chunks for second chunk + assertThat(chunks.get(1).getMetadata().get("chunk_index")).isEqualTo(1); + assertThat(chunks.get(1).getMetadata().get("total_chunks")).isEqualTo(2); + + // Verify original metadata is preserved + assertThat(chunks.get(0).getMetadata().get("source")).isEqualTo("test.txt"); + assertThat(chunks.get(1).getMetadata().get("source")).isEqualTo("test.txt"); + } + + @Test + public void testEnhancedMetadataWithMultipleDocuments() { + // given + var doc1 = Document.builder() + .text("First document with score and metadata.") + .metadata(Map.of("type", "article", "priority", "high")) + .score(0.8) + .build(); + + var doc2 = Document.builder() + .text("Second document with different score.") + .metadata(Map.of("type", "report", "priority", "medium")) + .score(0.6) + .build(); + + String originalId1 = doc1.getId(); + String originalId2 = doc2.getId(); + + // when + List chunks = testTextSplitter.apply(List.of(doc1, doc2)); + + // then + assertThat(chunks).hasSize(4); + + // Verify first document chunks + for (int i = 0; i < 2; i++) { + Document chunk = chunks.get(i); + assertThat(chunk.getScore()).isEqualTo(0.8); + assertThat(chunk.getMetadata().get("parent_document_id")).isEqualTo(originalId1); + assertThat(chunk.getMetadata().get("chunk_index")).isEqualTo(i); + assertThat(chunk.getMetadata().get("total_chunks")).isEqualTo(2); + assertThat(chunk.getMetadata().get("type")).isEqualTo("article"); + assertThat(chunk.getMetadata().get("priority")).isEqualTo("high"); + } + + // Verify second document chunks + for (int i = 2; i < 4; i++) { + Document chunk = chunks.get(i); + assertThat(chunk.getScore()).isEqualTo(0.6); + assertThat(chunk.getMetadata().get("parent_document_id")).isEqualTo(originalId2); + assertThat(chunk.getMetadata().get("chunk_index")).isEqualTo(i - 2); + assertThat(chunk.getMetadata().get("total_chunks")).isEqualTo(2); + assertThat(chunk.getMetadata().get("type")).isEqualTo("report"); + assertThat(chunk.getMetadata().get("priority")).isEqualTo("medium"); + } + } + } diff --git a/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TokenTextSplitterTest.java b/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TokenTextSplitterTest.java index e803c8a4e40..96c58f3fa9a 100644 --- a/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TokenTextSplitterTest.java +++ b/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TokenTextSplitterTest.java @@ -104,9 +104,22 @@ public void testTokenTextSplitterBuilderWithAllFields() { assertThat(chunks.get(4).getText()).isEqualTo("It isn’t the lack of an exit, but"); assertThat(chunks.get(5).getText()).isEqualTo("the abundance of exits that is so disorienting"); - // Verify that the same, merged metadata is copied to all chunks. - assertThat(chunks.get(0).getMetadata()).isEqualTo(chunks.get(1).getMetadata()); - assertThat(chunks.get(2).getMetadata()).isEqualTo(chunks.get(3).getMetadata()); + // Verify that the original metadata is copied to all chunks (including + // chunk-specific fields) + assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "key2", "parent_document_id", "chunk_index", + "total_chunks"); + assertThat(chunks.get(1).getMetadata()).containsKeys("key1", "key2", "parent_document_id", "chunk_index", + "total_chunks"); + assertThat(chunks.get(2).getMetadata()).containsKeys("key2", "key3", "parent_document_id", "chunk_index", + "total_chunks"); + assertThat(chunks.get(3).getMetadata()).containsKeys("key2", "key3", "parent_document_id", "chunk_index", + "total_chunks"); + + // Verify chunk indices are correct + assertThat(chunks.get(0).getMetadata().get("chunk_index")).isEqualTo(0); + assertThat(chunks.get(1).getMetadata().get("chunk_index")).isEqualTo(1); + assertThat(chunks.get(2).getMetadata().get("chunk_index")).isEqualTo(0); + assertThat(chunks.get(3).getMetadata().get("chunk_index")).isEqualTo(1); assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "key2").doesNotContainKeys("key3"); assertThat(chunks.get(2).getMetadata()).containsKeys("key2", "key3").doesNotContainKeys("key1");