Skip to content

Commit

Permalink
feat: add project name in the artifact path #1165
Browse files Browse the repository at this point in the history
  • Loading branch information
bamthomas committed Sep 10, 2024
1 parent d5611c5 commit 2cab58c
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 111 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ protected void after() {
client.indices().delete(DeleteIndexRequest.of(dir -> dir.index(asList(indexesNames))));
client._transport().close();
} catch (IOException e) {
e.printStackTrace();
throw new IllegalStateException("cannot close ES client properly", e);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ public InputStream getEmbeddedSource(final Project project, final Document docum
String algorithm = hasher.toString();
int i = 0;
List<DigestingParser.Digester> digesters = new ArrayList<>(List.of());
// Digester with the project name
// Digester without project name
digesters.add(new CommonsDigester(20 * 1024 * 1024, algorithm.replace("-", "")));
// Digester with the project name
digesters.add(new UpdatableDigester(project.getId(), algorithm));
Expand All @@ -104,7 +104,7 @@ public InputStream getEmbeddedSource(final Project project, final Document docum
try {
EmbeddedDocumentExtractor embeddedExtractor = new EmbeddedDocumentExtractor(
digester, algorithm,
propertiesProvider.get(DatashareCliOptions.ARTIFACT_DIR_OPT).map(Path::of).orElse(null),false);
propertiesProvider.get(DatashareCliOptions.ARTIFACT_DIR_OPT).map(dir -> Path.of(dir).resolve(project.name)).orElse(null),false);
TikaDocumentSource source = embeddedExtractor.extract(rootDocument, document.getId());
InputStream inputStream = source.get();
if (filterMetadata) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,7 @@

public class SourceExtractorTest {
@ClassRule static public TemporaryFolder tmpDir = new TemporaryFolder();
@ClassRule
public static ElasticsearchRule es = new ElasticsearchRule();
@ClassRule public static ElasticsearchRule es = new ElasticsearchRule();

@Test(expected = FileNotFoundException.class)
public void test_file_not_found() throws IOException {
Expand All @@ -55,7 +54,7 @@ public void test_content_not_found() {
.with(new HashMap<>())
.with(Document.Status.INDEXED)
.withContentLength(45L).build();
new SourceExtractor(tmpDir.getRoot().toPath()).getEmbeddedSource(project("project"), document);
new SourceExtractor(new PropertiesProvider()).getEmbeddedSource(project("project"), document);
}

@Test
Expand All @@ -69,7 +68,7 @@ public void test_get_source_for_root_doc() throws IOException {
.with(Document.Status.INDEXED)
.withContentLength(45L).build();

InputStream source = new SourceExtractor(tmpDir.getRoot().toPath()).getSource(document);
InputStream source = new SourceExtractor(new PropertiesProvider()).getSource(document);
assertThat(source).isNotNull();
assertThat(getBytes(source)).hasSize(70574);
}
Expand All @@ -85,8 +84,8 @@ public void test_get_source_for_doc_and_pdf_with_without_metadata() throws IOExc
.with(Document.Status.INDEXED)
.withContentLength(0L).build();

InputStream inputStreamWithMetadata = new SourceExtractor(tmpDir.getRoot().toPath(), false).getSource(document);
InputStream inputStreamWithoutMetadata = new SourceExtractor(tmpDir.getRoot().toPath(), true).getSource(document);
InputStream inputStreamWithMetadata = new SourceExtractor(new PropertiesProvider(), false).getSource(document);
InputStream inputStreamWithoutMetadata = new SourceExtractor(new PropertiesProvider(), true).getSource(document);
assertThat(inputStreamWithMetadata).isNotNull();
assertThat(inputStreamWithoutMetadata).isNotNull();
assertThat(getBytes(inputStreamWithMetadata).length).isEqualTo(9216);
Expand All @@ -95,162 +94,137 @@ public void test_get_source_for_doc_and_pdf_with_without_metadata() throws IOExc

@Test
public void test_get_source_for_embedded_doc() throws Exception {
Options<String> options = Options.from(new HashMap<>() {{
put("digestAlgorithm", Document.DEFAULT_DIGESTER.toString());
put("digestProjectName", TEST_INDEX);
}});
DocumentFactory tikaFactory = new DocumentFactory().configure(options);
Extractor extractor = new Extractor(tikaFactory).configure(options);

Path path = get(getClass().getResource("/docs/embedded_doc.eml").getPath());
final TikaDocument document = extractor.extract(path);
ElasticsearchSpewer spewer = new ElasticsearchSpewer(createIndexer(TEST_INDEX),
new MemoryDocumentCollectionFactory<>(), l -> Language.ENGLISH, new FieldNames(), new PropertiesProvider(
new HashMap<>() {{
put("defaultProject", TEST_INDEX);
}}
));
spewer.write(document);
Map<String, Object> stringProperties = Map.of(
"digestAlgorithm", Document.DEFAULT_DIGESTER.toString(),
"digestProjectName", TEST_INDEX,
"defaultProject", TEST_INDEX);
ElasticsearchIndexer elasticsearchIndexer = indexDocument(stringProperties, path, stringProperties);

Document attachedPdf = createIndexer(TEST_INDEX).
Document attachedPdf = elasticsearchIndexer.
get(TEST_INDEX, "1bf2b6aa27dd8b45c7db58875004b8cb27a78ced5200b4976b63e351ebbae5ececb86076d90e156a7cdea06cde9573ca",
"f4078910c3e73a192e3a82d205f3c0bdb749c4e7b23c1d05a622db0f07d7f0ededb335abdb62aef41ace5d3cdb9298bc");

assertThat(attachedPdf).isNotNull();
assertThat(attachedPdf.getContentType()).isEqualTo("application/pdf");
InputStream source = new SourceExtractor(tmpDir.getRoot().toPath()).getSource(project(TEST_INDEX), attachedPdf);
InputStream source = new SourceExtractor(new PropertiesProvider(stringProperties)).getSource(project(TEST_INDEX), attachedPdf);
assertThat(source).isNotNull();
assertThat(getBytes(source)).hasSize(49779);
}

private static ElasticsearchIndexer createIndexer() {
return createIndexer("local-datashare");
}
@Test
public void test_get_source_for_embedded_doc_with_artifact_dir() throws Exception {
Path path = get(getClass().getResource("/docs/embedded_doc.eml").getPath());
Map<String, Object> stringProperties = Map.of(
"digestAlgorithm", Document.DEFAULT_DIGESTER.toString(),
"digestProjectName", TEST_INDEX,
"defaultProject", TEST_INDEX);
ElasticsearchIndexer elasticsearchIndexer = indexDocument(stringProperties, path, stringProperties);
Document attachedPdf = elasticsearchIndexer.
get(TEST_INDEX, "1bf2b6aa27dd8b45c7db58875004b8cb27a78ced5200b4976b63e351ebbae5ececb86076d90e156a7cdea06cde9573ca",
"f4078910c3e73a192e3a82d205f3c0bdb749c4e7b23c1d05a622db0f07d7f0ededb335abdb62aef41ace5d3cdb9298bc");

private static ElasticsearchIndexer createIndexer(String defaultProject) {
return new ElasticsearchIndexer(es.client, new PropertiesProvider(Map.of("defaultProject", defaultProject))).withRefresh(Refresh.True);
InputStream source = new SourceExtractor(tmpDir.getRoot().toPath()).getSource(project(TEST_INDEX), attachedPdf);
assertThat(source).isNotNull();
assertThat(tmpDir.getRoot().toPath().resolve(TEST_INDEX).toFile()).isDirectory();
Path cachedArtifact = tmpDir.getRoot().toPath()
.resolve(TEST_INDEX)
.resolve("1b")
.resolve("f2")
.resolve("1bf2b6aa27dd8b45c7db58875004b8cb27a78ced5200b4976b63e351ebbae5ececb86076d90e156a7cdea06cde9573ca")
.resolve("raw");
assertThat(cachedArtifact.toFile()).isFile();
assertThat(cachedArtifact.toFile()).hasSize(49779);
}

@Test
public void test_get_source_for_embedded_doc_without_metadata() throws Exception {
Options<String> options = Options.from(new HashMap<>() {{
put("digestAlgorithm", Document.DEFAULT_DIGESTER.toString());
put("digestProjectName", TEST_INDEX);
}});
DocumentFactory tikaFactory = new DocumentFactory().configure(options);
Extractor extractor = new Extractor(tikaFactory).configure(options);

Path path = get(getClass().getResource("/docs/embedded_doc.eml").getPath());
final TikaDocument document = extractor.extract(path);
ElasticsearchSpewer spewer = new ElasticsearchSpewer(createIndexer(TEST_INDEX),
new MemoryDocumentCollectionFactory<>(), l -> Language.ENGLISH, new FieldNames(), new PropertiesProvider(
new HashMap<>() {{
put("defaultProject", TEST_INDEX);
}}
Map<String, Object> stringProperties = Map.of(
"digestAlgorithm", Document.DEFAULT_DIGESTER.toString(),
"digestProjectName", TEST_INDEX,
"defaultProject", TEST_INDEX);
ElasticsearchIndexer elasticsearchIndexer = indexDocument(stringProperties, path, stringProperties);

));
spewer.write(document);

Document attachedPdf = createIndexer(TEST_INDEX).
Document attachedPdf = elasticsearchIndexer.
get(TEST_INDEX, "1bf2b6aa27dd8b45c7db58875004b8cb27a78ced5200b4976b63e351ebbae5ececb86076d90e156a7cdea06cde9573ca",
"f4078910c3e73a192e3a82d205f3c0bdb749c4e7b23c1d05a622db0f07d7f0ededb335abdb62aef41ace5d3cdb9298bc");

InputStream source = new SourceExtractor(tmpDir.getRoot().toPath(), true).getSource(project(TEST_INDEX), attachedPdf);
InputStream source = new SourceExtractor(new PropertiesProvider(stringProperties), true).getSource(project(TEST_INDEX), attachedPdf);
assertThat(source).isNotNull();
assertThat(getBytes(source).length).isNotEqualTo(49779);
}

@Test
public void test_get_source_for_embedded_doc_without_digest_project_name() throws Exception {
Options<String> options = Options.from(new HashMap<>() {{
put("digestAlgorithm", Document.DEFAULT_DIGESTER.toString());
put("digestProjectName", "");
}});
DocumentFactory tikaFactory = new DocumentFactory().configure(options);
Extractor extractor = new Extractor(tikaFactory).configure(options);

Path path = get(getClass().getResource("/docs/embedded_doc.eml").getPath());
final TikaDocument document = extractor.extract(path);
ElasticsearchSpewer spewer = new ElasticsearchSpewer(createIndexer(TEST_INDEX),
new MemoryDocumentCollectionFactory<>(), l -> Language.ENGLISH, new FieldNames(), new PropertiesProvider(
new HashMap<>() {{
put("defaultProject", TEST_INDEX);
}}
Map<String, Object> stringProperties = Map.of(
"digestAlgorithm", Document.DEFAULT_DIGESTER.toString(),
"digestProjectName", "",
"defaultProject", TEST_INDEX);
ElasticsearchIndexer elasticsearchIndexer = indexDocument(stringProperties, path, stringProperties);

));
spewer.write(document);

Document attachedPdf = createIndexer(TEST_INDEX).
Document attachedPdf = elasticsearchIndexer.
get(TEST_INDEX, "754ea07d66c2ec23d2849b4d44f276a7ebe719e586c20d15c7b772dcd4a620b0117e7396b76496ed5c10a066bf19d907",
"c78925fb478426ccc4c5a7cb975bc0f35d4079cd8a55d7a340bdccb3a46379e4940daa198c0be0dfd247cde338194105");

InputStream source = new SourceExtractor(tmpDir.getRoot().toPath()).getSource(project(TEST_INDEX), attachedPdf);
InputStream source = new SourceExtractor(new PropertiesProvider(stringProperties)).getSource(project(TEST_INDEX), attachedPdf);
assertThat(source).isNotNull();
assertThat(getBytes(source)).hasSize(49779);
}

@Test
public void test_get_source_for_embedded_doc_with_digest_project_name_using_legacy_value() throws Exception {
PropertiesProvider propertiesProvider = new PropertiesProvider(new HashMap<>() {{
put("digestAlgorithm", Document.DEFAULT_DIGESTER.toString());
put("digestProjectName", "local-datashare");
put("artifactDir", tmpDir.newFolder("local_mode").toString());
put("mode", "LOCAL");
}});
Options<String> options = Options.from(propertiesProvider.getProperties());
DocumentFactory tikaFactory = new DocumentFactory().configure(options);
Extractor extractor = new Extractor(tikaFactory).configure(options);

Path path = get(getClass().getResource("/docs/embedded_doc.eml").getPath());
final TikaDocument document = extractor.extract(path);
ElasticsearchSpewer spewer = new ElasticsearchSpewer(createIndexer(TEST_INDEX),
new MemoryDocumentCollectionFactory<>(), l -> Language.ENGLISH, new FieldNames(), new PropertiesProvider(
new HashMap<>() {{
put("defaultProject", TEST_INDEX);
put("digestProjectName", "local-datashare");
}}

));
spewer.write(document);

Document attachedPdf = createIndexer(TEST_INDEX).
Map<String, Object> stringProperties = Map.of(
"digestAlgorithm", Document.DEFAULT_DIGESTER.toString(),
"digestProjectName", "local-datashare",
"artifactDir", tmpDir.newFolder("local_mode").toString(),
"mode", "LOCAL");
ElasticsearchIndexer elasticsearchIndexer = indexDocument(stringProperties, path, Map.of("defaultProject", TEST_INDEX, "digestProjectName", "local-datashare"));

Document attachedPdf = elasticsearchIndexer.
get(TEST_INDEX, "d365f488df3c84ecd6d7aa752ca268b78589f2082e4fe2fbe9f62dff6b3a6b74bedc645ec6df9ae5599dab7631433623",
"34ec4641c845234af66cfded88fed3ea92ee27da41e610d67eed0b9ba0c04ecf1cefae80d694050e29b8aadfd9cc7205");

InputStream source = new SourceExtractor(propertiesProvider).getSource(project(TEST_INDEX), attachedPdf);
InputStream source = new SourceExtractor(new PropertiesProvider(stringProperties)).getSource(project(TEST_INDEX), attachedPdf);
assertThat(source).isNotNull();
assertThat(getBytes(source)).hasSize(49779);
}

@Test(expected = EmbeddedDocumentExtractor.ContentNotFoundException.class)
public void test_not_get_source_for_embedded_doc_with_digest_project_name_using_legacy_value_in_server() throws Exception {
PropertiesProvider propertiesProvider = new PropertiesProvider(new HashMap<>() {{
put("digestAlgorithm", Document.DEFAULT_DIGESTER.toString());
put("digestProjectName", "local-datashare");
put("artifactDir", tmpDir.newFolder("server_mode").toString());
put("mode", "SERVER");
}});
Options<String> options = Options.from(propertiesProvider.getProperties());
Path path = get(getClass().getResource("/docs/embedded_doc.eml").getPath());
Map<String, Object> stringProperties = Map.of(
"digestAlgorithm", Document.DEFAULT_DIGESTER.toString(),
"digestProjectName", "local-datashare",
"defaultProject", TEST_INDEX,
"artifactDir", tmpDir.newFolder("server_mode").toString(),
"mode", "SERVER");
ElasticsearchIndexer elasticsearchIndexer = indexDocument(stringProperties, path, Map.of("defaultProject", TEST_INDEX, "digestProjectName", "local-datashare"));

Document attachedPdf = elasticsearchIndexer.
get(TEST_INDEX, "d365f488df3c84ecd6d7aa752ca268b78589f2082e4fe2fbe9f62dff6b3a6b74bedc645ec6df9ae5599dab7631433623",
"34ec4641c845234af66cfded88fed3ea92ee27da41e610d67eed0b9ba0c04ecf1cefae80d694050e29b8aadfd9cc7205");

new SourceExtractor(new PropertiesProvider(stringProperties)).getSource(project(TEST_INDEX), attachedPdf);
}

private static ElasticsearchIndexer indexDocument(Map<String, Object> properties, Path path, Map<String, Object> spewerProperties) throws IOException {
Options<String> options = Options.from(properties);
DocumentFactory tikaFactory = new DocumentFactory().configure(options);
Extractor extractor = new Extractor(tikaFactory).configure(options);

Path path = get(getClass().getResource("/docs/embedded_doc.eml").getPath());
final TikaDocument document = extractor.extract(path);
ElasticsearchSpewer spewer = new ElasticsearchSpewer(createIndexer(TEST_INDEX),
new MemoryDocumentCollectionFactory<>(), l -> Language.ENGLISH, new FieldNames(), new PropertiesProvider(
new HashMap<>() {{
put("defaultProject", TEST_INDEX);
put("digestProjectName", "local-datashare");
}}

));
ElasticsearchIndexer indexer = createIndexer(TEST_INDEX);
ElasticsearchSpewer spewer = new ElasticsearchSpewer(indexer,
new MemoryDocumentCollectionFactory<>(), l -> Language.ENGLISH, new FieldNames(), new PropertiesProvider(spewerProperties));
spewer.write(document);
return indexer;
}

Document attachedPdf = createIndexer(TEST_INDEX).
get(TEST_INDEX, "d365f488df3c84ecd6d7aa752ca268b78589f2082e4fe2fbe9f62dff6b3a6b74bedc645ec6df9ae5599dab7631433623",
"34ec4641c845234af66cfded88fed3ea92ee27da41e610d67eed0b9ba0c04ecf1cefae80d694050e29b8aadfd9cc7205");

new SourceExtractor(propertiesProvider).getSource(project(TEST_INDEX), attachedPdf);
private static ElasticsearchIndexer createIndexer(String defaultProject) {
return new ElasticsearchIndexer(es.client, new PropertiesProvider(Map.of("defaultProject", defaultProject))).withRefresh(Refresh.True);
}

private byte[] getBytes(InputStream source) throws IOException {
Expand Down

0 comments on commit 2cab58c

Please sign in to comment.