use of org.icij.datashare.text.Duplicate in project datashare by ICIJ.
the class ElasticsearchSpewerTest method test_duplicate_file.
@Test
public void test_duplicate_file() throws Exception {
DocumentFactory tikaFactory = new DocumentFactory().configure(Options.from(new HashMap<String, String>() {
{
put("idDigestMethod", Document.HASHER.toString());
}
}));
Extractor extractor = new Extractor(tikaFactory);
extractor.setDigester(new UpdatableDigester("project", Document.HASHER.toString()));
final TikaDocument document = extractor.extract(get(Objects.requireNonNull(getClass().getResource("/docs/doc.txt")).getPath()));
final TikaDocument document2 = extractor.extract(get(Objects.requireNonNull(getClass().getResource("/docs/doc-duplicate.txt")).getPath()));
spewer.write(document);
spewer.write(document2);
GetResponse actualDocument = es.client.get(new GetRequest(TEST_INDEX, document.getId()), RequestOptions.DEFAULT);
GetResponse actualDocument2 = es.client.get(new GetRequest(TEST_INDEX, new Duplicate(document2.getPath(), document.getId()).getId()), RequestOptions.DEFAULT);
assertThat(actualDocument.isExists()).isTrue();
assertThat(actualDocument.getSourceAsMap()).includes(entry("type", "Document"));
assertThat(actualDocument2.isExists()).isTrue();
assertThat(actualDocument2.getSourceAsMap()).includes(entry("type", "Duplicate"));
}
Aggregations