use of org.icij.extract.document.DocumentFactory in project datashare by ICIJ.
the class IndexerHelper method indexEmbeddedFile.
File indexEmbeddedFile(String project, String docPath) throws IOException {
Path path = get(getClass().getResource(docPath).getPath());
Extractor extractor = new Extractor(new DocumentFactory().withIdentifier(new DigestIdentifier("SHA-384", Charset.defaultCharset())));
extractor.setDigester(new UpdatableDigester(project, Entity.HASHER.toString()));
TikaDocument document = extractor.extract(path);
ElasticsearchSpewer elasticsearchSpewer = new ElasticsearchSpewer(client, l -> ENGLISH, new FieldNames(), mock(Publisher.class), new PropertiesProvider()).withRefresh(IMMEDIATE).withIndex("test-datashare");
elasticsearchSpewer.write(document);
return path.toFile();
}
use of org.icij.extract.document.DocumentFactory in project datashare by ICIJ.
the class DatashareExtractIntegrationTest method createExtractor.
Extractor createExtractor() {
Extractor extractor = new Extractor(new DocumentFactory().withIdentifier(new DigestIdentifier("SHA-384", Charset.defaultCharset())));
extractor.setDigester(new UpdatableDigester("test", Entity.HASHER.toString()));
return extractor;
}
use of org.icij.extract.document.DocumentFactory in project datashare by ICIJ.
the class ElasticsearchSpewerTest method test_duplicate_file.
@Test
public void test_duplicate_file() throws Exception {
DocumentFactory tikaFactory = new DocumentFactory().configure(Options.from(new HashMap<String, String>() {
{
put("idDigestMethod", Document.HASHER.toString());
}
}));
Extractor extractor = new Extractor(tikaFactory);
extractor.setDigester(new UpdatableDigester("project", Document.HASHER.toString()));
final TikaDocument document = extractor.extract(get(Objects.requireNonNull(getClass().getResource("/docs/doc.txt")).getPath()));
final TikaDocument document2 = extractor.extract(get(Objects.requireNonNull(getClass().getResource("/docs/doc-duplicate.txt")).getPath()));
spewer.write(document);
spewer.write(document2);
GetResponse actualDocument = es.client.get(new GetRequest(TEST_INDEX, document.getId()), RequestOptions.DEFAULT);
GetResponse actualDocument2 = es.client.get(new GetRequest(TEST_INDEX, new Duplicate(document2.getPath(), document.getId()).getId()), RequestOptions.DEFAULT);
assertThat(actualDocument.isExists()).isTrue();
assertThat(actualDocument.getSourceAsMap()).includes(entry("type", "Document"));
assertThat(actualDocument2.isExists()).isTrue();
assertThat(actualDocument2.getSourceAsMap()).includes(entry("type", "Duplicate"));
}
use of org.icij.extract.document.DocumentFactory in project datashare by ICIJ.
the class ElasticsearchSpewerTest method test_truncated_content.
@Test
public void test_truncated_content() throws Exception {
ElasticsearchSpewer limitedContentSpewer = new ElasticsearchSpewer(es.client, text -> Language.ENGLISH, new FieldNames(), publisher, new PropertiesProvider(new HashMap<String, String>() {
{
put("maxContentLength", "20");
}
})).withRefresh(IMMEDIATE).withIndex("test-datashare");
final TikaDocument document = new DocumentFactory().withIdentifier(new PathIdentifier()).create(get("fake-file.txt"));
final ParsingReader reader = new ParsingReader(new ByteArrayInputStream("this content should be truncated".getBytes()));
document.setReader(reader);
limitedContentSpewer.write(document);
GetResponse documentFields = es.client.get(new GetRequest(TEST_INDEX, document.getId()), RequestOptions.DEFAULT);
assertThat(documentFields.getSourceAsMap()).includes(entry("content", "this content should"));
}
use of org.icij.extract.document.DocumentFactory in project datashare by ICIJ.
the class ElasticsearchSpewerTest method test_extract_id_should_be_equal_to_datashare_id.
@Test
public void test_extract_id_should_be_equal_to_datashare_id() throws IOException {
DocumentFactory tikaFactory = new DocumentFactory().configure(Options.from(new HashMap<String, String>() {
{
put("idDigestMethod", Document.HASHER.toString());
}
}));
Extractor extractor = new Extractor(tikaFactory);
extractor.setDigester(new UpdatableDigester("project", Document.HASHER.toString()));
final TikaDocument extractDocument = extractor.extract(get(Objects.requireNonNull(getClass().getResource("/docs/embedded_doc.eml")).getPath()));
Document document = new Document(Project.project("project"), get(Objects.requireNonNull(getClass().getResource("/docs/embedded_doc.eml")).getPath()), "This is a document to be parsed by datashare.", Language.FRENCH, Charset.defaultCharset(), "text/plain", convert(extractDocument.getMetadata()), Document.Status.INDEXED, 45L);
assertThat(document.getId()).isEqualTo(extractDocument.getId());
}
Aggregations