use of org.icij.extract.document.TikaDocument in project datashare by ICIJ.
the class IndexerHelper method indexEmbeddedFile.
File indexEmbeddedFile(String project, String docPath) throws IOException {
Path path = get(getClass().getResource(docPath).getPath());
Extractor extractor = new Extractor(new DocumentFactory().withIdentifier(new DigestIdentifier("SHA-384", Charset.defaultCharset())));
extractor.setDigester(new UpdatableDigester(project, Entity.HASHER.toString()));
TikaDocument document = extractor.extract(path);
ElasticsearchSpewer elasticsearchSpewer = new ElasticsearchSpewer(client, l -> ENGLISH, new FieldNames(), mock(Publisher.class), new PropertiesProvider()).withRefresh(IMMEDIATE).withIndex("test-datashare");
elasticsearchSpewer.write(document);
return path.toFile();
}
use of org.icij.extract.document.TikaDocument in project datashare by ICIJ.
the class DatashareExtractIntegrationTest method test_spew_and_read_index.
@Test
public void test_spew_and_read_index() throws Exception {
Path path = get(getClass().getResource("/docs/doc.txt").getPath());
TikaDocument tikaDocument = createExtractor().extract(path);
spewer.write(tikaDocument);
Document doc = indexer.get(TEST_INDEX, tikaDocument.getId());
assertThat(doc.getId()).isEqualTo(tikaDocument.getId());
assertThat(doc.getContent()).isEqualTo("This is a document to be parsed by datashare.");
assertThat(doc.getLanguage()).isEqualTo(ENGLISH);
assertThat(doc.getContentLength()).isEqualTo(45);
assertThat(doc.getDirname()).contains(get("docs"));
assertThat(doc.getPath()).contains(get("doc.txt"));
assertThat(doc.getContentEncoding()).isEqualTo(Charset.forName("iso-8859-1"));
assertThat(doc.getContentType()).isEqualTo("text/plain");
assertThat(doc.getExtractionLevel()).isEqualTo((short) 0);
assertThat(doc.getMetadata()).hasSize(6);
assertThat(doc.getParentDocument()).isNull();
assertThat(doc.getRootDocument()).isEqualTo(doc.getId());
assertThat(doc.getCreationDate()).isNull();
}
use of org.icij.extract.document.TikaDocument in project datashare by ICIJ.
the class DatashareExtractIntegrationTest method test_spew_and_read_embedded_doc.
@Test
public void test_spew_and_read_embedded_doc() throws Exception {
Path path = get(getClass().getResource("/docs/embedded_doc.eml").getPath());
TikaDocument tikaDocument = createExtractor().extract(path);
spewer.write(tikaDocument);
Document doc = indexer.get(TEST_INDEX, tikaDocument.getEmbeds().get(0).getId(), tikaDocument.getId());
assertThat(doc).isNotNull();
assertThat(doc.getId()).isNotEqualTo(doc.getRootDocument());
assertThat(doc.getRootDocument()).isEqualTo(tikaDocument.getId());
assertThat(doc.getCreationDate()).isNotNull();
assertThat(new SimpleDateFormat("HH:mm:ss").format(doc.getCreationDate())).isEqualTo("23:22:36");
}
use of org.icij.extract.document.TikaDocument in project datashare by ICIJ.
the class DatabaseSpewerTest method test_spew_document_iso8859_encoded_is_stored_in_utf8_and_have_correct_parameters.
@Test
public void test_spew_document_iso8859_encoded_is_stored_in_utf8_and_have_correct_parameters() throws Exception {
File file = tmp.newFile("test_iso8859-1.txt");
Files.write(file.toPath(), singletonList("chaîne en iso8859"), forName("ISO-8859-1"));
TikaDocument tikaDocument = new Extractor().extract(file.toPath());
dbSpewer.write(tikaDocument);
Document actual = dbSpewer.repository.getDocument(tikaDocument.getId());
assertThat(actual.getContent()).isEqualTo("chaîne en iso8859");
assertThat(actual.getContentEncoding()).isEqualTo(forName("iso8859-1"));
assertThat(actual.getContentLength()).isEqualTo(18);
assertThat(actual.getContentType()).isEqualTo("text/plain");
}
use of org.icij.extract.document.TikaDocument in project datashare by ICIJ.
the class ElasticsearchSpewerTest method test_duplicate_file.
@Test
public void test_duplicate_file() throws Exception {
DocumentFactory tikaFactory = new DocumentFactory().configure(Options.from(new HashMap<String, String>() {
{
put("idDigestMethod", Document.HASHER.toString());
}
}));
Extractor extractor = new Extractor(tikaFactory);
extractor.setDigester(new UpdatableDigester("project", Document.HASHER.toString()));
final TikaDocument document = extractor.extract(get(Objects.requireNonNull(getClass().getResource("/docs/doc.txt")).getPath()));
final TikaDocument document2 = extractor.extract(get(Objects.requireNonNull(getClass().getResource("/docs/doc-duplicate.txt")).getPath()));
spewer.write(document);
spewer.write(document2);
GetResponse actualDocument = es.client.get(new GetRequest(TEST_INDEX, document.getId()), RequestOptions.DEFAULT);
GetResponse actualDocument2 = es.client.get(new GetRequest(TEST_INDEX, new Duplicate(document2.getPath(), document.getId()).getId()), RequestOptions.DEFAULT);
assertThat(actualDocument.isExists()).isTrue();
assertThat(actualDocument.getSourceAsMap()).includes(entry("type", "Document"));
assertThat(actualDocument2.isExists()).isTrue();
assertThat(actualDocument2.getSourceAsMap()).includes(entry("type", "Duplicate"));
}
Aggregations