use of org.icij.extract.extractor.Extractor in project datashare by ICIJ.
the class IndexerHelper method indexEmbeddedFile.
File indexEmbeddedFile(String project, String docPath) throws IOException {
Path path = get(getClass().getResource(docPath).getPath());
Extractor extractor = new Extractor(new DocumentFactory().withIdentifier(new DigestIdentifier("SHA-384", Charset.defaultCharset())));
extractor.setDigester(new UpdatableDigester(project, Entity.HASHER.toString()));
TikaDocument document = extractor.extract(path);
ElasticsearchSpewer elasticsearchSpewer = new ElasticsearchSpewer(client, l -> ENGLISH, new FieldNames(), mock(Publisher.class), new PropertiesProvider()).withRefresh(IMMEDIATE).withIndex("test-datashare");
elasticsearchSpewer.write(document);
return path.toFile();
}
use of org.icij.extract.extractor.Extractor in project datashare by ICIJ.
the class DatashareExtractIntegrationTest method createExtractor.
Extractor createExtractor() {
Extractor extractor = new Extractor(new DocumentFactory().withIdentifier(new DigestIdentifier("SHA-384", Charset.defaultCharset())));
extractor.setDigester(new UpdatableDigester("test", Entity.HASHER.toString()));
return extractor;
}
use of org.icij.extract.extractor.Extractor in project datashare by ICIJ.
the class DatabaseSpewerTest method test_spew_document_iso8859_encoded_is_stored_in_utf8_and_have_correct_parameters.
@Test
public void test_spew_document_iso8859_encoded_is_stored_in_utf8_and_have_correct_parameters() throws Exception {
File file = tmp.newFile("test_iso8859-1.txt");
Files.write(file.toPath(), singletonList("chaîne en iso8859"), forName("ISO-8859-1"));
TikaDocument tikaDocument = new Extractor().extract(file.toPath());
dbSpewer.write(tikaDocument);
Document actual = dbSpewer.repository.getDocument(tikaDocument.getId());
assertThat(actual.getContent()).isEqualTo("chaîne en iso8859");
assertThat(actual.getContentEncoding()).isEqualTo(forName("iso8859-1"));
assertThat(actual.getContentLength()).isEqualTo(18);
assertThat(actual.getContentType()).isEqualTo("text/plain");
}
use of org.icij.extract.extractor.Extractor in project datashare by ICIJ.
the class ElasticsearchSpewerTest method test_duplicate_file.
@Test
public void test_duplicate_file() throws Exception {
DocumentFactory tikaFactory = new DocumentFactory().configure(Options.from(new HashMap<String, String>() {
{
put("idDigestMethod", Document.HASHER.toString());
}
}));
Extractor extractor = new Extractor(tikaFactory);
extractor.setDigester(new UpdatableDigester("project", Document.HASHER.toString()));
final TikaDocument document = extractor.extract(get(Objects.requireNonNull(getClass().getResource("/docs/doc.txt")).getPath()));
final TikaDocument document2 = extractor.extract(get(Objects.requireNonNull(getClass().getResource("/docs/doc-duplicate.txt")).getPath()));
spewer.write(document);
spewer.write(document2);
GetResponse actualDocument = es.client.get(new GetRequest(TEST_INDEX, document.getId()), RequestOptions.DEFAULT);
GetResponse actualDocument2 = es.client.get(new GetRequest(TEST_INDEX, new Duplicate(document2.getPath(), document.getId()).getId()), RequestOptions.DEFAULT);
assertThat(actualDocument.isExists()).isTrue();
assertThat(actualDocument.getSourceAsMap()).includes(entry("type", "Document"));
assertThat(actualDocument2.isExists()).isTrue();
assertThat(actualDocument2.getSourceAsMap()).includes(entry("type", "Duplicate"));
}
use of org.icij.extract.extractor.Extractor in project datashare by ICIJ.
the class ElasticsearchSpewerTest method test_metadata.
@Test
public void test_metadata() throws Exception {
Path path = get(Objects.requireNonNull(getClass().getResource("/docs/a/b/c/doc.txt")).getPath());
TikaDocument document = new Extractor().extract(path);
spewer.write(document);
GetResponse documentFields = es.client.get(new GetRequest(TEST_INDEX, document.getId()), RequestOptions.DEFAULT);
assertThat(documentFields.getSourceAsMap()).includes(entry("contentEncoding", "ISO-8859-1"), entry("contentType", "text/plain"), entry("nerTags", new ArrayList<>()), entry("contentLength", 45), entry("status", "INDEXED"), entry("path", path.toString()), entry("dirname", path.getParent().toString()));
}
Aggregations