use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class ElasticsearchIndexerTest method test_bulk_add_should_add_ner_pipeline_once_and_for_empty_list.
@Test
public void test_bulk_add_should_add_ner_pipeline_once_and_for_empty_list() throws IOException {
Document doc = new org.icij.datashare.text.Document("id", project("prj"), Paths.get("doc.txt"), "content", Language.FRENCH, Charset.defaultCharset(), "application/pdf", new HashMap<>(), INDEXED, new HashSet<Pipeline.Type>() {
{
add(OPENNLP);
}
}, 432L);
indexer.add(TEST_INDEX, doc);
assertThat(indexer.bulkAdd(TEST_INDEX, OPENNLP, emptyList(), doc)).isTrue();
GetResponse resp = es.client.get(new GetRequest(TEST_INDEX, doc.getId()), RequestOptions.DEFAULT);
assertThat(resp.getSourceAsMap().get("status")).isEqualTo("DONE");
assertThat((ArrayList<String>) resp.getSourceAsMap().get("nerTags")).containsExactly("OPENNLP");
}
use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class SourceExtractorTest method test_get_source_for_embedded_doc.
@Test
public void test_get_source_for_embedded_doc() throws Exception {
DocumentFactory tikaFactory = new DocumentFactory().configure(Options.from(new HashMap<String, String>() {
{
put("idDigestMethod", Document.HASHER.toString());
}
}));
Path path = get(getClass().getResource("/docs/embedded_doc.eml").getPath());
Extractor extractor = new Extractor(tikaFactory);
extractor.setDigester(new UpdatableDigester(TEST_INDEX, Document.HASHER.toString()));
final TikaDocument document = extractor.extract(path);
ElasticsearchSpewer spewer = new ElasticsearchSpewer(es.client, l -> Language.ENGLISH, new FieldNames(), Mockito.mock(Publisher.class), new PropertiesProvider()).withRefresh(IMMEDIATE).withIndex(TEST_INDEX);
spewer.write(document);
Document attachedPdf = new ElasticsearchIndexer(es.client, new PropertiesProvider()).get(TEST_INDEX, "1bf2b6aa27dd8b45c7db58875004b8cb27a78ced5200b4976b63e351ebbae5ececb86076d90e156a7cdea06cde9573ca", "f4078910c3e73a192e3a82d205f3c0bdb749c4e7b23c1d05a622db0f07d7f0ededb335abdb62aef41ace5d3cdb9298bc");
assertThat(attachedPdf).isNotNull();
assertThat(attachedPdf.getContentType()).isEqualTo("application/pdf");
InputStream source = new SourceExtractor().getSource(project(TEST_INDEX), attachedPdf);
assertThat(source).isNotNull();
assertThat(getBytes(source)).hasSize(49779);
}
use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class SourceExtractorTest method test_get_source_for_embedded_doc_without_metadata.
@Test
public void test_get_source_for_embedded_doc_without_metadata() throws Exception {
DocumentFactory tikaFactory = new DocumentFactory().configure(Options.from(new HashMap<String, String>() {
{
put("idDigestMethod", Document.HASHER.toString());
}
}));
Path path = get(getClass().getResource("/docs/embedded_doc.eml").getPath());
Extractor extractor = new Extractor(tikaFactory);
extractor.setDigester(new UpdatableDigester(TEST_INDEX, Document.HASHER.toString()));
final TikaDocument document = extractor.extract(path);
ElasticsearchSpewer spewer = new ElasticsearchSpewer(es.client, l -> Language.ENGLISH, new FieldNames(), Mockito.mock(Publisher.class), new PropertiesProvider()).withRefresh(IMMEDIATE).withIndex(TEST_INDEX);
spewer.write(document);
Document attachedPdf = new ElasticsearchIndexer(es.client, new PropertiesProvider()).get(TEST_INDEX, "1bf2b6aa27dd8b45c7db58875004b8cb27a78ced5200b4976b63e351ebbae5ececb86076d90e156a7cdea06cde9573ca", "f4078910c3e73a192e3a82d205f3c0bdb749c4e7b23c1d05a622db0f07d7f0ededb335abdb62aef41ace5d3cdb9298bc");
InputStream source = new SourceExtractor(true).getSource(project(TEST_INDEX), attachedPdf);
assertThat(source).isNotNull();
assertThat(getBytes(source).length).isNotEqualTo(49779);
}
use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class SourceExtractorTest method test_get_source_for_doc_and_pdf_with_without_metadata.
@Test
public void test_get_source_for_doc_and_pdf_with_without_metadata() throws IOException {
Document document = new Document(project("project"), get(getClass().getResource("/docs/office_document.doc").getPath()), null, Language.ENGLISH, Charset.defaultCharset(), "application/msword", new HashMap<>(), Document.Status.INDEXED, 0L);
InputStream inputStreamWithMetadata = new SourceExtractor(false).getSource(document);
InputStream inputStreamWithoutMetadata = new SourceExtractor(true).getSource(document);
assertThat(inputStreamWithMetadata).isNotNull();
assertThat(inputStreamWithoutMetadata).isNotNull();
assertThat(getBytes(inputStreamWithMetadata).length).isEqualTo(9216);
assertThat(getBytes(inputStreamWithoutMetadata).length).isNotEqualTo(9216);
}
use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class SourceExtractorTest method test_get_source_for_root_doc.
@Test
public void test_get_source_for_root_doc() throws IOException {
Document document = new Document(project("project"), get(getClass().getResource("/docs/embedded_doc.eml").getPath()), "it has been parsed", Language.FRENCH, Charset.defaultCharset(), "message/rfc822", new HashMap<>(), Document.Status.INDEXED, 45L);
InputStream source = new SourceExtractor().getSource(document);
assertThat(source).isNotNull();
assertThat(getBytes(source)).hasSize(70574);
}
Aggregations