Search in sources :

Example 36 with Document

use of org.icij.datashare.text.Document in project datashare by ICIJ.

the class ElasticsearchIndexerTest method test_bulk_add_should_add_ner_pipeline_once_and_for_empty_list.

@Test
public void test_bulk_add_should_add_ner_pipeline_once_and_for_empty_list() throws IOException {
    Document doc = new org.icij.datashare.text.Document("id", project("prj"), Paths.get("doc.txt"), "content", Language.FRENCH, Charset.defaultCharset(), "application/pdf", new HashMap<>(), INDEXED, new HashSet<Pipeline.Type>() {

        {
            add(OPENNLP);
        }
    }, 432L);
    indexer.add(TEST_INDEX, doc);
    assertThat(indexer.bulkAdd(TEST_INDEX, OPENNLP, emptyList(), doc)).isTrue();
    GetResponse resp = es.client.get(new GetRequest(TEST_INDEX, doc.getId()), RequestOptions.DEFAULT);
    assertThat(resp.getSourceAsMap().get("status")).isEqualTo("DONE");
    assertThat((ArrayList<String>) resp.getSourceAsMap().get("nerTags")).containsExactly("OPENNLP");
}
Also used : ScriptType(org.elasticsearch.script.ScriptType) Type(org.icij.datashare.text.nlp.Pipeline.Type) GetRequest(org.elasticsearch.action.get.GetRequest) Document(org.icij.datashare.text.Document) GetResponse(org.elasticsearch.action.get.GetResponse) Test(org.junit.Test)

Example 37 with Document

use of org.icij.datashare.text.Document in project datashare by ICIJ.

the class SourceExtractorTest method test_get_source_for_embedded_doc.

@Test
public void test_get_source_for_embedded_doc() throws Exception {
    DocumentFactory tikaFactory = new DocumentFactory().configure(Options.from(new HashMap<String, String>() {

        {
            put("idDigestMethod", Document.HASHER.toString());
        }
    }));
    Path path = get(getClass().getResource("/docs/embedded_doc.eml").getPath());
    Extractor extractor = new Extractor(tikaFactory);
    extractor.setDigester(new UpdatableDigester(TEST_INDEX, Document.HASHER.toString()));
    final TikaDocument document = extractor.extract(path);
    ElasticsearchSpewer spewer = new ElasticsearchSpewer(es.client, l -> Language.ENGLISH, new FieldNames(), Mockito.mock(Publisher.class), new PropertiesProvider()).withRefresh(IMMEDIATE).withIndex(TEST_INDEX);
    spewer.write(document);
    Document attachedPdf = new ElasticsearchIndexer(es.client, new PropertiesProvider()).get(TEST_INDEX, "1bf2b6aa27dd8b45c7db58875004b8cb27a78ced5200b4976b63e351ebbae5ececb86076d90e156a7cdea06cde9573ca", "f4078910c3e73a192e3a82d205f3c0bdb749c4e7b23c1d05a622db0f07d7f0ededb335abdb62aef41ace5d3cdb9298bc");
    assertThat(attachedPdf).isNotNull();
    assertThat(attachedPdf.getContentType()).isEqualTo("application/pdf");
    InputStream source = new SourceExtractor().getSource(project(TEST_INDEX), attachedPdf);
    assertThat(source).isNotNull();
    assertThat(getBytes(source)).hasSize(49779);
}
Also used : Path(java.nio.file.Path) HashMap(java.util.HashMap) InputStream(java.io.InputStream) TikaDocument(org.icij.extract.document.TikaDocument) Publisher(org.icij.datashare.com.Publisher) TikaDocument(org.icij.extract.document.TikaDocument) Document(org.icij.datashare.text.Document) PropertiesProvider(org.icij.datashare.PropertiesProvider) DocumentFactory(org.icij.extract.document.DocumentFactory) UpdatableDigester(org.icij.extract.extractor.UpdatableDigester) FieldNames(org.icij.spewer.FieldNames) Extractor(org.icij.extract.extractor.Extractor) Test(org.junit.Test)

Example 38 with Document

use of org.icij.datashare.text.Document in project datashare by ICIJ.

the class SourceExtractorTest method test_get_source_for_embedded_doc_without_metadata.

@Test
public void test_get_source_for_embedded_doc_without_metadata() throws Exception {
    DocumentFactory tikaFactory = new DocumentFactory().configure(Options.from(new HashMap<String, String>() {

        {
            put("idDigestMethod", Document.HASHER.toString());
        }
    }));
    Path path = get(getClass().getResource("/docs/embedded_doc.eml").getPath());
    Extractor extractor = new Extractor(tikaFactory);
    extractor.setDigester(new UpdatableDigester(TEST_INDEX, Document.HASHER.toString()));
    final TikaDocument document = extractor.extract(path);
    ElasticsearchSpewer spewer = new ElasticsearchSpewer(es.client, l -> Language.ENGLISH, new FieldNames(), Mockito.mock(Publisher.class), new PropertiesProvider()).withRefresh(IMMEDIATE).withIndex(TEST_INDEX);
    spewer.write(document);
    Document attachedPdf = new ElasticsearchIndexer(es.client, new PropertiesProvider()).get(TEST_INDEX, "1bf2b6aa27dd8b45c7db58875004b8cb27a78ced5200b4976b63e351ebbae5ececb86076d90e156a7cdea06cde9573ca", "f4078910c3e73a192e3a82d205f3c0bdb749c4e7b23c1d05a622db0f07d7f0ededb335abdb62aef41ace5d3cdb9298bc");
    InputStream source = new SourceExtractor(true).getSource(project(TEST_INDEX), attachedPdf);
    assertThat(source).isNotNull();
    assertThat(getBytes(source).length).isNotEqualTo(49779);
}
Also used : Path(java.nio.file.Path) HashMap(java.util.HashMap) InputStream(java.io.InputStream) TikaDocument(org.icij.extract.document.TikaDocument) Publisher(org.icij.datashare.com.Publisher) TikaDocument(org.icij.extract.document.TikaDocument) Document(org.icij.datashare.text.Document) PropertiesProvider(org.icij.datashare.PropertiesProvider) DocumentFactory(org.icij.extract.document.DocumentFactory) UpdatableDigester(org.icij.extract.extractor.UpdatableDigester) FieldNames(org.icij.spewer.FieldNames) Extractor(org.icij.extract.extractor.Extractor) Test(org.junit.Test)

Example 39 with Document

use of org.icij.datashare.text.Document in project datashare by ICIJ.

the class SourceExtractorTest method test_get_source_for_doc_and_pdf_with_without_metadata.

@Test
public void test_get_source_for_doc_and_pdf_with_without_metadata() throws IOException {
    Document document = new Document(project("project"), get(getClass().getResource("/docs/office_document.doc").getPath()), null, Language.ENGLISH, Charset.defaultCharset(), "application/msword", new HashMap<>(), Document.Status.INDEXED, 0L);
    InputStream inputStreamWithMetadata = new SourceExtractor(false).getSource(document);
    InputStream inputStreamWithoutMetadata = new SourceExtractor(true).getSource(document);
    assertThat(inputStreamWithMetadata).isNotNull();
    assertThat(inputStreamWithoutMetadata).isNotNull();
    assertThat(getBytes(inputStreamWithMetadata).length).isEqualTo(9216);
    assertThat(getBytes(inputStreamWithoutMetadata).length).isNotEqualTo(9216);
}
Also used : InputStream(java.io.InputStream) TikaDocument(org.icij.extract.document.TikaDocument) Document(org.icij.datashare.text.Document) Test(org.junit.Test)

Example 40 with Document

use of org.icij.datashare.text.Document in project datashare by ICIJ.

the class SourceExtractorTest method test_get_source_for_root_doc.

@Test
public void test_get_source_for_root_doc() throws IOException {
    Document document = new Document(project("project"), get(getClass().getResource("/docs/embedded_doc.eml").getPath()), "it has been parsed", Language.FRENCH, Charset.defaultCharset(), "message/rfc822", new HashMap<>(), Document.Status.INDEXED, 45L);
    InputStream source = new SourceExtractor().getSource(document);
    assertThat(source).isNotNull();
    assertThat(getBytes(source)).hasSize(70574);
}
Also used : InputStream(java.io.InputStream) TikaDocument(org.icij.extract.document.TikaDocument) Document(org.icij.datashare.text.Document) Test(org.junit.Test)

Aggregations

Document (org.icij.datashare.text.Document)63 Test (org.junit.Test)48 PropertiesProvider (org.icij.datashare.PropertiesProvider)19 BatchSearch (org.icij.datashare.batch.BatchSearch)15 NamedEntity (org.icij.datashare.text.NamedEntity)11 TikaDocument (org.icij.extract.document.TikaDocument)10 HashMap (java.util.HashMap)9 Path (java.nio.file.Path)6 Date (java.util.Date)5 Indexer (org.icij.datashare.text.indexing.Indexer)5 File (java.io.File)4 IOException (java.io.IOException)4 InputStream (java.io.InputStream)4 IntStream (java.util.stream.IntStream)4 DocumentBuilder.createDoc (org.icij.datashare.text.DocumentBuilder.createDoc)4 Project.project (org.icij.datashare.text.Project.project)4 User (org.icij.datashare.user.User)4 Rule (org.junit.Rule)4 Arrays.asList (java.util.Arrays.asList)3 List (java.util.List)3