Search in sources :

Example 41 with PropertiesProvider

use of org.icij.datashare.PropertiesProvider in project datashare by ICIJ.

the class SourceExtractorTest method test_get_source_for_embedded_doc.

@Test
public void test_get_source_for_embedded_doc() throws Exception {
    DocumentFactory tikaFactory = new DocumentFactory().configure(Options.from(new HashMap<String, String>() {

        {
            put("idDigestMethod", Document.HASHER.toString());
        }
    }));
    Path path = get(getClass().getResource("/docs/embedded_doc.eml").getPath());
    Extractor extractor = new Extractor(tikaFactory);
    extractor.setDigester(new UpdatableDigester(TEST_INDEX, Document.HASHER.toString()));
    final TikaDocument document = extractor.extract(path);
    ElasticsearchSpewer spewer = new ElasticsearchSpewer(es.client, l -> Language.ENGLISH, new FieldNames(), Mockito.mock(Publisher.class), new PropertiesProvider()).withRefresh(IMMEDIATE).withIndex(TEST_INDEX);
    spewer.write(document);
    Document attachedPdf = new ElasticsearchIndexer(es.client, new PropertiesProvider()).get(TEST_INDEX, "1bf2b6aa27dd8b45c7db58875004b8cb27a78ced5200b4976b63e351ebbae5ececb86076d90e156a7cdea06cde9573ca", "f4078910c3e73a192e3a82d205f3c0bdb749c4e7b23c1d05a622db0f07d7f0ededb335abdb62aef41ace5d3cdb9298bc");
    assertThat(attachedPdf).isNotNull();
    assertThat(attachedPdf.getContentType()).isEqualTo("application/pdf");
    InputStream source = new SourceExtractor().getSource(project(TEST_INDEX), attachedPdf);
    assertThat(source).isNotNull();
    assertThat(getBytes(source)).hasSize(49779);
}
Also used : Path(java.nio.file.Path) HashMap(java.util.HashMap) InputStream(java.io.InputStream) TikaDocument(org.icij.extract.document.TikaDocument) Publisher(org.icij.datashare.com.Publisher) TikaDocument(org.icij.extract.document.TikaDocument) Document(org.icij.datashare.text.Document) PropertiesProvider(org.icij.datashare.PropertiesProvider) DocumentFactory(org.icij.extract.document.DocumentFactory) UpdatableDigester(org.icij.extract.extractor.UpdatableDigester) FieldNames(org.icij.spewer.FieldNames) Extractor(org.icij.extract.extractor.Extractor) Test(org.junit.Test)

Example 42 with PropertiesProvider

use of org.icij.datashare.PropertiesProvider in project datashare by ICIJ.

the class SourceExtractorTest method test_get_source_for_embedded_doc_without_metadata.

@Test
public void test_get_source_for_embedded_doc_without_metadata() throws Exception {
    DocumentFactory tikaFactory = new DocumentFactory().configure(Options.from(new HashMap<String, String>() {

        {
            put("idDigestMethod", Document.HASHER.toString());
        }
    }));
    Path path = get(getClass().getResource("/docs/embedded_doc.eml").getPath());
    Extractor extractor = new Extractor(tikaFactory);
    extractor.setDigester(new UpdatableDigester(TEST_INDEX, Document.HASHER.toString()));
    final TikaDocument document = extractor.extract(path);
    ElasticsearchSpewer spewer = new ElasticsearchSpewer(es.client, l -> Language.ENGLISH, new FieldNames(), Mockito.mock(Publisher.class), new PropertiesProvider()).withRefresh(IMMEDIATE).withIndex(TEST_INDEX);
    spewer.write(document);
    Document attachedPdf = new ElasticsearchIndexer(es.client, new PropertiesProvider()).get(TEST_INDEX, "1bf2b6aa27dd8b45c7db58875004b8cb27a78ced5200b4976b63e351ebbae5ececb86076d90e156a7cdea06cde9573ca", "f4078910c3e73a192e3a82d205f3c0bdb749c4e7b23c1d05a622db0f07d7f0ededb335abdb62aef41ace5d3cdb9298bc");
    InputStream source = new SourceExtractor(true).getSource(project(TEST_INDEX), attachedPdf);
    assertThat(source).isNotNull();
    assertThat(getBytes(source).length).isNotEqualTo(49779);
}
Also used : Path(java.nio.file.Path) HashMap(java.util.HashMap) InputStream(java.io.InputStream) TikaDocument(org.icij.extract.document.TikaDocument) Publisher(org.icij.datashare.com.Publisher) TikaDocument(org.icij.extract.document.TikaDocument) Document(org.icij.datashare.text.Document) PropertiesProvider(org.icij.datashare.PropertiesProvider) DocumentFactory(org.icij.extract.document.DocumentFactory) UpdatableDigester(org.icij.extract.extractor.UpdatableDigester) FieldNames(org.icij.spewer.FieldNames) Extractor(org.icij.extract.extractor.Extractor) Test(org.junit.Test)

Example 43 with PropertiesProvider

use of org.icij.datashare.PropertiesProvider in project datashare by ICIJ.

the class CommonMode method configure.

@Override
protected void configure() {
    bind(PropertiesProvider.class).toInstance(propertiesProvider);
    bind(LanguageGuesser.class).to(OptimaizeLanguageGuesser.class);
    String batchQueueType = propertiesProvider.get("batchQueueType").orElse("org.icij.datashare.extract.MemoryBlockingQueue");
    bind(new TypeLiteral<BlockingQueue<String>>() {
    }).toInstance(getBlockingQueue(propertiesProvider, batchQueueType, "ds:batchsearch:queue"));
    bind(new TypeLiteral<BlockingQueue<BatchDownload>>() {
    }).toInstance(getBlockingQueue(propertiesProvider, batchQueueType, "ds:batchdownload:queue"));
    RestHighLevelClient esClient = createESClient(propertiesProvider);
    bind(RestHighLevelClient.class).toInstance(esClient);
    bind(Indexer.class).to(ElasticsearchIndexer.class).asEagerSingleton();
    bind(TaskManagerMemory.class).toInstance(new TaskManagerMemory(propertiesProvider));
    install(new FactoryModuleBuilder().build(TaskFactory.class));
    if ("memory".equals(propertiesProvider.getProperties().get("queueType"))) {
        bind(DocumentCollectionFactory.class).to(MemoryDocumentCollectionFactory.class).asEagerSingleton();
    } else {
        install(new FactoryModuleBuilder().implement(DocumentQueue.class, RedisUserDocumentQueue.class).implement(ReportMap.class, RedisUserReportMap.class).build(DocumentCollectionFactory.class));
    }
    DataBus dataBus;
    if ("memory".equals(propertiesProvider.getProperties().get("busType"))) {
        dataBus = new MemoryDataBus();
    } else {
        dataBus = new RedisDataBus(propertiesProvider);
    }
    bind(DataBus.class).toInstance(dataBus);
    bind(Publisher.class).toInstance(dataBus);
    PipelineRegistry pipelineRegistry = new PipelineRegistry(propertiesProvider);
    pipelineRegistry.register(EmailPipeline.class);
    pipelineRegistry.register(Pipeline.Type.CORENLP);
    try {
        pipelineRegistry.load();
    } catch (FileNotFoundException e) {
        LoggerFactory.getLogger(getClass()).info("extensions dir not found " + e.getMessage());
    }
    bind(PipelineRegistry.class).toInstance(pipelineRegistry);
}
Also used : BatchDownload(org.icij.datashare.batch.BatchDownload) FactoryModuleBuilder(com.google.inject.assistedinject.FactoryModuleBuilder) FileNotFoundException(java.io.FileNotFoundException) MemoryDataBus(org.icij.datashare.com.MemoryDataBus) RedisUserDocumentQueue(org.icij.datashare.extract.RedisUserDocumentQueue) DocumentQueue(org.icij.extract.queue.DocumentQueue) RestHighLevelClient(org.elasticsearch.client.RestHighLevelClient) DataBus(org.icij.datashare.com.DataBus) MemoryDataBus(org.icij.datashare.com.MemoryDataBus) RedisDataBus(org.icij.datashare.com.RedisDataBus) Publisher(org.icij.datashare.com.Publisher) TaskManagerMemory(org.icij.datashare.tasks.TaskManagerMemory) PipelineRegistry(org.icij.datashare.extension.PipelineRegistry) PropertiesProvider(org.icij.datashare.PropertiesProvider) MemoryDocumentCollectionFactory(org.icij.datashare.tasks.MemoryDocumentCollectionFactory) TypeLiteral(com.google.inject.TypeLiteral) MemoryDocumentCollectionFactory(org.icij.datashare.tasks.MemoryDocumentCollectionFactory) DocumentCollectionFactory(org.icij.datashare.tasks.DocumentCollectionFactory) TaskFactory(org.icij.datashare.tasks.TaskFactory) RedisDataBus(org.icij.datashare.com.RedisDataBus) OptimaizeLanguageGuesser(org.icij.datashare.nlp.OptimaizeLanguageGuesser) LanguageGuesser(org.icij.datashare.text.indexing.LanguageGuesser) ElasticsearchIndexer(org.icij.datashare.text.indexing.elasticsearch.ElasticsearchIndexer) RedisUserDocumentQueue(org.icij.datashare.extract.RedisUserDocumentQueue)

Example 44 with PropertiesProvider

use of org.icij.datashare.PropertiesProvider in project datashare by ICIJ.

the class ElasticsearchConfigurationTest method test_create_client_creates_settings.

@Test
public void test_create_client_creates_settings() throws Exception {
    ElasticsearchConfiguration.createESClient(new PropertiesProvider());
    Response response = es.client.getLowLevelClient().performRequest(new Request("GET", TEST_INDEX));
    assertThat(EntityUtils.toString(response.getEntity())).contains("settings");
}
Also used : PropertiesProvider(org.icij.datashare.PropertiesProvider) Response(org.elasticsearch.client.Response) Request(org.elasticsearch.client.Request) Test(org.junit.Test)

Example 45 with PropertiesProvider

use of org.icij.datashare.PropertiesProvider in project datashare by ICIJ.

the class ElasticsearchSpewerTest method test_truncated_content_if_document_is_smaller_than_limit.

@Test
public void test_truncated_content_if_document_is_smaller_than_limit() throws Exception {
    ElasticsearchSpewer limitedContentSpewer = new ElasticsearchSpewer(es.client, text -> Language.ENGLISH, new FieldNames(), publisher, new PropertiesProvider(new HashMap<String, String>() {

        {
            put("maxContentLength", "20");
        }
    })).withRefresh(IMMEDIATE).withIndex("test-datashare");
    final TikaDocument document = new DocumentFactory().withIdentifier(new PathIdentifier()).create(get("ok-file.txt"));
    final ParsingReader reader = new ParsingReader(new ByteArrayInputStream("this content is ok".getBytes()));
    document.setReader(reader);
    limitedContentSpewer.write(document);
    GetResponse documentFields = es.client.get(new GetRequest(TEST_INDEX, document.getId()), RequestOptions.DEFAULT);
    assertThat(documentFields.getSourceAsMap()).includes(entry("content", "this content is ok"));
}
Also used : PropertiesProvider(org.icij.datashare.PropertiesProvider) DocumentFactory(org.icij.extract.document.DocumentFactory) FieldNames(org.icij.spewer.FieldNames) HashMap(java.util.HashMap) ParsingReader(org.apache.tika.parser.ParsingReader) ByteArrayInputStream(java.io.ByteArrayInputStream) GetRequest(org.elasticsearch.action.get.GetRequest) PathIdentifier(org.icij.extract.document.PathIdentifier) TikaDocument(org.icij.extract.document.TikaDocument) GetResponse(org.elasticsearch.action.get.GetResponse) Test(org.junit.Test)

Aggregations

PropertiesProvider (org.icij.datashare.PropertiesProvider)73 Test (org.junit.Test)44 HashMap (java.util.HashMap)27 Document (org.icij.datashare.text.Document)18 BatchSearch (org.icij.datashare.batch.BatchSearch)17 LocalUserFilter (org.icij.datashare.session.LocalUserFilter)15 AbstractProdWebServerTest (org.icij.datashare.web.testhelpers.AbstractProdWebServerTest)15 Path (java.nio.file.Path)11 Before (org.junit.Before)10 BatchDownload (org.icij.datashare.batch.BatchDownload)9 Publisher (org.icij.datashare.com.Publisher)8 Indexer (org.icij.datashare.text.indexing.Indexer)7 User (org.icij.datashare.user.User)7 Date (java.util.Date)6 Properties (java.util.Properties)6 RestAssert (net.codestory.rest.RestAssert)5 PipelineRegistry (org.icij.datashare.extension.PipelineRegistry)5 DocumentFactory (org.icij.extract.document.DocumentFactory)5 TikaDocument (org.icij.extract.document.TikaDocument)5 FieldNames (org.icij.spewer.FieldNames)5