Search in sources :

Example 1 with Indexer

use of org.icij.datashare.text.indexing.Indexer in project datashare by ICIJ.

the class NamedEntityResourceTest method test_hide_named_entity_when_success.

@Test
public void test_hide_named_entity_when_success() throws IOException {
    NamedEntity toBeHidden = create(PERSON, "to_update", asList(123L), "docId", "root", CORENLP, FRENCH);
    assertThat(toBeHidden.isHidden()).isFalse();
    Indexer.Searcher searcher = mock(Indexer.Searcher.class);
    doReturn(Stream.of(toBeHidden)).when(searcher).execute();
    doReturn(searcher).when(searcher).thatMatchesFieldValue(any(), any());
    doReturn(searcher).when(indexer).search("index", NamedEntity.class);
    put("/api/index/namedEntities/hide/to_update").should().respond(200);
    verify(indexer).bulkUpdate("index", singletonList(toBeHidden));
}
Also used : NamedEntity(org.icij.datashare.text.NamedEntity) Indexer(org.icij.datashare.text.indexing.Indexer) AbstractProdWebServerTest(org.icij.datashare.web.testhelpers.AbstractProdWebServerTest) Test(org.junit.Test)

Example 2 with Indexer

use of org.icij.datashare.text.indexing.Indexer in project datashare by ICIJ.

the class NlpAppTest method runNlpApp.

private NlpApp runNlpApp(String parallelism, int nlpProcessDelayMillis) throws InterruptedException {
    Properties properties = new Properties();
    properties.setProperty(NLP_PARALLELISM_OPT, parallelism);
    properties.setProperty("messageBusAddress", "redis");
    CountDownLatch latch = new CountDownLatch(1);
    when(pipeline.process(any())).thenAnswer((Answer<List<NamedEntity>>) invocationOnMock -> {
        if (nlpProcessDelayMillis > 0)
            Thread.sleep(nlpProcessDelayMillis);
        return emptyList();
    });
    NlpApp nlpApp = new NlpApp(dataBus, indexer, pipeline, properties, latch::countDown, 1, true, local());
    executor.execute(nlpApp);
    latch.await(2, SECONDS);
    return nlpApp;
}
Also used : IntStream(java.util.stream.IntStream) MockitoAnnotations.initMocks(org.mockito.MockitoAnnotations.initMocks) Mock(org.mockito.Mock) INIT_MONITORING(org.icij.datashare.com.Message.Type.INIT_MONITORING) RunWith(org.junit.runner.RunWith) Matchers.anyString(org.mockito.Matchers.anyString) Answer(org.mockito.stubbing.Answer) Assertions.assertThat(org.fest.assertions.Assertions.assertThat) Arrays.asList(java.util.Arrays.asList) After(org.junit.After) Parameterized(org.junit.runners.Parameterized) ExecutorService(java.util.concurrent.ExecutorService) Before(org.junit.Before) NLP_PARALLELISM_OPT(org.icij.datashare.cli.DatashareCliOptions.NLP_PARALLELISM_OPT) Field(org.icij.datashare.com.Message.Field) AbstractPipeline(org.icij.datashare.text.nlp.AbstractPipeline) Properties(java.util.Properties) PropertiesProvider(org.icij.datashare.PropertiesProvider) Collections.emptyList(java.util.Collections.emptyList) EXTRACT_NLP(org.icij.datashare.com.Message.Type.EXTRACT_NLP) DocumentBuilder.createDoc(org.icij.datashare.text.DocumentBuilder.createDoc) org.icij.datashare.com(org.icij.datashare.com) Collection(java.util.Collection) Indexer(org.icij.datashare.text.indexing.Indexer) Test(org.junit.Test) OPENNLP(org.icij.datashare.text.nlp.Pipeline.Type.OPENNLP) Executors(java.util.concurrent.Executors) User.local(org.icij.datashare.user.User.local) Matchers.any(org.mockito.Matchers.any) CountDownLatch(java.util.concurrent.CountDownLatch) Mockito(org.mockito.Mockito) List(java.util.List) Language(org.icij.datashare.text.Language) SECONDS(java.util.concurrent.TimeUnit.SECONDS) NamedEntity(org.icij.datashare.text.NamedEntity) Arrays.asList(java.util.Arrays.asList) Collections.emptyList(java.util.Collections.emptyList) List(java.util.List) Properties(java.util.Properties) CountDownLatch(java.util.concurrent.CountDownLatch)

Example 3 with Indexer

use of org.icij.datashare.text.indexing.Indexer in project datashare by ICIJ.

the class BatchSearchRunnerTest method test_run_batch_search_truncate_to_60k_max_results.

@Test
public void test_run_batch_search_truncate_to_60k_max_results() throws Exception {
    Document[] documents = IntStream.range(0, MAX_SCROLL_SIZE).mapToObj(i -> createDoc("doc" + i).build()).toArray(Document[]::new);
    mockSearch.willReturn(MAX_BATCH_RESULT_SIZE / MAX_SCROLL_SIZE + 1, documents);
    BatchSearch batchSearch = new BatchSearch("uuid1", project("test-datashare"), "name", "desc", asSet("query"), new Date(), BatchSearch.State.QUEUED, local());
    assertThat(new BatchSearchRunner(indexer, new PropertiesProvider(), batchSearch, resultConsumer).call()).isLessThan(60000);
}
Also used : IntStream(java.util.stream.IntStream) MAX_SCROLL_SIZE(org.icij.datashare.tasks.BatchSearchRunner.MAX_SCROLL_SIZE) MockitoAnnotations.initMocks(org.mockito.MockitoAnnotations.initMocks) Mock(org.mockito.Mock) Date(java.util.Date) Assert.assertThrows(org.junit.Assert.assertThrows) HashMap(java.util.HashMap) Assertions.assertThat(org.fest.assertions.Assertions.assertThat) Arrays.asList(java.util.Arrays.asList) SearchException(org.icij.datashare.batch.SearchException) User(org.icij.datashare.user.User) Project.project(org.icij.datashare.text.Project.project) DatashareTimeRule(org.icij.datashare.test.DatashareTimeRule) BATCH_SEARCH_MAX_TIME(org.icij.datashare.cli.DatashareCliOptions.BATCH_SEARCH_MAX_TIME) ExecutorService(java.util.concurrent.ExecutorService) Before(org.junit.Before) BATCH_THROTTLE(org.icij.datashare.cli.DatashareCliOptions.BATCH_THROTTLE) PropertiesProvider(org.icij.datashare.PropertiesProvider) DocumentBuilder.createDoc(org.icij.datashare.text.DocumentBuilder.createDoc) Indexer(org.icij.datashare.text.indexing.Indexer) CollectionUtils.asSet(org.icij.datashare.CollectionUtils.asSet) Test(org.junit.Test) MAX_BATCH_RESULT_SIZE(org.icij.datashare.tasks.BatchSearchRunner.MAX_BATCH_RESULT_SIZE) Document(org.icij.datashare.text.Document) Executors(java.util.concurrent.Executors) User.local(org.icij.datashare.user.User.local) TimeUnit(java.util.concurrent.TimeUnit) Matchers.any(org.mockito.Matchers.any) CountDownLatch(java.util.concurrent.CountDownLatch) Mockito(org.mockito.Mockito) List(java.util.List) Rule(org.junit.Rule) BatchSearch(org.icij.datashare.batch.BatchSearch) TerFunction(org.icij.datashare.function.TerFunction) PropertiesProvider(org.icij.datashare.PropertiesProvider) BatchSearch(org.icij.datashare.batch.BatchSearch) Document(org.icij.datashare.text.Document) Date(java.util.Date) Test(org.junit.Test)

Example 4 with Indexer

use of org.icij.datashare.text.indexing.Indexer in project datashare by ICIJ.

the class CliApp method runTaskRunner.

private static void runTaskRunner(Injector injector, Properties properties) throws Exception {
    TaskManagerMemory taskManager = injector.getInstance(TaskManagerMemory.class);
    TaskFactory taskFactory = injector.getInstance(TaskFactory.class);
    Set<Pipeline.Type> nlpPipelines = parseAll(properties.getProperty(DatashareCliOptions.NLP_PIPELINES_OPT));
    Indexer indexer = injector.getInstance(Indexer.class);
    if (resume(properties)) {
        RedisUserDocumentQueue queue = new RedisUserDocumentQueue(nullUser(), new PropertiesProvider(properties));
        boolean queueIsEmpty = queue.isEmpty();
        queue.close();
        if (indexer.search(properties.getProperty("defaultProject"), Document.class).withSource(false).without(nlpPipelines.toArray(new Pipeline.Type[] {})).execute().count() == 0 && queueIsEmpty) {
            logger.info("nothing to resume, exiting normally");
            System.exit(0);
        }
    }
    if (properties.getProperty(CREATE_INDEX_OPT) != null) {
        indexer.createIndex(properties.getProperty(CREATE_INDEX_OPT));
        System.exit(0);
    }
    if (properties.getProperty(CRE_API_KEY_OPT) != null) {
        String userName = properties.getProperty(CRE_API_KEY_OPT);
        String secretKey = taskFactory.createGenApiKey(localUser(userName)).call();
        logger.info("generated secret key for user {} (store it somewhere safe, datashare cannot retrieve it later): {}", userName, secretKey);
        System.exit(0);
    }
    if (properties.getProperty(GET_API_KEY_OPT) != null) {
        String userName = properties.getProperty(GET_API_KEY_OPT);
        String hashedKey = taskFactory.createGetApiKey(localUser(userName)).call();
        if ((hashedKey == null)) {
            logger.info("no user {} exists", userName);
        } else {
            logger.info("hashed key for user {} is {}", userName, hashedKey);
        }
        System.exit(0);
    }
    if (properties.getProperty(DEL_API_KEY_OPT) != null) {
        String userName = properties.getProperty(DEL_API_KEY_OPT);
        taskFactory.createDelApiKey(localUser(userName)).call();
        System.exit(0);
    }
    PipelineHelper pipeline = new PipelineHelper(new PropertiesProvider(properties));
    if (pipeline.has(DatashareCli.Stage.DEDUPLICATE)) {
        taskManager.startTask(taskFactory.createDeduplicateTask(nullUser(), pipeline.getQueueNameFor(DatashareCli.Stage.DEDUPLICATE)));
    }
    if (pipeline.has(DatashareCli.Stage.SCANIDX)) {
        TaskView<Long> taskView = taskManager.startTask(taskFactory.createScanIndexTask(nullUser(), ofNullable(properties.getProperty(MAP_NAME_OPTION)).orElse("extract:report")));
        logger.info("scanned {}", taskView.getResult(true));
    }
    if (pipeline.has(DatashareCli.Stage.SCAN) && !resume(properties)) {
        taskManager.startTask(taskFactory.createScanTask(nullUser(), pipeline.getQueueNameFor(DatashareCli.Stage.SCAN), Paths.get(properties.getProperty(DatashareCliOptions.DATA_DIR_OPT)), properties), () -> closeAndLogException(injector.getInstance(DocumentQueue.class)).run());
    }
    if (pipeline.has(DatashareCli.Stage.INDEX)) {
        taskManager.startTask(taskFactory.createIndexTask(nullUser(), pipeline.getQueueNameFor(DatashareCli.Stage.INDEX), properties), () -> closeAndLogException(injector.getInstance(DocumentQueue.class)).run());
    }
    if (pipeline.has(DatashareCli.Stage.NLP)) {
        for (Pipeline.Type nlp : nlpPipelines) {
            Pipeline pipelineClass = injector.getInstance(PipelineRegistry.class).get(nlp);
            taskManager.startTask(taskFactory.createNlpTask(nullUser(), pipelineClass));
        }
        if (resume(properties)) {
            taskManager.startTask(taskFactory.createResumeNlpTask(nullUser(), nlpPipelines));
        }
    }
    taskManager.shutdownAndAwaitTermination(Integer.MAX_VALUE, SECONDS);
    indexer.close();
}
Also used : DocumentQueue(org.icij.extract.queue.DocumentQueue) RedisUserDocumentQueue(org.icij.datashare.extract.RedisUserDocumentQueue) TaskManagerMemory(org.icij.datashare.tasks.TaskManagerMemory) PipelineRegistry(org.icij.datashare.extension.PipelineRegistry) Pipeline(org.icij.datashare.text.nlp.Pipeline) Indexer(org.icij.datashare.text.indexing.Indexer) TaskFactory(org.icij.datashare.tasks.TaskFactory) RedisUserDocumentQueue(org.icij.datashare.extract.RedisUserDocumentQueue)

Example 5 with Indexer

use of org.icij.datashare.text.indexing.Indexer in project datashare by ICIJ.

the class DocumentIngestor method main.

public static void main(String[] args) {
    OptionSet optionSet = parseArgs(args);
    Integer nbThreads = (Integer) optionSet.valueOf("t");
    Integer bulkSize = (Integer) optionSet.valueOf("bulkSize");
    String elasticsearchUrl = (String) optionSet.valueOf("elasticsearchAddress");
    String indexName = (String) optionSet.valueOf("indexName");
    Integer nbDocuments = (Integer) optionSet.valueOf("nbDocuments");
    PropertiesProvider propertiesProvider = new PropertiesProvider(new HashMap<String, String>() {

        {
            put(INDEX_ADDRESS_PROP, elasticsearchUrl);
        }
    });
    Indexer indexer = new ElasticsearchIndexer(ElasticsearchConfiguration.createESClient(propertiesProvider), propertiesProvider);
    logger.info("ingest {} documents in elasticsearch {} with bulk of {} and {} threads", nbDocuments, elasticsearchUrl, bulkSize, nbThreads);
    executorService = Executors.newFixedThreadPool(nbThreads);
    new DocumentProducer(nbDocuments, bulkSize).run();
    IntStream.range(0, nbThreads).forEach(n -> executorService.submit(new DocumentConsumer(indexer, indexName)));
    executorService.shutdown();
}
Also used : ElasticsearchIndexer(org.icij.datashare.text.indexing.elasticsearch.ElasticsearchIndexer) Indexer(org.icij.datashare.text.indexing.Indexer) OptionSet(joptsimple.OptionSet) ElasticsearchIndexer(org.icij.datashare.text.indexing.elasticsearch.ElasticsearchIndexer)

Aggregations

Indexer (org.icij.datashare.text.indexing.Indexer)7 Test (org.junit.Test)5 IntStream (java.util.stream.IntStream)4 Assertions.assertThat (org.fest.assertions.Assertions.assertThat)4 PropertiesProvider (org.icij.datashare.PropertiesProvider)4 DocumentBuilder.createDoc (org.icij.datashare.text.DocumentBuilder.createDoc)4 Before (org.junit.Before)4 Mock (org.mockito.Mock)4 MockitoAnnotations.initMocks (org.mockito.MockitoAnnotations.initMocks)4 HashMap (java.util.HashMap)3 Document (org.icij.datashare.text.Document)3 Project.project (org.icij.datashare.text.Project.project)3 User (org.icij.datashare.user.User)3 Rule (org.junit.Rule)3 File (java.io.File)2 IOException (java.io.IOException)2 String.valueOf (java.lang.String.valueOf)2 StandardCharsets (java.nio.charset.StandardCharsets)2 Files (java.nio.file.Files)2 Path (java.nio.file.Path)2