Search in sources :

Example 1 with Pipeline

use of org.icij.datashare.text.nlp.Pipeline in project datashare by ICIJ.

the class TaskResource method extractNlp.

/**
 * Find names using the given pipeline :
 *
 * - OPENNLP
 * - CORENLP
 * - IXAPIPE
 * - GATENLP
 * - MITIE
 *
 * This endpoint is going to find all Documents that are not taggued with the given pipeline,
 * and extract named entities for all these documents.
 *
 * @param pipelineName
 * @param optionsWrapper
 * @return 200 and the list of created tasks
 *
 * Example :
 * $(curl -XPOST http://dsenv:8080/api/task/findNames/CORENLP -d {})
 */
@Post("/findNames/:pipeline")
public List<TaskView<?>> extractNlp(final String pipelineName, final OptionsWrapper<String> optionsWrapper, Context context) {
    Properties mergedProps = propertiesProvider.createOverriddenWith(optionsWrapper.getOptions());
    syncModels(parseBoolean(mergedProps.getProperty("syncModels", "true")));
    Pipeline pipeline = pipelineRegistry.get(Pipeline.Type.parse(pipelineName));
    TaskView<Void> nlpTask = createNlpApp(context, mergedProps, pipeline);
    if (parseBoolean(mergedProps.getProperty("resume", "true"))) {
        TaskView<Long> resumeNlpTask = taskManager.startTask(taskFactory.createResumeNlpTask((User) context.currentUser(), new HashSet<Pipeline.Type>() {

            {
                add(Pipeline.Type.parse(pipelineName));
            }
        }));
        return asList(resumeNlpTask, nlpTask);
    }
    return singletonList(nlpTask);
}
Also used : User(org.icij.datashare.user.User) Pipeline(org.icij.datashare.text.nlp.Pipeline)

Example 2 with Pipeline

use of org.icij.datashare.text.nlp.Pipeline in project datashare by ICIJ.

the class TaskResourceTest method test_findNames_with_options_should_merge_with_property_provider.

@Test
public void test_findNames_with_options_should_merge_with_property_provider() {
    RestAssert response = post("/api/task/findNames/EMAIL", "{\"options\":{\"waitForNlpApp\": false, \"key\":\"val\",\"foo\":\"loo\"}}");
    response.should().haveType("application/json");
    verify(taskFactory).createResumeNlpTask(local(), singleton(Pipeline.Type.EMAIL));
    ArgumentCaptor<Pipeline> pipelineCaptor = ArgumentCaptor.forClass(Pipeline.class);
    ArgumentCaptor<Properties> propertiesCaptor = ArgumentCaptor.forClass(Properties.class);
    verify(taskFactory).createNlpTask(eq(local()), pipelineCaptor.capture(), propertiesCaptor.capture(), any());
    assertThat(propertiesCaptor.getValue()).includes(entry("key", "val"), entry("foo", "loo"));
    assertThat(pipelineCaptor.getValue().getType()).isEqualTo(Pipeline.Type.EMAIL);
}
Also used : Properties(java.util.Properties) RestAssert(net.codestory.rest.RestAssert) EmailPipeline(org.icij.datashare.nlp.EmailPipeline) Pipeline(org.icij.datashare.text.nlp.Pipeline) AbstractProdWebServerTest(org.icij.datashare.web.testhelpers.AbstractProdWebServerTest)

Example 3 with Pipeline

use of org.icij.datashare.text.nlp.Pipeline in project datashare by ICIJ.

the class NerResource method getAnnotations.

/**
 * When datashare is launched in NER mode (without index) it exposes a name finding HTTP API. The text is sent with the HTTP body.
 *
 * @param pipeline to use
 * @param text to analyse in the request body
 * @return list of NamedEntities annotations
 *
 * Example :
 * $(curl -XPOST http://dsenv:8080/api/ner/findNames/CORENLP -d "Please find attached a PDF copy of the advance tax clearance obtained for our client John Doe.")
 */
@Post("/findNames/:pipeline")
public List<NamedEntity> getAnnotations(final String pipeline, String text) throws Exception {
    LoggerFactory.getLogger(getClass()).info(String.valueOf(getClass().getClassLoader()));
    Pipeline p = pipelineRegistry.get(Pipeline.Type.parse(pipeline));
    Language language = languageGuesser.guess(text);
    if (p.initialize(language)) {
        return p.process(DocumentBuilder.createDoc("inline").with(text).with(language).build());
    }
    return emptyList();
}
Also used : Language(org.icij.datashare.text.Language) Pipeline(org.icij.datashare.text.nlp.Pipeline) Post(net.codestory.http.annotations.Post)

Example 4 with Pipeline

use of org.icij.datashare.text.nlp.Pipeline in project datashare by ICIJ.

the class CliApp method runTaskRunner.

private static void runTaskRunner(Injector injector, Properties properties) throws Exception {
    TaskManagerMemory taskManager = injector.getInstance(TaskManagerMemory.class);
    TaskFactory taskFactory = injector.getInstance(TaskFactory.class);
    Set<Pipeline.Type> nlpPipelines = parseAll(properties.getProperty(DatashareCliOptions.NLP_PIPELINES_OPT));
    Indexer indexer = injector.getInstance(Indexer.class);
    if (resume(properties)) {
        RedisUserDocumentQueue queue = new RedisUserDocumentQueue(nullUser(), new PropertiesProvider(properties));
        boolean queueIsEmpty = queue.isEmpty();
        queue.close();
        if (indexer.search(properties.getProperty("defaultProject"), Document.class).withSource(false).without(nlpPipelines.toArray(new Pipeline.Type[] {})).execute().count() == 0 && queueIsEmpty) {
            logger.info("nothing to resume, exiting normally");
            System.exit(0);
        }
    }
    if (properties.getProperty(CREATE_INDEX_OPT) != null) {
        indexer.createIndex(properties.getProperty(CREATE_INDEX_OPT));
        System.exit(0);
    }
    if (properties.getProperty(CRE_API_KEY_OPT) != null) {
        String userName = properties.getProperty(CRE_API_KEY_OPT);
        String secretKey = taskFactory.createGenApiKey(localUser(userName)).call();
        logger.info("generated secret key for user {} (store it somewhere safe, datashare cannot retrieve it later): {}", userName, secretKey);
        System.exit(0);
    }
    if (properties.getProperty(GET_API_KEY_OPT) != null) {
        String userName = properties.getProperty(GET_API_KEY_OPT);
        String hashedKey = taskFactory.createGetApiKey(localUser(userName)).call();
        if ((hashedKey == null)) {
            logger.info("no user {} exists", userName);
        } else {
            logger.info("hashed key for user {} is {}", userName, hashedKey);
        }
        System.exit(0);
    }
    if (properties.getProperty(DEL_API_KEY_OPT) != null) {
        String userName = properties.getProperty(DEL_API_KEY_OPT);
        taskFactory.createDelApiKey(localUser(userName)).call();
        System.exit(0);
    }
    PipelineHelper pipeline = new PipelineHelper(new PropertiesProvider(properties));
    if (pipeline.has(DatashareCli.Stage.DEDUPLICATE)) {
        taskManager.startTask(taskFactory.createDeduplicateTask(nullUser(), pipeline.getQueueNameFor(DatashareCli.Stage.DEDUPLICATE)));
    }
    if (pipeline.has(DatashareCli.Stage.SCANIDX)) {
        TaskView<Long> taskView = taskManager.startTask(taskFactory.createScanIndexTask(nullUser(), ofNullable(properties.getProperty(MAP_NAME_OPTION)).orElse("extract:report")));
        logger.info("scanned {}", taskView.getResult(true));
    }
    if (pipeline.has(DatashareCli.Stage.SCAN) && !resume(properties)) {
        taskManager.startTask(taskFactory.createScanTask(nullUser(), pipeline.getQueueNameFor(DatashareCli.Stage.SCAN), Paths.get(properties.getProperty(DatashareCliOptions.DATA_DIR_OPT)), properties), () -> closeAndLogException(injector.getInstance(DocumentQueue.class)).run());
    }
    if (pipeline.has(DatashareCli.Stage.INDEX)) {
        taskManager.startTask(taskFactory.createIndexTask(nullUser(), pipeline.getQueueNameFor(DatashareCli.Stage.INDEX), properties), () -> closeAndLogException(injector.getInstance(DocumentQueue.class)).run());
    }
    if (pipeline.has(DatashareCli.Stage.NLP)) {
        for (Pipeline.Type nlp : nlpPipelines) {
            Pipeline pipelineClass = injector.getInstance(PipelineRegistry.class).get(nlp);
            taskManager.startTask(taskFactory.createNlpTask(nullUser(), pipelineClass));
        }
        if (resume(properties)) {
            taskManager.startTask(taskFactory.createResumeNlpTask(nullUser(), nlpPipelines));
        }
    }
    taskManager.shutdownAndAwaitTermination(Integer.MAX_VALUE, SECONDS);
    indexer.close();
}
Also used : DocumentQueue(org.icij.extract.queue.DocumentQueue) RedisUserDocumentQueue(org.icij.datashare.extract.RedisUserDocumentQueue) TaskManagerMemory(org.icij.datashare.tasks.TaskManagerMemory) PipelineRegistry(org.icij.datashare.extension.PipelineRegistry) Pipeline(org.icij.datashare.text.nlp.Pipeline) Indexer(org.icij.datashare.text.indexing.Indexer) TaskFactory(org.icij.datashare.tasks.TaskFactory) RedisUserDocumentQueue(org.icij.datashare.extract.RedisUserDocumentQueue)

Example 5 with Pipeline

use of org.icij.datashare.text.nlp.Pipeline in project datashare by ICIJ.

the class TaskResourceTest method test_findNames_should_create_resume.

@Test
public void test_findNames_should_create_resume() {
    RestAssert response = post("/api/task/findNames/EMAIL", "{\"options\":{\"waitForNlpApp\": false}}");
    response.should().haveType("application/json");
    List<String> taskNames = taskManager.waitTasksToBeDone(1, SECONDS).stream().map(t -> t.name).collect(toList());
    assertThat(taskNames.size()).isEqualTo(2);
    verify(taskFactory).createResumeNlpTask(local(), singleton(Pipeline.Type.EMAIL));
    ArgumentCaptor<Pipeline> pipelineArgumentCaptor = ArgumentCaptor.forClass(Pipeline.class);
    HashMap<String, String> properties = getDefaultProperties();
    properties.put("waitForNlpApp", "false");
    verify(taskFactory).createNlpTask(eq(local()), pipelineArgumentCaptor.capture(), eq(new PropertiesProvider(properties).getProperties()), any());
    assertThat(pipelineArgumentCaptor.getValue().getType()).isEqualTo(Pipeline.Type.EMAIL);
}
Also used : Routes(net.codestory.http.routes.Routes) AbstractProdWebServerTest(org.icij.datashare.web.testhelpers.AbstractProdWebServerTest) CommonMode(org.icij.datashare.mode.CommonMode) NlpApp(org.icij.datashare.nlp.NlpApp) HashMap(java.util.HashMap) AbstractModels(org.icij.datashare.text.nlp.AbstractModels) RestAssert(net.codestory.rest.RestAssert) ArgumentCaptor(org.mockito.ArgumentCaptor) Collections.singleton(java.util.Collections.singleton) Assertions.assertThat(org.fest.assertions.Assertions.assertThat) EmailPipeline(org.icij.datashare.nlp.EmailPipeline) Matchers.eq(org.mockito.Matchers.eq) Filter(net.codestory.http.filters.Filter) User(org.icij.datashare.user.User) Project.project(org.icij.datashare.text.Project.project) BatchDownload(org.icij.datashare.batch.BatchDownload) DatashareTimeRule(org.icij.datashare.test.DatashareTimeRule) Path(java.nio.file.Path) Pipeline(org.icij.datashare.text.nlp.Pipeline) org.icij.datashare.tasks(org.icij.datashare.tasks) Properties(java.util.Properties) PropertiesProvider(org.icij.datashare.PropertiesProvider) Indexer(org.icij.datashare.text.indexing.Indexer) String.format(java.lang.String.format) PipelineRegistry(org.icij.datashare.extension.PipelineRegistry) ShouldChain(net.codestory.rest.ShouldChain) Mockito(org.mockito.Mockito) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) Paths(java.nio.file.Paths) LocalUserFilter(org.icij.datashare.session.LocalUserFilter) DatashareUser.local(org.icij.datashare.session.DatashareUser.local) org.junit(org.junit) NotNull(org.jetbrains.annotations.NotNull) MapAssert.entry(org.fest.assertions.MapAssert.entry) SECONDS(java.util.concurrent.TimeUnit.SECONDS) PropertiesProvider(org.icij.datashare.PropertiesProvider) RestAssert(net.codestory.rest.RestAssert) EmailPipeline(org.icij.datashare.nlp.EmailPipeline) Pipeline(org.icij.datashare.text.nlp.Pipeline) AbstractProdWebServerTest(org.icij.datashare.web.testhelpers.AbstractProdWebServerTest)

Aggregations

Pipeline (org.icij.datashare.text.nlp.Pipeline)5 Properties (java.util.Properties)2 RestAssert (net.codestory.rest.RestAssert)2 PipelineRegistry (org.icij.datashare.extension.PipelineRegistry)2 EmailPipeline (org.icij.datashare.nlp.EmailPipeline)2 Indexer (org.icij.datashare.text.indexing.Indexer)2 User (org.icij.datashare.user.User)2 AbstractProdWebServerTest (org.icij.datashare.web.testhelpers.AbstractProdWebServerTest)2 String.format (java.lang.String.format)1 Path (java.nio.file.Path)1 Paths (java.nio.file.Paths)1 Collections.singleton (java.util.Collections.singleton)1 HashMap (java.util.HashMap)1 List (java.util.List)1 SECONDS (java.util.concurrent.TimeUnit.SECONDS)1 Collectors.toList (java.util.stream.Collectors.toList)1 Post (net.codestory.http.annotations.Post)1 Filter (net.codestory.http.filters.Filter)1 Routes (net.codestory.http.routes.Routes)1 ShouldChain (net.codestory.rest.ShouldChain)1