use of org.icij.datashare.text.nlp.Pipeline in project datashare by ICIJ.
the class TaskResource method extractNlp.
/**
* Find names using the given pipeline :
*
* - OPENNLP
* - CORENLP
* - IXAPIPE
* - GATENLP
* - MITIE
*
* This endpoint is going to find all Documents that are not taggued with the given pipeline,
* and extract named entities for all these documents.
*
* @param pipelineName
* @param optionsWrapper
* @return 200 and the list of created tasks
*
* Example :
* $(curl -XPOST http://dsenv:8080/api/task/findNames/CORENLP -d {})
*/
@Post("/findNames/:pipeline")
public List<TaskView<?>> extractNlp(final String pipelineName, final OptionsWrapper<String> optionsWrapper, Context context) {
Properties mergedProps = propertiesProvider.createOverriddenWith(optionsWrapper.getOptions());
syncModels(parseBoolean(mergedProps.getProperty("syncModels", "true")));
Pipeline pipeline = pipelineRegistry.get(Pipeline.Type.parse(pipelineName));
TaskView<Void> nlpTask = createNlpApp(context, mergedProps, pipeline);
if (parseBoolean(mergedProps.getProperty("resume", "true"))) {
TaskView<Long> resumeNlpTask = taskManager.startTask(taskFactory.createResumeNlpTask((User) context.currentUser(), new HashSet<Pipeline.Type>() {
{
add(Pipeline.Type.parse(pipelineName));
}
}));
return asList(resumeNlpTask, nlpTask);
}
return singletonList(nlpTask);
}
use of org.icij.datashare.text.nlp.Pipeline in project datashare by ICIJ.
the class TaskResourceTest method test_findNames_with_options_should_merge_with_property_provider.
@Test
public void test_findNames_with_options_should_merge_with_property_provider() {
RestAssert response = post("/api/task/findNames/EMAIL", "{\"options\":{\"waitForNlpApp\": false, \"key\":\"val\",\"foo\":\"loo\"}}");
response.should().haveType("application/json");
verify(taskFactory).createResumeNlpTask(local(), singleton(Pipeline.Type.EMAIL));
ArgumentCaptor<Pipeline> pipelineCaptor = ArgumentCaptor.forClass(Pipeline.class);
ArgumentCaptor<Properties> propertiesCaptor = ArgumentCaptor.forClass(Properties.class);
verify(taskFactory).createNlpTask(eq(local()), pipelineCaptor.capture(), propertiesCaptor.capture(), any());
assertThat(propertiesCaptor.getValue()).includes(entry("key", "val"), entry("foo", "loo"));
assertThat(pipelineCaptor.getValue().getType()).isEqualTo(Pipeline.Type.EMAIL);
}
use of org.icij.datashare.text.nlp.Pipeline in project datashare by ICIJ.
the class NerResource method getAnnotations.
/**
* When datashare is launched in NER mode (without index) it exposes a name finding HTTP API. The text is sent with the HTTP body.
*
* @param pipeline to use
* @param text to analyse in the request body
* @return list of NamedEntities annotations
*
* Example :
* $(curl -XPOST http://dsenv:8080/api/ner/findNames/CORENLP -d "Please find attached a PDF copy of the advance tax clearance obtained for our client John Doe.")
*/
@Post("/findNames/:pipeline")
public List<NamedEntity> getAnnotations(final String pipeline, String text) throws Exception {
LoggerFactory.getLogger(getClass()).info(String.valueOf(getClass().getClassLoader()));
Pipeline p = pipelineRegistry.get(Pipeline.Type.parse(pipeline));
Language language = languageGuesser.guess(text);
if (p.initialize(language)) {
return p.process(DocumentBuilder.createDoc("inline").with(text).with(language).build());
}
return emptyList();
}
use of org.icij.datashare.text.nlp.Pipeline in project datashare by ICIJ.
the class CliApp method runTaskRunner.
private static void runTaskRunner(Injector injector, Properties properties) throws Exception {
TaskManagerMemory taskManager = injector.getInstance(TaskManagerMemory.class);
TaskFactory taskFactory = injector.getInstance(TaskFactory.class);
Set<Pipeline.Type> nlpPipelines = parseAll(properties.getProperty(DatashareCliOptions.NLP_PIPELINES_OPT));
Indexer indexer = injector.getInstance(Indexer.class);
if (resume(properties)) {
RedisUserDocumentQueue queue = new RedisUserDocumentQueue(nullUser(), new PropertiesProvider(properties));
boolean queueIsEmpty = queue.isEmpty();
queue.close();
if (indexer.search(properties.getProperty("defaultProject"), Document.class).withSource(false).without(nlpPipelines.toArray(new Pipeline.Type[] {})).execute().count() == 0 && queueIsEmpty) {
logger.info("nothing to resume, exiting normally");
System.exit(0);
}
}
if (properties.getProperty(CREATE_INDEX_OPT) != null) {
indexer.createIndex(properties.getProperty(CREATE_INDEX_OPT));
System.exit(0);
}
if (properties.getProperty(CRE_API_KEY_OPT) != null) {
String userName = properties.getProperty(CRE_API_KEY_OPT);
String secretKey = taskFactory.createGenApiKey(localUser(userName)).call();
logger.info("generated secret key for user {} (store it somewhere safe, datashare cannot retrieve it later): {}", userName, secretKey);
System.exit(0);
}
if (properties.getProperty(GET_API_KEY_OPT) != null) {
String userName = properties.getProperty(GET_API_KEY_OPT);
String hashedKey = taskFactory.createGetApiKey(localUser(userName)).call();
if ((hashedKey == null)) {
logger.info("no user {} exists", userName);
} else {
logger.info("hashed key for user {} is {}", userName, hashedKey);
}
System.exit(0);
}
if (properties.getProperty(DEL_API_KEY_OPT) != null) {
String userName = properties.getProperty(DEL_API_KEY_OPT);
taskFactory.createDelApiKey(localUser(userName)).call();
System.exit(0);
}
PipelineHelper pipeline = new PipelineHelper(new PropertiesProvider(properties));
if (pipeline.has(DatashareCli.Stage.DEDUPLICATE)) {
taskManager.startTask(taskFactory.createDeduplicateTask(nullUser(), pipeline.getQueueNameFor(DatashareCli.Stage.DEDUPLICATE)));
}
if (pipeline.has(DatashareCli.Stage.SCANIDX)) {
TaskView<Long> taskView = taskManager.startTask(taskFactory.createScanIndexTask(nullUser(), ofNullable(properties.getProperty(MAP_NAME_OPTION)).orElse("extract:report")));
logger.info("scanned {}", taskView.getResult(true));
}
if (pipeline.has(DatashareCli.Stage.SCAN) && !resume(properties)) {
taskManager.startTask(taskFactory.createScanTask(nullUser(), pipeline.getQueueNameFor(DatashareCli.Stage.SCAN), Paths.get(properties.getProperty(DatashareCliOptions.DATA_DIR_OPT)), properties), () -> closeAndLogException(injector.getInstance(DocumentQueue.class)).run());
}
if (pipeline.has(DatashareCli.Stage.INDEX)) {
taskManager.startTask(taskFactory.createIndexTask(nullUser(), pipeline.getQueueNameFor(DatashareCli.Stage.INDEX), properties), () -> closeAndLogException(injector.getInstance(DocumentQueue.class)).run());
}
if (pipeline.has(DatashareCli.Stage.NLP)) {
for (Pipeline.Type nlp : nlpPipelines) {
Pipeline pipelineClass = injector.getInstance(PipelineRegistry.class).get(nlp);
taskManager.startTask(taskFactory.createNlpTask(nullUser(), pipelineClass));
}
if (resume(properties)) {
taskManager.startTask(taskFactory.createResumeNlpTask(nullUser(), nlpPipelines));
}
}
taskManager.shutdownAndAwaitTermination(Integer.MAX_VALUE, SECONDS);
indexer.close();
}
use of org.icij.datashare.text.nlp.Pipeline in project datashare by ICIJ.
the class TaskResourceTest method test_findNames_should_create_resume.
@Test
public void test_findNames_should_create_resume() {
RestAssert response = post("/api/task/findNames/EMAIL", "{\"options\":{\"waitForNlpApp\": false}}");
response.should().haveType("application/json");
List<String> taskNames = taskManager.waitTasksToBeDone(1, SECONDS).stream().map(t -> t.name).collect(toList());
assertThat(taskNames.size()).isEqualTo(2);
verify(taskFactory).createResumeNlpTask(local(), singleton(Pipeline.Type.EMAIL));
ArgumentCaptor<Pipeline> pipelineArgumentCaptor = ArgumentCaptor.forClass(Pipeline.class);
HashMap<String, String> properties = getDefaultProperties();
properties.put("waitForNlpApp", "false");
verify(taskFactory).createNlpTask(eq(local()), pipelineArgumentCaptor.capture(), eq(new PropertiesProvider(properties).getProperties()), any());
assertThat(pipelineArgumentCaptor.getValue().getType()).isEqualTo(Pipeline.Type.EMAIL);
}
Aggregations