use of org.icij.datashare.text.indexing.Indexer in project datashare by ICIJ.
the class NamedEntityResourceTest method test_hide_named_entity_when_success.
@Test
public void test_hide_named_entity_when_success() throws IOException {
NamedEntity toBeHidden = create(PERSON, "to_update", asList(123L), "docId", "root", CORENLP, FRENCH);
assertThat(toBeHidden.isHidden()).isFalse();
Indexer.Searcher searcher = mock(Indexer.Searcher.class);
doReturn(Stream.of(toBeHidden)).when(searcher).execute();
doReturn(searcher).when(searcher).thatMatchesFieldValue(any(), any());
doReturn(searcher).when(indexer).search("index", NamedEntity.class);
put("/api/index/namedEntities/hide/to_update").should().respond(200);
verify(indexer).bulkUpdate("index", singletonList(toBeHidden));
}
use of org.icij.datashare.text.indexing.Indexer in project datashare by ICIJ.
the class NlpAppTest method runNlpApp.
private NlpApp runNlpApp(String parallelism, int nlpProcessDelayMillis) throws InterruptedException {
Properties properties = new Properties();
properties.setProperty(NLP_PARALLELISM_OPT, parallelism);
properties.setProperty("messageBusAddress", "redis");
CountDownLatch latch = new CountDownLatch(1);
when(pipeline.process(any())).thenAnswer((Answer<List<NamedEntity>>) invocationOnMock -> {
if (nlpProcessDelayMillis > 0)
Thread.sleep(nlpProcessDelayMillis);
return emptyList();
});
NlpApp nlpApp = new NlpApp(dataBus, indexer, pipeline, properties, latch::countDown, 1, true, local());
executor.execute(nlpApp);
latch.await(2, SECONDS);
return nlpApp;
}
use of org.icij.datashare.text.indexing.Indexer in project datashare by ICIJ.
the class BatchSearchRunnerTest method test_run_batch_search_truncate_to_60k_max_results.
@Test
public void test_run_batch_search_truncate_to_60k_max_results() throws Exception {
Document[] documents = IntStream.range(0, MAX_SCROLL_SIZE).mapToObj(i -> createDoc("doc" + i).build()).toArray(Document[]::new);
mockSearch.willReturn(MAX_BATCH_RESULT_SIZE / MAX_SCROLL_SIZE + 1, documents);
BatchSearch batchSearch = new BatchSearch("uuid1", project("test-datashare"), "name", "desc", asSet("query"), new Date(), BatchSearch.State.QUEUED, local());
assertThat(new BatchSearchRunner(indexer, new PropertiesProvider(), batchSearch, resultConsumer).call()).isLessThan(60000);
}
use of org.icij.datashare.text.indexing.Indexer in project datashare by ICIJ.
the class CliApp method runTaskRunner.
private static void runTaskRunner(Injector injector, Properties properties) throws Exception {
TaskManagerMemory taskManager = injector.getInstance(TaskManagerMemory.class);
TaskFactory taskFactory = injector.getInstance(TaskFactory.class);
Set<Pipeline.Type> nlpPipelines = parseAll(properties.getProperty(DatashareCliOptions.NLP_PIPELINES_OPT));
Indexer indexer = injector.getInstance(Indexer.class);
if (resume(properties)) {
RedisUserDocumentQueue queue = new RedisUserDocumentQueue(nullUser(), new PropertiesProvider(properties));
boolean queueIsEmpty = queue.isEmpty();
queue.close();
if (indexer.search(properties.getProperty("defaultProject"), Document.class).withSource(false).without(nlpPipelines.toArray(new Pipeline.Type[] {})).execute().count() == 0 && queueIsEmpty) {
logger.info("nothing to resume, exiting normally");
System.exit(0);
}
}
if (properties.getProperty(CREATE_INDEX_OPT) != null) {
indexer.createIndex(properties.getProperty(CREATE_INDEX_OPT));
System.exit(0);
}
if (properties.getProperty(CRE_API_KEY_OPT) != null) {
String userName = properties.getProperty(CRE_API_KEY_OPT);
String secretKey = taskFactory.createGenApiKey(localUser(userName)).call();
logger.info("generated secret key for user {} (store it somewhere safe, datashare cannot retrieve it later): {}", userName, secretKey);
System.exit(0);
}
if (properties.getProperty(GET_API_KEY_OPT) != null) {
String userName = properties.getProperty(GET_API_KEY_OPT);
String hashedKey = taskFactory.createGetApiKey(localUser(userName)).call();
if ((hashedKey == null)) {
logger.info("no user {} exists", userName);
} else {
logger.info("hashed key for user {} is {}", userName, hashedKey);
}
System.exit(0);
}
if (properties.getProperty(DEL_API_KEY_OPT) != null) {
String userName = properties.getProperty(DEL_API_KEY_OPT);
taskFactory.createDelApiKey(localUser(userName)).call();
System.exit(0);
}
PipelineHelper pipeline = new PipelineHelper(new PropertiesProvider(properties));
if (pipeline.has(DatashareCli.Stage.DEDUPLICATE)) {
taskManager.startTask(taskFactory.createDeduplicateTask(nullUser(), pipeline.getQueueNameFor(DatashareCli.Stage.DEDUPLICATE)));
}
if (pipeline.has(DatashareCli.Stage.SCANIDX)) {
TaskView<Long> taskView = taskManager.startTask(taskFactory.createScanIndexTask(nullUser(), ofNullable(properties.getProperty(MAP_NAME_OPTION)).orElse("extract:report")));
logger.info("scanned {}", taskView.getResult(true));
}
if (pipeline.has(DatashareCli.Stage.SCAN) && !resume(properties)) {
taskManager.startTask(taskFactory.createScanTask(nullUser(), pipeline.getQueueNameFor(DatashareCli.Stage.SCAN), Paths.get(properties.getProperty(DatashareCliOptions.DATA_DIR_OPT)), properties), () -> closeAndLogException(injector.getInstance(DocumentQueue.class)).run());
}
if (pipeline.has(DatashareCli.Stage.INDEX)) {
taskManager.startTask(taskFactory.createIndexTask(nullUser(), pipeline.getQueueNameFor(DatashareCli.Stage.INDEX), properties), () -> closeAndLogException(injector.getInstance(DocumentQueue.class)).run());
}
if (pipeline.has(DatashareCli.Stage.NLP)) {
for (Pipeline.Type nlp : nlpPipelines) {
Pipeline pipelineClass = injector.getInstance(PipelineRegistry.class).get(nlp);
taskManager.startTask(taskFactory.createNlpTask(nullUser(), pipelineClass));
}
if (resume(properties)) {
taskManager.startTask(taskFactory.createResumeNlpTask(nullUser(), nlpPipelines));
}
}
taskManager.shutdownAndAwaitTermination(Integer.MAX_VALUE, SECONDS);
indexer.close();
}
use of org.icij.datashare.text.indexing.Indexer in project datashare by ICIJ.
the class DocumentIngestor method main.
public static void main(String[] args) {
OptionSet optionSet = parseArgs(args);
Integer nbThreads = (Integer) optionSet.valueOf("t");
Integer bulkSize = (Integer) optionSet.valueOf("bulkSize");
String elasticsearchUrl = (String) optionSet.valueOf("elasticsearchAddress");
String indexName = (String) optionSet.valueOf("indexName");
Integer nbDocuments = (Integer) optionSet.valueOf("nbDocuments");
PropertiesProvider propertiesProvider = new PropertiesProvider(new HashMap<String, String>() {
{
put(INDEX_ADDRESS_PROP, elasticsearchUrl);
}
});
Indexer indexer = new ElasticsearchIndexer(ElasticsearchConfiguration.createESClient(propertiesProvider), propertiesProvider);
logger.info("ingest {} documents in elasticsearch {} with bulk of {} and {} threads", nbDocuments, elasticsearchUrl, bulkSize, nbThreads);
executorService = Executors.newFixedThreadPool(nbThreads);
new DocumentProducer(nbDocuments, bulkSize).run();
IntStream.range(0, nbThreads).forEach(n -> executorService.submit(new DocumentConsumer(indexer, indexName)));
executorService.shutdown();
}
Aggregations