use of org.icij.datashare.com.Message in project datashare by ICIJ.
the class IndexTask method call.
@Override
public Long call() throws Exception {
logger.info("Processing up to {} file(s) in parallel", parallelism);
totalToProcess = drainer.drain(POISON).get();
drainer.shutdown();
// drain is finished
drainer.awaitTermination(10, SECONDS);
logger.info("drained {} documents. Waiting for consumer to shutdown", totalToProcess);
publisher.publish(Channel.NLP, new Message(INIT_MONITORING).add(VALUE, valueOf(totalToProcess)));
consumer.shutdown();
// documents could be currently processed
try {
while (!consumer.awaitTermination(30, MINUTES)) {
logger.info("Consumer has not terminated yet.");
}
} catch (InterruptedException iex) {
logger.info("Got InterruptedException while waiting for the consumer shutdown.");
}
publisher.publish(Channel.NLP, new ShutdownMessage());
if (consumer.getReporter() != null)
consumer.getReporter().close();
queue.close();
logger.info("exiting");
return totalToProcess;
}
use of org.icij.datashare.com.Message in project datashare by ICIJ.
the class ResumeNlpTask method call.
@Override
public Long call() throws IOException {
Indexer.Searcher searcher = indexer.search(projectName, Document.class).withSource("rootDocument").without(nlpPipelines.toArray(new Pipeline.Type[] {}));
logger.info("resuming NLP name finding for index {} and {} : {} documents found", projectName, nlpPipelines, searcher.totalHits());
List<? extends Entity> docsToProcess = searcher.scroll().collect(toList());
long totalHits = searcher.totalHits();
this.publisher.publish(Channel.NLP, new Message(Message.Type.INIT_MONITORING).add(Message.Field.VALUE, valueOf(totalHits)));
do {
docsToProcess.forEach(doc -> this.publisher.publish(Channel.NLP, new Message(Message.Type.EXTRACT_NLP).add(Message.Field.INDEX_NAME, projectName).add(Message.Field.DOC_ID, doc.getId()).add(Message.Field.R_ID, ((Document) doc).getRootDocument())));
docsToProcess = searcher.scroll().collect(toList());
} while (docsToProcess.size() != 0);
logger.info("sent {} message for {} files without {} pipeline tags", Message.Type.EXTRACT_NLP, totalHits, nlpPipelines);
searcher.clearScroll();
this.publisher.publish(Channel.NLP, new ShutdownMessage());
return totalHits;
}
use of org.icij.datashare.com.Message in project datashare by ICIJ.
the class ElasticsearchSpewer method writeDocument.
@Override
protected void writeDocument(TikaDocument doc, TikaDocument parent, TikaDocument root, int level) throws IOException {
final IndexRequest req = prepareRequest(doc, parent, root, level);
long before = currentTimeMillis();
IndexResponse indexResponse = client.index(req, RequestOptions.DEFAULT);
logger.info("{} {} added to elasticsearch in {}ms: {}", parent == null ? "Document" : "Child", shorten(indexResponse.getId(), 4), currentTimeMillis() - before, doc);
synchronized (publisher) {
// jedis instance is not thread safe and Spewer is shared in DocumentConsumer threads
publisher.publish(NLP, new Message(EXTRACT_NLP).add(Message.Field.INDEX_NAME, indexName).add(Message.Field.DOC_ID, indexResponse.getId()).add(Message.Field.R_ID, parent == null ? doc.getId() : root.getId()));
}
}
use of org.icij.datashare.com.Message in project datashare by ICIJ.
the class NlpConsumer method call.
@Override
public Integer call() {
boolean exitAsked = false;
int nbMessages = 0;
while (!exitAsked) {
try {
Message message = messageQueue.poll(30, TimeUnit.SECONDS);
if (message != null) {
switch(message.type) {
case EXTRACT_NLP:
findNamedEntities(message.content.get(INDEX_NAME), message.content.get(DOC_ID), message.content.get(R_ID));
nbMessages++;
break;
case SHUTDOWN:
exitAsked = true;
break;
default:
logger.info("ignore {}", message);
}
synchronized (messageQueue) {
if (messageQueue.isEmpty()) {
logger.debug("queue is empty notifying messageQueue {}", messageQueue.hashCode());
messageQueue.notify();
}
}
}
} catch (Throwable e) {
logger.warn("error in consumer main loop", e);
}
}
logger.info("exiting main loop");
return nbMessages;
}
use of org.icij.datashare.com.Message in project datashare by ICIJ.
the class ElasticsearchSpewerTest method test_simple_write.
@Test
public void test_simple_write() throws Exception {
final TikaDocument document = new DocumentFactory().withIdentifier(new PathIdentifier()).create(get("test-file.txt"));
final ParsingReader reader = new ParsingReader(new ByteArrayInputStream("test".getBytes()));
document.setReader(reader);
spewer.write(document);
GetResponse documentFields = es.client.get(new GetRequest(TEST_INDEX, document.getId()), RequestOptions.DEFAULT);
assertThat(documentFields.isExists()).isTrue();
assertThat(documentFields.getId()).isEqualTo(document.getId());
assertEquals(new HashMap<String, String>() {
{
put("name", "Document");
}
}, documentFields.getSourceAsMap().get("join"));
ArgumentCaptor<Message> argument = ArgumentCaptor.forClass(Message.class);
verify(publisher).publish(eq(Channel.NLP), argument.capture());
assertThat(argument.getValue().content).includes(entry(Field.DOC_ID, document.getId()));
}
Aggregations