use of org.icij.datashare.batch.BatchDownload in project datashare by ICIJ.
the class TaskResource method batchDownload.
/**
* download files from a search query. Expected parameters are :
*
* * project: string
* * query: string or elasticsearch JSON query
*
* if the query is a string it is taken as an ES query string, else it is a raw JSON query (without the query part)
* @see org.elasticsearch.index.query.WrapperQueryBuilder that is used to wrap the query
*
* @param optionsWrapper wrapper for options json
*
* @return 200 and json task
*
* Example :
* $(curl -XPOST -H 'Content-Type: application/json' localhost:8080/api/task/batchDownload -d '{"options": {"project":"genapi-datashare", "query": "*" }}')
*/
@Post("/batchDownload")
public TaskView<File> batchDownload(final OptionsWrapper<Object> optionsWrapper, Context context) throws JsonProcessingException {
Map<String, Object> options = optionsWrapper.getOptions();
Path tmpPath = get(context.env().appFolder(), "tmp");
if (!tmpPath.toFile().exists())
tmpPath.toFile().mkdirs();
String query = options.get("query") instanceof Map ? JsonObjectMapper.MAPPER.writeValueAsString(options.get("query")) : (String) options.get("query");
boolean batchDownloadEncrypt = parseBoolean(propertiesProvider.get("batchDownloadEncrypt").orElse("false"));
BatchDownload batchDownload = new BatchDownload(project((String) options.get("project")), (User) context.currentUser(), query, tmpPath, batchDownloadEncrypt);
BatchDownloadRunner downloadTask = taskFactory.createDownloadRunner(batchDownload, v -> null);
return taskManager.startTask(downloadTask, new HashMap<String, Object>() {
{
put("batchDownload", batchDownload);
}
});
}
use of org.icij.datashare.batch.BatchDownload in project datashare by ICIJ.
the class BatchDownloadRunnerIntTest method test_two_results_two_dirs.
@Test
public void test_two_results_two_dirs() throws Exception {
File doc1 = new IndexerHelper(es.client).indexFile("dir1/doc1.txt", "The quick brown fox jumps over the lazy dog", fs);
File doc2 = new IndexerHelper(es.client).indexFile("dir2/doc2.txt", "Portez ce vieux whisky au juge blond qui fume", fs);
BatchDownload bd = createBatchDownload("*");
new BatchDownloadRunner(indexer, createProvider(), bd, updateCallback).call();
assertThat(bd.filename.toFile()).isFile();
assertThat(new ZipFile(bd.filename.toFile()).size()).isEqualTo(2);
assertThat(new ZipFile(bd.filename.toFile()).getEntry(doc1.toString().substring(1))).isNotNull();
assertThat(new ZipFile(bd.filename.toFile()).getEntry(doc2.toString().substring(1))).isNotNull();
}
use of org.icij.datashare.batch.BatchDownload in project datashare by ICIJ.
the class BatchDownloadRunnerIntTest method test_embedded_doc_should_not_interrupt_zip_creation.
@Test
public void test_embedded_doc_should_not_interrupt_zip_creation() throws Exception {
File file = new IndexerHelper(es.client).indexEmbeddedFile(TEST_INDEX, "/docs/embedded_doc.eml");
BatchDownload batchDownload = createBatchDownload("*");
new BatchDownloadRunner(indexer, createProvider(), batchDownload, updateCallback).call();
assertThat(new ZipFile(batchDownload.filename.toFile()).size()).isEqualTo(2);
assertThat(new ZipFile(batchDownload.filename.toFile()).getEntry(file.toString().substring(1))).isNotNull();
}
use of org.icij.datashare.batch.BatchDownload in project datashare by ICIJ.
the class CommonMode method configure.
@Override
protected void configure() {
bind(PropertiesProvider.class).toInstance(propertiesProvider);
bind(LanguageGuesser.class).to(OptimaizeLanguageGuesser.class);
String batchQueueType = propertiesProvider.get("batchQueueType").orElse("org.icij.datashare.extract.MemoryBlockingQueue");
bind(new TypeLiteral<BlockingQueue<String>>() {
}).toInstance(getBlockingQueue(propertiesProvider, batchQueueType, "ds:batchsearch:queue"));
bind(new TypeLiteral<BlockingQueue<BatchDownload>>() {
}).toInstance(getBlockingQueue(propertiesProvider, batchQueueType, "ds:batchdownload:queue"));
RestHighLevelClient esClient = createESClient(propertiesProvider);
bind(RestHighLevelClient.class).toInstance(esClient);
bind(Indexer.class).to(ElasticsearchIndexer.class).asEagerSingleton();
bind(TaskManagerMemory.class).toInstance(new TaskManagerMemory(propertiesProvider));
install(new FactoryModuleBuilder().build(TaskFactory.class));
if ("memory".equals(propertiesProvider.getProperties().get("queueType"))) {
bind(DocumentCollectionFactory.class).to(MemoryDocumentCollectionFactory.class).asEagerSingleton();
} else {
install(new FactoryModuleBuilder().implement(DocumentQueue.class, RedisUserDocumentQueue.class).implement(ReportMap.class, RedisUserReportMap.class).build(DocumentCollectionFactory.class));
}
DataBus dataBus;
if ("memory".equals(propertiesProvider.getProperties().get("busType"))) {
dataBus = new MemoryDataBus();
} else {
dataBus = new RedisDataBus(propertiesProvider);
}
bind(DataBus.class).toInstance(dataBus);
bind(Publisher.class).toInstance(dataBus);
PipelineRegistry pipelineRegistry = new PipelineRegistry(propertiesProvider);
pipelineRegistry.register(EmailPipeline.class);
pipelineRegistry.register(Pipeline.Type.CORENLP);
try {
pipelineRegistry.load();
} catch (FileNotFoundException e) {
LoggerFactory.getLogger(getClass()).info("extensions dir not found " + e.getMessage());
}
bind(PipelineRegistry.class).toInstance(pipelineRegistry);
}
use of org.icij.datashare.batch.BatchDownload in project datashare by ICIJ.
the class BatchDownloadLoop method run.
public void run() {
logger.info("Datashare running in batch mode. Waiting batch from ds:batchdownload.queue ({})", batchDownloadQueue.getClass());
BatchDownload currentBatch = null;
while (!POISON.equals(currentBatch)) {
try {
currentBatch = batchDownloadQueue.poll(60, TimeUnit.SECONDS);
createDownloadCleaner(DOWNLOAD_DIR, ttlHour).run();
HashMap<String, Object> taskProperties = new HashMap<>();
taskProperties.put("batchDownload", currentBatch);
if (!POISON.equals(currentBatch) && currentBatch != null) {
MonitorableFutureTask<File> fileMonitorableFutureTask = new MonitorableFutureTask<>(factory.createDownloadRunner(currentBatch, manager::save), taskProperties);
fileMonitorableFutureTask.run();
manager.save(new TaskView<>(fileMonitorableFutureTask));
}
} catch (Exception ex) {
logger.error("error in loop", ex);
}
}
}
Aggregations