Search in sources :

Example 11 with BatchDownload

use of org.icij.datashare.batch.BatchDownload in project datashare by ICIJ.

the class TaskResource method batchDownload.

/**
 * download files from a search query. Expected parameters are :
 *
 * * project: string
 * * query: string or elasticsearch JSON query
 *
 * if the query is a string it is taken as an ES query string, else it is a raw JSON query (without the query part)
 * @see org.elasticsearch.index.query.WrapperQueryBuilder that is used to wrap the query
 *
 * @param optionsWrapper wrapper for options json
 *
 * @return 200 and json task
 *
 * Example :
 * $(curl -XPOST -H 'Content-Type: application/json' localhost:8080/api/task/batchDownload -d '{"options": {"project":"genapi-datashare", "query": "*" }}')
 */
@Post("/batchDownload")
public TaskView<File> batchDownload(final OptionsWrapper<Object> optionsWrapper, Context context) throws JsonProcessingException {
    Map<String, Object> options = optionsWrapper.getOptions();
    Path tmpPath = get(context.env().appFolder(), "tmp");
    if (!tmpPath.toFile().exists())
        tmpPath.toFile().mkdirs();
    String query = options.get("query") instanceof Map ? JsonObjectMapper.MAPPER.writeValueAsString(options.get("query")) : (String) options.get("query");
    boolean batchDownloadEncrypt = parseBoolean(propertiesProvider.get("batchDownloadEncrypt").orElse("false"));
    BatchDownload batchDownload = new BatchDownload(project((String) options.get("project")), (User) context.currentUser(), query, tmpPath, batchDownloadEncrypt);
    BatchDownloadRunner downloadTask = taskFactory.createDownloadRunner(batchDownload, v -> null);
    return taskManager.startTask(downloadTask, new HashMap<String, Object>() {

        {
            put("batchDownload", batchDownload);
        }
    });
}
Also used : Path(java.nio.file.Path) BatchDownload(org.icij.datashare.batch.BatchDownload) Collectors.toMap(java.util.stream.Collectors.toMap)

Example 12 with BatchDownload

use of org.icij.datashare.batch.BatchDownload in project datashare by ICIJ.

the class BatchDownloadRunnerIntTest method test_two_results_two_dirs.

@Test
public void test_two_results_two_dirs() throws Exception {
    File doc1 = new IndexerHelper(es.client).indexFile("dir1/doc1.txt", "The quick brown fox jumps over the lazy dog", fs);
    File doc2 = new IndexerHelper(es.client).indexFile("dir2/doc2.txt", "Portez ce vieux whisky au juge blond qui fume", fs);
    BatchDownload bd = createBatchDownload("*");
    new BatchDownloadRunner(indexer, createProvider(), bd, updateCallback).call();
    assertThat(bd.filename.toFile()).isFile();
    assertThat(new ZipFile(bd.filename.toFile()).size()).isEqualTo(2);
    assertThat(new ZipFile(bd.filename.toFile()).getEntry(doc1.toString().substring(1))).isNotNull();
    assertThat(new ZipFile(bd.filename.toFile()).getEntry(doc2.toString().substring(1))).isNotNull();
}
Also used : BatchDownload(org.icij.datashare.batch.BatchDownload) ZipFile(java.util.zip.ZipFile) File(java.io.File) ZipFile(java.util.zip.ZipFile)

Example 13 with BatchDownload

use of org.icij.datashare.batch.BatchDownload in project datashare by ICIJ.

the class BatchDownloadRunnerIntTest method test_embedded_doc_should_not_interrupt_zip_creation.

@Test
public void test_embedded_doc_should_not_interrupt_zip_creation() throws Exception {
    File file = new IndexerHelper(es.client).indexEmbeddedFile(TEST_INDEX, "/docs/embedded_doc.eml");
    BatchDownload batchDownload = createBatchDownload("*");
    new BatchDownloadRunner(indexer, createProvider(), batchDownload, updateCallback).call();
    assertThat(new ZipFile(batchDownload.filename.toFile()).size()).isEqualTo(2);
    assertThat(new ZipFile(batchDownload.filename.toFile()).getEntry(file.toString().substring(1))).isNotNull();
}
Also used : BatchDownload(org.icij.datashare.batch.BatchDownload) ZipFile(java.util.zip.ZipFile) File(java.io.File) ZipFile(java.util.zip.ZipFile)

Example 14 with BatchDownload

use of org.icij.datashare.batch.BatchDownload in project datashare by ICIJ.

the class CommonMode method configure.

@Override
protected void configure() {
    bind(PropertiesProvider.class).toInstance(propertiesProvider);
    bind(LanguageGuesser.class).to(OptimaizeLanguageGuesser.class);
    String batchQueueType = propertiesProvider.get("batchQueueType").orElse("org.icij.datashare.extract.MemoryBlockingQueue");
    bind(new TypeLiteral<BlockingQueue<String>>() {
    }).toInstance(getBlockingQueue(propertiesProvider, batchQueueType, "ds:batchsearch:queue"));
    bind(new TypeLiteral<BlockingQueue<BatchDownload>>() {
    }).toInstance(getBlockingQueue(propertiesProvider, batchQueueType, "ds:batchdownload:queue"));
    RestHighLevelClient esClient = createESClient(propertiesProvider);
    bind(RestHighLevelClient.class).toInstance(esClient);
    bind(Indexer.class).to(ElasticsearchIndexer.class).asEagerSingleton();
    bind(TaskManagerMemory.class).toInstance(new TaskManagerMemory(propertiesProvider));
    install(new FactoryModuleBuilder().build(TaskFactory.class));
    if ("memory".equals(propertiesProvider.getProperties().get("queueType"))) {
        bind(DocumentCollectionFactory.class).to(MemoryDocumentCollectionFactory.class).asEagerSingleton();
    } else {
        install(new FactoryModuleBuilder().implement(DocumentQueue.class, RedisUserDocumentQueue.class).implement(ReportMap.class, RedisUserReportMap.class).build(DocumentCollectionFactory.class));
    }
    DataBus dataBus;
    if ("memory".equals(propertiesProvider.getProperties().get("busType"))) {
        dataBus = new MemoryDataBus();
    } else {
        dataBus = new RedisDataBus(propertiesProvider);
    }
    bind(DataBus.class).toInstance(dataBus);
    bind(Publisher.class).toInstance(dataBus);
    PipelineRegistry pipelineRegistry = new PipelineRegistry(propertiesProvider);
    pipelineRegistry.register(EmailPipeline.class);
    pipelineRegistry.register(Pipeline.Type.CORENLP);
    try {
        pipelineRegistry.load();
    } catch (FileNotFoundException e) {
        LoggerFactory.getLogger(getClass()).info("extensions dir not found " + e.getMessage());
    }
    bind(PipelineRegistry.class).toInstance(pipelineRegistry);
}
Also used : BatchDownload(org.icij.datashare.batch.BatchDownload) FactoryModuleBuilder(com.google.inject.assistedinject.FactoryModuleBuilder) FileNotFoundException(java.io.FileNotFoundException) MemoryDataBus(org.icij.datashare.com.MemoryDataBus) RedisUserDocumentQueue(org.icij.datashare.extract.RedisUserDocumentQueue) DocumentQueue(org.icij.extract.queue.DocumentQueue) RestHighLevelClient(org.elasticsearch.client.RestHighLevelClient) DataBus(org.icij.datashare.com.DataBus) MemoryDataBus(org.icij.datashare.com.MemoryDataBus) RedisDataBus(org.icij.datashare.com.RedisDataBus) Publisher(org.icij.datashare.com.Publisher) TaskManagerMemory(org.icij.datashare.tasks.TaskManagerMemory) PipelineRegistry(org.icij.datashare.extension.PipelineRegistry) PropertiesProvider(org.icij.datashare.PropertiesProvider) MemoryDocumentCollectionFactory(org.icij.datashare.tasks.MemoryDocumentCollectionFactory) TypeLiteral(com.google.inject.TypeLiteral) MemoryDocumentCollectionFactory(org.icij.datashare.tasks.MemoryDocumentCollectionFactory) DocumentCollectionFactory(org.icij.datashare.tasks.DocumentCollectionFactory) TaskFactory(org.icij.datashare.tasks.TaskFactory) RedisDataBus(org.icij.datashare.com.RedisDataBus) OptimaizeLanguageGuesser(org.icij.datashare.nlp.OptimaizeLanguageGuesser) LanguageGuesser(org.icij.datashare.text.indexing.LanguageGuesser) ElasticsearchIndexer(org.icij.datashare.text.indexing.elasticsearch.ElasticsearchIndexer) RedisUserDocumentQueue(org.icij.datashare.extract.RedisUserDocumentQueue)

Example 15 with BatchDownload

use of org.icij.datashare.batch.BatchDownload in project datashare by ICIJ.

the class BatchDownloadLoop method run.

public void run() {
    logger.info("Datashare running in batch mode. Waiting batch from ds:batchdownload.queue ({})", batchDownloadQueue.getClass());
    BatchDownload currentBatch = null;
    while (!POISON.equals(currentBatch)) {
        try {
            currentBatch = batchDownloadQueue.poll(60, TimeUnit.SECONDS);
            createDownloadCleaner(DOWNLOAD_DIR, ttlHour).run();
            HashMap<String, Object> taskProperties = new HashMap<>();
            taskProperties.put("batchDownload", currentBatch);
            if (!POISON.equals(currentBatch) && currentBatch != null) {
                MonitorableFutureTask<File> fileMonitorableFutureTask = new MonitorableFutureTask<>(factory.createDownloadRunner(currentBatch, manager::save), taskProperties);
                fileMonitorableFutureTask.run();
                manager.save(new TaskView<>(fileMonitorableFutureTask));
            }
        } catch (Exception ex) {
            logger.error("error in loop", ex);
        }
    }
}
Also used : BatchDownload(org.icij.datashare.batch.BatchDownload) HashMap(java.util.HashMap) File(java.io.File) IOException(java.io.IOException)

Aggregations

BatchDownload (org.icij.datashare.batch.BatchDownload)21 ZipFile (java.util.zip.ZipFile)9 PropertiesProvider (org.icij.datashare.PropertiesProvider)8 File (java.io.File)7 Test (org.junit.Test)7 Path (java.nio.file.Path)5 IOException (java.io.IOException)4 HashMap (java.util.HashMap)4 ElasticsearchStatusException (org.elasticsearch.ElasticsearchStatusException)4 Function (java.util.function.Function)3 Assertions.assertThat (org.fest.assertions.Assertions.assertThat)3 Project.project (org.icij.datashare.text.Project.project)3 Indexer (org.icij.datashare.text.indexing.Indexer)3 User (org.icij.datashare.user.User)3 TemporaryFolder (org.junit.rules.TemporaryFolder)3 Mock (org.mockito.Mock)3 MockitoAnnotations.initMocks (org.mockito.MockitoAnnotations.initMocks)3 String.valueOf (java.lang.String.valueOf)2 StandardCharsets (java.nio.charset.StandardCharsets)2 Files (java.nio.file.Files)2