use of org.icij.datashare.Entity in project datashare by ICIJ.
the class ElasticsearchIndexer method bulkUpdate.
@Override
public <T extends Entity> boolean bulkUpdate(String indexName, List<T> entities) throws IOException {
BulkRequest bulkRequest = new BulkRequest();
entities.stream().map(e -> createUpdateRequest(indexName, getType(e), e.getId(), getJson(e), getParent(e), getRoot(e))).forEach(bulkRequest::add);
return executeBulk(bulkRequest);
}
use of org.icij.datashare.Entity in project datashare by ICIJ.
the class BatchDownloadRunner method call.
@Override
public File call() throws Exception {
int throttleMs = parseInt(propertiesProvider.get(BATCH_THROTTLE).orElse("0"));
int maxResultSize = parseInt(propertiesProvider.get(BATCH_DOWNLOAD_MAX_NB_FILES).orElse(valueOf(MAX_BATCH_RESULT_SIZE)));
int scrollSize = min(parseInt(propertiesProvider.get(SCROLL_SIZE).orElse("1000")), MAX_SCROLL_SIZE);
long maxZipSizeBytes = HumanReadableSize.parse(propertiesProvider.get(BATCH_DOWNLOAD_MAX_SIZE).orElse("100M"));
long zippedFilesSize = 0;
logger.info("running batch download for user {} on project {} with throttle {}ms and scroll size of {}", batchDownload.user.getId(), batchDownload.project, throttleMs, scrollSize);
Indexer.Searcher searcher = indexer.search(batchDownload.project.getId(), Document.class).withoutSource("content").limit(scrollSize);
if (batchDownload.isJsonQuery()) {
searcher.set(batchDownload.queryAsJson());
} else {
searcher.with(batchDownload.query);
}
List<? extends Entity> docsToProcess = searcher.scroll().collect(toList());
if (docsToProcess.size() == 0) {
logger.warn("no results for batchDownload {}", batchDownload.uuid);
return null;
}
docsToProcessSize = searcher.totalHits();
if (docsToProcessSize > maxResultSize) {
logger.warn("number of results for batch download > {} for {}/{} (nb zip entries will be limited)", maxResultSize, batchDownload.uuid, batchDownload.user);
}
try (Zipper zipper = createZipper(batchDownload, propertiesProvider, mailSenderSupplier)) {
HashMap<String, Object> taskProperties = new HashMap<>();
taskProperties.put("batchDownload", batchDownload);
while (docsToProcess.size() != 0) {
for (int i = 0; i < docsToProcess.size() && numberOfResults.get() < maxResultSize && zippedFilesSize <= maxZipSizeBytes; i++) {
Entity doc = docsToProcess.get(i);
int addedBytes = zipper.add((Document) doc);
if (addedBytes > 0) {
zippedFilesSize += addedBytes;
numberOfResults.incrementAndGet();
batchDownload.setZipSize(zippedFilesSize);
updateCallback.apply(new TaskView<>(new MonitorableFutureTask<>(this, taskProperties)));
}
}
docsToProcess = searcher.scroll().collect(toList());
}
}
logger.info("created batch download file {} ({} bytes/{} entries) for user {}", batchDownload.filename, Files.size(batchDownload.filename), numberOfResults, batchDownload.user.getId());
return batchDownload.filename.toFile();
}
use of org.icij.datashare.Entity in project datashare by ICIJ.
the class ElasticsearchIndexer method bulkAdd.
@Override
public boolean bulkAdd(final String indexName, Pipeline.Type nerType, List<NamedEntity> namedEntities, Document parent) throws IOException {
BulkRequest bulkRequest = new BulkRequest();
String routing = ofNullable(parent.getRootDocument()).orElse(parent.getId());
bulkRequest.add(new UpdateRequest(indexName, parent.getId()).doc(jsonBuilder().startObject().field("status", Document.Status.DONE).endObject()).routing(routing));
bulkRequest.add(new UpdateRequest(indexName, parent.getId()).script(new Script(ScriptType.INLINE, "painless", "if (!ctx._source.nerTags.contains(params.nerTag)) ctx._source.nerTags.add(params.nerTag);", new HashMap<String, Object>() {
{
put("nerTag", nerType.toString());
}
})).routing(routing));
for (Entity child : namedEntities) {
bulkRequest.add(createIndexRequest(indexName, JsonObjectMapper.getType(child), child.getId(), getJson(child), parent.getId(), routing));
}
bulkRequest.setRefreshPolicy(esCfg.refreshPolicy);
BulkResponse bulkResponse = client.bulk(bulkRequest, RequestOptions.DEFAULT);
if (bulkResponse.hasFailures()) {
for (BulkItemResponse resp : bulkResponse.getItems()) {
if (resp.isFailed()) {
LOGGER.error("bulk add failed : {}", resp.getFailureMessage());
}
}
return false;
}
return true;
}
use of org.icij.datashare.Entity in project datashare by ICIJ.
the class ElasticsearchIndexer method bulkAdd.
@Override
public <T extends Entity> boolean bulkAdd(final String indexName, List<T> objs) throws IOException {
BulkRequest bulkRequest = new BulkRequest();
objs.stream().map(e -> createIndexRequest(indexName, getType(e), e.getId(), getJson(e), getParent(e), getRoot(e))).forEach(bulkRequest::add);
return executeBulk(bulkRequest);
}
Aggregations