use of org.icij.datashare.batch.BatchSearch in project datashare by ICIJ.
the class BenchBatchSearch method testReadsAndWrites.
@Test
public void testReadsAndWrites() {
int nbBatchSearches = 100;
int nbQueries = 1000;
int nbResultsPerQuery = 10;
logger.info("writing {} batch searches with {} queries and {} results per query", nbBatchSearches, nbQueries, nbResultsPerQuery);
long beginTime = System.currentTimeMillis();
for (int bsIdx = 0; bsIdx < nbBatchSearches; bsIdx++) {
String[] queries = IntStream.range(0, nbQueries).mapToObj(i -> "query " + i).toArray(String[]::new);
BatchSearch batch = new BatchSearch(project("test"), "name" + bsIdx, "desc" + bsIdx, asSet(queries), User.local());
repository.save(batch);
for (String q : queries) {
List<Document> documents = IntStream.range(0, nbResultsPerQuery).mapToObj(i -> createDoc("doc" + i).build()).collect(Collectors.toList());
repository.saveResults(batch.uuid, q, documents);
}
if (bsIdx % 2 == 0) {
logger.info("wrote {} batches", bsIdx);
}
}
long endTime = System.currentTimeMillis();
logger.info("done in {}ms", endTime - beginTime);
logger.info("reading batch searches");
beginTime = System.currentTimeMillis();
// repository.get(User.local());
endTime = System.currentTimeMillis();
logger.info("done in {}ms", endTime - beginTime);
}
use of org.icij.datashare.batch.BatchSearch in project datashare by ICIJ.
the class BatchSearchRunnerTest method test_cancel_current_batch_search.
@Test
public void test_cancel_current_batch_search() throws Exception {
CountDownLatch countDownLatch = new CountDownLatch(1);
BatchSearch batchSearch = new BatchSearch("uuid1", project("test-datashare"), "name1", "desc1", asSet("query1", "query2"), new Date(), BatchSearch.State.QUEUED, local());
Document[] documents = { createDoc("doc").build() };
mockSearch.willReturn(1, documents);
BatchSearchRunner batchSearchRunner = new BatchSearchRunner(indexer, new PropertiesProvider(), batchSearch, resultConsumer, countDownLatch);
executor.submit(batchSearchRunner);
executor.shutdown();
countDownLatch.await();
batchSearchRunner.cancel();
assertThat(executor.awaitTermination(2, TimeUnit.SECONDS)).isTrue();
}
use of org.icij.datashare.batch.BatchSearch in project datashare by ICIJ.
the class BatchSearchRunnerTest method test_run_batch_search_failure.
@Test(expected = RuntimeException.class)
public void test_run_batch_search_failure() throws Exception {
Document[] documents = { createDoc("doc").build() };
mockSearch.willReturn(1, documents);
BatchSearch batchSearch = new BatchSearch("uuid1", project("test-datashare"), "name1", "desc1", asSet("query1", "query2"), new Date(), BatchSearch.State.QUEUED, local());
when(resultConsumer.apply(anyString(), any(), anyList())).thenThrow(new RuntimeException());
new BatchSearchRunner(indexer, new PropertiesProvider(), batchSearch, resultConsumer).call();
}
use of org.icij.datashare.batch.BatchSearch in project datashare by ICIJ.
the class BatchSearchRunnerTest method test_run_batch_search_truncate_to_60k_max_results.
@Test
public void test_run_batch_search_truncate_to_60k_max_results() throws Exception {
Document[] documents = IntStream.range(0, MAX_SCROLL_SIZE).mapToObj(i -> createDoc("doc" + i).build()).toArray(Document[]::new);
mockSearch.willReturn(MAX_BATCH_RESULT_SIZE / MAX_SCROLL_SIZE + 1, documents);
BatchSearch batchSearch = new BatchSearch("uuid1", project("test-datashare"), "name", "desc", asSet("query"), new Date(), BatchSearch.State.QUEUED, local());
assertThat(new BatchSearchRunner(indexer, new PropertiesProvider(), batchSearch, resultConsumer).call()).isLessThan(60000);
}
use of org.icij.datashare.batch.BatchSearch in project datashare by ICIJ.
the class BatchSearchResource method search.
/**
* Creates a new batch search. This is a multipart form with 8 fields :
* name, description, csvFile, published, fileTypes, paths, fuzziness, phrase_matches
*
* No matter the order. The name and csv file are mandatory else it will return 400 (bad request)
* Csv file must have under 60 000 lines else it will return 413 (payload too large)
* Queries with less than two characters are filtered
*
* To do so with bash you can create a text file like :
* ```
* --BOUNDARY
* Content-Disposition: form-data; name="name"
*
* my batch search
* --BOUNDARY
* Content-Disposition: form-data; name="description"
*
* search description
* --BOUNDARY
* Content-Disposition: form-data; name="csvFile"; filename="search.csv"
* Content-Type: text/csv
*
* Obama
* skype
* test
* query three
* --BOUNDARY--
* Content-Disposition: form-data; name="published"
*
* true
* --BOUNDARY--
* ```
* Then replace `\n` with `\r\n` with a sed like this:
*
* `sed -i 's/$/^M/g' ~/multipart.txt`
*
* Then make a curl request with this file :
* ```
* curl -i -XPOST localhost:8080/api/batch/search/prj -H 'Content-Type: multipart/form-data; boundary=BOUNDARY' --data-binary @/home/dev/multipart.txt
* ```
* @param projectId
* @param context : the request body
* @return 200 or 400 or 413
*/
@Post("/search/:project")
public Payload search(String projectId, Context context) throws Exception {
List<Part> parts = context.parts();
String name = fieldValue("name", parts);
String csv = fieldValue("csvFile", parts);
if (name == null || csv == null) {
return badRequest();
}
String description = fieldValue("description", parts);
boolean published = "true".equalsIgnoreCase(fieldValue("published", parts)) ? TRUE : FALSE;
List<String> fileTypes = fieldValues("fileTypes", parts);
List<String> paths = fieldValues("paths", parts);
Optional<Part> fuzzinessPart = parts.stream().filter(p -> "fuzziness".equals(p.name())).findAny();
int fuzziness = fuzzinessPart.isPresent() ? parseInt(fuzzinessPart.get().content()) : 0;
Optional<Part> phraseMatchesPart = parts.stream().filter(p -> "phrase_matches".equals(p.name())).findAny();
boolean phraseMatches = phraseMatchesPart.isPresent() ? parseBoolean(phraseMatchesPart.get().content()) : FALSE;
LinkedHashSet<String> queries = getQueries(csv).stream().map(query -> (phraseMatches && query.contains("\"")) ? query : sanitizeDoubleQuotesInQuery(query)).collect(Collectors.toCollection(LinkedHashSet::new));
if (queries.size() >= MAX_BATCH_SIZE)
return new Payload(413);
BatchSearch batchSearch = new BatchSearch(project(projectId), name, description, queries, (User) context.currentUser(), published, fileTypes, paths, fuzziness, phraseMatches);
boolean isSaved = batchSearchRepository.save(batchSearch);
if (isSaved)
batchSearchQueue.put(batchSearch.uuid);
return isSaved ? new Payload("application/json", batchSearch.uuid, 200) : badRequest();
}
Aggregations