Search in sources :

Example 16 with BatchSearch

use of org.icij.datashare.batch.BatchSearch in project datashare by ICIJ.

the class BenchBatchSearch method testReadsAndWrites.

@Test
public void testReadsAndWrites() {
    int nbBatchSearches = 100;
    int nbQueries = 1000;
    int nbResultsPerQuery = 10;
    logger.info("writing {} batch searches with {} queries and {} results per query", nbBatchSearches, nbQueries, nbResultsPerQuery);
    long beginTime = System.currentTimeMillis();
    for (int bsIdx = 0; bsIdx < nbBatchSearches; bsIdx++) {
        String[] queries = IntStream.range(0, nbQueries).mapToObj(i -> "query " + i).toArray(String[]::new);
        BatchSearch batch = new BatchSearch(project("test"), "name" + bsIdx, "desc" + bsIdx, asSet(queries), User.local());
        repository.save(batch);
        for (String q : queries) {
            List<Document> documents = IntStream.range(0, nbResultsPerQuery).mapToObj(i -> createDoc("doc" + i).build()).collect(Collectors.toList());
            repository.saveResults(batch.uuid, q, documents);
        }
        if (bsIdx % 2 == 0) {
            logger.info("wrote {} batches", bsIdx);
        }
    }
    long endTime = System.currentTimeMillis();
    logger.info("done in {}ms", endTime - beginTime);
    logger.info("reading batch searches");
    beginTime = System.currentTimeMillis();
    // repository.get(User.local());
    endTime = System.currentTimeMillis();
    logger.info("done in {}ms", endTime - beginTime);
}
Also used : IntStream(java.util.stream.IntStream) Logger(org.slf4j.Logger) DocumentBuilder.createDoc(org.icij.datashare.text.DocumentBuilder.createDoc) LoggerFactory(org.slf4j.LoggerFactory) CollectionUtils.asSet(org.icij.datashare.CollectionUtils.asSet) Test(org.junit.Test) BatchSearchRepository(org.icij.datashare.batch.BatchSearchRepository) Collectors(java.util.stream.Collectors) Document(org.icij.datashare.text.Document) List(java.util.List) Rule(org.junit.Rule) BatchSearch(org.icij.datashare.batch.BatchSearch) User(org.icij.datashare.user.User) Project.project(org.icij.datashare.text.Project.project) SQLDialect(org.jooq.SQLDialect) BatchSearch(org.icij.datashare.batch.BatchSearch) Document(org.icij.datashare.text.Document) Test(org.junit.Test)

Example 17 with BatchSearch

use of org.icij.datashare.batch.BatchSearch in project datashare by ICIJ.

the class BatchSearchRunnerTest method test_cancel_current_batch_search.

@Test
public void test_cancel_current_batch_search() throws Exception {
    CountDownLatch countDownLatch = new CountDownLatch(1);
    BatchSearch batchSearch = new BatchSearch("uuid1", project("test-datashare"), "name1", "desc1", asSet("query1", "query2"), new Date(), BatchSearch.State.QUEUED, local());
    Document[] documents = { createDoc("doc").build() };
    mockSearch.willReturn(1, documents);
    BatchSearchRunner batchSearchRunner = new BatchSearchRunner(indexer, new PropertiesProvider(), batchSearch, resultConsumer, countDownLatch);
    executor.submit(batchSearchRunner);
    executor.shutdown();
    countDownLatch.await();
    batchSearchRunner.cancel();
    assertThat(executor.awaitTermination(2, TimeUnit.SECONDS)).isTrue();
}
Also used : PropertiesProvider(org.icij.datashare.PropertiesProvider) BatchSearch(org.icij.datashare.batch.BatchSearch) CountDownLatch(java.util.concurrent.CountDownLatch) Document(org.icij.datashare.text.Document) Date(java.util.Date) Test(org.junit.Test)

Example 18 with BatchSearch

use of org.icij.datashare.batch.BatchSearch in project datashare by ICIJ.

the class BatchSearchRunnerTest method test_run_batch_search_failure.

@Test(expected = RuntimeException.class)
public void test_run_batch_search_failure() throws Exception {
    Document[] documents = { createDoc("doc").build() };
    mockSearch.willReturn(1, documents);
    BatchSearch batchSearch = new BatchSearch("uuid1", project("test-datashare"), "name1", "desc1", asSet("query1", "query2"), new Date(), BatchSearch.State.QUEUED, local());
    when(resultConsumer.apply(anyString(), any(), anyList())).thenThrow(new RuntimeException());
    new BatchSearchRunner(indexer, new PropertiesProvider(), batchSearch, resultConsumer).call();
}
Also used : PropertiesProvider(org.icij.datashare.PropertiesProvider) BatchSearch(org.icij.datashare.batch.BatchSearch) Document(org.icij.datashare.text.Document) Date(java.util.Date) Test(org.junit.Test)

Example 19 with BatchSearch

use of org.icij.datashare.batch.BatchSearch in project datashare by ICIJ.

the class BatchSearchRunnerTest method test_run_batch_search_truncate_to_60k_max_results.

@Test
public void test_run_batch_search_truncate_to_60k_max_results() throws Exception {
    Document[] documents = IntStream.range(0, MAX_SCROLL_SIZE).mapToObj(i -> createDoc("doc" + i).build()).toArray(Document[]::new);
    mockSearch.willReturn(MAX_BATCH_RESULT_SIZE / MAX_SCROLL_SIZE + 1, documents);
    BatchSearch batchSearch = new BatchSearch("uuid1", project("test-datashare"), "name", "desc", asSet("query"), new Date(), BatchSearch.State.QUEUED, local());
    assertThat(new BatchSearchRunner(indexer, new PropertiesProvider(), batchSearch, resultConsumer).call()).isLessThan(60000);
}
Also used : IntStream(java.util.stream.IntStream) MAX_SCROLL_SIZE(org.icij.datashare.tasks.BatchSearchRunner.MAX_SCROLL_SIZE) MockitoAnnotations.initMocks(org.mockito.MockitoAnnotations.initMocks) Mock(org.mockito.Mock) Date(java.util.Date) Assert.assertThrows(org.junit.Assert.assertThrows) HashMap(java.util.HashMap) Assertions.assertThat(org.fest.assertions.Assertions.assertThat) Arrays.asList(java.util.Arrays.asList) SearchException(org.icij.datashare.batch.SearchException) User(org.icij.datashare.user.User) Project.project(org.icij.datashare.text.Project.project) DatashareTimeRule(org.icij.datashare.test.DatashareTimeRule) BATCH_SEARCH_MAX_TIME(org.icij.datashare.cli.DatashareCliOptions.BATCH_SEARCH_MAX_TIME) ExecutorService(java.util.concurrent.ExecutorService) Before(org.junit.Before) BATCH_THROTTLE(org.icij.datashare.cli.DatashareCliOptions.BATCH_THROTTLE) PropertiesProvider(org.icij.datashare.PropertiesProvider) DocumentBuilder.createDoc(org.icij.datashare.text.DocumentBuilder.createDoc) Indexer(org.icij.datashare.text.indexing.Indexer) CollectionUtils.asSet(org.icij.datashare.CollectionUtils.asSet) Test(org.junit.Test) MAX_BATCH_RESULT_SIZE(org.icij.datashare.tasks.BatchSearchRunner.MAX_BATCH_RESULT_SIZE) Document(org.icij.datashare.text.Document) Executors(java.util.concurrent.Executors) User.local(org.icij.datashare.user.User.local) TimeUnit(java.util.concurrent.TimeUnit) Matchers.any(org.mockito.Matchers.any) CountDownLatch(java.util.concurrent.CountDownLatch) Mockito(org.mockito.Mockito) List(java.util.List) Rule(org.junit.Rule) BatchSearch(org.icij.datashare.batch.BatchSearch) TerFunction(org.icij.datashare.function.TerFunction) PropertiesProvider(org.icij.datashare.PropertiesProvider) BatchSearch(org.icij.datashare.batch.BatchSearch) Document(org.icij.datashare.text.Document) Date(java.util.Date) Test(org.junit.Test)

Example 20 with BatchSearch

use of org.icij.datashare.batch.BatchSearch in project datashare by ICIJ.

the class BatchSearchResource method search.

/**
 * Creates a new batch search. This is a multipart form with 8 fields :
 * name, description, csvFile, published, fileTypes, paths, fuzziness, phrase_matches
 *
 * No matter the order. The name and csv file are mandatory else it will return 400 (bad request)
 * Csv file must have under 60 000 lines else it will return 413 (payload too large)
 * Queries with less than two characters are filtered
 *
 * To do so with bash you can create a text file like :
 * ```
 * --BOUNDARY
 * Content-Disposition: form-data; name="name"
 *
 * my batch search
 * --BOUNDARY
 * Content-Disposition: form-data; name="description"
 *
 * search description
 * --BOUNDARY
 * Content-Disposition: form-data; name="csvFile"; filename="search.csv"
 * Content-Type: text/csv
 *
 * Obama
 * skype
 * test
 * query three
 * --BOUNDARY--
 * Content-Disposition: form-data; name="published"
 *
 * true
 * --BOUNDARY--
 * ```
 * Then replace `\n` with `\r\n` with a sed like this:
 *
 * `sed -i 's/$/^M/g' ~/multipart.txt`
 *
 * Then make a curl request with this file :
 * ```
 * curl -i -XPOST localhost:8080/api/batch/search/prj -H 'Content-Type: multipart/form-data; boundary=BOUNDARY' --data-binary @/home/dev/multipart.txt
 * ```
 * @param projectId
 * @param context : the request body
 * @return 200 or 400 or 413
 */
@Post("/search/:project")
public Payload search(String projectId, Context context) throws Exception {
    List<Part> parts = context.parts();
    String name = fieldValue("name", parts);
    String csv = fieldValue("csvFile", parts);
    if (name == null || csv == null) {
        return badRequest();
    }
    String description = fieldValue("description", parts);
    boolean published = "true".equalsIgnoreCase(fieldValue("published", parts)) ? TRUE : FALSE;
    List<String> fileTypes = fieldValues("fileTypes", parts);
    List<String> paths = fieldValues("paths", parts);
    Optional<Part> fuzzinessPart = parts.stream().filter(p -> "fuzziness".equals(p.name())).findAny();
    int fuzziness = fuzzinessPart.isPresent() ? parseInt(fuzzinessPart.get().content()) : 0;
    Optional<Part> phraseMatchesPart = parts.stream().filter(p -> "phrase_matches".equals(p.name())).findAny();
    boolean phraseMatches = phraseMatchesPart.isPresent() ? parseBoolean(phraseMatchesPart.get().content()) : FALSE;
    LinkedHashSet<String> queries = getQueries(csv).stream().map(query -> (phraseMatches && query.contains("\"")) ? query : sanitizeDoubleQuotesInQuery(query)).collect(Collectors.toCollection(LinkedHashSet::new));
    if (queries.size() >= MAX_BATCH_SIZE)
        return new Payload(413);
    BatchSearch batchSearch = new BatchSearch(project(projectId), name, description, queries, (User) context.currentUser(), published, fileTypes, paths, fuzziness, phraseMatches);
    boolean isSaved = batchSearchRepository.save(batchSearch);
    if (isSaved)
        batchSearchQueue.put(batchSearch.uuid);
    return isSaved ? new Payload("application/json", batchSearch.uuid, 200) : badRequest();
}
Also used : java.util(java.util) Inject(com.google.inject.Inject) UnauthorizedException(net.codestory.http.errors.UnauthorizedException) SearchResult(org.icij.datashare.batch.SearchResult) JooqBatchSearchRepository(org.icij.datashare.db.JooqBatchSearchRepository) Boolean(java.lang.Boolean) User(org.icij.datashare.user.User) Project.project(org.icij.datashare.text.Project.project) DatashareUser(org.icij.datashare.session.DatashareUser) Part(net.codestory.http.Part) net.codestory.http.annotations(net.codestory.http.annotations) PropertiesProvider(org.icij.datashare.PropertiesProvider) Project(org.icij.datashare.text.Project) Context(net.codestory.http.Context) Payload(net.codestory.http.payload.Payload) CollectionUtils.asSet(org.icij.datashare.CollectionUtils.asSet) IOException(java.io.IOException) BlockingQueue(java.util.concurrent.BlockingQueue) BatchSearchRecord(org.icij.datashare.batch.BatchSearchRecord) NotFoundException(net.codestory.http.errors.NotFoundException) BatchSearchRepository(org.icij.datashare.batch.BatchSearchRepository) Collectors(java.util.stream.Collectors) Integer.parseInt(java.lang.Integer.parseInt) String.format(java.lang.String.format) BatchSearch(org.icij.datashare.batch.BatchSearch) Arrays.stream(java.util.Arrays.stream) Singleton(com.google.inject.Singleton) BatchSearch(org.icij.datashare.batch.BatchSearch) Part(net.codestory.http.Part) Payload(net.codestory.http.payload.Payload)

Aggregations

BatchSearch (org.icij.datashare.batch.BatchSearch)32 PropertiesProvider (org.icij.datashare.PropertiesProvider)18 Test (org.junit.Test)17 Document (org.icij.datashare.text.Document)15 AbstractProdWebServerTest (org.icij.datashare.web.testhelpers.AbstractProdWebServerTest)9 Date (java.util.Date)6 Response (net.codestory.rest.Response)4 BatchSearchRepository (org.icij.datashare.batch.BatchSearchRepository)4 SearchException (org.icij.datashare.batch.SearchException)4 JooqBatchSearchRepository (org.icij.datashare.db.JooqBatchSearchRepository)4 User (org.icij.datashare.user.User)4 CollectionUtils.asSet (org.icij.datashare.CollectionUtils.asSet)3 SearchResult (org.icij.datashare.batch.SearchResult)3 Project.project (org.icij.datashare.text.Project.project)3 HashMap (java.util.HashMap)2 List (java.util.List)2 CountDownLatch (java.util.concurrent.CountDownLatch)2 Collectors (java.util.stream.Collectors)2 IntStream (java.util.stream.IntStream)2 NotFoundException (net.codestory.http.errors.NotFoundException)2