Search in sources :

Example 21 with Document

use of org.icij.datashare.text.Document in project datashare by ICIJ.

the class ElasticsearchIndexerTest method test_bulk_add_for_embedded_doc.

@Test
public void test_bulk_add_for_embedded_doc() throws IOException {
    Document parent = new org.icij.datashare.text.Document("id", project("prj"), Paths.get("mail.eml"), "content", Language.FRENCH, Charset.defaultCharset(), "message/rfc822", new HashMap<>(), INDEXED, new HashSet<>(), 321L);
    Document child = new Document(project("prj"), "childId", Paths.get("mail.eml"), "mail body", FRENCH, Charset.defaultCharset(), "text/plain", new HashMap<>(), Document.Status.INDEXED, new HashSet<>(), new Date(), "id", "id", (short) 1, 123L);
    indexer.add(TEST_INDEX, parent);
    indexer.add(TEST_INDEX, child);
    NamedEntity ne1 = create(PERSON, "Jane Daffodil", asList(12L), parent.getId(), "root", CORENLP, Language.FRENCH);
    assertThat(indexer.bulkAdd(TEST_INDEX, CORENLP, singletonList(ne1), child)).isTrue();
    Document doc = indexer.get(TEST_INDEX, child.getId(), parent.getId());
    assertThat(doc.getNerTags()).containsOnly(CORENLP);
    assertThat(doc.getStatus()).isEqualTo(Document.Status.DONE);
    NamedEntity actual = indexer.get(TEST_INDEX, ne1.getId(), doc.getRootDocument());
    assertThat(actual).isNotNull();
    assertThat(actual.getRootDocument()).isEqualTo(doc.getRootDocument());
}
Also used : NamedEntity(org.icij.datashare.text.NamedEntity) Document(org.icij.datashare.text.Document) Test(org.junit.Test)

Example 22 with Document

use of org.icij.datashare.text.Document in project datashare by ICIJ.

the class DatabaseSpewer method writeDocument.

@Override
protected void writeDocument(TikaDocument tikaDocument, TikaDocument parent, TikaDocument root, int level) throws IOException {
    String content = toString(tikaDocument.getReader()).trim();
    Charset charset = Charset.forName(ofNullable(tikaDocument.getMetadata().get(CONTENT_ENCODING)).orElse("utf-8"));
    String contentType = ofNullable(tikaDocument.getMetadata().get(CONTENT_TYPE)).orElse(DEFAULT_VALUE_UNKNOWN).split(";")[0];
    Long contentLength = valueOf(ofNullable(tikaDocument.getMetadata().get(CONTENT_LENGTH)).orElse("-1"));
    String parentId = parent == null ? null : parent.getId();
    String rootId = root == null ? null : root.getId();
    Document document = new Document(project, tikaDocument.getId(), tikaDocument.getPath(), content, languageGuesser.guess(content), charset, contentType, getMetadata(tikaDocument), Document.Status.INDEXED, new HashSet<>(), new Date(), parentId, rootId, (short) level, contentLength);
    repository.create(document);
}
Also used : Charset(java.nio.charset.Charset) TikaDocument(org.icij.extract.document.TikaDocument) Document(org.icij.datashare.text.Document) Date(java.util.Date)

Example 23 with Document

use of org.icij.datashare.text.Document in project datashare by ICIJ.

the class BatchSearchRunnerTest method test_cancel_current_batch_search.

@Test
public void test_cancel_current_batch_search() throws Exception {
    CountDownLatch countDownLatch = new CountDownLatch(1);
    BatchSearch batchSearch = new BatchSearch("uuid1", project("test-datashare"), "name1", "desc1", asSet("query1", "query2"), new Date(), BatchSearch.State.QUEUED, local());
    Document[] documents = { createDoc("doc").build() };
    mockSearch.willReturn(1, documents);
    BatchSearchRunner batchSearchRunner = new BatchSearchRunner(indexer, new PropertiesProvider(), batchSearch, resultConsumer, countDownLatch);
    executor.submit(batchSearchRunner);
    executor.shutdown();
    countDownLatch.await();
    batchSearchRunner.cancel();
    assertThat(executor.awaitTermination(2, TimeUnit.SECONDS)).isTrue();
}
Also used : PropertiesProvider(org.icij.datashare.PropertiesProvider) BatchSearch(org.icij.datashare.batch.BatchSearch) CountDownLatch(java.util.concurrent.CountDownLatch) Document(org.icij.datashare.text.Document) Date(java.util.Date) Test(org.junit.Test)

Example 24 with Document

use of org.icij.datashare.text.Document in project datashare by ICIJ.

the class BatchSearchRunnerTest method test_run_batch_search_failure.

@Test(expected = RuntimeException.class)
public void test_run_batch_search_failure() throws Exception {
    Document[] documents = { createDoc("doc").build() };
    mockSearch.willReturn(1, documents);
    BatchSearch batchSearch = new BatchSearch("uuid1", project("test-datashare"), "name1", "desc1", asSet("query1", "query2"), new Date(), BatchSearch.State.QUEUED, local());
    when(resultConsumer.apply(anyString(), any(), anyList())).thenThrow(new RuntimeException());
    new BatchSearchRunner(indexer, new PropertiesProvider(), batchSearch, resultConsumer).call();
}
Also used : PropertiesProvider(org.icij.datashare.PropertiesProvider) BatchSearch(org.icij.datashare.batch.BatchSearch) Document(org.icij.datashare.text.Document) Date(java.util.Date) Test(org.junit.Test)

Example 25 with Document

use of org.icij.datashare.text.Document in project datashare by ICIJ.

the class BatchSearchRunnerTest method test_run_batch_search_truncate_to_60k_max_results.

@Test
public void test_run_batch_search_truncate_to_60k_max_results() throws Exception {
    Document[] documents = IntStream.range(0, MAX_SCROLL_SIZE).mapToObj(i -> createDoc("doc" + i).build()).toArray(Document[]::new);
    mockSearch.willReturn(MAX_BATCH_RESULT_SIZE / MAX_SCROLL_SIZE + 1, documents);
    BatchSearch batchSearch = new BatchSearch("uuid1", project("test-datashare"), "name", "desc", asSet("query"), new Date(), BatchSearch.State.QUEUED, local());
    assertThat(new BatchSearchRunner(indexer, new PropertiesProvider(), batchSearch, resultConsumer).call()).isLessThan(60000);
}
Also used : IntStream(java.util.stream.IntStream) MAX_SCROLL_SIZE(org.icij.datashare.tasks.BatchSearchRunner.MAX_SCROLL_SIZE) MockitoAnnotations.initMocks(org.mockito.MockitoAnnotations.initMocks) Mock(org.mockito.Mock) Date(java.util.Date) Assert.assertThrows(org.junit.Assert.assertThrows) HashMap(java.util.HashMap) Assertions.assertThat(org.fest.assertions.Assertions.assertThat) Arrays.asList(java.util.Arrays.asList) SearchException(org.icij.datashare.batch.SearchException) User(org.icij.datashare.user.User) Project.project(org.icij.datashare.text.Project.project) DatashareTimeRule(org.icij.datashare.test.DatashareTimeRule) BATCH_SEARCH_MAX_TIME(org.icij.datashare.cli.DatashareCliOptions.BATCH_SEARCH_MAX_TIME) ExecutorService(java.util.concurrent.ExecutorService) Before(org.junit.Before) BATCH_THROTTLE(org.icij.datashare.cli.DatashareCliOptions.BATCH_THROTTLE) PropertiesProvider(org.icij.datashare.PropertiesProvider) DocumentBuilder.createDoc(org.icij.datashare.text.DocumentBuilder.createDoc) Indexer(org.icij.datashare.text.indexing.Indexer) CollectionUtils.asSet(org.icij.datashare.CollectionUtils.asSet) Test(org.junit.Test) MAX_BATCH_RESULT_SIZE(org.icij.datashare.tasks.BatchSearchRunner.MAX_BATCH_RESULT_SIZE) Document(org.icij.datashare.text.Document) Executors(java.util.concurrent.Executors) User.local(org.icij.datashare.user.User.local) TimeUnit(java.util.concurrent.TimeUnit) Matchers.any(org.mockito.Matchers.any) CountDownLatch(java.util.concurrent.CountDownLatch) Mockito(org.mockito.Mockito) List(java.util.List) Rule(org.junit.Rule) BatchSearch(org.icij.datashare.batch.BatchSearch) TerFunction(org.icij.datashare.function.TerFunction) PropertiesProvider(org.icij.datashare.PropertiesProvider) BatchSearch(org.icij.datashare.batch.BatchSearch) Document(org.icij.datashare.text.Document) Date(java.util.Date) Test(org.junit.Test)

Aggregations

Document (org.icij.datashare.text.Document)63 Test (org.junit.Test)48 PropertiesProvider (org.icij.datashare.PropertiesProvider)19 BatchSearch (org.icij.datashare.batch.BatchSearch)15 NamedEntity (org.icij.datashare.text.NamedEntity)11 TikaDocument (org.icij.extract.document.TikaDocument)10 HashMap (java.util.HashMap)9 Path (java.nio.file.Path)6 Date (java.util.Date)5 Indexer (org.icij.datashare.text.indexing.Indexer)5 File (java.io.File)4 IOException (java.io.IOException)4 InputStream (java.io.InputStream)4 IntStream (java.util.stream.IntStream)4 DocumentBuilder.createDoc (org.icij.datashare.text.DocumentBuilder.createDoc)4 Project.project (org.icij.datashare.text.Project.project)4 User (org.icij.datashare.user.User)4 Rule (org.junit.Rule)4 Arrays.asList (java.util.Arrays.asList)3 List (java.util.List)3