use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class ElasticsearchIndexerTest method test_bulk_add_for_embedded_doc.
@Test
public void test_bulk_add_for_embedded_doc() throws IOException {
Document parent = new org.icij.datashare.text.Document("id", project("prj"), Paths.get("mail.eml"), "content", Language.FRENCH, Charset.defaultCharset(), "message/rfc822", new HashMap<>(), INDEXED, new HashSet<>(), 321L);
Document child = new Document(project("prj"), "childId", Paths.get("mail.eml"), "mail body", FRENCH, Charset.defaultCharset(), "text/plain", new HashMap<>(), Document.Status.INDEXED, new HashSet<>(), new Date(), "id", "id", (short) 1, 123L);
indexer.add(TEST_INDEX, parent);
indexer.add(TEST_INDEX, child);
NamedEntity ne1 = create(PERSON, "Jane Daffodil", asList(12L), parent.getId(), "root", CORENLP, Language.FRENCH);
assertThat(indexer.bulkAdd(TEST_INDEX, CORENLP, singletonList(ne1), child)).isTrue();
Document doc = indexer.get(TEST_INDEX, child.getId(), parent.getId());
assertThat(doc.getNerTags()).containsOnly(CORENLP);
assertThat(doc.getStatus()).isEqualTo(Document.Status.DONE);
NamedEntity actual = indexer.get(TEST_INDEX, ne1.getId(), doc.getRootDocument());
assertThat(actual).isNotNull();
assertThat(actual.getRootDocument()).isEqualTo(doc.getRootDocument());
}
use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class DatabaseSpewer method writeDocument.
@Override
protected void writeDocument(TikaDocument tikaDocument, TikaDocument parent, TikaDocument root, int level) throws IOException {
String content = toString(tikaDocument.getReader()).trim();
Charset charset = Charset.forName(ofNullable(tikaDocument.getMetadata().get(CONTENT_ENCODING)).orElse("utf-8"));
String contentType = ofNullable(tikaDocument.getMetadata().get(CONTENT_TYPE)).orElse(DEFAULT_VALUE_UNKNOWN).split(";")[0];
Long contentLength = valueOf(ofNullable(tikaDocument.getMetadata().get(CONTENT_LENGTH)).orElse("-1"));
String parentId = parent == null ? null : parent.getId();
String rootId = root == null ? null : root.getId();
Document document = new Document(project, tikaDocument.getId(), tikaDocument.getPath(), content, languageGuesser.guess(content), charset, contentType, getMetadata(tikaDocument), Document.Status.INDEXED, new HashSet<>(), new Date(), parentId, rootId, (short) level, contentLength);
repository.create(document);
}
use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class BatchSearchRunnerTest method test_cancel_current_batch_search.
@Test
public void test_cancel_current_batch_search() throws Exception {
CountDownLatch countDownLatch = new CountDownLatch(1);
BatchSearch batchSearch = new BatchSearch("uuid1", project("test-datashare"), "name1", "desc1", asSet("query1", "query2"), new Date(), BatchSearch.State.QUEUED, local());
Document[] documents = { createDoc("doc").build() };
mockSearch.willReturn(1, documents);
BatchSearchRunner batchSearchRunner = new BatchSearchRunner(indexer, new PropertiesProvider(), batchSearch, resultConsumer, countDownLatch);
executor.submit(batchSearchRunner);
executor.shutdown();
countDownLatch.await();
batchSearchRunner.cancel();
assertThat(executor.awaitTermination(2, TimeUnit.SECONDS)).isTrue();
}
use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class BatchSearchRunnerTest method test_run_batch_search_failure.
@Test(expected = RuntimeException.class)
public void test_run_batch_search_failure() throws Exception {
Document[] documents = { createDoc("doc").build() };
mockSearch.willReturn(1, documents);
BatchSearch batchSearch = new BatchSearch("uuid1", project("test-datashare"), "name1", "desc1", asSet("query1", "query2"), new Date(), BatchSearch.State.QUEUED, local());
when(resultConsumer.apply(anyString(), any(), anyList())).thenThrow(new RuntimeException());
new BatchSearchRunner(indexer, new PropertiesProvider(), batchSearch, resultConsumer).call();
}
use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class BatchSearchRunnerTest method test_run_batch_search_truncate_to_60k_max_results.
@Test
public void test_run_batch_search_truncate_to_60k_max_results() throws Exception {
Document[] documents = IntStream.range(0, MAX_SCROLL_SIZE).mapToObj(i -> createDoc("doc" + i).build()).toArray(Document[]::new);
mockSearch.willReturn(MAX_BATCH_RESULT_SIZE / MAX_SCROLL_SIZE + 1, documents);
BatchSearch batchSearch = new BatchSearch("uuid1", project("test-datashare"), "name", "desc", asSet("query"), new Date(), BatchSearch.State.QUEUED, local());
assertThat(new BatchSearchRunner(indexer, new PropertiesProvider(), batchSearch, resultConsumer).call()).isLessThan(60000);
}
Aggregations