use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class DatashareExtractIntegrationTest method test_spew_and_read_embedded_doc.
@Test
public void test_spew_and_read_embedded_doc() throws Exception {
Path path = get(getClass().getResource("/docs/embedded_doc.eml").getPath());
TikaDocument tikaDocument = createExtractor().extract(path);
spewer.write(tikaDocument);
Document doc = indexer.get(TEST_INDEX, tikaDocument.getEmbeds().get(0).getId(), tikaDocument.getId());
assertThat(doc).isNotNull();
assertThat(doc.getId()).isNotEqualTo(doc.getRootDocument());
assertThat(doc.getRootDocument()).isEqualTo(tikaDocument.getId());
assertThat(doc.getCreationDate()).isNotNull();
assertThat(new SimpleDateFormat("HH:mm:ss").format(doc.getCreationDate())).isEqualTo("23:22:36");
}
use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class ElasticsearchIndexerTest method test_search_query_with_operator_and_phrase_match.
@Test
public void test_search_query_with_operator_and_phrase_match() throws Exception {
Document doc = new org.icij.datashare.text.Document("id", project("prj"), Paths.get("doc.txt"), "content with john doe", Language.FRENCH, Charset.defaultCharset(), "application/pdf", new HashMap<>(), INDEXED, new HashSet<>(), 34L);
indexer.add(TEST_INDEX, doc);
assertThat(indexer.search(TEST_INDEX, Document.class).with("john AND doe", 0, true).execute().toArray()).isEmpty();
assertThat(indexer.search(TEST_INDEX, Document.class).with("john AND doe", 0, false).execute().toArray()).hasSize(1);
}
use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class ElasticsearchIndexerTest method test_search_with_and_without_NLP_tags_no_tags.
@Test
public void test_search_with_and_without_NLP_tags_no_tags() throws IOException {
Document doc = new org.icij.datashare.text.Document("id", project("prj"), Paths.get("doc.txt"), "content", Language.FRENCH, Charset.defaultCharset(), "application/pdf", new HashMap<>(), INDEXED, new HashSet<>(), 345L);
indexer.add(TEST_INDEX, doc);
assertThat((int) indexer.search(TEST_INDEX, Document.class).without().execute().count()).isEqualTo(1);
}
use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class ElasticsearchIndexerTest method test_search_source_false.
@Test
public void test_search_source_false() throws IOException {
Document doc = new org.icij.datashare.text.Document("id", project("prj"), Paths.get("doc_with_parent.txt"), "content", Language.FRENCH, Charset.defaultCharset(), "application/pdf", new HashMap<>(), INDEXED, new HashSet<>(), 222L);
indexer.add(TEST_INDEX, doc);
Document actualDoc = (Document) indexer.search(TEST_INDEX, Document.class).withSource(false).execute().collect(toList()).get(0);
assertThat(actualDoc.getId()).isNotNull();
}
use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class ElasticsearchIndexerTest method test_execute_raw_search.
@Test
public void test_execute_raw_search() throws Exception {
Document doc = new org.icij.datashare.text.Document("id", project("prj"), Paths.get("doc.txt"), "my content", Language.FRENCH, Charset.defaultCharset(), "application/pdf", new HashMap<>(), INDEXED, new HashSet<Pipeline.Type>() {
{
add(OPENNLP);
}
}, 432L);
indexer.add(TEST_INDEX, doc);
assertThat(indexer.executeRaw("POST", TEST_INDEX + "/_search", "{\"query\":{\"match_all\":{}}}")).contains("my content");
assertThat(indexer.executeRaw("POST", TEST_INDEX + "/_search", "{\"query\":{\"match\":{\"content\":\"foo\"}}}")).doesNotContain("my content");
}
Aggregations