use of org.icij.datashare.text.NamedEntity in project datashare by ICIJ.
the class ElasticsearchIndexerTest method test_bulk_add_for_embedded_doc.
@Test
public void test_bulk_add_for_embedded_doc() throws IOException {
Document parent = new org.icij.datashare.text.Document("id", project("prj"), Paths.get("mail.eml"), "content", Language.FRENCH, Charset.defaultCharset(), "message/rfc822", new HashMap<>(), INDEXED, new HashSet<>(), 321L);
Document child = new Document(project("prj"), "childId", Paths.get("mail.eml"), "mail body", FRENCH, Charset.defaultCharset(), "text/plain", new HashMap<>(), Document.Status.INDEXED, new HashSet<>(), new Date(), "id", "id", (short) 1, 123L);
indexer.add(TEST_INDEX, parent);
indexer.add(TEST_INDEX, child);
NamedEntity ne1 = create(PERSON, "Jane Daffodil", asList(12L), parent.getId(), "root", CORENLP, Language.FRENCH);
assertThat(indexer.bulkAdd(TEST_INDEX, CORENLP, singletonList(ne1), child)).isTrue();
Document doc = indexer.get(TEST_INDEX, child.getId(), parent.getId());
assertThat(doc.getNerTags()).containsOnly(CORENLP);
assertThat(doc.getStatus()).isEqualTo(Document.Status.DONE);
NamedEntity actual = indexer.get(TEST_INDEX, ne1.getId(), doc.getRootDocument());
assertThat(actual).isNotNull();
assertThat(actual.getRootDocument()).isEqualTo(doc.getRootDocument());
}
use of org.icij.datashare.text.NamedEntity in project datashare by ICIJ.
the class NlpAppTest method runNlpApp.
private NlpApp runNlpApp(String parallelism, int nlpProcessDelayMillis) throws InterruptedException {
Properties properties = new Properties();
properties.setProperty(NLP_PARALLELISM_OPT, parallelism);
properties.setProperty("messageBusAddress", "redis");
CountDownLatch latch = new CountDownLatch(1);
when(pipeline.process(any())).thenAnswer((Answer<List<NamedEntity>>) invocationOnMock -> {
if (nlpProcessDelayMillis > 0)
Thread.sleep(nlpProcessDelayMillis);
return emptyList();
});
NlpApp nlpApp = new NlpApp(dataBus, indexer, pipeline, properties, latch::countDown, 1, true, local());
executor.execute(nlpApp);
latch.await(2, SECONDS);
return nlpApp;
}
use of org.icij.datashare.text.NamedEntity in project datashare by ICIJ.
the class EmailPipelineTest method test_emails_chunked_content.
@Test
public void test_emails_chunked_content() {
Document document = createDocument("this is a content with email@domain.com\n" + "and another one : foo@bar.com\n" + "and baz@qux.fr", "docId", Language.ENGLISH);
List<NamedEntity> annotations = pipeline.process(document, 20, 72);
assertThat(annotations).hasSize(1);
assertThat(annotations.get(0).getMention()).isEqualTo("baz@qux.fr");
assertThat(annotations.get(0).getOffsets()).containsExactly(74L);
}
use of org.icij.datashare.text.NamedEntity in project datashare by ICIJ.
the class EmailPipelineTest method test_filter_headers_that_contains_mail_addresses.
@Test
public void test_filter_headers_that_contains_mail_addresses() {
Document doc = createDoc("docid").with("mail content").ofMimeType("message/rfc822").with(new HashMap<String, Object>() {
{
put(tikaRawHeader("field"), "email@domain.com");
put(tikaRawHeader("Message-ID"), "id@domain.com");
put(tikaRawHeader("Return-Path"), "return@head.er");
put(tikaMsgHeader("To"), "to@head.er");
put(tikaMsgHeader("From"), "from@head.er");
put(tikaMsgHeader("Cc"), "cc@head.er");
put(tikaMsgHeader("Bcc"), "bcc@head.er");
put(tika("Dc-Title"), "subject@head.er");
put(tikaRawHeader("Reply-To"), "replyto@head.er");
put(tikaRawHeader("Followup-To"), "followup@head.er");
put(tikaRawHeader("Alternate-Recipient"), "alternate@head.er");
put(tikaRawHeader("For-Handling"), "forhandling@head.er");
put(tikaRawHeader("Resent-Reply-To"), "resent-replyto@head.er");
put(tikaRawHeader("Resent-Sender"), "resent-sender@head.er");
put(tikaRawHeader("Resent-From"), "resent-from@head.er");
put(tikaRawHeader("Resent-To"), "resent-to@head.er");
put(tikaRawHeader("Resent-cc"), "resent-cc@head.er");
put(tikaRawHeader("Resent-bcc"), "resent-bcc@head.er");
}
}).build();
List<NamedEntity> namedEntities = pipeline.process(doc);
assertThat(namedEntities).containsExactly(NamedEntity.create(EMAIL, "replyto@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "alternate@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "resent-sender@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "cc@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "from@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "resent-cc@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "forhandling@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "resent-replyto@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "return@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "resent-to@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "followup@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "resent-bcc@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "subject@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "bcc@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "to@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "resent-from@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH));
}
use of org.icij.datashare.text.NamedEntity in project datashare by ICIJ.
the class EmailPipelineTest method test_one_email_twice.
@Test
public void test_one_email_twice() {
String content = "this is a content with email@domain.com\n" + "that is twice in the document\n" + "email@domain.com";
List<NamedEntity> annotations = pipeline.process(createDocument(content, "docId", Language.ENGLISH));
assertThat(annotations).hasSize(1);
NamedEntity nlpTag = annotations.get(0);
assertThat(nlpTag.getOffsets()).containsExactly(23L, 70L);
assertThat(nlpTag.getMention()).isEqualTo("email@domain.com");
}
Aggregations