Search in sources :

Example 6 with NamedEntity

use of org.icij.datashare.text.NamedEntity in project datashare by ICIJ.

the class ElasticsearchIndexerTest method test_bulk_add_for_embedded_doc.

@Test
public void test_bulk_add_for_embedded_doc() throws IOException {
    Document parent = new org.icij.datashare.text.Document("id", project("prj"), Paths.get("mail.eml"), "content", Language.FRENCH, Charset.defaultCharset(), "message/rfc822", new HashMap<>(), INDEXED, new HashSet<>(), 321L);
    Document child = new Document(project("prj"), "childId", Paths.get("mail.eml"), "mail body", FRENCH, Charset.defaultCharset(), "text/plain", new HashMap<>(), Document.Status.INDEXED, new HashSet<>(), new Date(), "id", "id", (short) 1, 123L);
    indexer.add(TEST_INDEX, parent);
    indexer.add(TEST_INDEX, child);
    NamedEntity ne1 = create(PERSON, "Jane Daffodil", asList(12L), parent.getId(), "root", CORENLP, Language.FRENCH);
    assertThat(indexer.bulkAdd(TEST_INDEX, CORENLP, singletonList(ne1), child)).isTrue();
    Document doc = indexer.get(TEST_INDEX, child.getId(), parent.getId());
    assertThat(doc.getNerTags()).containsOnly(CORENLP);
    assertThat(doc.getStatus()).isEqualTo(Document.Status.DONE);
    NamedEntity actual = indexer.get(TEST_INDEX, ne1.getId(), doc.getRootDocument());
    assertThat(actual).isNotNull();
    assertThat(actual.getRootDocument()).isEqualTo(doc.getRootDocument());
}
Also used : NamedEntity(org.icij.datashare.text.NamedEntity) Document(org.icij.datashare.text.Document) Test(org.junit.Test)

Example 7 with NamedEntity

use of org.icij.datashare.text.NamedEntity in project datashare by ICIJ.

the class NlpAppTest method runNlpApp.

private NlpApp runNlpApp(String parallelism, int nlpProcessDelayMillis) throws InterruptedException {
    Properties properties = new Properties();
    properties.setProperty(NLP_PARALLELISM_OPT, parallelism);
    properties.setProperty("messageBusAddress", "redis");
    CountDownLatch latch = new CountDownLatch(1);
    when(pipeline.process(any())).thenAnswer((Answer<List<NamedEntity>>) invocationOnMock -> {
        if (nlpProcessDelayMillis > 0)
            Thread.sleep(nlpProcessDelayMillis);
        return emptyList();
    });
    NlpApp nlpApp = new NlpApp(dataBus, indexer, pipeline, properties, latch::countDown, 1, true, local());
    executor.execute(nlpApp);
    latch.await(2, SECONDS);
    return nlpApp;
}
Also used : IntStream(java.util.stream.IntStream) MockitoAnnotations.initMocks(org.mockito.MockitoAnnotations.initMocks) Mock(org.mockito.Mock) INIT_MONITORING(org.icij.datashare.com.Message.Type.INIT_MONITORING) RunWith(org.junit.runner.RunWith) Matchers.anyString(org.mockito.Matchers.anyString) Answer(org.mockito.stubbing.Answer) Assertions.assertThat(org.fest.assertions.Assertions.assertThat) Arrays.asList(java.util.Arrays.asList) After(org.junit.After) Parameterized(org.junit.runners.Parameterized) ExecutorService(java.util.concurrent.ExecutorService) Before(org.junit.Before) NLP_PARALLELISM_OPT(org.icij.datashare.cli.DatashareCliOptions.NLP_PARALLELISM_OPT) Field(org.icij.datashare.com.Message.Field) AbstractPipeline(org.icij.datashare.text.nlp.AbstractPipeline) Properties(java.util.Properties) PropertiesProvider(org.icij.datashare.PropertiesProvider) Collections.emptyList(java.util.Collections.emptyList) EXTRACT_NLP(org.icij.datashare.com.Message.Type.EXTRACT_NLP) DocumentBuilder.createDoc(org.icij.datashare.text.DocumentBuilder.createDoc) org.icij.datashare.com(org.icij.datashare.com) Collection(java.util.Collection) Indexer(org.icij.datashare.text.indexing.Indexer) Test(org.junit.Test) OPENNLP(org.icij.datashare.text.nlp.Pipeline.Type.OPENNLP) Executors(java.util.concurrent.Executors) User.local(org.icij.datashare.user.User.local) Matchers.any(org.mockito.Matchers.any) CountDownLatch(java.util.concurrent.CountDownLatch) Mockito(org.mockito.Mockito) List(java.util.List) Language(org.icij.datashare.text.Language) SECONDS(java.util.concurrent.TimeUnit.SECONDS) NamedEntity(org.icij.datashare.text.NamedEntity) Arrays.asList(java.util.Arrays.asList) Collections.emptyList(java.util.Collections.emptyList) List(java.util.List) Properties(java.util.Properties) CountDownLatch(java.util.concurrent.CountDownLatch)

Example 8 with NamedEntity

use of org.icij.datashare.text.NamedEntity in project datashare by ICIJ.

the class EmailPipelineTest method test_emails_chunked_content.

@Test
public void test_emails_chunked_content() {
    Document document = createDocument("this is a content with email@domain.com\n" + "and another one : foo@bar.com\n" + "and baz@qux.fr", "docId", Language.ENGLISH);
    List<NamedEntity> annotations = pipeline.process(document, 20, 72);
    assertThat(annotations).hasSize(1);
    assertThat(annotations.get(0).getMention()).isEqualTo("baz@qux.fr");
    assertThat(annotations.get(0).getOffsets()).containsExactly(74L);
}
Also used : NamedEntity(org.icij.datashare.text.NamedEntity) Document(org.icij.datashare.text.Document) Test(org.junit.Test)

Example 9 with NamedEntity

use of org.icij.datashare.text.NamedEntity in project datashare by ICIJ.

the class EmailPipelineTest method test_filter_headers_that_contains_mail_addresses.

@Test
public void test_filter_headers_that_contains_mail_addresses() {
    Document doc = createDoc("docid").with("mail content").ofMimeType("message/rfc822").with(new HashMap<String, Object>() {

        {
            put(tikaRawHeader("field"), "email@domain.com");
            put(tikaRawHeader("Message-ID"), "id@domain.com");
            put(tikaRawHeader("Return-Path"), "return@head.er");
            put(tikaMsgHeader("To"), "to@head.er");
            put(tikaMsgHeader("From"), "from@head.er");
            put(tikaMsgHeader("Cc"), "cc@head.er");
            put(tikaMsgHeader("Bcc"), "bcc@head.er");
            put(tika("Dc-Title"), "subject@head.er");
            put(tikaRawHeader("Reply-To"), "replyto@head.er");
            put(tikaRawHeader("Followup-To"), "followup@head.er");
            put(tikaRawHeader("Alternate-Recipient"), "alternate@head.er");
            put(tikaRawHeader("For-Handling"), "forhandling@head.er");
            put(tikaRawHeader("Resent-Reply-To"), "resent-replyto@head.er");
            put(tikaRawHeader("Resent-Sender"), "resent-sender@head.er");
            put(tikaRawHeader("Resent-From"), "resent-from@head.er");
            put(tikaRawHeader("Resent-To"), "resent-to@head.er");
            put(tikaRawHeader("Resent-cc"), "resent-cc@head.er");
            put(tikaRawHeader("Resent-bcc"), "resent-bcc@head.er");
        }
    }).build();
    List<NamedEntity> namedEntities = pipeline.process(doc);
    assertThat(namedEntities).containsExactly(NamedEntity.create(EMAIL, "replyto@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "alternate@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "resent-sender@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "cc@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "from@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "resent-cc@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "forhandling@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "resent-replyto@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "return@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "resent-to@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "followup@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "resent-bcc@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "subject@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "bcc@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "to@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "resent-from@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH));
}
Also used : NamedEntity(org.icij.datashare.text.NamedEntity) HashMap(java.util.HashMap) Document(org.icij.datashare.text.Document) Test(org.junit.Test)

Example 10 with NamedEntity

use of org.icij.datashare.text.NamedEntity in project datashare by ICIJ.

the class EmailPipelineTest method test_one_email_twice.

@Test
public void test_one_email_twice() {
    String content = "this is a content with email@domain.com\n" + "that is twice in the document\n" + "email@domain.com";
    List<NamedEntity> annotations = pipeline.process(createDocument(content, "docId", Language.ENGLISH));
    assertThat(annotations).hasSize(1);
    NamedEntity nlpTag = annotations.get(0);
    assertThat(nlpTag.getOffsets()).containsExactly(23L, 70L);
    assertThat(nlpTag.getMention()).isEqualTo("email@domain.com");
}
Also used : NamedEntity(org.icij.datashare.text.NamedEntity) Test(org.junit.Test)

Aggregations

NamedEntity (org.icij.datashare.text.NamedEntity)20 Test (org.junit.Test)16 Document (org.icij.datashare.text.Document)11 Arrays.asList (java.util.Arrays.asList)2 HashMap (java.util.HashMap)2 PropertiesProvider (org.icij.datashare.PropertiesProvider)2 Language (org.icij.datashare.text.Language)2 NamedEntitiesBuilder (org.icij.datashare.text.NamedEntitiesBuilder)2 AbstractPipeline (org.icij.datashare.text.nlp.AbstractPipeline)2 Annotations (org.icij.datashare.text.nlp.Annotations)2 AbstractProdWebServerTest (org.icij.datashare.web.testhelpers.AbstractProdWebServerTest)2 Inject (com.google.inject.Inject)1 AbstractSequenceClassifier (edu.stanford.nlp.ie.AbstractSequenceClassifier)1 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)1 Triple (edu.stanford.nlp.util.Triple)1 IOException (java.io.IOException)1 Charset (java.nio.charset.Charset)1 Path (java.nio.file.Path)1 java.util (java.util)1 Collection (java.util.Collection)1