Search in sources :

Example 11 with NamedEntity

use of org.icij.datashare.text.NamedEntity in project datashare by ICIJ.

the class EmailPipelineTest method test_adds_document_headers_parsing_for_email.

@Test
public void test_adds_document_headers_parsing_for_email() {
    Document doc = createDoc("docid").with("hello@world.com").ofMimeType("message/rfc822").with(new HashMap<String, Object>() {

        {
            put(tikaMsgHeader("To"), "email1@domain.com");
            put(tikaMsgHeader("Cc"), "email2@domain.com");
        }
    }).build();
    List<NamedEntity> namedEntities = pipeline.process(doc);
    assertThat(namedEntities).containsExactly(NamedEntity.create(EMAIL, "hello@world.com", asList(0L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "email2@domain.com", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "email1@domain.com", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH));
}
Also used : NamedEntity(org.icij.datashare.text.NamedEntity) HashMap(java.util.HashMap) Document(org.icij.datashare.text.Document) Test(org.junit.Test)

Example 12 with NamedEntity

use of org.icij.datashare.text.NamedEntity in project datashare by ICIJ.

the class ElasticsearchIndexerTest method test_query_like_js_front_finds_document_from_its_child_named_entity.

@Test
public void test_query_like_js_front_finds_document_from_its_child_named_entity() throws Exception {
    Document doc = new org.icij.datashare.text.Document("id", project("prj"), Paths.get("doc.txt"), "content with john doe", Language.FRENCH, Charset.defaultCharset(), "application/pdf", new HashMap<>(), INDEXED, new HashSet<>(), 34L);
    indexer.add(TEST_INDEX, doc);
    NamedEntity ne1 = create(PERSON, "John Doe", asList(12L), doc.getId(), "root", CORENLP, Language.FRENCH);
    indexer.bulkAdd(TEST_INDEX, CORENLP, singletonList(ne1), doc);
    Object[] documents = indexer.search(TEST_INDEX, Document.class).withoutSource("content").with("john").execute().toArray();
    assertThat(documents.length).isEqualTo(1);
    assertThat(((Document) documents[0]).getId()).isEqualTo("id");
    assertThat(((Document) documents[0]).getContent()).isEmpty();
}
Also used : NamedEntity(org.icij.datashare.text.NamedEntity) Document(org.icij.datashare.text.Document) Test(org.junit.Test)

Example 13 with NamedEntity

use of org.icij.datashare.text.NamedEntity in project datashare by ICIJ.

the class CorenlpPipeline method processNerClassifier.

/**
 * Named Entity Classifier (Conditional Random Fields) only
 *
 * @param doc the document
 */
private List<NamedEntity> processNerClassifier(Document doc, int contentLength, int contentOffset) throws InterruptedException {
    Annotations annotations = new Annotations(doc.getId(), doc.getRootDocument(), getType(), doc.getLanguage());
    NamedEntitiesBuilder namedEntitiesBuilder = new NamedEntitiesBuilder(getType(), doc.getId(), doc.getLanguage()).withRoot(doc.getRootDocument());
    LOGGER.info("name-finding for {} in document {} (offset {})", doc.getLanguage(), doc.getId(), contentOffset);
    // Recognize named entities from input
    final CoreNlpAnnotator<AbstractSequenceClassifier<CoreLabel>> abstractSequenceClassifierCoreNlpAnnotator;
    abstractSequenceClassifierCoreNlpAnnotator = CoreNlpNerModels.getInstance().get(doc.getLanguage());
    String chunk = doc.getContent().substring(contentOffset, Math.min(contentOffset + contentLength, doc.getContentTextLength()));
    List<Triple<String, Integer, Integer>> items = abstractSequenceClassifierCoreNlpAnnotator.annotator.classifyToCharacterOffsets(chunk);
    // For each recognized named entity
    for (Triple<String, Integer, Integer> item : items) {
        // Triple: <category, begin, end>
        NamedEntity.Category category = NamedEntity.Category.parse(item.first());
        int begin = item.second();
        int end = item.third();
        String mention = ThrowingFunctions.removeNewLines.apply(chunk.substring(begin, end));
        namedEntitiesBuilder.add(category, mention, begin + contentOffset);
    }
    return namedEntitiesBuilder.build();
}
Also used : Triple(edu.stanford.nlp.util.Triple) AbstractSequenceClassifier(edu.stanford.nlp.ie.AbstractSequenceClassifier) NamedEntity(org.icij.datashare.text.NamedEntity) Annotations(org.icij.datashare.text.nlp.Annotations) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) NamedEntitiesBuilder(org.icij.datashare.text.NamedEntitiesBuilder)

Example 14 with NamedEntity

use of org.icij.datashare.text.NamedEntity in project datashare by ICIJ.

the class ElasticsearchIndexerTest method test_bulk_update.

@Test
public void test_bulk_update() throws IOException {
    Document doc = new org.icij.datashare.text.Document("id", project("prj"), Paths.get("doc.txt"), "content", Language.FRENCH, Charset.defaultCharset(), "application/pdf", new HashMap<>(), INDEXED, new HashSet<>(), 34L);
    indexer.add(TEST_INDEX, doc);
    NamedEntity ne1 = create(PERSON, "John Doe", asList(12L), doc.getId(), "root", CORENLP, Language.FRENCH);
    NamedEntity ne2 = create(ORGANIZATION, "AAA", asList(123L), doc.getId(), "root", CORENLP, Language.FRENCH);
    indexer.bulkAdd(TEST_INDEX, CORENLP, asList(ne1, ne2), doc);
    ne1.hide();
    ne2.hide();
    assertThat(indexer.bulkUpdate(TEST_INDEX, asList(ne1, ne2))).isTrue();
    Object[] namedEntities = indexer.search(TEST_INDEX, NamedEntity.class).execute().toArray();
    assertThat(namedEntities.length).isEqualTo(2);
    assertThat(((NamedEntity) namedEntities[0]).isHidden()).isTrue();
    assertThat(((NamedEntity) namedEntities[1]).isHidden()).isTrue();
}
Also used : NamedEntity(org.icij.datashare.text.NamedEntity) Document(org.icij.datashare.text.Document) Test(org.junit.Test)

Example 15 with NamedEntity

use of org.icij.datashare.text.NamedEntity in project datashare by ICIJ.

the class ElasticsearchIndexerTest method test_bulk_add_named_entities.

@Test
public void test_bulk_add_named_entities() throws IOException {
    Document doc = new org.icij.datashare.text.Document("id", project("prj"), Paths.get("doc.txt"), "content", Language.FRENCH, Charset.defaultCharset(), "application/pdf", new HashMap<>(), INDEXED, new HashSet<>(), 4324L);
    indexer.add(TEST_INDEX, doc);
    NamedEntity ne1 = create(PERSON, "John Doe", asList(12L), "doc.txt", "root", CORENLP, Language.FRENCH);
    NamedEntity ne2 = create(ORGANIZATION, "AAA", asList(123L), "doc.txt", "root", CORENLP, Language.FRENCH);
    assertThat(indexer.bulkAdd(TEST_INDEX, CORENLP, asList(ne1, ne2), doc)).isTrue();
    assertThat(((Document) indexer.get(TEST_INDEX, doc.getId())).getStatus()).isEqualTo(Document.Status.DONE);
    assertThat(((Document) indexer.get(TEST_INDEX, doc.getId())).getNerTags()).containsOnly(CORENLP);
    assertThat((NamedEntity) indexer.get(TEST_INDEX, ne1.getId(), doc.getId())).isNotNull();
    assertThat((NamedEntity) indexer.get(TEST_INDEX, ne2.getId(), doc.getId())).isNotNull();
}
Also used : NamedEntity(org.icij.datashare.text.NamedEntity) Document(org.icij.datashare.text.Document) Test(org.junit.Test)

Aggregations

NamedEntity (org.icij.datashare.text.NamedEntity)20 Test (org.junit.Test)16 Document (org.icij.datashare.text.Document)11 Arrays.asList (java.util.Arrays.asList)2 HashMap (java.util.HashMap)2 PropertiesProvider (org.icij.datashare.PropertiesProvider)2 Language (org.icij.datashare.text.Language)2 NamedEntitiesBuilder (org.icij.datashare.text.NamedEntitiesBuilder)2 AbstractPipeline (org.icij.datashare.text.nlp.AbstractPipeline)2 Annotations (org.icij.datashare.text.nlp.Annotations)2 AbstractProdWebServerTest (org.icij.datashare.web.testhelpers.AbstractProdWebServerTest)2 Inject (com.google.inject.Inject)1 AbstractSequenceClassifier (edu.stanford.nlp.ie.AbstractSequenceClassifier)1 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)1 Triple (edu.stanford.nlp.util.Triple)1 IOException (java.io.IOException)1 Charset (java.nio.charset.Charset)1 Path (java.nio.file.Path)1 java.util (java.util)1 Collection (java.util.Collection)1