use of org.icij.datashare.text.NamedEntity in project datashare by ICIJ.
the class EmailPipelineTest method test_adds_document_headers_parsing_for_email.
@Test
public void test_adds_document_headers_parsing_for_email() {
Document doc = createDoc("docid").with("hello@world.com").ofMimeType("message/rfc822").with(new HashMap<String, Object>() {
{
put(tikaMsgHeader("To"), "email1@domain.com");
put(tikaMsgHeader("Cc"), "email2@domain.com");
}
}).build();
List<NamedEntity> namedEntities = pipeline.process(doc);
assertThat(namedEntities).containsExactly(NamedEntity.create(EMAIL, "hello@world.com", asList(0L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "email2@domain.com", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "email1@domain.com", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH));
}
use of org.icij.datashare.text.NamedEntity in project datashare by ICIJ.
the class ElasticsearchIndexerTest method test_query_like_js_front_finds_document_from_its_child_named_entity.
@Test
public void test_query_like_js_front_finds_document_from_its_child_named_entity() throws Exception {
Document doc = new org.icij.datashare.text.Document("id", project("prj"), Paths.get("doc.txt"), "content with john doe", Language.FRENCH, Charset.defaultCharset(), "application/pdf", new HashMap<>(), INDEXED, new HashSet<>(), 34L);
indexer.add(TEST_INDEX, doc);
NamedEntity ne1 = create(PERSON, "John Doe", asList(12L), doc.getId(), "root", CORENLP, Language.FRENCH);
indexer.bulkAdd(TEST_INDEX, CORENLP, singletonList(ne1), doc);
Object[] documents = indexer.search(TEST_INDEX, Document.class).withoutSource("content").with("john").execute().toArray();
assertThat(documents.length).isEqualTo(1);
assertThat(((Document) documents[0]).getId()).isEqualTo("id");
assertThat(((Document) documents[0]).getContent()).isEmpty();
}
use of org.icij.datashare.text.NamedEntity in project datashare by ICIJ.
the class CorenlpPipeline method processNerClassifier.
/**
* Named Entity Classifier (Conditional Random Fields) only
*
* @param doc the document
*/
private List<NamedEntity> processNerClassifier(Document doc, int contentLength, int contentOffset) throws InterruptedException {
Annotations annotations = new Annotations(doc.getId(), doc.getRootDocument(), getType(), doc.getLanguage());
NamedEntitiesBuilder namedEntitiesBuilder = new NamedEntitiesBuilder(getType(), doc.getId(), doc.getLanguage()).withRoot(doc.getRootDocument());
LOGGER.info("name-finding for {} in document {} (offset {})", doc.getLanguage(), doc.getId(), contentOffset);
// Recognize named entities from input
final CoreNlpAnnotator<AbstractSequenceClassifier<CoreLabel>> abstractSequenceClassifierCoreNlpAnnotator;
abstractSequenceClassifierCoreNlpAnnotator = CoreNlpNerModels.getInstance().get(doc.getLanguage());
String chunk = doc.getContent().substring(contentOffset, Math.min(contentOffset + contentLength, doc.getContentTextLength()));
List<Triple<String, Integer, Integer>> items = abstractSequenceClassifierCoreNlpAnnotator.annotator.classifyToCharacterOffsets(chunk);
// For each recognized named entity
for (Triple<String, Integer, Integer> item : items) {
// Triple: <category, begin, end>
NamedEntity.Category category = NamedEntity.Category.parse(item.first());
int begin = item.second();
int end = item.third();
String mention = ThrowingFunctions.removeNewLines.apply(chunk.substring(begin, end));
namedEntitiesBuilder.add(category, mention, begin + contentOffset);
}
return namedEntitiesBuilder.build();
}
use of org.icij.datashare.text.NamedEntity in project datashare by ICIJ.
the class ElasticsearchIndexerTest method test_bulk_update.
@Test
public void test_bulk_update() throws IOException {
Document doc = new org.icij.datashare.text.Document("id", project("prj"), Paths.get("doc.txt"), "content", Language.FRENCH, Charset.defaultCharset(), "application/pdf", new HashMap<>(), INDEXED, new HashSet<>(), 34L);
indexer.add(TEST_INDEX, doc);
NamedEntity ne1 = create(PERSON, "John Doe", asList(12L), doc.getId(), "root", CORENLP, Language.FRENCH);
NamedEntity ne2 = create(ORGANIZATION, "AAA", asList(123L), doc.getId(), "root", CORENLP, Language.FRENCH);
indexer.bulkAdd(TEST_INDEX, CORENLP, asList(ne1, ne2), doc);
ne1.hide();
ne2.hide();
assertThat(indexer.bulkUpdate(TEST_INDEX, asList(ne1, ne2))).isTrue();
Object[] namedEntities = indexer.search(TEST_INDEX, NamedEntity.class).execute().toArray();
assertThat(namedEntities.length).isEqualTo(2);
assertThat(((NamedEntity) namedEntities[0]).isHidden()).isTrue();
assertThat(((NamedEntity) namedEntities[1]).isHidden()).isTrue();
}
use of org.icij.datashare.text.NamedEntity in project datashare by ICIJ.
the class ElasticsearchIndexerTest method test_bulk_add_named_entities.
@Test
public void test_bulk_add_named_entities() throws IOException {
Document doc = new org.icij.datashare.text.Document("id", project("prj"), Paths.get("doc.txt"), "content", Language.FRENCH, Charset.defaultCharset(), "application/pdf", new HashMap<>(), INDEXED, new HashSet<>(), 4324L);
indexer.add(TEST_INDEX, doc);
NamedEntity ne1 = create(PERSON, "John Doe", asList(12L), "doc.txt", "root", CORENLP, Language.FRENCH);
NamedEntity ne2 = create(ORGANIZATION, "AAA", asList(123L), "doc.txt", "root", CORENLP, Language.FRENCH);
assertThat(indexer.bulkAdd(TEST_INDEX, CORENLP, asList(ne1, ne2), doc)).isTrue();
assertThat(((Document) indexer.get(TEST_INDEX, doc.getId())).getStatus()).isEqualTo(Document.Status.DONE);
assertThat(((Document) indexer.get(TEST_INDEX, doc.getId())).getNerTags()).containsOnly(CORENLP);
assertThat((NamedEntity) indexer.get(TEST_INDEX, ne1.getId(), doc.getId())).isNotNull();
assertThat((NamedEntity) indexer.get(TEST_INDEX, ne2.getId(), doc.getId())).isNotNull();
}
Aggregations