Search in sources :

Example 1 with NamedEntitiesBuilder

use of org.icij.datashare.text.NamedEntitiesBuilder in project datashare by ICIJ.

the class CorenlpPipeline method processNerClassifier.

/**
 * Named Entity Classifier (Conditional Random Fields) only
 *
 * @param doc the document
 */
private List<NamedEntity> processNerClassifier(Document doc, int contentLength, int contentOffset) throws InterruptedException {
    Annotations annotations = new Annotations(doc.getId(), doc.getRootDocument(), getType(), doc.getLanguage());
    NamedEntitiesBuilder namedEntitiesBuilder = new NamedEntitiesBuilder(getType(), doc.getId(), doc.getLanguage()).withRoot(doc.getRootDocument());
    LOGGER.info("name-finding for {} in document {} (offset {})", doc.getLanguage(), doc.getId(), contentOffset);
    // Recognize named entities from input
    final CoreNlpAnnotator<AbstractSequenceClassifier<CoreLabel>> abstractSequenceClassifierCoreNlpAnnotator;
    abstractSequenceClassifierCoreNlpAnnotator = CoreNlpNerModels.getInstance().get(doc.getLanguage());
    String chunk = doc.getContent().substring(contentOffset, Math.min(contentOffset + contentLength, doc.getContentTextLength()));
    List<Triple<String, Integer, Integer>> items = abstractSequenceClassifierCoreNlpAnnotator.annotator.classifyToCharacterOffsets(chunk);
    // For each recognized named entity
    for (Triple<String, Integer, Integer> item : items) {
        // Triple: <category, begin, end>
        NamedEntity.Category category = NamedEntity.Category.parse(item.first());
        int begin = item.second();
        int end = item.third();
        String mention = ThrowingFunctions.removeNewLines.apply(chunk.substring(begin, end));
        namedEntitiesBuilder.add(category, mention, begin + contentOffset);
    }
    return namedEntitiesBuilder.build();
}
Also used : Triple(edu.stanford.nlp.util.Triple) AbstractSequenceClassifier(edu.stanford.nlp.ie.AbstractSequenceClassifier) NamedEntity(org.icij.datashare.text.NamedEntity) Annotations(org.icij.datashare.text.nlp.Annotations) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) NamedEntitiesBuilder(org.icij.datashare.text.NamedEntitiesBuilder)

Example 2 with NamedEntitiesBuilder

use of org.icij.datashare.text.NamedEntitiesBuilder in project datashare by ICIJ.

the class EmailPipeline method process.

@Override
public List<NamedEntity> process(Document doc, int contentLength, int contentOffset) {
    Matcher matcher = pattern.matcher(doc.getContent().substring(contentOffset, Math.min(contentLength + contentOffset, doc.getContentTextLength())));
    NamedEntitiesBuilder namedEntitiesBuilder = new NamedEntitiesBuilder(EMAIL, doc.getId(), doc.getLanguage()).withRoot(doc.getRootDocument());
    while (matcher.find()) {
        String email = matcher.group(0);
        int start = matcher.start();
        namedEntitiesBuilder.add(NamedEntity.Category.EMAIL, email, start + contentOffset);
    }
    if ("message/rfc822".equals(doc.getContentType())) {
        String metadataString = parsedEmailHeaders.stream().map(key -> doc.getMetadata().getOrDefault(key, "").toString()).collect(joining(" "));
        Matcher metaMatcher = pattern.matcher(metadataString);
        while (metaMatcher.find()) {
            namedEntitiesBuilder.add(NamedEntity.Category.EMAIL, metaMatcher.group(0), -1);
        }
    }
    return namedEntitiesBuilder.build();
}
Also used : NamedEntitiesBuilder(org.icij.datashare.text.NamedEntitiesBuilder) EMAIL(org.icij.datashare.text.nlp.Pipeline.Type.EMAIL) java.util(java.util) NamedEntity.allFrom(org.icij.datashare.text.NamedEntity.allFrom) AbstractPipeline(org.icij.datashare.text.nlp.AbstractPipeline) PropertiesProvider(org.icij.datashare.PropertiesProvider) Inject(com.google.inject.Inject) Document(org.icij.datashare.text.Document) Collectors.joining(java.util.stream.Collectors.joining) Matcher(java.util.regex.Matcher) Collections.unmodifiableSet(java.util.Collections.unmodifiableSet) Charset(java.nio.charset.Charset) Arrays.asList(java.util.Arrays.asList) Annotations(org.icij.datashare.text.nlp.Annotations) Pattern(java.util.regex.Pattern) Language(org.icij.datashare.text.Language) NlpStage(org.icij.datashare.text.nlp.NlpStage) NamedEntity(org.icij.datashare.text.NamedEntity) Matcher(java.util.regex.Matcher) NamedEntitiesBuilder(org.icij.datashare.text.NamedEntitiesBuilder)

Aggregations

NamedEntitiesBuilder (org.icij.datashare.text.NamedEntitiesBuilder)2 NamedEntity (org.icij.datashare.text.NamedEntity)2 Annotations (org.icij.datashare.text.nlp.Annotations)2 Inject (com.google.inject.Inject)1 AbstractSequenceClassifier (edu.stanford.nlp.ie.AbstractSequenceClassifier)1 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)1 Triple (edu.stanford.nlp.util.Triple)1 Charset (java.nio.charset.Charset)1 java.util (java.util)1 Arrays.asList (java.util.Arrays.asList)1 Collections.unmodifiableSet (java.util.Collections.unmodifiableSet)1 Matcher (java.util.regex.Matcher)1 Pattern (java.util.regex.Pattern)1 Collectors.joining (java.util.stream.Collectors.joining)1 PropertiesProvider (org.icij.datashare.PropertiesProvider)1 Document (org.icij.datashare.text.Document)1 Language (org.icij.datashare.text.Language)1 NamedEntity.allFrom (org.icij.datashare.text.NamedEntity.allFrom)1 AbstractPipeline (org.icij.datashare.text.nlp.AbstractPipeline)1 NlpStage (org.icij.datashare.text.nlp.NlpStage)1