Search in sources :

Example 91 with Annotation

use of edu.stanford.nlp.pipeline.Annotation in project textdb by TextDB.

the class NlpEntityOperator method extractNlpSpans.

/**
 * @param iField
 * @param attributeName
 * @return
 * @about This function takes an IField(TextField) and a String (the field's
 *        name) as input and uses the Stanford NLP package to process the
 *        field based on the input token type and nlpTypeIndicator. In the
 *        result spans, value represents the word itself and key represents
 *        the recognized token type
 * @overview First set up a pipeline of Annotators based on the
 *           nlpTypeIndicator. If the nlpTypeIndicator is "NE_ALL", we set
 *           up the NamedEntityTagAnnotator, if it's "POS", then only
 *           PartOfSpeechAnnotator is needed.
 *           <p>
 *           The pipeline has to be this order: TokenizerAnnotator,
 *           SentencesAnnotator, PartOfSpeechAnnotator, LemmaAnnotator and
 *           NamedEntityTagAnnotator.
 *           <p>
 *           In the pipeline, each token is wrapped as a CoreLabel and each
 *           sentence is wrapped as CoreMap. Each annotator adds its
 *           annotation to the CoreMap(sentence) or CoreLabel(token) object.
 *           <p>
 *           After the pipeline, scan each CoreLabel(token) for its
 *           NamedEntityAnnotation or PartOfSpeechAnnotator depends on the
 *           nlpTypeIndicator
 *           <p>
 *           For each Stanford NLP annotation, get it's corresponding
 *           inputnlpEntityType that used in this package, then check if it
 *           equals to the input token type. If yes, makes it a span and add
 *           to the return list.
 *           <p>
 *           The NLP package has annotations for the start and end position
 *           of a token and it perfectly matches the span design so we just
 *           use them.
 *           <p>
 *           For Example: With TextField value: "Microsoft, Google and
 *           Facebook are organizations while Donald Trump and Barack Obama
 *           are persons", with attributeName: Sentence1 and inputTokenType is
 *           Organization. Since the inputTokenType require us to use
 *           NamedEntity Annotator in the Stanford NLP package, the
 *           nlpTypeIndicator would be set to "NE". The pipeline would set
 *           up to cover the Named Entity Recognizer. Then get the value of
 *           NamedEntityTagAnnotation for each CoreLabel(token).If the value
 *           is the token type "Organization", then it meets the
 *           requirement. In this case "Microsoft","Google" and "Facebook"
 *           will satisfy the requirement. "Donald Trump" and "Barack Obama"
 *           would have token type "Person" and do not meet the requirement.
 *           For each qualified token, create a span accordingly and add it
 *           to the returned list. In this case, token "Microsoft" would be
 *           span: ["Sentence1", 0, 9, Organization, "Microsoft"]
 */
private List<Span> extractNlpSpans(IField iField, String attributeName) {
    List<Span> spanList = new ArrayList<>();
    String text = (String) iField.getValue();
    Properties props = new Properties();
    // Setup Stanford NLP pipeline based on nlpTypeIndicator
    StanfordCoreNLP pipeline = null;
    if (getNlpTypeIndicator(predicate.getNlpEntityType()).equals("POS")) {
        props.setProperty("annotators", "tokenize, ssplit, pos");
        if (posPipeline == null) {
            posPipeline = new StanfordCoreNLP(props);
        }
        pipeline = posPipeline;
    } else {
        props.setProperty("annotators", "tokenize, ssplit, pos, lemma, " + "ner");
        if (nerPipeline == null) {
            nerPipeline = new StanfordCoreNLP(props);
        }
        pipeline = nerPipeline;
    }
    Annotation documentAnnotation = new Annotation(text);
    pipeline.annotate(documentAnnotation);
    List<CoreMap> sentences = documentAnnotation.get(CoreAnnotations.SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
        for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
            String stanfordNlpConstant;
            // Extract annotations based on nlpTypeIndicator
            if (getNlpTypeIndicator(predicate.getNlpEntityType()).equals("POS")) {
                stanfordNlpConstant = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
            } else {
                stanfordNlpConstant = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
            }
            NlpEntityType nlpEntityType = mapNlpEntityType(stanfordNlpConstant);
            if (nlpEntityType == null) {
                continue;
            }
            if (predicate.getNlpEntityType().equals(NlpEntityType.NE_ALL) || predicate.getNlpEntityType().equals(nlpEntityType)) {
                int start = token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
                int end = token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
                String word = token.get(CoreAnnotations.TextAnnotation.class);
                Span span = new Span(attributeName, start, end, nlpEntityType.toString(), word);
                if (spanList.size() >= 1 && (getNlpTypeIndicator(predicate.getNlpEntityType()).equals("NE_ALL"))) {
                    Span previousSpan = spanList.get(spanList.size() - 1);
                    if (previousSpan.getAttributeName().equals(span.getAttributeName()) && (span.getStart() - previousSpan.getEnd() <= 1) && previousSpan.getKey().equals(span.getKey())) {
                        Span newSpan = mergeTwoSpans(previousSpan, span);
                        span = newSpan;
                        spanList.remove(spanList.size() - 1);
                    }
                }
                spanList.add(span);
            }
        }
    }
    return spanList;
}
Also used : ArrayList(java.util.ArrayList) Properties(java.util.Properties) Span(edu.uci.ics.texera.api.span.Span) StanfordCoreNLP(edu.stanford.nlp.pipeline.StanfordCoreNLP) Annotation(edu.stanford.nlp.pipeline.Annotation) CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) CoreMap(edu.stanford.nlp.util.CoreMap)

Aggregations

Annotation (edu.stanford.nlp.pipeline.Annotation)91 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)58 CoreMap (edu.stanford.nlp.util.CoreMap)50 CoreLabel (edu.stanford.nlp.ling.CoreLabel)30 StanfordCoreNLP (edu.stanford.nlp.pipeline.StanfordCoreNLP)27 ArrayList (java.util.ArrayList)25 Properties (java.util.Properties)25 SemanticGraphCoreAnnotations (edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations)19 SemanticGraph (edu.stanford.nlp.semgraph.SemanticGraph)14 TreeCoreAnnotations (edu.stanford.nlp.trees.TreeCoreAnnotations)13 SentencesAnnotation (edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation)12 TreeAnnotation (edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation)12 List (java.util.List)11 Tree (edu.stanford.nlp.trees.Tree)10 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)8 IOException (java.io.IOException)8 TokensAnnotation (edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation)7 CorefChain (edu.stanford.nlp.coref.data.CorefChain)6 EntityMentionsAnnotation (edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations.EntityMentionsAnnotation)6 CoreAnnotation (edu.stanford.nlp.ling.CoreAnnotation)6