Search in sources :

Example 86 with CoreLabel

use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.

the class Rules method entityHaveExtraProperNoun.

/** Have extra proper noun except strings involved in semantic match */
public static boolean entityHaveExtraProperNoun(Mention m, Mention a, Set<String> exceptWords) {
    Set<String> mProper = Generics.newHashSet();
    Set<String> aProper = Generics.newHashSet();
    String mString = m.spanToString();
    String aString = a.spanToString();
    for (CoreLabel w : m.originalSpan) {
        if (w.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("NNP")) {
            mProper.add(w.get(CoreAnnotations.TextAnnotation.class));
        }
    }
    for (CoreLabel w : a.originalSpan) {
        if (w.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("NNP")) {
            aProper.add(w.get(CoreAnnotations.TextAnnotation.class));
        }
    }
    boolean mHasExtra = false;
    boolean aHasExtra = false;
    for (String s : mProper) {
        if (!aString.contains(s) && !exceptWords.contains(s.toLowerCase())) {
            mHasExtra = true;
            break;
        }
    }
    for (String s : aProper) {
        if (!mString.contains(s) && !exceptWords.contains(s.toLowerCase())) {
            aHasExtra = true;
            break;
        }
    }
    if (mHasExtra && aHasExtra) {
        return true;
    }
    return false;
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel)

Example 87 with CoreLabel

use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.

the class Rules method isAcronym.

public static boolean isAcronym(List<CoreLabel> first, List<CoreLabel> second) {
    if (first.size() > 1 && second.size() > 1) {
        return false;
    }
    if (first.size() == 0 && second.size() == 0) {
        return false;
    }
    List<CoreLabel> longer;
    List<CoreLabel> shorter;
    if (first.size() == second.size()) {
        String firstWord = first.get(0).get(CoreAnnotations.TextAnnotation.class);
        String secondWord = second.get(0).get(CoreAnnotations.TextAnnotation.class);
        longer = (firstWord.length() > secondWord.length()) ? first : second;
        shorter = (firstWord.length() > secondWord.length()) ? second : first;
    } else {
        longer = (first.size() > 0 && first.size() > second.size()) ? first : second;
        shorter = (second.size() > 0 && first.size() > second.size()) ? second : first;
    }
    String acronym = shorter.size() > 0 ? shorter.get(0).get(CoreAnnotations.TextAnnotation.class) : "<UNK>";
    // time iterating through the text of the longer mention
    for (int acronymPos = 0; acronymPos < acronym.length(); ++acronymPos) {
        if (acronym.charAt(acronymPos) < 'A' || acronym.charAt(acronymPos) > 'Z') {
            return false;
        }
    }
    int acronymPos = 0;
    for (CoreLabel aLonger1 : longer) {
        String word = aLonger1.get(CoreAnnotations.TextAnnotation.class);
        for (int charNum = 0; charNum < word.length(); ++charNum) {
            if (word.charAt(charNum) >= 'A' && word.charAt(charNum) <= 'Z') {
                // the longer mention than in the shorter mention
                if (acronymPos >= acronym.length()) {
                    return false;
                }
                if (acronym.charAt(acronymPos) != word.charAt(charNum)) {
                    return false;
                }
                ++acronymPos;
            }
        }
    }
    if (acronymPos != acronym.length()) {
        return false;
    }
    for (CoreLabel aLonger : longer) {
        if (aLonger.get(CoreAnnotations.TextAnnotation.class).contains(acronym)) {
            return false;
        }
    }
    return true;
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations)

Example 88 with CoreLabel

use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.

the class KBPStatisticalExtractor method surfaceFeatures.

@SuppressWarnings("UnusedParameters")
private static void surfaceFeatures(KBPInput input, Sentence simpleSentence, ClassicCounter<String> feats) {
    List<String> lemmaSpan = spanBetweenMentions(input, CoreLabel::lemma);
    List<String> nerSpan = spanBetweenMentions(input, CoreLabel::ner);
    List<String> posSpan = spanBetweenMentions(input, CoreLabel::tag);
    // Unigram features of the sentence
    List<CoreLabel> tokens = input.sentence.asCoreLabels(Sentence::lemmas, Sentence::nerTags);
    for (CoreLabel token : tokens) {
        indicator(feats, "sentence_unigram", token.lemma());
    }
    // Full lemma span ( -0.3 F1 )
    //    if (lemmaSpan.size() <= 5) {
    //      indicator(feats, "full_lemma_span", withMentionsPositioned(input, StringUtils.join(lemmaSpan, " ")));
    //    }
    // Lemma n-grams
    String lastLemma = "_^_";
    for (String lemma : lemmaSpan) {
        indicator(feats, "lemma_bigram", withMentionsPositioned(input, lastLemma + " " + lemma));
        indicator(feats, "lemma_unigram", withMentionsPositioned(input, lemma));
        lastLemma = lemma;
    }
    indicator(feats, "lemma_bigram", withMentionsPositioned(input, lastLemma + " _$_"));
    // NER + lemma bi-grams
    for (int i = 0; i < lemmaSpan.size() - 1; ++i) {
        if (!"O".equals(nerSpan.get(i)) && "O".equals(nerSpan.get(i + 1)) && "IN".equals(posSpan.get(i + 1))) {
            indicator(feats, "ner/lemma_bigram", withMentionsPositioned(input, nerSpan.get(i) + " " + lemmaSpan.get(i + 1)));
        }
        if (!"O".equals(nerSpan.get(i + 1)) && "O".equals(nerSpan.get(i)) && "IN".equals(posSpan.get(i))) {
            indicator(feats, "ner/lemma_bigram", withMentionsPositioned(input, lemmaSpan.get(i) + " " + nerSpan.get(i + 1)));
        }
    }
    // Distance between mentions
    String distanceBucket = ">10";
    if (lemmaSpan.size() == 0) {
        distanceBucket = "0";
    } else if (lemmaSpan.size() <= 3) {
        distanceBucket = "<=3";
    } else if (lemmaSpan.size() <= 5) {
        distanceBucket = "<=5";
    } else if (lemmaSpan.size() <= 10) {
        distanceBucket = "<=10";
    } else if (lemmaSpan.size() <= 15) {
        distanceBucket = "<=15";
    }
    indicator(feats, "distance_between_entities_bucket", distanceBucket);
    // Punctuation features
    int numCommasInSpan = 0;
    int numQuotesInSpan = 0;
    int parenParity = 0;
    for (String lemma : lemmaSpan) {
        if (lemma.equals(",")) {
            numCommasInSpan += 1;
        }
        if (lemma.equals("\"") || lemma.equals("``") || lemma.equals("''")) {
            numQuotesInSpan += 1;
        }
        if (lemma.equals("(") || lemma.equals("-LRB-")) {
            parenParity += 1;
        }
        if (lemma.equals(")") || lemma.equals("-RRB-")) {
            parenParity -= 1;
        }
    }
    indicator(feats, "comma_parity", numCommasInSpan % 2 == 0 ? "even" : "odd");
    indicator(feats, "quote_parity", numQuotesInSpan % 2 == 0 ? "even" : "odd");
    indicator(feats, "paren_parity", "" + parenParity);
    // Is broken by entity
    Set<String> intercedingNERTags = nerSpan.stream().filter(ner -> !ner.equals("O")).collect(Collectors.toSet());
    if (!intercedingNERTags.isEmpty()) {
        indicator(feats, "has_interceding_ner", "t");
    }
    for (String ner : intercedingNERTags) {
        indicator(feats, "interceding_ner", ner);
    }
    // Left and right context
    List<CoreLabel> sentence = input.sentence.asCoreLabels(Sentence::nerTags);
    if (input.subjectSpan.start() == 0) {
        indicator(feats, "subj_left", "^");
    } else {
        indicator(feats, "subj_left", sentence.get(input.subjectSpan.start() - 1).lemma());
    }
    if (input.subjectSpan.end() == sentence.size()) {
        indicator(feats, "subj_right", "$");
    } else {
        indicator(feats, "subj_right", sentence.get(input.subjectSpan.end()).lemma());
    }
    if (input.objectSpan.start() == 0) {
        indicator(feats, "obj_left", "^");
    } else {
        indicator(feats, "obj_left", sentence.get(input.objectSpan.start() - 1).lemma());
    }
    if (input.objectSpan.end() == sentence.size()) {
        indicator(feats, "obj_right", "$");
    } else {
        indicator(feats, "obj_right", sentence.get(input.objectSpan.end()).lemma());
    }
    // Skip-word patterns
    if (lemmaSpan.size() == 1 && input.subjectSpan.isBefore(input.objectSpan)) {
        String left = input.subjectSpan.start() == 0 ? "^" : sentence.get(input.subjectSpan.start() - 1).lemma();
        indicator(feats, "X<subj>Y<obj>", left + "_" + lemmaSpan.get(0));
    }
}
Also used : edu.stanford.nlp.optimization(edu.stanford.nlp.optimization) CoreLabel(edu.stanford.nlp.ling.CoreLabel) java.util(java.util) Counters(edu.stanford.nlp.stats.Counters) IOUtils(edu.stanford.nlp.io.IOUtils) DefaultPaths(edu.stanford.nlp.pipeline.DefaultPaths) edu.stanford.nlp.util(edu.stanford.nlp.util) Redwood(edu.stanford.nlp.util.logging.Redwood) Util(edu.stanford.nlp.util.logging.Redwood.Util) Datum(edu.stanford.nlp.ling.Datum) Function(java.util.function.Function) Collectors(java.util.stream.Collectors) Span(edu.stanford.nlp.ie.machinereading.structure.Span) Counter(edu.stanford.nlp.stats.Counter) java.io(java.io) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) edu.stanford.nlp.classify(edu.stanford.nlp.classify) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) Sentence(edu.stanford.nlp.simple.Sentence) RedwoodConfiguration(edu.stanford.nlp.util.logging.RedwoodConfiguration) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) RVFDatum(edu.stanford.nlp.ling.RVFDatum) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Sentence(edu.stanford.nlp.simple.Sentence)

Example 89 with CoreLabel

use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.

the class KBPStatisticalExtractor method spanBetweenMentions.

/**
   * Get information from the span between the two mentions.
   * Canonically, get the words in this span.
   * For instance, for "Obama was born in Hawaii", this would return a list
   * "was born in" if the selector is <code>CoreLabel::token</code>;
   * or "be bear in" if the selector is <code>CoreLabel::lemma</code>.
   *
   * @param input The featurizer input.
   * @param selector The field to compute for each element in the span. A good default is <code></code>CoreLabel::word</code> or <code></code>CoreLabel::token</code>
   * @param <E> The type of element returned by the selector.
   *
   * @return A list of elements between the two mentions.
   */
@SuppressWarnings("unchecked")
private static <E> List<E> spanBetweenMentions(KBPInput input, Function<CoreLabel, E> selector) {
    List<CoreLabel> sentence = input.sentence.asCoreLabels(Sentence::lemmas, Sentence::nerTags);
    Span subjSpan = input.subjectSpan;
    Span objSpan = input.objectSpan;
    // Corner cases
    if (Span.overlaps(subjSpan, objSpan)) {
        return Collections.EMPTY_LIST;
    }
    // Get the range between the subject and object
    int begin = subjSpan.end();
    int end = objSpan.start();
    if (begin > end) {
        begin = objSpan.end();
        end = subjSpan.start();
    }
    if (begin > end) {
        throw new IllegalArgumentException("Gabor sucks at logic and he should feel bad about it: " + subjSpan + " and " + objSpan);
    } else if (begin == end) {
        return Collections.EMPTY_LIST;
    }
    // Compute the return value
    List<E> rtn = new ArrayList<>();
    for (int i = begin; i < end; ++i) {
        rtn.add(selector.apply(sentence.get(i)));
    }
    return rtn;
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) Sentence(edu.stanford.nlp.simple.Sentence) Span(edu.stanford.nlp.ie.machinereading.structure.Span)

Example 90 with CoreLabel

use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.

the class KBPTokensregexExtractor method classify.

@Override
public Pair<String, Double> classify(KBPInput input) {
    // Annotate Sentence
    CoreMap sentenceAsMap = input.sentence.asCoreMap(Sentence::nerTags);
    List<CoreLabel> tokens = sentenceAsMap.get(CoreAnnotations.TokensAnnotation.class);
    // Annotate where the subject is
    for (int i : input.subjectSpan) {
        tokens.get(i).set(Subject.class, "true");
        if ("O".equals(tokens.get(i).ner())) {
            tokens.get(i).setNER(input.subjectType.name);
        }
    }
    // Annotate where the object is
    for (int i : input.objectSpan) {
        tokens.get(i).set(Object.class, "true");
        if ("O".equals(tokens.get(i).ner())) {
            tokens.get(i).setNER(input.objectType.name);
        }
    }
    // Run Rules
    for (RelationType rel : RelationType.values()) {
        if (rules.containsKey(rel) && rel.entityType == input.subjectType && rel.validNamedEntityLabels.contains(input.objectType)) {
            CoreMapExpressionExtractor extractor = rules.get(rel);
            @SuppressWarnings("unchecked") List<MatchedExpression> extractions = extractor.extractExpressions(sentenceAsMap);
            if (extractions != null && extractions.size() > 0) {
                MatchedExpression best = MatchedExpression.getBestMatched(extractions, MatchedExpression.EXPR_WEIGHT_SCORER);
                // Un-Annotate Sentence
                for (CoreLabel token : tokens) {
                    token.remove(Subject.class);
                    token.remove(Object.class);
                }
                return Pair.makePair(rel.canonicalName, best.getWeight());
            }
        }
    }
    // Un-Annotate Sentence
    for (CoreLabel token : tokens) {
        token.remove(Subject.class);
        token.remove(Object.class);
    }
    return Pair.makePair(NO_RELATION, 1.0);
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreMapExpressionExtractor(edu.stanford.nlp.ling.tokensregex.CoreMapExpressionExtractor) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) CoreMap(edu.stanford.nlp.util.CoreMap) Sentence(edu.stanford.nlp.simple.Sentence) MatchedExpression(edu.stanford.nlp.ling.tokensregex.MatchedExpression)

Aggregations

CoreLabel (edu.stanford.nlp.ling.CoreLabel)536 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)311 CoreMap (edu.stanford.nlp.util.CoreMap)103 ArrayList (java.util.ArrayList)102 Tree (edu.stanford.nlp.trees.Tree)98 SemanticGraphCoreAnnotations (edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations)96 TreeCoreAnnotations (edu.stanford.nlp.trees.TreeCoreAnnotations)63 SemanticGraph (edu.stanford.nlp.semgraph.SemanticGraph)53 ParserConstraint (edu.stanford.nlp.parser.common.ParserConstraint)41 IndexedWord (edu.stanford.nlp.ling.IndexedWord)38 List (java.util.List)33 Annotation (edu.stanford.nlp.pipeline.Annotation)32 Mention (edu.stanford.nlp.coref.data.Mention)29 Label (edu.stanford.nlp.ling.Label)28 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)26 Properties (java.util.Properties)25 CorefCoreAnnotations (edu.stanford.nlp.coref.CorefCoreAnnotations)21 StringReader (java.io.StringReader)20 CoreAnnotation (edu.stanford.nlp.ling.CoreAnnotation)19 SemanticGraphEdge (edu.stanford.nlp.semgraph.SemanticGraphEdge)18