Search in sources :

Example 76 with CoreLabel

use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.

the class RuleBasedCorefMentionFinder method findSyntacticHead.

protected Tree findSyntacticHead(Mention m, Tree root, List<CoreLabel> tokens) {
    // mention ends with 's
    int endIdx = m.endIndex;
    if (m.originalSpan.size() > 0) {
        String lastWord = m.originalSpan.get(m.originalSpan.size() - 1).get(CoreAnnotations.TextAnnotation.class);
        if ((lastWord.equals("'s") || lastWord.equals("'")) && m.originalSpan.size() != 1)
            endIdx--;
    }
    Tree exactMatch = findTreeWithSpan(root, m.startIndex, endIdx);
    //
    if (exactMatch != null) {
        return safeHead(exactMatch, endIdx);
    }
    // context, so as to make the parser work better :-)
    if (allowReparsing) {
        int approximateness = 0;
        List<CoreLabel> extentTokens = new ArrayList<>();
        extentTokens.add(initCoreLabel("It"));
        extentTokens.add(initCoreLabel("was"));
        final int ADDED_WORDS = 2;
        for (int i = m.startIndex; i < endIdx; i++) {
            // Add everything except separated dashes! The separated dashes mess with the parser too badly.
            CoreLabel label = tokens.get(i);
            if (!"-".equals(label.word())) {
                // necessary to copy tokens in case the parser does things like
                // put new indices on the tokens
                extentTokens.add((CoreLabel) label.labelFactory().newLabel(label));
            } else {
                approximateness++;
            }
        }
        extentTokens.add(initCoreLabel("."));
        // constrain the parse to the part we're interested in.
        // Starting from ADDED_WORDS comes from skipping "It was".
        // -1 to exclude the period.
        // We now let it be any kind of nominal constituent, since there
        // are VP and S ones
        ParserConstraint constraint = new ParserConstraint(ADDED_WORDS, extentTokens.size() - 1, Pattern.compile(".*"));
        List<ParserConstraint> constraints = Collections.singletonList(constraint);
        Tree tree = parse(extentTokens, constraints);
        // now unnecessary, as parser uses CoreLabels?
        convertToCoreLabels(tree);
        // remember it has ADDED_WORDS extra words at the beginning
        tree.indexSpans(m.startIndex - ADDED_WORDS);
        Tree subtree = findPartialSpan(tree, m.startIndex);
        // There was a possible problem that with a crazy parse, extentHead could be one of the added words, not a real word!
        // Now we make sure in findPartialSpan that it can't be before the real start, and in safeHead, we disallow something
        // passed the right end (that is, just that final period).
        Tree extentHead = safeHead(subtree, endIdx);
        assert (extentHead != null);
        // extentHead is a child in the local extent parse tree. we need to find the corresponding node in the main tree
        // Because we deleted dashes, it's index will be >= the index in the extent parse tree
        CoreLabel l = (CoreLabel) extentHead.label();
        Tree realHead = funkyFindLeafWithApproximateSpan(root, l.value(), l.get(CoreAnnotations.BeginIndexAnnotation.class), approximateness);
        assert (realHead != null);
        return realHead;
    }
    // If reparsing wasn't allowed, try to find a span in the tree
    // which happens to have the head
    Tree wordMatch = findTreeWithSmallestSpan(root, m.startIndex, endIdx);
    if (wordMatch != null) {
        Tree head = safeHead(wordMatch, endIdx);
        if (head != null) {
            int index = ((CoreLabel) head.label()).get(CoreAnnotations.IndexAnnotation.class) - 1;
            if (index >= m.startIndex && index < endIdx) {
                return head;
            }
        }
    }
    // If that didn't work, guess that it's the last word
    int lastNounIdx = endIdx - 1;
    for (int i = m.startIndex; i < m.endIndex; i++) {
        if (tokens.get(i).tag().startsWith("N"))
            lastNounIdx = i;
        else if (tokens.get(i).tag().startsWith("W"))
            break;
    }
    List<Tree> leaves = root.getLeaves();
    Tree endLeaf = leaves.get(lastNounIdx);
    return endLeaf;
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) ParserConstraint(edu.stanford.nlp.parser.common.ParserConstraint) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) ArrayList(java.util.ArrayList) ParserConstraint(edu.stanford.nlp.parser.common.ParserConstraint)

Example 77 with CoreLabel

use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.

the class RuleBasedCorefMentionFinder method extractNamedEntityMentions.

protected static void extractNamedEntityMentions(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) {
    List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
    SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
    String preNE = "O";
    int beginIndex = -1;
    for (CoreLabel w : sent) {
        String nerString = w.get(CoreAnnotations.NamedEntityTagAnnotation.class);
        if (!nerString.equals(preNE)) {
            int endIndex = w.get(CoreAnnotations.IndexAnnotation.class) - 1;
            if (!preNE.matches("O|QUANTITY|CARDINAL|PERCENT|DATE|DURATION|TIME|SET")) {
                if (w.get(CoreAnnotations.TextAnnotation.class).equals("'s"))
                    endIndex++;
                IntPair mSpan = new IntPair(beginIndex, endIndex);
                // attached to the previous NER by the earlier heuristic
                if (beginIndex < endIndex && !mentionSpanSet.contains(mSpan)) {
                    int dummyMentionId = -1;
                    Mention m = new Mention(dummyMentionId, beginIndex, endIndex, dependency, new ArrayList<>(sent.subList(beginIndex, endIndex)));
                    mentions.add(m);
                    mentionSpanSet.add(mSpan);
                    namedEntitySpanSet.add(mSpan);
                }
            }
            beginIndex = endIndex;
            preNE = nerString;
        }
    }
    // NE at the end of sentence
    if (!preNE.matches("O|QUANTITY|CARDINAL|PERCENT|DATE|DURATION|TIME|SET")) {
        IntPair mSpan = new IntPair(beginIndex, sent.size());
        if (!mentionSpanSet.contains(mSpan)) {
            int dummyMentionId = -1;
            Mention m = new Mention(dummyMentionId, beginIndex, sent.size(), dependency, new ArrayList<>(sent.subList(beginIndex, sent.size())));
            mentions.add(m);
            mentionSpanSet.add(mSpan);
            namedEntitySpanSet.add(mSpan);
        }
    }
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph) ParserConstraint(edu.stanford.nlp.parser.common.ParserConstraint)

Example 78 with CoreLabel

use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.

the class Mention method isListLike.

private boolean isListLike() {
    // See if this mention looks to be a conjunction of things
    // Check for "or" and "and" and ","
    int commas = 0;
    //    boolean firstLabelLike = false;
    //    if (originalSpan.size() > 1) {
    //      String w = originalSpan.get(1).word();
    //      firstLabelLike = (w.equals(":") || w.equals("-"));
    //    }
    String mentionSpanString = spanToString();
    String subTreeSpanString = StringUtils.joinWords(mentionSubTree.yieldWords(), " ");
    if (subTreeSpanString.equals(mentionSpanString)) {
        // subtree represents this mention well....
        List<Tree> children = mentionSubTree.getChildrenAsList();
        for (Tree t : children) {
            String label = t.value();
            String ner = null;
            if (t.isLeaf()) {
                ner = ((CoreLabel) t.getLeaves().get(0).label()).ner();
            }
            if ("CC".equals(label)) {
                // Check NER type
                if (ner == null || "O".equals(ner)) {
                    return true;
                }
            } else if (label.equals(",")) {
                if (ner == null || "O".equals(ner)) {
                    commas++;
                }
            }
        }
    }
    if (commas <= 2) {
        // look at the string for and/or
        boolean first = true;
        for (CoreLabel t : originalSpan) {
            String tag = t.tag();
            String ner = t.ner();
            String w = t.word();
            if (tag.equals("TO") || tag.equals("IN") || tag.startsWith("VB")) {
                // prepositions and verbs are too hard for us
                return false;
            }
            if (!first) {
                if (w.equalsIgnoreCase("and") || w.equalsIgnoreCase("or")) {
                    // Check NER type
                    if (ner == null || "O".equals(ner)) {
                        return true;
                    }
                }
            }
            first = false;
        }
    }
    return (commas > 2);
}
Also used : AbstractCoreLabel(edu.stanford.nlp.ling.AbstractCoreLabel) CoreLabel(edu.stanford.nlp.ling.CoreLabel)

Example 79 with CoreLabel

use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.

the class Mention method removePhraseAfterHead.

/** Remove any clause after headword */
public String removePhraseAfterHead() {
    String removed = "";
    int posComma = -1;
    int posWH = -1;
    for (int i = 0; i < this.originalSpan.size(); i++) {
        CoreLabel w = this.originalSpan.get(i);
        if (posComma == -1 && w.get(CoreAnnotations.PartOfSpeechAnnotation.class).equals(","))
            posComma = this.startIndex + i;
        if (posWH == -1 && w.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("W"))
            posWH = this.startIndex + i;
    }
    if (posComma != -1 && this.headIndex < posComma) {
        StringBuilder os = new StringBuilder();
        for (int i = 0; i < posComma - this.startIndex; i++) {
            if (i > 0)
                os.append(" ");
            os.append(this.originalSpan.get(i).get(CoreAnnotations.TextAnnotation.class));
        }
        removed = os.toString();
    }
    if (posComma == -1 && posWH != -1 && this.headIndex < posWH) {
        StringBuilder os = new StringBuilder();
        for (int i = 0; i < posWH - this.startIndex; i++) {
            if (i > 0)
                os.append(" ");
            os.append(this.originalSpan.get(i).get(CoreAnnotations.TextAnnotation.class));
        }
        removed = os.toString();
    }
    if (posComma == -1 && posWH == -1) {
        removed = this.spanToString();
    }
    return removed;
}
Also used : AbstractCoreLabel(edu.stanford.nlp.ling.AbstractCoreLabel) CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations)

Example 80 with CoreLabel

use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.

the class MentionExtractor method mergeLabels.

/**
   * Sets the label of the leaf nodes of a Tree to be the CoreLabels in the given sentence.
   * The original value() of the Tree nodes is preserved, and otherwise the label of tree
   * leaves becomes the label from the List.
   */
// todo [cdm 2015]: This clearly shouldn't be here! Maybe it's not needed at all now since parsing code does this?
public static void mergeLabels(Tree tree, List<CoreLabel> sentence) {
    int idx = 0;
    for (Tree t : tree.getLeaves()) {
        CoreLabel cl = sentence.get(idx++);
        String value = t.value();
        cl.set(CoreAnnotations.ValueAnnotation.class, value);
        t.setLabel(cl);
    }
    tree.indexLeaves();
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) Tree(edu.stanford.nlp.trees.Tree)

Aggregations

CoreLabel (edu.stanford.nlp.ling.CoreLabel)536 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)311 CoreMap (edu.stanford.nlp.util.CoreMap)103 ArrayList (java.util.ArrayList)102 Tree (edu.stanford.nlp.trees.Tree)98 SemanticGraphCoreAnnotations (edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations)96 TreeCoreAnnotations (edu.stanford.nlp.trees.TreeCoreAnnotations)63 SemanticGraph (edu.stanford.nlp.semgraph.SemanticGraph)53 ParserConstraint (edu.stanford.nlp.parser.common.ParserConstraint)41 IndexedWord (edu.stanford.nlp.ling.IndexedWord)38 List (java.util.List)33 Annotation (edu.stanford.nlp.pipeline.Annotation)32 Mention (edu.stanford.nlp.coref.data.Mention)29 Label (edu.stanford.nlp.ling.Label)28 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)26 Properties (java.util.Properties)25 CorefCoreAnnotations (edu.stanford.nlp.coref.CorefCoreAnnotations)21 StringReader (java.io.StringReader)20 CoreAnnotation (edu.stanford.nlp.ling.CoreAnnotation)19 SemanticGraphEdge (edu.stanford.nlp.semgraph.SemanticGraphEdge)18