Search in sources :

Example 16 with IntPair

use of edu.stanford.nlp.util.IntPair in project CoreNLP by stanfordnlp.

the class RuleBasedCorefMentionFinder method filterPredictedMentions.

/** When mention boundaries are given */
public List<List<Mention>> filterPredictedMentions(List<List<Mention>> allGoldMentions, Annotation doc, Dictionaries dict, Properties props) {
    List<List<Mention>> predictedMentions = new ArrayList<>();
    for (int i = 0; i < allGoldMentions.size(); i++) {
        CoreMap s = doc.get(CoreAnnotations.SentencesAnnotation.class).get(i);
        List<Mention> goldMentions = allGoldMentions.get(i);
        List<Mention> mentions = new ArrayList<>();
        predictedMentions.add(mentions);
        mentions.addAll(goldMentions);
        findHead(s, mentions);
        // todo [cdm 2013]: This block seems to do nothing - the two sets are never used
        Set<IntPair> mentionSpanSet = Generics.newHashSet();
        Set<IntPair> namedEntitySpanSet = Generics.newHashSet();
        for (Mention m : mentions) {
            mentionSpanSet.add(new IntPair(m.startIndex, m.endIndex));
            if (!m.headWord.get(CoreAnnotations.NamedEntityTagAnnotation.class).equals("O")) {
                namedEntitySpanSet.add(new IntPair(m.startIndex, m.endIndex));
            }
        }
        setBarePlural(mentions);
    }
    removeSpuriousMentions(doc, predictedMentions, dict, CorefProperties.removeNestedMentions(props), lang);
    return predictedMentions;
}
Also used : Mention(edu.stanford.nlp.coref.data.Mention) ArrayList(java.util.ArrayList) TreeCoreAnnotations(edu.stanford.nlp.trees.TreeCoreAnnotations) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) ArrayList(java.util.ArrayList) List(java.util.List) CoreMap(edu.stanford.nlp.util.CoreMap) IntPair(edu.stanford.nlp.util.IntPair)

Example 17 with IntPair

use of edu.stanford.nlp.util.IntPair in project CoreNLP by stanfordnlp.

the class Document method findTwinMentionsStrict.

/** Mark twin mentions: All mention boundaries should be matched */
private void findTwinMentionsStrict() {
    for (int sentNum = 0; sentNum < goldOrderedMentionsBySentence.size(); sentNum++) {
        List<Mention> golds = goldOrderedMentionsBySentence.get(sentNum);
        List<Mention> predicts = predictedOrderedMentionsBySentence.get(sentNum);
        // For CoNLL training there are some documents with gold mentions with the same position offsets
        // See /scr/nlp/data/conll-2011/v2/data/train/data/english/annotations/nw/wsj/09/wsj_0990.v2_auto_conll
        //  (Packwood - Roth)
        CollectionValuedMap<IntPair, Mention> goldMentionPositions = new CollectionValuedMap<>();
        for (Mention g : golds) {
            IntPair ip = new IntPair(g.startIndex, g.endIndex);
            if (goldMentionPositions.containsKey(ip)) {
                StringBuilder existingMentions = new StringBuilder();
                for (Mention eg : goldMentionPositions.get(ip)) {
                    if (existingMentions.length() > 0) {
                        existingMentions.append(",");
                    }
                    existingMentions.append(eg.mentionID);
                }
                SieveCoreferenceSystem.logger.warning("WARNING: gold mentions with the same offsets: " + ip + " mentions=" + g.mentionID + "," + existingMentions + ", " + g.spanToString());
            }
            //assert(!goldMentionPositions.containsKey(ip));
            goldMentionPositions.add(new IntPair(g.startIndex, g.endIndex), g);
        }
        for (Mention p : predicts) {
            IntPair pos = new IntPair(p.startIndex, p.endIndex);
            if (goldMentionPositions.containsKey(pos)) {
                Collection<Mention> cm = goldMentionPositions.get(pos);
                Mention g = cm.iterator().next();
                cm.remove(g);
                p.mentionID = g.mentionID;
                p.twinless = false;
                g.twinless = false;
            }
        }
        // temp: for making easy to recognize twinless mention
        for (Mention p : predicts) {
            if (p.twinless)
                p.mentionID += 10000;
        }
    }
}
Also used : CollectionValuedMap(edu.stanford.nlp.util.CollectionValuedMap) IntPair(edu.stanford.nlp.util.IntPair)

Example 18 with IntPair

use of edu.stanford.nlp.util.IntPair in project CoreNLP by stanfordnlp.

the class IOBUtils method TokenSpansForIOB.

public static List<IntPair> TokenSpansForIOB(List<CoreLabel> labeledSequence) {
    List<IntPair> spans = CollectionUtils.makeList();
    String lastLabel = "";
    boolean inToken = false;
    int tokenStart = 0;
    final int sequenceLength = labeledSequence.size();
    for (int i = 0; i < sequenceLength; ++i) {
        CoreLabel labeledChar = labeledSequence.get(i);
        String token = labeledChar.get(CoreAnnotations.CharAnnotation.class);
        String label = labeledChar.get(CoreAnnotations.AnswerAnnotation.class);
        if (token.equals(BoundaryChar)) {
            if (inToken) {
                spans.add(new IntPair(tokenStart, i));
            }
            inToken = false;
        } else {
            switch(label) {
                case BeginSymbol:
                    if (lastLabel.equals(ContinuationSymbol) || lastLabel.equals(BeginSymbol) || lastLabel.equals(RewriteSymbol)) {
                        if (inToken) {
                            spans.add(new IntPair(tokenStart, i));
                        }
                        inToken = true;
                        tokenStart = i;
                    } else if (!inToken) {
                        inToken = true;
                        tokenStart = i;
                    }
                    break;
                case ContinuationSymbol:
                    if (!inToken) {
                        inToken = true;
                        tokenStart = i;
                    }
                    break;
                case BoundarySymbol:
                case NosegSymbol:
                    if (inToken) {
                        spans.add(new IntPair(tokenStart, i));
                    }
                    inToken = true;
                    tokenStart = i;
                    break;
                case RewriteSymbol:
                case "REWAL":
                case "REWTA":
                    if (token.equals("ل")) {
                        if (inToken) {
                            spans.add(new IntPair(tokenStart, i));
                        }
                        inToken = true;
                        tokenStart = i;
                    } else if (!inToken) {
                        inToken = true;
                        tokenStart = i;
                    }
                    break;
            }
        }
        lastLabel = label;
    }
    if (inToken) {
        spans.add(new IntPair(tokenStart, sequenceLength));
    }
    return spans;
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) IntPair(edu.stanford.nlp.util.IntPair)

Example 19 with IntPair

use of edu.stanford.nlp.util.IntPair in project CoreNLP by stanfordnlp.

the class MultiWordStringMatcher method findTargetStringOffsetsExct.

/**
   * Finds target string in text span from character start to end (exclusive) and returns offsets
   *   (does EXCT string matching)
   * @param text - String in which to look for the target string
   * @param targetString - Target string to look for
   * @param start - position to start search
   * @param end - position to end search
   * @return list of integer pairs indicating the character offsets (begin, end - exclusive)
   *         at which the targetString can be find
   */
protected List<IntPair> findTargetStringOffsetsExct(String text, String targetString, int start, int end) {
    if (start > text.length())
        return null;
    if (end > text.length())
        return null;
    List<IntPair> offsets = null;
    int i = text.indexOf(targetString, start);
    if (i >= 0 && i < end) {
        offsets = new ArrayList<>();
    }
    while (i >= 0 && i < end) {
        boolean matched = true;
        if (i > 0) {
            char charBefore = text.charAt(i - 1);
            if (Character.isLetterOrDigit(charBefore)) {
                matched = false;
            }
        }
        if (i + targetString.length() < text.length()) {
            char charAfter = text.charAt(i + targetString.length());
            if (Character.isLetterOrDigit(charAfter)) {
                matched = false;
            }
        }
        if (matched) {
            offsets.add(new IntPair(i, i + targetString.length()));
            i += targetString.length();
        } else {
            i++;
        }
        i = text.indexOf(targetString, i);
    }
    return offsets;
}
Also used : IntPair(edu.stanford.nlp.util.IntPair)

Example 20 with IntPair

use of edu.stanford.nlp.util.IntPair in project CoreNLP by stanfordnlp.

the class MultiWordStringMatcher method findOffsets.

/**
   * Finds pattern in text span from character start to end (exclusive) and returns offsets
   * @param pattern - pattern to look for
   * @param text - String in which to look for the pattern
   * @param start - position to start search
   * @param end - position to end search
   * @return list of integer pairs indicating the character offsets (begin, end - exclusive)
   *         at which the pattern can be find
   */
public static List<IntPair> findOffsets(Pattern pattern, String text, int start, int end) {
    Matcher matcher = pattern.matcher(text);
    List<IntPair> offsets = null;
    matcher.region(start, end);
    int i = (matcher.find()) ? matcher.start() : -1;
    if (i >= 0 && i < end) {
        offsets = new ArrayList<>();
    }
    while (i >= 0 && i < end) {
        boolean matched = true;
        int matchEnd = matcher.end();
        if (i > 0) {
            char charBefore = text.charAt(i - 1);
            if (Character.isLetterOrDigit(charBefore)) {
                matched = false;
            }
        }
        if (matchEnd < text.length()) {
            char charAfter = text.charAt(matchEnd);
            if (Character.isLetterOrDigit(charAfter)) {
                matched = false;
            }
        }
        if (matched) {
            offsets.add(new IntPair(i, matchEnd));
        }
        i = (matcher.find()) ? matcher.start() : -1;
    }
    return offsets;
}
Also used : Matcher(java.util.regex.Matcher) IntPair(edu.stanford.nlp.util.IntPair)

Aggregations

IntPair (edu.stanford.nlp.util.IntPair)37 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)19 CoreLabel (edu.stanford.nlp.ling.CoreLabel)17 SemanticGraphCoreAnnotations (edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations)16 Mention (edu.stanford.nlp.coref.data.Mention)14 TreeCoreAnnotations (edu.stanford.nlp.trees.TreeCoreAnnotations)11 CoreMap (edu.stanford.nlp.util.CoreMap)9 SemanticGraph (edu.stanford.nlp.semgraph.SemanticGraph)8 ArrayList (java.util.ArrayList)8 IndexedWord (edu.stanford.nlp.ling.IndexedWord)7 Tree (edu.stanford.nlp.trees.Tree)7 List (java.util.List)6 ParserConstraint (edu.stanford.nlp.parser.common.ParserConstraint)5 SemanticGraphEdge (edu.stanford.nlp.semgraph.SemanticGraphEdge)3 TregexMatcher (edu.stanford.nlp.trees.tregex.TregexMatcher)3 TregexPattern (edu.stanford.nlp.trees.tregex.TregexPattern)3 CollectionValuedMap (edu.stanford.nlp.util.CollectionValuedMap)3 Set (java.util.Set)3 BasicDependenciesAnnotation (edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation)2 Constituent (edu.stanford.nlp.trees.Constituent)2