Search in sources :

Example 21 with IntPair

use of edu.stanford.nlp.util.IntPair in project CoreNLP by stanfordnlp.

the class ArabicSegmenter method segmentStringToTokenList.

public List<CoreLabel> segmentStringToTokenList(String line) {
    List<CoreLabel> tokenList = CollectionUtils.makeList();
    List<CoreLabel> labeledSequence = segmentStringToIOB(line);
    for (IntPair span : IOBUtils.TokenSpansForIOB(labeledSequence)) {
        CoreLabel token = new CoreLabel();
        String text = IOBUtils.IOBToString(labeledSequence, prefixMarker, suffixMarker, span.getSource(), span.getTarget());
        token.setWord(text);
        token.setValue(text);
        token.set(CoreAnnotations.TextAnnotation.class, text);
        token.set(CoreAnnotations.ArabicSegAnnotation.class, "1");
        int start = labeledSequence.get(span.getSource()).beginPosition();
        int end = labeledSequence.get(span.getTarget() - 1).endPosition();
        token.setOriginalText(line.substring(start, end));
        token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, start);
        token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, end);
        tokenList.add(token);
    }
    return tokenList;
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) IntPair(edu.stanford.nlp.util.IntPair)

Example 22 with IntPair

use of edu.stanford.nlp.util.IntPair in project CoreNLP by stanfordnlp.

the class CorefMentionFinder method extractNamedEntityModifiers.

// extract mentions which have same string as another stand-alone mention
protected static void extractNamedEntityModifiers(List<CoreMap> sentences, List<Set<IntPair>> mentionSpanSetList, List<List<Mention>> predictedMentions, Set<String> neStrings) {
    for (int i = 0, sz = sentences.size(); i < sz; i++) {
        List<Mention> mentions = predictedMentions.get(i);
        CoreMap sent = sentences.get(i);
        List<CoreLabel> tokens = sent.get(TokensAnnotation.class);
        Set<IntPair> mentionSpanSet = mentionSpanSetList.get(i);
        for (int j = 0, tSize = tokens.size(); j < tSize; j++) {
            for (String ne : neStrings) {
                int len = ne.split(" ").length;
                if (j + len > tokens.size())
                    continue;
                StringBuilder sb = new StringBuilder();
                for (int k = 0; k < len; k++) {
                    sb.append(tokens.get(k + j).word()).append(" ");
                }
                String phrase = sb.toString().trim();
                int beginIndex = j;
                int endIndex = j + len;
                // include "'s" if it belongs to this named entity
                if (endIndex < tokens.size() && tokens.get(endIndex).word().equals("'s") && tokens.get(endIndex).tag().equals("POS")) {
                    Tree tree = sent.get(TreeAnnotation.class);
                    Tree sToken = tree.getLeaves().get(beginIndex);
                    Tree eToken = tree.getLeaves().get(endIndex);
                    Tree join = tree.joinNode(sToken, eToken);
                    Tree sJoin = join.getLeaves().get(0);
                    Tree eJoin = join.getLeaves().get(join.getLeaves().size() - 1);
                    if (sToken == sJoin && eToken == eJoin) {
                        endIndex++;
                    }
                }
                // include DT if it belongs to this named entity
                if (beginIndex > 0 && tokens.get(beginIndex - 1).tag().equals("DT")) {
                    Tree tree = sent.get(TreeAnnotation.class);
                    Tree sToken = tree.getLeaves().get(beginIndex - 1);
                    Tree eToken = tree.getLeaves().get(endIndex - 1);
                    Tree join = tree.joinNode(sToken, eToken);
                    Tree sJoin = join.getLeaves().get(0);
                    Tree eJoin = join.getLeaves().get(join.getLeaves().size() - 1);
                    if (sToken == sJoin && eToken == eJoin) {
                        beginIndex--;
                    }
                }
                IntPair span = new IntPair(beginIndex, endIndex);
                if (phrase.equalsIgnoreCase(ne) && !mentionSpanSet.contains(span)) {
                    int dummyMentionId = -1;
                    Mention m = new Mention(dummyMentionId, beginIndex, endIndex, tokens, sent.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class), sent.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class) != null ? sent.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class) : sent.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class), new ArrayList<>(tokens.subList(beginIndex, endIndex)));
                    mentions.add(m);
                    mentionSpanSet.add(span);
                }
            }
        }
    }
}
Also used : SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) IntPair(edu.stanford.nlp.util.IntPair) ParserConstraint(edu.stanford.nlp.parser.common.ParserConstraint) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Mention(edu.stanford.nlp.coref.data.Mention) Tree(edu.stanford.nlp.trees.Tree) CoreMap(edu.stanford.nlp.util.CoreMap)

Example 23 with IntPair

use of edu.stanford.nlp.util.IntPair in project CoreNLP by stanfordnlp.

the class DocumentPreprocessor method findSpeakersInArticle.

private static void findSpeakersInArticle(Document doc, Dictionaries dict) {
    List<CoreMap> sentences = doc.annotation.get(CoreAnnotations.SentencesAnnotation.class);
    IntPair beginQuotation = null;
    IntPair endQuotation = null;
    boolean insideQuotation = false;
    int utterNum = -1;
    for (int i = 0; i < sentences.size(); i++) {
        List<CoreLabel> sent = sentences.get(i).get(CoreAnnotations.TokensAnnotation.class);
        for (int j = 0; j < sent.size(); j++) {
            int utterIndex = sent.get(j).get(CoreAnnotations.UtteranceAnnotation.class);
            if (utterIndex != 0 && !insideQuotation) {
                utterNum = utterIndex;
                insideQuotation = true;
                beginQuotation = new IntPair(i, j);
            } else if (utterIndex == 0 && insideQuotation) {
                insideQuotation = false;
                endQuotation = new IntPair(i, j);
                findQuotationSpeaker(doc, utterNum, sentences, beginQuotation, endQuotation, dict);
            }
        }
    }
    if (insideQuotation) {
        endQuotation = new IntPair(sentences.size() - 1, sentences.get(sentences.size() - 1).get(CoreAnnotations.TokensAnnotation.class).size() - 1);
        findQuotationSpeaker(doc, utterNum, sentences, beginQuotation, endQuotation, dict);
    }
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) CoreMap(edu.stanford.nlp.util.CoreMap) IntPair(edu.stanford.nlp.util.IntPair)

Example 24 with IntPair

use of edu.stanford.nlp.util.IntPair in project CoreNLP by stanfordnlp.

the class DocumentPreprocessor method findTwinMentionsRelaxed.

/** Mark twin mentions: heads of the mentions are matched */
private static void findTwinMentionsRelaxed(Document doc) {
    for (int sentNum = 0; sentNum < doc.goldMentions.size(); sentNum++) {
        List<Mention> golds = doc.goldMentions.get(sentNum);
        List<Mention> predicts = doc.predictedMentions.get(sentNum);
        Map<IntPair, Mention> goldMentionPositions = Generics.newHashMap();
        Map<Integer, LinkedList<Mention>> goldMentionHeadPositions = Generics.newHashMap();
        for (Mention g : golds) {
            goldMentionPositions.put(new IntPair(g.startIndex, g.endIndex), g);
            if (!goldMentionHeadPositions.containsKey(g.headIndex)) {
                goldMentionHeadPositions.put(g.headIndex, new LinkedList<>());
            }
            goldMentionHeadPositions.get(g.headIndex).add(g);
        }
        List<Mention> remains = new ArrayList<>();
        for (Mention p : predicts) {
            IntPair pos = new IntPair(p.startIndex, p.endIndex);
            if (goldMentionPositions.containsKey(pos)) {
                Mention g = goldMentionPositions.get(pos);
                p.mentionID = g.mentionID;
                p.hasTwin = true;
                g.hasTwin = true;
                goldMentionHeadPositions.get(g.headIndex).remove(g);
                if (goldMentionHeadPositions.get(g.headIndex).isEmpty()) {
                    goldMentionHeadPositions.remove(g.headIndex);
                }
            } else
                remains.add(p);
        }
        for (Mention r : remains) {
            if (goldMentionHeadPositions.containsKey(r.headIndex)) {
                Mention g = goldMentionHeadPositions.get(r.headIndex).poll();
                r.mentionID = g.mentionID;
                r.hasTwin = true;
                g.hasTwin = true;
                if (goldMentionHeadPositions.get(g.headIndex).isEmpty()) {
                    goldMentionHeadPositions.remove(g.headIndex);
                }
            }
        }
    }
}
Also used : ArrayList(java.util.ArrayList) IntPair(edu.stanford.nlp.util.IntPair) LinkedList(java.util.LinkedList)

Example 25 with IntPair

use of edu.stanford.nlp.util.IntPair in project CoreNLP by stanfordnlp.

the class DocumentPreprocessor method findTwinMentionsStrict.

/** Mark twin mentions: All mention boundaries should be matched */
private static void findTwinMentionsStrict(Document doc) {
    for (int sentNum = 0; sentNum < doc.goldMentions.size(); sentNum++) {
        List<Mention> golds = doc.goldMentions.get(sentNum);
        List<Mention> predicts = doc.predictedMentions.get(sentNum);
        // For CoNLL training there are some documents with gold mentions with the same position offsets
        // See /scr/nlp/data/conll-2011/v2/data/train/data/english/annotations/nw/wsj/09/wsj_0990.v2_auto_conll
        //  (Packwood - Roth)
        CollectionValuedMap<IntPair, Mention> goldMentionPositions = new CollectionValuedMap<>();
        for (Mention g : golds) {
            IntPair ip = new IntPair(g.startIndex, g.endIndex);
            if (goldMentionPositions.containsKey(ip)) {
                StringBuilder existingMentions = new StringBuilder();
                for (Mention eg : goldMentionPositions.get(ip)) {
                    if (existingMentions.length() > 0) {
                        existingMentions.append(",");
                    }
                    existingMentions.append(eg.mentionID);
                }
                Redwood.log("debug-preprocessor", "WARNING: gold mentions with the same offsets: " + ip + " mentions=" + g.mentionID + "," + existingMentions + ", " + g.spanToString());
            }
            //assert(!goldMentionPositions.containsKey(ip));
            goldMentionPositions.add(new IntPair(g.startIndex, g.endIndex), g);
        }
        for (Mention p : predicts) {
            IntPair pos = new IntPair(p.startIndex, p.endIndex);
            if (goldMentionPositions.containsKey(pos)) {
                Collection<Mention> cm = goldMentionPositions.get(pos);
                int minId = Integer.MAX_VALUE;
                Mention g = null;
                for (Mention m : cm) {
                    if (m.mentionID < minId) {
                        g = m;
                        minId = m.mentionID;
                    }
                }
                cm.remove(g);
                p.mentionID = g.mentionID;
                p.hasTwin = true;
                g.hasTwin = true;
            }
        }
    }
}
Also used : CollectionValuedMap(edu.stanford.nlp.util.CollectionValuedMap) IntPair(edu.stanford.nlp.util.IntPair)

Aggregations

IntPair (edu.stanford.nlp.util.IntPair)37 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)19 CoreLabel (edu.stanford.nlp.ling.CoreLabel)17 SemanticGraphCoreAnnotations (edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations)16 Mention (edu.stanford.nlp.coref.data.Mention)14 TreeCoreAnnotations (edu.stanford.nlp.trees.TreeCoreAnnotations)11 CoreMap (edu.stanford.nlp.util.CoreMap)9 SemanticGraph (edu.stanford.nlp.semgraph.SemanticGraph)8 ArrayList (java.util.ArrayList)8 IndexedWord (edu.stanford.nlp.ling.IndexedWord)7 Tree (edu.stanford.nlp.trees.Tree)7 List (java.util.List)6 ParserConstraint (edu.stanford.nlp.parser.common.ParserConstraint)5 SemanticGraphEdge (edu.stanford.nlp.semgraph.SemanticGraphEdge)3 TregexMatcher (edu.stanford.nlp.trees.tregex.TregexMatcher)3 TregexPattern (edu.stanford.nlp.trees.tregex.TregexPattern)3 CollectionValuedMap (edu.stanford.nlp.util.CollectionValuedMap)3 Set (java.util.Set)3 BasicDependenciesAnnotation (edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation)2 Constituent (edu.stanford.nlp.trees.Constituent)2