Search in sources :

Example 1 with CollectionValuedMap

use of edu.stanford.nlp.util.CollectionValuedMap in project CoreNLP by stanfordnlp.

the class ApplyPatternsMulti method call.

@Override
public Pair<TwoDimensionalCounter<Pair<String, String>, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>> call() throws Exception {
    //CollectionValuedMap<String, Integer> tokensMatchedPattern = new CollectionValuedMap<String, Integer>();
    CollectionValuedMap<E, Triple<String, Integer, Integer>> matchedTokensByPat = new CollectionValuedMap<>();
    TwoDimensionalCounter<Pair<String, String>, E> allFreq = new TwoDimensionalCounter<>();
    for (String sentid : sentids) {
        List<CoreLabel> sent = sents.get(sentid).getTokens();
        //FIND_ALL is faster than FIND_NONOVERLAP
        Iterable<SequenceMatchResult<CoreMap>> matched = multiPatternMatcher.find(sent, SequenceMatcher.FindType.FIND_ALL);
        for (SequenceMatchResult<CoreMap> m : matched) {
            int s = m.start("$term");
            int e = m.end("$term");
            E matchedPat = patterns.get(m.pattern());
            matchedTokensByPat.add(matchedPat, new Triple<>(sentid, s, e));
            String phrase = "";
            String phraseLemma = "";
            boolean useWordNotLabeled = false;
            boolean doNotUse = false;
            //find if the neighboring words are labeled - if so - club them together
            if (constVars.clubNeighboringLabeledWords) {
                for (int i = s - 1; i >= 0; i--) {
                    if (!sent.get(i).get(constVars.getAnswerClass().get(label)).equals(label)) {
                        s = i + 1;
                        break;
                    }
                }
                for (int i = e; i < sent.size(); i++) {
                    if (!sent.get(i).get(constVars.getAnswerClass().get(label)).equals(label)) {
                        e = i;
                        break;
                    }
                }
            }
            //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
            boolean[] addedindices = new boolean[e - s];
            Arrays.fill(addedindices, false);
            for (int i = s; i < e; i++) {
                CoreLabel l = sent.get(i);
                l.set(PatternsAnnotations.MatchedPattern.class, true);
                if (!l.containsKey(PatternsAnnotations.MatchedPatterns.class))
                    l.set(PatternsAnnotations.MatchedPatterns.class, new HashSet<>());
                l.get(PatternsAnnotations.MatchedPatterns.class).add(matchedPat);
                // }
                for (Entry<Class, Object> ig : constVars.getIgnoreWordswithClassesDuringSelection().get(label).entrySet()) {
                    if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) {
                        doNotUse = true;
                    }
                }
                boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), PatternFactory.ignoreWordRegex);
                if (removePhrasesWithStopWords && containsStop) {
                    doNotUse = true;
                } else {
                    if (!containsStop || !removeStopWordsFromSelectedPhrases) {
                        if (label == null || l.get(constVars.getAnswerClass().get(label)) == null || !l.get(constVars.getAnswerClass().get(label)).equals(label.toString())) {
                            useWordNotLabeled = true;
                        }
                        phrase += " " + l.word();
                        phraseLemma += " " + l.lemma();
                        addedindices[i - s] = true;
                    }
                }
            }
            for (int i = 0; i < addedindices.length; i++) {
                if (i > 0 && i < addedindices.length - 1 && addedindices[i - 1] == true && addedindices[i] == false && addedindices[i + 1] == true) {
                    doNotUse = true;
                    break;
                }
            }
            if (!doNotUse && useWordNotLabeled) {
                phrase = phrase.trim();
                phraseLemma = phraseLemma.trim();
                allFreq.incrementCount(new Pair<>(phrase, phraseLemma), matchedPat, 1.0);
            }
        }
    //      for (SurfacePattern pat : patterns.keySet()) {
    //        String patternStr = pat.toString();
    //
    //        TokenSequencePattern p = TokenSequencePattern.compile(constVars.env.get(label), patternStr);
    //        if (pat == null || p == null)
    //          throw new RuntimeException("why is the pattern " + pat + " null?");
    //
    //        TokenSequenceMatcher m = p.getMatcher(sent);
    //        while (m.find()) {
    //
    //          int s = m.start("$term");
    //          int e = m.end("$term");
    //
    //          String phrase = "";
    //          String phraseLemma = "";
    //          boolean useWordNotLabeled = false;
    //          boolean doNotUse = false;
    //          for (int i = s; i < e; i++) {
    //            CoreLabel l = sent.get(i);
    //            l.set(PatternsAnnotations.MatchedPattern.class, true);
    //            if (restrictToMatched) {
    //              tokensMatchedPattern.add(sentid, i);
    //            }
    //            for (Entry<Class, Object> ig : constVars.ignoreWordswithClassesDuringSelection.get(label).entrySet()) {
    //              if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) {
    //                doNotUse = true;
    //              }
    //            }
    //            boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), constVars.ignoreWordRegex, ignoreWords);
    //            if (removePhrasesWithStopWords && containsStop) {
    //              doNotUse = true;
    //            } else {
    //              if (!containsStop || !removeStopWordsFromSelectedPhrases) {
    //
    //                if (label == null || l.get(constVars.answerClass.get(label)) == null || !l.get(constVars.answerClass.get(label)).equals(label.toString())) {
    //                  useWordNotLabeled = true;
    //                }
    //                phrase += " " + l.word();
    //                phraseLemma += " " + l.lemma();
    //
    //              }
    //            }
    //          }
    //          if (!doNotUse && useWordNotLabeled) {
    //            phrase = phrase.trim();
    //            phraseLemma = phraseLemma.trim();
    //            allFreq.incrementCount(new Pair<String, String>(phrase, phraseLemma), pat, 1.0);
    //          }
    //        }
    //      }
    }
    return new Pair<>(allFreq, matchedTokensByPat);
}
Also used : CollectionValuedMap(edu.stanford.nlp.util.CollectionValuedMap) SequenceMatchResult(edu.stanford.nlp.ling.tokensregex.SequenceMatchResult) Pair(edu.stanford.nlp.util.Pair) TwoDimensionalCounter(edu.stanford.nlp.stats.TwoDimensionalCounter) Triple(edu.stanford.nlp.util.Triple) CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreMap(edu.stanford.nlp.util.CoreMap)

Example 2 with CollectionValuedMap

use of edu.stanford.nlp.util.CollectionValuedMap in project CoreNLP by stanfordnlp.

the class CoNLLMentionExtractor method extractGoldMentions.

public List<List<Mention>> extractGoldMentions(CoNLL2011DocumentReader.Document conllDoc) {
    List<CoreMap> sentences = conllDoc.getAnnotation().get(CoreAnnotations.SentencesAnnotation.class);
    List<List<Mention>> allGoldMentions = new ArrayList<>();
    CollectionValuedMap<String, CoreMap> corefChainMap = conllDoc.getCorefChainMap();
    for (int i = 0; i < sentences.size(); i++) {
        allGoldMentions.add(new ArrayList<>());
    }
    int maxCorefClusterId = -1;
    for (String corefIdStr : corefChainMap.keySet()) {
        int id = Integer.parseInt(corefIdStr);
        if (id > maxCorefClusterId) {
            maxCorefClusterId = id;
        }
    }
    int newMentionID = maxCorefClusterId + 1;
    for (Map.Entry<String, Collection<CoreMap>> idChainEntry : corefChainMap.entrySet()) {
        int id = Integer.parseInt(idChainEntry.getKey());
        int clusterMentionCnt = 0;
        for (CoreMap m : idChainEntry.getValue()) {
            clusterMentionCnt++;
            Mention mention = new Mention();
            mention.goldCorefClusterID = id;
            if (clusterMentionCnt == 1) {
                // First mention in cluster
                mention.mentionID = id;
                mention.originalRef = -1;
            } else {
                mention.mentionID = newMentionID;
                mention.originalRef = id;
                newMentionID++;
            }
            if (maxID < mention.mentionID)
                maxID = mention.mentionID;
            int sentIndex = m.get(CoreAnnotations.SentenceIndexAnnotation.class);
            CoreMap sent = sentences.get(sentIndex);
            mention.startIndex = m.get(CoreAnnotations.TokenBeginAnnotation.class) - sent.get(CoreAnnotations.TokenBeginAnnotation.class);
            mention.endIndex = m.get(CoreAnnotations.TokenEndAnnotation.class) - sent.get(CoreAnnotations.TokenBeginAnnotation.class);
            // will be set by arrange
            mention.originalSpan = m.get(CoreAnnotations.TokensAnnotation.class);
            // Mention dependency graph is the enhanced dependency graph of the sentence
            mention.dependency = sentences.get(sentIndex).get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
            allGoldMentions.get(sentIndex).add(mention);
        }
    }
    return allGoldMentions;
}
Also used : ArrayList(java.util.ArrayList) TreeCoreAnnotations(edu.stanford.nlp.trees.TreeCoreAnnotations) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) Collection(java.util.Collection) ArrayList(java.util.ArrayList) List(java.util.List) CoreMap(edu.stanford.nlp.util.CoreMap) CollectionValuedMap(edu.stanford.nlp.util.CollectionValuedMap) Map(java.util.Map) CoreMap(edu.stanford.nlp.util.CoreMap)

Example 3 with CollectionValuedMap

use of edu.stanford.nlp.util.CollectionValuedMap in project CoreNLP by stanfordnlp.

the class Document method findTwinMentionsStrict.

/** Mark twin mentions: All mention boundaries should be matched */
private void findTwinMentionsStrict() {
    for (int sentNum = 0; sentNum < goldOrderedMentionsBySentence.size(); sentNum++) {
        List<Mention> golds = goldOrderedMentionsBySentence.get(sentNum);
        List<Mention> predicts = predictedOrderedMentionsBySentence.get(sentNum);
        // For CoNLL training there are some documents with gold mentions with the same position offsets
        // See /scr/nlp/data/conll-2011/v2/data/train/data/english/annotations/nw/wsj/09/wsj_0990.v2_auto_conll
        //  (Packwood - Roth)
        CollectionValuedMap<IntPair, Mention> goldMentionPositions = new CollectionValuedMap<>();
        for (Mention g : golds) {
            IntPair ip = new IntPair(g.startIndex, g.endIndex);
            if (goldMentionPositions.containsKey(ip)) {
                StringBuilder existingMentions = new StringBuilder();
                for (Mention eg : goldMentionPositions.get(ip)) {
                    if (existingMentions.length() > 0) {
                        existingMentions.append(",");
                    }
                    existingMentions.append(eg.mentionID);
                }
                SieveCoreferenceSystem.logger.warning("WARNING: gold mentions with the same offsets: " + ip + " mentions=" + g.mentionID + "," + existingMentions + ", " + g.spanToString());
            }
            //assert(!goldMentionPositions.containsKey(ip));
            goldMentionPositions.add(new IntPair(g.startIndex, g.endIndex), g);
        }
        for (Mention p : predicts) {
            IntPair pos = new IntPair(p.startIndex, p.endIndex);
            if (goldMentionPositions.containsKey(pos)) {
                Collection<Mention> cm = goldMentionPositions.get(pos);
                Mention g = cm.iterator().next();
                cm.remove(g);
                p.mentionID = g.mentionID;
                p.twinless = false;
                g.twinless = false;
            }
        }
        // temp: for making easy to recognize twinless mention
        for (Mention p : predicts) {
            if (p.twinless)
                p.mentionID += 10000;
        }
    }
}
Also used : CollectionValuedMap(edu.stanford.nlp.util.CollectionValuedMap) IntPair(edu.stanford.nlp.util.IntPair)

Example 4 with CollectionValuedMap

use of edu.stanford.nlp.util.CollectionValuedMap in project CoreNLP by stanfordnlp.

the class DocumentPreprocessor method findTwinMentionsStrict.

/** Mark twin mentions: All mention boundaries should be matched */
private static void findTwinMentionsStrict(Document doc) {
    for (int sentNum = 0; sentNum < doc.goldMentions.size(); sentNum++) {
        List<Mention> golds = doc.goldMentions.get(sentNum);
        List<Mention> predicts = doc.predictedMentions.get(sentNum);
        // For CoNLL training there are some documents with gold mentions with the same position offsets
        // See /scr/nlp/data/conll-2011/v2/data/train/data/english/annotations/nw/wsj/09/wsj_0990.v2_auto_conll
        //  (Packwood - Roth)
        CollectionValuedMap<IntPair, Mention> goldMentionPositions = new CollectionValuedMap<>();
        for (Mention g : golds) {
            IntPair ip = new IntPair(g.startIndex, g.endIndex);
            if (goldMentionPositions.containsKey(ip)) {
                StringBuilder existingMentions = new StringBuilder();
                for (Mention eg : goldMentionPositions.get(ip)) {
                    if (existingMentions.length() > 0) {
                        existingMentions.append(",");
                    }
                    existingMentions.append(eg.mentionID);
                }
                Redwood.log("debug-preprocessor", "WARNING: gold mentions with the same offsets: " + ip + " mentions=" + g.mentionID + "," + existingMentions + ", " + g.spanToString());
            }
            //assert(!goldMentionPositions.containsKey(ip));
            goldMentionPositions.add(new IntPair(g.startIndex, g.endIndex), g);
        }
        for (Mention p : predicts) {
            IntPair pos = new IntPair(p.startIndex, p.endIndex);
            if (goldMentionPositions.containsKey(pos)) {
                Collection<Mention> cm = goldMentionPositions.get(pos);
                int minId = Integer.MAX_VALUE;
                Mention g = null;
                for (Mention m : cm) {
                    if (m.mentionID < minId) {
                        g = m;
                        minId = m.mentionID;
                    }
                }
                cm.remove(g);
                p.mentionID = g.mentionID;
                p.hasTwin = true;
                g.hasTwin = true;
            }
        }
    }
}
Also used : CollectionValuedMap(edu.stanford.nlp.util.CollectionValuedMap) IntPair(edu.stanford.nlp.util.IntPair)

Example 5 with CollectionValuedMap

use of edu.stanford.nlp.util.CollectionValuedMap in project CoreNLP by stanfordnlp.

the class ApplyPatterns method call.

@Override
public Triple<TwoDimensionalCounter<CandidatePhrase, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>, Set<CandidatePhrase>> call() throws Exception {
    // CollectionValuedMap<String, Integer>();
    try {
        Set<CandidatePhrase> alreadyLabeledPhrases = new HashSet<>();
        TwoDimensionalCounter<CandidatePhrase, E> allFreq = new TwoDimensionalCounter<>();
        CollectionValuedMap<E, Triple<String, Integer, Integer>> matchedTokensByPat = new CollectionValuedMap<>();
        for (String sentid : sentids) {
            List<CoreLabel> sent = sents.get(sentid).getTokens();
            for (Entry<TokenSequencePattern, E> pEn : patterns.entrySet()) {
                if (pEn.getKey() == null)
                    throw new RuntimeException("why is the pattern " + pEn + " null?");
                TokenSequenceMatcher m = pEn.getKey().getMatcher(sent);
                //        //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
                //        m.setFindType(SequenceMatcher.FindType.FIND_ALL);
                //Higher branch values makes the faster but uses more memory
                m.setBranchLimit(5);
                while (m.find()) {
                    int s = m.start("$term");
                    int e = m.end("$term");
                    assert e - s <= PatternFactory.numWordsCompoundMapped.get(label) : "How come the pattern " + pEn.getKey() + " is extracting phrases longer than numWordsCompound of " + PatternFactory.numWordsCompoundMapped.get(label) + " for label " + label;
                    String phrase = "";
                    String phraseLemma = "";
                    boolean useWordNotLabeled = false;
                    boolean doNotUse = false;
                    //find if the neighboring words are labeled - if so - club them together
                    if (constVars.clubNeighboringLabeledWords) {
                        for (int i = s - 1; i >= 0; i--) {
                            if (!sent.get(i).get(constVars.getAnswerClass().get(label)).equals(label)) {
                                s = i + 1;
                                break;
                            }
                        }
                        for (int i = e; i < sent.size(); i++) {
                            if (!sent.get(i).get(constVars.getAnswerClass().get(label)).equals(label)) {
                                e = i;
                                break;
                            }
                        }
                    }
                    //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
                    boolean[] addedindices = new boolean[e - s];
                    Arrays.fill(addedindices, false);
                    for (int i = s; i < e; i++) {
                        CoreLabel l = sent.get(i);
                        l.set(PatternsAnnotations.MatchedPattern.class, true);
                        if (!l.containsKey(PatternsAnnotations.MatchedPatterns.class) || l.get(PatternsAnnotations.MatchedPatterns.class) == null)
                            l.set(PatternsAnnotations.MatchedPatterns.class, new HashSet<>());
                        SurfacePattern pSur = (SurfacePattern) pEn.getValue();
                        assert pSur != null : "Why is " + pEn.getValue() + " not present in the index?!";
                        assert l.get(PatternsAnnotations.MatchedPatterns.class) != null : "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.keySet();
                        l.get(PatternsAnnotations.MatchedPatterns.class).add(pSur);
                        for (Entry<Class, Object> ig : constVars.getIgnoreWordswithClassesDuringSelection().get(label).entrySet()) {
                            if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) {
                                doNotUse = true;
                            }
                        }
                        boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), PatternFactory.ignoreWordRegex);
                        if (removePhrasesWithStopWords && containsStop) {
                            doNotUse = true;
                        } else {
                            if (!containsStop || !removeStopWordsFromSelectedPhrases) {
                                if (label == null || l.get(constVars.getAnswerClass().get(label)) == null || !l.get(constVars.getAnswerClass().get(label)).equals(label.toString())) {
                                    useWordNotLabeled = true;
                                }
                                phrase += " " + l.word();
                                phraseLemma += " " + l.lemma();
                                addedindices[i - s] = true;
                            }
                        }
                    }
                    for (int i = 0; i < addedindices.length; i++) {
                        if (i > 0 && i < addedindices.length - 1 && addedindices[i - 1] == true && addedindices[i] == false && addedindices[i + 1] == true) {
                            doNotUse = true;
                            break;
                        }
                    }
                    if (!doNotUse) {
                        matchedTokensByPat.add(pEn.getValue(), new Triple<>(sentid, s, e - 1));
                        phrase = phrase.trim();
                        if (!phrase.isEmpty()) {
                            phraseLemma = phraseLemma.trim();
                            CandidatePhrase candPhrase = CandidatePhrase.createOrGet(phrase, phraseLemma);
                            allFreq.incrementCount(candPhrase, pEn.getValue(), 1.0);
                            if (!useWordNotLabeled)
                                alreadyLabeledPhrases.add(candPhrase);
                        }
                    }
                }
            }
        }
        return new Triple<>(allFreq, matchedTokensByPat, alreadyLabeledPhrases);
    } catch (Exception e) {
        e.printStackTrace();
        throw e;
    }
}
Also used : CollectionValuedMap(edu.stanford.nlp.util.CollectionValuedMap) TokenSequencePattern(edu.stanford.nlp.ling.tokensregex.TokenSequencePattern) TokenSequenceMatcher(edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher) TwoDimensionalCounter(edu.stanford.nlp.stats.TwoDimensionalCounter) Triple(edu.stanford.nlp.util.Triple) CoreLabel(edu.stanford.nlp.ling.CoreLabel)

Aggregations

CollectionValuedMap (edu.stanford.nlp.util.CollectionValuedMap)6 IntPair (edu.stanford.nlp.util.IntPair)3 CoreLabel (edu.stanford.nlp.ling.CoreLabel)2 TwoDimensionalCounter (edu.stanford.nlp.stats.TwoDimensionalCounter)2 CoreMap (edu.stanford.nlp.util.CoreMap)2 Triple (edu.stanford.nlp.util.Triple)2 ArrayList (java.util.ArrayList)2 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)1 IndexedWord (edu.stanford.nlp.ling.IndexedWord)1 SequenceMatchResult (edu.stanford.nlp.ling.tokensregex.SequenceMatchResult)1 TokenSequenceMatcher (edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher)1 TokenSequencePattern (edu.stanford.nlp.ling.tokensregex.TokenSequencePattern)1 SemanticGraphCoreAnnotations (edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations)1 TreeCoreAnnotations (edu.stanford.nlp.trees.TreeCoreAnnotations)1 Pair (edu.stanford.nlp.util.Pair)1 Collection (java.util.Collection)1 List (java.util.List)1 Map (java.util.Map)1