Search in sources :

Example 1 with Pattern

use of edu.stanford.nlp.patterns.Pattern in project CoreNLP by stanfordnlp.

the class ApplyDepPatterns method call.

@Override
public Pair<TwoDimensionalCounter<CandidatePhrase, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>> call() throws Exception {
    // CollectionValuedMap<String, Integer> tokensMatchedPattern = new
    // CollectionValuedMap<String, Integer>();
    TwoDimensionalCounter<CandidatePhrase, E> allFreq = new TwoDimensionalCounter<>();
    CollectionValuedMap<E, Triple<String, Integer, Integer>> matchedTokensByPat = new CollectionValuedMap<>();
    for (String sentid : sentids) {
        DataInstance sent = sents.get(sentid);
        List<CoreLabel> tokens = sent.getTokens();
        for (Map.Entry<SemgrexPattern, E> pEn : patterns.entrySet()) {
            if (pEn.getKey() == null)
                throw new RuntimeException("why is the pattern " + pEn + " null?");
            SemanticGraph graph = ((DataInstanceDep) sent).getGraph();
            // SemgrexMatcher m = pEn.getKey().matcher(graph);
            // TokenSequenceMatcher m = pEn.getKey().matcher(sent);
            // //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
            // m.setFindType(SequenceMatcher.FindType.FIND_ALL);
            // Higher branch values makes the faster but uses more memory
            // m.setBranchLimit(5);
            Collection<ExtractedPhrase> matched = getMatchedTokensIndex(graph, pEn.getKey(), sent, label);
            for (ExtractedPhrase match : matched) {
                int s = match.startIndex;
                int e = match.endIndex + 1;
                String phrase = "";
                String phraseLemma = "";
                boolean useWordNotLabeled = false;
                boolean doNotUse = false;
                // find if the neighboring words are labeled - if so - club them together
                if (constVars.clubNeighboringLabeledWords) {
                    for (int i = s - 1; i >= 0; i--) {
                        if (tokens.get(i).get(constVars.getAnswerClass().get(label)).equals(label) && (e - i + 1) <= PatternFactory.numWordsCompoundMapped.get(label)) {
                            s = i;
                        // System.out.println("for phrase " + match + " clubbing earlier word. new s is " + s);
                        } else
                            break;
                    }
                    for (int i = e; i < tokens.size(); i++) {
                        if (tokens.get(i).get(constVars.getAnswerClass().get(label)).equals(label) && (i - s + 1) <= PatternFactory.numWordsCompoundMapped.get(label)) {
                            e = i;
                        // System.out.println("for phrase " + match + " clubbing next word. new e is " + e);
                        } else
                            break;
                    }
                }
                // to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
                boolean[] addedindices = new boolean[e - s];
                for (int i = s; i < e; i++) {
                    CoreLabel l = tokens.get(i);
                    l.set(PatternsAnnotations.MatchedPattern.class, true);
                    if (!l.containsKey(PatternsAnnotations.MatchedPatterns.class) || l.get(PatternsAnnotations.MatchedPatterns.class) == null)
                        l.set(PatternsAnnotations.MatchedPatterns.class, new HashSet<>());
                    Pattern pSur = pEn.getValue();
                    assert pSur != null : "Why is " + pEn.getValue() + " not present in the index?!";
                    assert l.get(PatternsAnnotations.MatchedPatterns.class) != null : "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.keySet();
                    l.get(PatternsAnnotations.MatchedPatterns.class).add(pSur);
                    for (Map.Entry<Class, Object> ig : constVars.getIgnoreWordswithClassesDuringSelection().get(label).entrySet()) {
                        if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) {
                            doNotUse = true;
                        }
                    }
                    boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), PatternFactory.ignoreWordRegex);
                    if (removePhrasesWithStopWords && containsStop) {
                        doNotUse = true;
                    } else {
                        if (!containsStop || !removeStopWordsFromSelectedPhrases) {
                            if (label == null || l.get(constVars.getAnswerClass().get(label)) == null || !l.get(constVars.getAnswerClass().get(label)).equals(label)) {
                                useWordNotLabeled = true;
                            }
                            phrase += " " + l.word();
                            phraseLemma += " " + l.lemma();
                            addedindices[i - s] = true;
                        }
                    }
                }
                for (int i = 0; i < addedindices.length; i++) {
                    if (i > 0 && i < addedindices.length - 1 && addedindices[i - 1] == true && addedindices[i] == false && addedindices[i + 1] == true) {
                        doNotUse = true;
                        break;
                    }
                }
                if (!doNotUse && useWordNotLabeled) {
                    matchedTokensByPat.add(pEn.getValue(), new Triple<>(sentid, s, e - 1));
                    if (useWordNotLabeled) {
                        phrase = phrase.trim();
                        phraseLemma = phraseLemma.trim();
                        allFreq.incrementCount(CandidatePhrase.createOrGet(phrase, phraseLemma, match.getFeatures()), pEn.getValue(), 1.0);
                    }
                }
            }
        }
    }
    return new Pair<>(allFreq, matchedTokensByPat);
}
Also used : CollectionValuedMap(edu.stanford.nlp.util.CollectionValuedMap) DataInstance(edu.stanford.nlp.patterns.DataInstance) CandidatePhrase(edu.stanford.nlp.patterns.CandidatePhrase) HashSet(java.util.HashSet) IntPair(edu.stanford.nlp.util.IntPair) Pair(edu.stanford.nlp.util.Pair) Pattern(edu.stanford.nlp.patterns.Pattern) SemgrexPattern(edu.stanford.nlp.semgraph.semgrex.SemgrexPattern) SemgrexPattern(edu.stanford.nlp.semgraph.semgrex.SemgrexPattern) TwoDimensionalCounter(edu.stanford.nlp.stats.TwoDimensionalCounter) Triple(edu.stanford.nlp.util.Triple) CoreLabel(edu.stanford.nlp.ling.CoreLabel) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph) Map(java.util.Map) CollectionValuedMap(edu.stanford.nlp.util.CollectionValuedMap) PatternsAnnotations(edu.stanford.nlp.patterns.PatternsAnnotations)

Example 2 with Pattern

use of edu.stanford.nlp.patterns.Pattern in project CoreNLP by stanfordnlp.

the class ApplyDepPatterns method getMatchedTokensIndex.

private Collection<ExtractedPhrase> getMatchedTokensIndex(SemanticGraph graph, SemgrexPattern pattern, DataInstance sent, String label) {
    // TODO: look at the ignoreCommonTags flag
    ExtractPhraseFromPattern extract = new ExtractPhraseFromPattern(false, PatternFactory.numWordsCompoundMapped.get(label));
    Collection<IntPair> outputIndices = new ArrayList<>();
    boolean findSubTrees = true;
    List<CoreLabel> tokensC = sent.getTokens();
    // TODO: see if you can get rid of this (only used for matchedGraphs)
    List<String> tokens = tokensC.stream().map(x -> x.word()).collect(Collectors.toList());
    List<String> outputPhrases = new ArrayList<>();
    List<ExtractedPhrase> extractedPhrases = new ArrayList<>();
    Function<Pair<IndexedWord, SemanticGraph>, Counter<String>> extractFeatures = new Function<Pair<IndexedWord, SemanticGraph>, Counter<String>>() {

        @Override
        public Counter<String> apply(Pair<IndexedWord, SemanticGraph> indexedWordSemanticGraphPair) {
            // TODO: make features;
            Counter<String> feat = new ClassicCounter<>();
            IndexedWord vertex = indexedWordSemanticGraphPair.first();
            SemanticGraph graph = indexedWordSemanticGraphPair.second();
            List<Pair<GrammaticalRelation, IndexedWord>> pt = graph.parentPairs(vertex);
            for (Pair<GrammaticalRelation, IndexedWord> en : pt) {
                feat.incrementCount("PARENTREL-" + en.first());
            }
            return feat;
        }
    };
    extract.getSemGrexPatternNodes(graph, tokens, outputPhrases, outputIndices, pattern, findSubTrees, extractedPhrases, constVars.matchLowerCaseContext, matchingWordRestriction);
    // System.out.println("extracted phrases are " + extractedPhrases + " and output indices are " + outputIndices);
    return extractedPhrases;
}
Also used : Pattern(edu.stanford.nlp.patterns.Pattern) Callable(java.util.concurrent.Callable) Function(java.util.function.Function) ArrayList(java.util.ArrayList) IntPair(edu.stanford.nlp.util.IntPair) HashSet(java.util.HashSet) ConstantsAndVariables(edu.stanford.nlp.patterns.ConstantsAndVariables) Counter(edu.stanford.nlp.stats.Counter) Map(java.util.Map) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph) Pair(edu.stanford.nlp.util.Pair) SemgrexPattern(edu.stanford.nlp.semgraph.semgrex.SemgrexPattern) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) TwoDimensionalCounter(edu.stanford.nlp.stats.TwoDimensionalCounter) IndexedWord(edu.stanford.nlp.ling.IndexedWord) CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) CollectionValuedMap(edu.stanford.nlp.util.CollectionValuedMap) GrammaticalRelation(edu.stanford.nlp.trees.GrammaticalRelation) Predicate(java.util.function.Predicate) Collection(java.util.Collection) Set(java.util.Set) DataInstance(edu.stanford.nlp.patterns.DataInstance) Collectors(java.util.stream.Collectors) List(java.util.List) CandidatePhrase(edu.stanford.nlp.patterns.CandidatePhrase) PatternsAnnotations(edu.stanford.nlp.patterns.PatternsAnnotations) Triple(edu.stanford.nlp.util.Triple) PatternFactory(edu.stanford.nlp.patterns.PatternFactory) ArrayList(java.util.ArrayList) IntPair(edu.stanford.nlp.util.IntPair) Function(java.util.function.Function) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Counter(edu.stanford.nlp.stats.Counter) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) TwoDimensionalCounter(edu.stanford.nlp.stats.TwoDimensionalCounter) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph) GrammaticalRelation(edu.stanford.nlp.trees.GrammaticalRelation) IndexedWord(edu.stanford.nlp.ling.IndexedWord) IntPair(edu.stanford.nlp.util.IntPair) Pair(edu.stanford.nlp.util.Pair)

Aggregations

CoreLabel (edu.stanford.nlp.ling.CoreLabel)2 CandidatePhrase (edu.stanford.nlp.patterns.CandidatePhrase)2 DataInstance (edu.stanford.nlp.patterns.DataInstance)2 Pattern (edu.stanford.nlp.patterns.Pattern)2 PatternsAnnotations (edu.stanford.nlp.patterns.PatternsAnnotations)2 SemanticGraph (edu.stanford.nlp.semgraph.SemanticGraph)2 SemgrexPattern (edu.stanford.nlp.semgraph.semgrex.SemgrexPattern)2 TwoDimensionalCounter (edu.stanford.nlp.stats.TwoDimensionalCounter)2 CollectionValuedMap (edu.stanford.nlp.util.CollectionValuedMap)2 IntPair (edu.stanford.nlp.util.IntPair)2 Pair (edu.stanford.nlp.util.Pair)2 Triple (edu.stanford.nlp.util.Triple)2 HashSet (java.util.HashSet)2 Map (java.util.Map)2 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)1 IndexedWord (edu.stanford.nlp.ling.IndexedWord)1 ConstantsAndVariables (edu.stanford.nlp.patterns.ConstantsAndVariables)1 PatternFactory (edu.stanford.nlp.patterns.PatternFactory)1 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)1 Counter (edu.stanford.nlp.stats.Counter)1