Search in sources :

Example 36 with IntPair

use of edu.stanford.nlp.util.IntPair in project CoreNLP by stanfordnlp.

the class ExtractPhraseFromPattern method getSemGrexPatternNodes.

/*
   * Given a SemanticGraph g and a SemgrexPattern pattern
   * And a bunch of other parameters,
   * run the pattern matcher (get SemgrexMatcher m)
   * Iterate through to get matching words/phrases
   * 
   * Next, gets matchedGraphsForPattern.get(pattern),
   * a list of matched (String, semgraph) pairs
   * and adds the new graph and tokens if matched.
   * 
   * I need to clarify what's going on with tokens.
   */
public Set<IndexedWord> getSemGrexPatternNodes(SemanticGraph g, List<String> tokens, Collection<String> outputNodes, Collection<IntPair> outputIndices, SemgrexPattern pattern, boolean findSubTrees, Collection<ExtractedPhrase> extractedPhrases, boolean lowercase, Function<CoreLabel, Boolean> acceptWord) {
    Set<IndexedWord> foundWordsParents = new HashSet<>();
    SemgrexMatcher m = pattern.matcher(g, lowercase);
    while (m.find()) {
        IndexedWord w = m.getNode("node");
        //System.out.println("found a match for " + pattern.pattern());
        IndexedWord parent = m.getNode("parent");
        boolean ifSatisfiedMaxDepth = checkIfSatisfiedMaxDepth(g, parent, w, new IntPair(maxDepth, 0));
        if (ifSatisfiedMaxDepth == false)
            continue;
        if (DEBUG > 3) {
            List<Pair<String, SemanticGraph>> matchedGraphs = matchedGraphsForPattern.get(pattern);
            if (matchedGraphs == null)
                matchedGraphs = new ArrayList<>();
            matchedGraphs.add(new Pair<>(StringUtils.join(tokens, " "), g));
            //if (DEBUG >= 3)
            //  System.out.println("matched pattern is " + pattern);
            matchedGraphsForPattern.put(pattern, matchedGraphs);
        }
        foundWordsParents.add(parent);
        // String relationName = m.getRelnString("reln");
        // System.out.println("word is " + w.lemma() + " and " + w.tag());
        ArrayList<IndexedWord> seenNodes = new ArrayList<>();
        List<String> cutoffrelations = new ArrayList<>();
        //      if (elementStr.equalsIgnoreCase("technique"))
        //        cutoffrelations = cutoffRelationsForTech;
        //      if (elementStr.equalsIgnoreCase("app"))
        //        cutoffrelations = this.cuttoffRelationsForApp;
        //System.out.println("g is ");
        //g.prettyPrint();
        printSubGraph(g, w, cutoffrelations, tokens, outputNodes, outputIndices, seenNodes, new ArrayList<>(), findSubTrees, extractedPhrases, pattern, acceptWord);
    }
    return foundWordsParents;
}
Also used : SemgrexMatcher(edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher) ArrayList(java.util.ArrayList) IntPair(edu.stanford.nlp.util.IntPair) IndexedWord(edu.stanford.nlp.ling.IndexedWord) HashSet(java.util.HashSet) IntPair(edu.stanford.nlp.util.IntPair) Pair(edu.stanford.nlp.util.Pair)

Example 37 with IntPair

use of edu.stanford.nlp.util.IntPair in project CoreNLP by stanfordnlp.

the class ExtractPhraseFromPattern method printSubGraph.

//Here, the index (startIndex, endIndex) seems to be inclusive of the endIndex
public void printSubGraph(SemanticGraph g, IndexedWord w, List<String> additionalCutOffRels, List<String> textTokens, Collection<String> listOfOutput, Collection<IntPair> listOfOutputIndices, List<IndexedWord> seenNodes, List<IndexedWord> doNotAddThese, boolean findSubTrees, Collection<ExtractedPhrase> extractedPhrases, SemgrexPattern pattern, Function<CoreLabel, Boolean> acceptWord) {
    try {
        if (seenNodes.contains(w))
            return;
        seenNodes.add(w);
        if (doNotAddThese.contains(w))
            return;
        List<IndexedWord> andNodes = new ArrayList<>();
        descendantsWithReln(g, w, "conj_and", new ArrayList<>(), andNodes);
        for (IndexedWord w1 : andNodes) {
            printSubGraph(g, w1, additionalCutOffRels, textTokens, listOfOutput, listOfOutputIndices, seenNodes, doNotAddThese, findSubTrees, extractedPhrases, pattern, acceptWord);
        }
        doNotAddThese.addAll(andNodes);
        List<String> allCutOffRels = new ArrayList<>();
        if (additionalCutOffRels != null)
            allCutOffRels.addAll(additionalCutOffRels);
        allCutOffRels.addAll(cutoffRelations);
        CollectionValuedMap<Integer, String> featPerToken = new CollectionValuedMap<>();
        Collection<String> feat = new ArrayList<>();
        GetPatternsFromDataMultiClass.getFeatures(g, w, true, feat, null);
        Set<IndexedWord> words = descendants(g, w, allCutOffRels, doNotAddThese, ignoreCommonTags, acceptWord, featPerToken);
        //System.out.println("words are " + words);
        if (words.size() > 0) {
            int min = Integer.MAX_VALUE, max = -1;
            for (IndexedWord word : words) {
                if (word.index() < min)
                    min = word.index();
                if (word.index() > max)
                    max = word.index();
            }
            IntPair indices;
            // phrase = StringUtils.join(ph.values(), " ");
            if ((max - min + 1) > maxPhraseLength) {
                max = min + maxPhraseLength - 1;
            }
            indices = new IntPair(min - 1, max - 1);
            String phrase = StringUtils.join(textTokens.subList(min - 1, max), " ");
            phrase = phrase.trim();
            feat.add("LENGTH-" + (max - min + 1));
            for (int i = min; i <= max; i++) feat.addAll(featPerToken.get(i));
            //System.out.println("phrase is " + phrase  + " index is " + indices + " and maxphraselength is " + maxPhraseLength + " and descendentset is " + words);
            ExtractedPhrase extractedPh = new ExtractedPhrase(min - 1, max - 1, pattern, phrase, Counters.asCounter(feat));
            if (!listOfOutput.contains(phrase) && !doNotAddThese.contains(phrase)) {
                //          if (sentElem != null) {
                //            Element node = new Element(elemString, curNS);
                //            node.addContent(phrase);
                //            sentElem.addContent(node);
                //          }
                listOfOutput.add(phrase);
                if (!listOfOutputIndices.contains(indices)) {
                    listOfOutputIndices.add(indices);
                    extractedPhrases.add(extractedPh);
                }
                if (findSubTrees == true) {
                    for (IndexedWord word : words) if (!seenNodes.contains(word))
                        printSubGraph(g, word, additionalCutOffRels, textTokens, listOfOutput, listOfOutputIndices, seenNodes, doNotAddThese, findSubTrees, extractedPhrases, pattern, acceptWord);
                }
            }
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}
Also used : CollectionValuedMap(edu.stanford.nlp.util.CollectionValuedMap) ArrayList(java.util.ArrayList) IntPair(edu.stanford.nlp.util.IntPair) IndexedWord(edu.stanford.nlp.ling.IndexedWord)

Aggregations

IntPair (edu.stanford.nlp.util.IntPair)37 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)19 CoreLabel (edu.stanford.nlp.ling.CoreLabel)17 SemanticGraphCoreAnnotations (edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations)16 Mention (edu.stanford.nlp.coref.data.Mention)14 TreeCoreAnnotations (edu.stanford.nlp.trees.TreeCoreAnnotations)11 CoreMap (edu.stanford.nlp.util.CoreMap)9 SemanticGraph (edu.stanford.nlp.semgraph.SemanticGraph)8 ArrayList (java.util.ArrayList)8 IndexedWord (edu.stanford.nlp.ling.IndexedWord)7 Tree (edu.stanford.nlp.trees.Tree)7 List (java.util.List)6 ParserConstraint (edu.stanford.nlp.parser.common.ParserConstraint)5 SemanticGraphEdge (edu.stanford.nlp.semgraph.SemanticGraphEdge)3 TregexMatcher (edu.stanford.nlp.trees.tregex.TregexMatcher)3 TregexPattern (edu.stanford.nlp.trees.tregex.TregexPattern)3 CollectionValuedMap (edu.stanford.nlp.util.CollectionValuedMap)3 Set (java.util.Set)3 BasicDependenciesAnnotation (edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation)2 Constituent (edu.stanford.nlp.trees.Constituent)2