Search in sources :

Example 11 with Triple

use of edu.stanford.nlp.util.Triple in project CoreNLP by stanfordnlp.

the class GrammarCompactor method compactGrammar.

/**
   * Compacts the grammar specified by the Pair.
   *
   * @param grammar       a Pair of grammars, ordered UnaryGrammar BinaryGrammar.
   * @param allTrainPaths a Map from String passive constituents to Lists of paths
   * @param allTestPaths  a Map from String passive constituents to Lists of paths
   * @return a Pair of grammars, ordered UnaryGrammar BinaryGrammar.
   */
public Triple<Index<String>, UnaryGrammar, BinaryGrammar> compactGrammar(Pair<UnaryGrammar, BinaryGrammar> grammar, Map<String, List<List<String>>> allTrainPaths, Map<String, List<List<String>>> allTestPaths, Index<String> originalStateIndex) {
    // computed once for the whole grammar
    inputPrior = computeInputPrior(allTrainPaths);
    // BinaryGrammar bg = grammar.second;
    this.stateIndex = originalStateIndex;
    List<List<String>> trainPaths, testPaths;
    Set<UnaryRule> unaryRules = Generics.newHashSet();
    Set<BinaryRule> binaryRules = Generics.newHashSet();
    Map<String, TransducerGraph> graphs = convertGrammarToGraphs(grammar, unaryRules, binaryRules);
    compactedGraphs = Generics.newHashSet();
    if (verbose) {
        System.out.println("There are " + graphs.size() + " categories to compact.");
    }
    int i = 0;
    for (Iterator<Entry<String, TransducerGraph>> graphIter = graphs.entrySet().iterator(); graphIter.hasNext(); ) {
        Map.Entry<String, TransducerGraph> entry = graphIter.next();
        String cat = entry.getKey();
        TransducerGraph graph = entry.getValue();
        if (verbose) {
            System.out.println("About to compact grammar for " + cat + " with numNodes=" + graph.getNodes().size());
        }
        // to save memory
        trainPaths = allTrainPaths.remove(cat);
        if (trainPaths == null) {
            trainPaths = new ArrayList<>();
        }
        // to save memory
        testPaths = allTestPaths.remove(cat);
        if (testPaths == null) {
            testPaths = new ArrayList<>();
        }
        TransducerGraph compactedGraph = doCompaction(graph, trainPaths, testPaths);
        i++;
        if (verbose) {
            System.out.println(i + ". Compacted grammar for " + cat + " from " + graph.getArcs().size() + " arcs to " + compactedGraph.getArcs().size() + " arcs.");
        }
        // to save memory, remove the last thing
        graphIter.remove();
        compactedGraphs.add(compactedGraph);
    }
    Pair<UnaryGrammar, BinaryGrammar> ugbg = convertGraphsToGrammar(compactedGraphs, unaryRules, binaryRules);
    return new Triple<>(newStateIndex, ugbg.first(), ugbg.second());
}
Also used : Triple(edu.stanford.nlp.util.Triple) Entry(java.util.Map.Entry) TransducerGraph(edu.stanford.nlp.fsm.TransducerGraph)

Example 12 with Triple

use of edu.stanford.nlp.util.Triple in project CoreNLP by stanfordnlp.

the class ApplyPatterns method call.

@Override
public Triple<TwoDimensionalCounter<CandidatePhrase, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>, Set<CandidatePhrase>> call() throws Exception {
    // CollectionValuedMap<String, Integer>();
    try {
        Set<CandidatePhrase> alreadyLabeledPhrases = new HashSet<>();
        TwoDimensionalCounter<CandidatePhrase, E> allFreq = new TwoDimensionalCounter<>();
        CollectionValuedMap<E, Triple<String, Integer, Integer>> matchedTokensByPat = new CollectionValuedMap<>();
        for (String sentid : sentids) {
            List<CoreLabel> sent = sents.get(sentid).getTokens();
            for (Entry<TokenSequencePattern, E> pEn : patterns.entrySet()) {
                if (pEn.getKey() == null)
                    throw new RuntimeException("why is the pattern " + pEn + " null?");
                TokenSequenceMatcher m = pEn.getKey().getMatcher(sent);
                //        //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
                //        m.setFindType(SequenceMatcher.FindType.FIND_ALL);
                //Higher branch values makes the faster but uses more memory
                m.setBranchLimit(5);
                while (m.find()) {
                    int s = m.start("$term");
                    int e = m.end("$term");
                    assert e - s <= PatternFactory.numWordsCompoundMapped.get(label) : "How come the pattern " + pEn.getKey() + " is extracting phrases longer than numWordsCompound of " + PatternFactory.numWordsCompoundMapped.get(label) + " for label " + label;
                    String phrase = "";
                    String phraseLemma = "";
                    boolean useWordNotLabeled = false;
                    boolean doNotUse = false;
                    //find if the neighboring words are labeled - if so - club them together
                    if (constVars.clubNeighboringLabeledWords) {
                        for (int i = s - 1; i >= 0; i--) {
                            if (!sent.get(i).get(constVars.getAnswerClass().get(label)).equals(label)) {
                                s = i + 1;
                                break;
                            }
                        }
                        for (int i = e; i < sent.size(); i++) {
                            if (!sent.get(i).get(constVars.getAnswerClass().get(label)).equals(label)) {
                                e = i;
                                break;
                            }
                        }
                    }
                    //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
                    boolean[] addedindices = new boolean[e - s];
                    Arrays.fill(addedindices, false);
                    for (int i = s; i < e; i++) {
                        CoreLabel l = sent.get(i);
                        l.set(PatternsAnnotations.MatchedPattern.class, true);
                        if (!l.containsKey(PatternsAnnotations.MatchedPatterns.class) || l.get(PatternsAnnotations.MatchedPatterns.class) == null)
                            l.set(PatternsAnnotations.MatchedPatterns.class, new HashSet<>());
                        SurfacePattern pSur = (SurfacePattern) pEn.getValue();
                        assert pSur != null : "Why is " + pEn.getValue() + " not present in the index?!";
                        assert l.get(PatternsAnnotations.MatchedPatterns.class) != null : "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.keySet();
                        l.get(PatternsAnnotations.MatchedPatterns.class).add(pSur);
                        for (Entry<Class, Object> ig : constVars.getIgnoreWordswithClassesDuringSelection().get(label).entrySet()) {
                            if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) {
                                doNotUse = true;
                            }
                        }
                        boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), PatternFactory.ignoreWordRegex);
                        if (removePhrasesWithStopWords && containsStop) {
                            doNotUse = true;
                        } else {
                            if (!containsStop || !removeStopWordsFromSelectedPhrases) {
                                if (label == null || l.get(constVars.getAnswerClass().get(label)) == null || !l.get(constVars.getAnswerClass().get(label)).equals(label.toString())) {
                                    useWordNotLabeled = true;
                                }
                                phrase += " " + l.word();
                                phraseLemma += " " + l.lemma();
                                addedindices[i - s] = true;
                            }
                        }
                    }
                    for (int i = 0; i < addedindices.length; i++) {
                        if (i > 0 && i < addedindices.length - 1 && addedindices[i - 1] == true && addedindices[i] == false && addedindices[i + 1] == true) {
                            doNotUse = true;
                            break;
                        }
                    }
                    if (!doNotUse) {
                        matchedTokensByPat.add(pEn.getValue(), new Triple<>(sentid, s, e - 1));
                        phrase = phrase.trim();
                        if (!phrase.isEmpty()) {
                            phraseLemma = phraseLemma.trim();
                            CandidatePhrase candPhrase = CandidatePhrase.createOrGet(phrase, phraseLemma);
                            allFreq.incrementCount(candPhrase, pEn.getValue(), 1.0);
                            if (!useWordNotLabeled)
                                alreadyLabeledPhrases.add(candPhrase);
                        }
                    }
                }
            }
        }
        return new Triple<>(allFreq, matchedTokensByPat, alreadyLabeledPhrases);
    } catch (Exception e) {
        e.printStackTrace();
        throw e;
    }
}
Also used : CollectionValuedMap(edu.stanford.nlp.util.CollectionValuedMap) TokenSequencePattern(edu.stanford.nlp.ling.tokensregex.TokenSequencePattern) TokenSequenceMatcher(edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher) TwoDimensionalCounter(edu.stanford.nlp.stats.TwoDimensionalCounter) Triple(edu.stanford.nlp.util.Triple) CoreLabel(edu.stanford.nlp.ling.CoreLabel)

Aggregations

Triple (edu.stanford.nlp.util.Triple)12 CoreLabel (edu.stanford.nlp.ling.CoreLabel)4 Pair (edu.stanford.nlp.util.Pair)3 TwoDimensionalCounter (edu.stanford.nlp.stats.TwoDimensionalCounter)2 Tree (edu.stanford.nlp.trees.Tree)2 CollectionValuedMap (edu.stanford.nlp.util.CollectionValuedMap)2 PrintWriter (java.io.PrintWriter)2 ArrayList (java.util.ArrayList)2 List (java.util.List)2 TransducerGraph (edu.stanford.nlp.fsm.TransducerGraph)1 Language (edu.stanford.nlp.international.Language)1 RuntimeIOException (edu.stanford.nlp.io.RuntimeIOException)1 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)1 HasWord (edu.stanford.nlp.ling.HasWord)1 Label (edu.stanford.nlp.ling.Label)1 TaggedWord (edu.stanford.nlp.ling.TaggedWord)1 SequenceMatchResult (edu.stanford.nlp.ling.tokensregex.SequenceMatchResult)1 TokenSequenceMatcher (edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher)1 TokenSequencePattern (edu.stanford.nlp.ling.tokensregex.TokenSequencePattern)1 ParserConstraint (edu.stanford.nlp.parser.common.ParserConstraint)1