use of edu.stanford.nlp.util.Triple in project CoreNLP by stanfordnlp.
the class GrammarCompactor method compactGrammar.
/**
* Compacts the grammar specified by the Pair.
*
* @param grammar a Pair of grammars, ordered UnaryGrammar BinaryGrammar.
* @param allTrainPaths a Map from String passive constituents to Lists of paths
* @param allTestPaths a Map from String passive constituents to Lists of paths
* @return a Pair of grammars, ordered UnaryGrammar BinaryGrammar.
*/
public Triple<Index<String>, UnaryGrammar, BinaryGrammar> compactGrammar(Pair<UnaryGrammar, BinaryGrammar> grammar, Map<String, List<List<String>>> allTrainPaths, Map<String, List<List<String>>> allTestPaths, Index<String> originalStateIndex) {
// computed once for the whole grammar
inputPrior = computeInputPrior(allTrainPaths);
// BinaryGrammar bg = grammar.second;
this.stateIndex = originalStateIndex;
List<List<String>> trainPaths, testPaths;
Set<UnaryRule> unaryRules = Generics.newHashSet();
Set<BinaryRule> binaryRules = Generics.newHashSet();
Map<String, TransducerGraph> graphs = convertGrammarToGraphs(grammar, unaryRules, binaryRules);
compactedGraphs = Generics.newHashSet();
if (verbose) {
System.out.println("There are " + graphs.size() + " categories to compact.");
}
int i = 0;
for (Iterator<Entry<String, TransducerGraph>> graphIter = graphs.entrySet().iterator(); graphIter.hasNext(); ) {
Map.Entry<String, TransducerGraph> entry = graphIter.next();
String cat = entry.getKey();
TransducerGraph graph = entry.getValue();
if (verbose) {
System.out.println("About to compact grammar for " + cat + " with numNodes=" + graph.getNodes().size());
}
// to save memory
trainPaths = allTrainPaths.remove(cat);
if (trainPaths == null) {
trainPaths = new ArrayList<>();
}
// to save memory
testPaths = allTestPaths.remove(cat);
if (testPaths == null) {
testPaths = new ArrayList<>();
}
TransducerGraph compactedGraph = doCompaction(graph, trainPaths, testPaths);
i++;
if (verbose) {
System.out.println(i + ". Compacted grammar for " + cat + " from " + graph.getArcs().size() + " arcs to " + compactedGraph.getArcs().size() + " arcs.");
}
// to save memory, remove the last thing
graphIter.remove();
compactedGraphs.add(compactedGraph);
}
Pair<UnaryGrammar, BinaryGrammar> ugbg = convertGraphsToGrammar(compactedGraphs, unaryRules, binaryRules);
return new Triple<>(newStateIndex, ugbg.first(), ugbg.second());
}
use of edu.stanford.nlp.util.Triple in project CoreNLP by stanfordnlp.
the class ApplyPatterns method call.
@Override
public Triple<TwoDimensionalCounter<CandidatePhrase, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>, Set<CandidatePhrase>> call() throws Exception {
// CollectionValuedMap<String, Integer>();
try {
Set<CandidatePhrase> alreadyLabeledPhrases = new HashSet<>();
TwoDimensionalCounter<CandidatePhrase, E> allFreq = new TwoDimensionalCounter<>();
CollectionValuedMap<E, Triple<String, Integer, Integer>> matchedTokensByPat = new CollectionValuedMap<>();
for (String sentid : sentids) {
List<CoreLabel> sent = sents.get(sentid).getTokens();
for (Entry<TokenSequencePattern, E> pEn : patterns.entrySet()) {
if (pEn.getKey() == null)
throw new RuntimeException("why is the pattern " + pEn + " null?");
TokenSequenceMatcher m = pEn.getKey().getMatcher(sent);
// //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
// m.setFindType(SequenceMatcher.FindType.FIND_ALL);
//Higher branch values makes the faster but uses more memory
m.setBranchLimit(5);
while (m.find()) {
int s = m.start("$term");
int e = m.end("$term");
assert e - s <= PatternFactory.numWordsCompoundMapped.get(label) : "How come the pattern " + pEn.getKey() + " is extracting phrases longer than numWordsCompound of " + PatternFactory.numWordsCompoundMapped.get(label) + " for label " + label;
String phrase = "";
String phraseLemma = "";
boolean useWordNotLabeled = false;
boolean doNotUse = false;
//find if the neighboring words are labeled - if so - club them together
if (constVars.clubNeighboringLabeledWords) {
for (int i = s - 1; i >= 0; i--) {
if (!sent.get(i).get(constVars.getAnswerClass().get(label)).equals(label)) {
s = i + 1;
break;
}
}
for (int i = e; i < sent.size(); i++) {
if (!sent.get(i).get(constVars.getAnswerClass().get(label)).equals(label)) {
e = i;
break;
}
}
}
//to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
boolean[] addedindices = new boolean[e - s];
Arrays.fill(addedindices, false);
for (int i = s; i < e; i++) {
CoreLabel l = sent.get(i);
l.set(PatternsAnnotations.MatchedPattern.class, true);
if (!l.containsKey(PatternsAnnotations.MatchedPatterns.class) || l.get(PatternsAnnotations.MatchedPatterns.class) == null)
l.set(PatternsAnnotations.MatchedPatterns.class, new HashSet<>());
SurfacePattern pSur = (SurfacePattern) pEn.getValue();
assert pSur != null : "Why is " + pEn.getValue() + " not present in the index?!";
assert l.get(PatternsAnnotations.MatchedPatterns.class) != null : "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.keySet();
l.get(PatternsAnnotations.MatchedPatterns.class).add(pSur);
for (Entry<Class, Object> ig : constVars.getIgnoreWordswithClassesDuringSelection().get(label).entrySet()) {
if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) {
doNotUse = true;
}
}
boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), PatternFactory.ignoreWordRegex);
if (removePhrasesWithStopWords && containsStop) {
doNotUse = true;
} else {
if (!containsStop || !removeStopWordsFromSelectedPhrases) {
if (label == null || l.get(constVars.getAnswerClass().get(label)) == null || !l.get(constVars.getAnswerClass().get(label)).equals(label.toString())) {
useWordNotLabeled = true;
}
phrase += " " + l.word();
phraseLemma += " " + l.lemma();
addedindices[i - s] = true;
}
}
}
for (int i = 0; i < addedindices.length; i++) {
if (i > 0 && i < addedindices.length - 1 && addedindices[i - 1] == true && addedindices[i] == false && addedindices[i + 1] == true) {
doNotUse = true;
break;
}
}
if (!doNotUse) {
matchedTokensByPat.add(pEn.getValue(), new Triple<>(sentid, s, e - 1));
phrase = phrase.trim();
if (!phrase.isEmpty()) {
phraseLemma = phraseLemma.trim();
CandidatePhrase candPhrase = CandidatePhrase.createOrGet(phrase, phraseLemma);
allFreq.incrementCount(candPhrase, pEn.getValue(), 1.0);
if (!useWordNotLabeled)
alreadyLabeledPhrases.add(candPhrase);
}
}
}
}
}
return new Triple<>(allFreq, matchedTokensByPat, alreadyLabeledPhrases);
} catch (Exception e) {
e.printStackTrace();
throw e;
}
}
Aggregations