use of edu.stanford.nlp.util.IntPair in project CoreNLP by stanfordnlp.
the class ExtractPhraseFromPattern method getSemGrexPatternNodes.
/*
* Given a SemanticGraph g and a SemgrexPattern pattern
* And a bunch of other parameters,
* run the pattern matcher (get SemgrexMatcher m)
* Iterate through to get matching words/phrases
*
* Next, gets matchedGraphsForPattern.get(pattern),
* a list of matched (String, semgraph) pairs
* and adds the new graph and tokens if matched.
*
* I need to clarify what's going on with tokens.
*/
public Set<IndexedWord> getSemGrexPatternNodes(SemanticGraph g, List<String> tokens, Collection<String> outputNodes, Collection<IntPair> outputIndices, SemgrexPattern pattern, boolean findSubTrees, Collection<ExtractedPhrase> extractedPhrases, boolean lowercase, Function<CoreLabel, Boolean> acceptWord) {
Set<IndexedWord> foundWordsParents = new HashSet<>();
SemgrexMatcher m = pattern.matcher(g, lowercase);
while (m.find()) {
IndexedWord w = m.getNode("node");
//System.out.println("found a match for " + pattern.pattern());
IndexedWord parent = m.getNode("parent");
boolean ifSatisfiedMaxDepth = checkIfSatisfiedMaxDepth(g, parent, w, new IntPair(maxDepth, 0));
if (ifSatisfiedMaxDepth == false)
continue;
if (DEBUG > 3) {
List<Pair<String, SemanticGraph>> matchedGraphs = matchedGraphsForPattern.get(pattern);
if (matchedGraphs == null)
matchedGraphs = new ArrayList<>();
matchedGraphs.add(new Pair<>(StringUtils.join(tokens, " "), g));
//if (DEBUG >= 3)
// System.out.println("matched pattern is " + pattern);
matchedGraphsForPattern.put(pattern, matchedGraphs);
}
foundWordsParents.add(parent);
// String relationName = m.getRelnString("reln");
// System.out.println("word is " + w.lemma() + " and " + w.tag());
ArrayList<IndexedWord> seenNodes = new ArrayList<>();
List<String> cutoffrelations = new ArrayList<>();
// if (elementStr.equalsIgnoreCase("technique"))
// cutoffrelations = cutoffRelationsForTech;
// if (elementStr.equalsIgnoreCase("app"))
// cutoffrelations = this.cuttoffRelationsForApp;
//System.out.println("g is ");
//g.prettyPrint();
printSubGraph(g, w, cutoffrelations, tokens, outputNodes, outputIndices, seenNodes, new ArrayList<>(), findSubTrees, extractedPhrases, pattern, acceptWord);
}
return foundWordsParents;
}
use of edu.stanford.nlp.util.IntPair in project CoreNLP by stanfordnlp.
the class ExtractPhraseFromPattern method printSubGraph.
//Here, the index (startIndex, endIndex) seems to be inclusive of the endIndex
public void printSubGraph(SemanticGraph g, IndexedWord w, List<String> additionalCutOffRels, List<String> textTokens, Collection<String> listOfOutput, Collection<IntPair> listOfOutputIndices, List<IndexedWord> seenNodes, List<IndexedWord> doNotAddThese, boolean findSubTrees, Collection<ExtractedPhrase> extractedPhrases, SemgrexPattern pattern, Function<CoreLabel, Boolean> acceptWord) {
try {
if (seenNodes.contains(w))
return;
seenNodes.add(w);
if (doNotAddThese.contains(w))
return;
List<IndexedWord> andNodes = new ArrayList<>();
descendantsWithReln(g, w, "conj_and", new ArrayList<>(), andNodes);
for (IndexedWord w1 : andNodes) {
printSubGraph(g, w1, additionalCutOffRels, textTokens, listOfOutput, listOfOutputIndices, seenNodes, doNotAddThese, findSubTrees, extractedPhrases, pattern, acceptWord);
}
doNotAddThese.addAll(andNodes);
List<String> allCutOffRels = new ArrayList<>();
if (additionalCutOffRels != null)
allCutOffRels.addAll(additionalCutOffRels);
allCutOffRels.addAll(cutoffRelations);
CollectionValuedMap<Integer, String> featPerToken = new CollectionValuedMap<>();
Collection<String> feat = new ArrayList<>();
GetPatternsFromDataMultiClass.getFeatures(g, w, true, feat, null);
Set<IndexedWord> words = descendants(g, w, allCutOffRels, doNotAddThese, ignoreCommonTags, acceptWord, featPerToken);
//System.out.println("words are " + words);
if (words.size() > 0) {
int min = Integer.MAX_VALUE, max = -1;
for (IndexedWord word : words) {
if (word.index() < min)
min = word.index();
if (word.index() > max)
max = word.index();
}
IntPair indices;
// phrase = StringUtils.join(ph.values(), " ");
if ((max - min + 1) > maxPhraseLength) {
max = min + maxPhraseLength - 1;
}
indices = new IntPair(min - 1, max - 1);
String phrase = StringUtils.join(textTokens.subList(min - 1, max), " ");
phrase = phrase.trim();
feat.add("LENGTH-" + (max - min + 1));
for (int i = min; i <= max; i++) feat.addAll(featPerToken.get(i));
//System.out.println("phrase is " + phrase + " index is " + indices + " and maxphraselength is " + maxPhraseLength + " and descendentset is " + words);
ExtractedPhrase extractedPh = new ExtractedPhrase(min - 1, max - 1, pattern, phrase, Counters.asCounter(feat));
if (!listOfOutput.contains(phrase) && !doNotAddThese.contains(phrase)) {
// if (sentElem != null) {
// Element node = new Element(elemString, curNS);
// node.addContent(phrase);
// sentElem.addContent(node);
// }
listOfOutput.add(phrase);
if (!listOfOutputIndices.contains(indices)) {
listOfOutputIndices.add(indices);
extractedPhrases.add(extractedPh);
}
if (findSubTrees == true) {
for (IndexedWord word : words) if (!seenNodes.contains(word))
printSubGraph(g, word, additionalCutOffRels, textTokens, listOfOutput, listOfOutputIndices, seenNodes, doNotAddThese, findSubTrees, extractedPhrases, pattern, acceptWord);
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
Aggregations