use of edu.stanford.nlp.patterns.Pattern in project CoreNLP by stanfordnlp.
the class ApplyDepPatterns method call.
@Override
public Pair<TwoDimensionalCounter<CandidatePhrase, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>> call() throws Exception {
// CollectionValuedMap<String, Integer> tokensMatchedPattern = new
// CollectionValuedMap<String, Integer>();
TwoDimensionalCounter<CandidatePhrase, E> allFreq = new TwoDimensionalCounter<>();
CollectionValuedMap<E, Triple<String, Integer, Integer>> matchedTokensByPat = new CollectionValuedMap<>();
for (String sentid : sentids) {
DataInstance sent = sents.get(sentid);
List<CoreLabel> tokens = sent.getTokens();
for (Map.Entry<SemgrexPattern, E> pEn : patterns.entrySet()) {
if (pEn.getKey() == null)
throw new RuntimeException("why is the pattern " + pEn + " null?");
SemanticGraph graph = ((DataInstanceDep) sent).getGraph();
// SemgrexMatcher m = pEn.getKey().matcher(graph);
// TokenSequenceMatcher m = pEn.getKey().matcher(sent);
// //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
// m.setFindType(SequenceMatcher.FindType.FIND_ALL);
// Higher branch values makes the faster but uses more memory
// m.setBranchLimit(5);
Collection<ExtractedPhrase> matched = getMatchedTokensIndex(graph, pEn.getKey(), sent, label);
for (ExtractedPhrase match : matched) {
int s = match.startIndex;
int e = match.endIndex + 1;
String phrase = "";
String phraseLemma = "";
boolean useWordNotLabeled = false;
boolean doNotUse = false;
// find if the neighboring words are labeled - if so - club them together
if (constVars.clubNeighboringLabeledWords) {
for (int i = s - 1; i >= 0; i--) {
if (tokens.get(i).get(constVars.getAnswerClass().get(label)).equals(label) && (e - i + 1) <= PatternFactory.numWordsCompoundMapped.get(label)) {
s = i;
// System.out.println("for phrase " + match + " clubbing earlier word. new s is " + s);
} else
break;
}
for (int i = e; i < tokens.size(); i++) {
if (tokens.get(i).get(constVars.getAnswerClass().get(label)).equals(label) && (i - s + 1) <= PatternFactory.numWordsCompoundMapped.get(label)) {
e = i;
// System.out.println("for phrase " + match + " clubbing next word. new e is " + e);
} else
break;
}
}
// to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
boolean[] addedindices = new boolean[e - s];
for (int i = s; i < e; i++) {
CoreLabel l = tokens.get(i);
l.set(PatternsAnnotations.MatchedPattern.class, true);
if (!l.containsKey(PatternsAnnotations.MatchedPatterns.class) || l.get(PatternsAnnotations.MatchedPatterns.class) == null)
l.set(PatternsAnnotations.MatchedPatterns.class, new HashSet<>());
Pattern pSur = pEn.getValue();
assert pSur != null : "Why is " + pEn.getValue() + " not present in the index?!";
assert l.get(PatternsAnnotations.MatchedPatterns.class) != null : "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.keySet();
l.get(PatternsAnnotations.MatchedPatterns.class).add(pSur);
for (Map.Entry<Class, Object> ig : constVars.getIgnoreWordswithClassesDuringSelection().get(label).entrySet()) {
if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) {
doNotUse = true;
}
}
boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), PatternFactory.ignoreWordRegex);
if (removePhrasesWithStopWords && containsStop) {
doNotUse = true;
} else {
if (!containsStop || !removeStopWordsFromSelectedPhrases) {
if (label == null || l.get(constVars.getAnswerClass().get(label)) == null || !l.get(constVars.getAnswerClass().get(label)).equals(label)) {
useWordNotLabeled = true;
}
phrase += " " + l.word();
phraseLemma += " " + l.lemma();
addedindices[i - s] = true;
}
}
}
for (int i = 0; i < addedindices.length; i++) {
if (i > 0 && i < addedindices.length - 1 && addedindices[i - 1] == true && addedindices[i] == false && addedindices[i + 1] == true) {
doNotUse = true;
break;
}
}
if (!doNotUse && useWordNotLabeled) {
matchedTokensByPat.add(pEn.getValue(), new Triple<>(sentid, s, e - 1));
if (useWordNotLabeled) {
phrase = phrase.trim();
phraseLemma = phraseLemma.trim();
allFreq.incrementCount(CandidatePhrase.createOrGet(phrase, phraseLemma, match.getFeatures()), pEn.getValue(), 1.0);
}
}
}
}
}
return new Pair<>(allFreq, matchedTokensByPat);
}
use of edu.stanford.nlp.patterns.Pattern in project CoreNLP by stanfordnlp.
the class ApplyDepPatterns method getMatchedTokensIndex.
private Collection<ExtractedPhrase> getMatchedTokensIndex(SemanticGraph graph, SemgrexPattern pattern, DataInstance sent, String label) {
// TODO: look at the ignoreCommonTags flag
ExtractPhraseFromPattern extract = new ExtractPhraseFromPattern(false, PatternFactory.numWordsCompoundMapped.get(label));
Collection<IntPair> outputIndices = new ArrayList<>();
boolean findSubTrees = true;
List<CoreLabel> tokensC = sent.getTokens();
// TODO: see if you can get rid of this (only used for matchedGraphs)
List<String> tokens = tokensC.stream().map(x -> x.word()).collect(Collectors.toList());
List<String> outputPhrases = new ArrayList<>();
List<ExtractedPhrase> extractedPhrases = new ArrayList<>();
Function<Pair<IndexedWord, SemanticGraph>, Counter<String>> extractFeatures = new Function<Pair<IndexedWord, SemanticGraph>, Counter<String>>() {
@Override
public Counter<String> apply(Pair<IndexedWord, SemanticGraph> indexedWordSemanticGraphPair) {
// TODO: make features;
Counter<String> feat = new ClassicCounter<>();
IndexedWord vertex = indexedWordSemanticGraphPair.first();
SemanticGraph graph = indexedWordSemanticGraphPair.second();
List<Pair<GrammaticalRelation, IndexedWord>> pt = graph.parentPairs(vertex);
for (Pair<GrammaticalRelation, IndexedWord> en : pt) {
feat.incrementCount("PARENTREL-" + en.first());
}
return feat;
}
};
extract.getSemGrexPatternNodes(graph, tokens, outputPhrases, outputIndices, pattern, findSubTrees, extractedPhrases, constVars.matchLowerCaseContext, matchingWordRestriction);
// System.out.println("extracted phrases are " + extractedPhrases + " and output indices are " + outputIndices);
return extractedPhrases;
}
Aggregations