use of edu.stanford.nlp.trees.tregex.TregexMatcher in project CoreNLP by stanfordnlp.
the class MWEPreprocessor method countMWEStatistics.
public static void countMWEStatistics(Tree t, TwoDimensionalCounter<String, String> unigramTagger, TwoDimensionalCounter<String, String> labelPreterm, TwoDimensionalCounter<String, String> pretermLabel, TwoDimensionalCounter<String, String> labelTerm, TwoDimensionalCounter<String, String> termLabel) {
updateTagger(unigramTagger, t);
//Count MWE statistics
TregexMatcher m = pMWE.matcher(t);
while (m.findNextMatchingNode()) {
Tree match = m.getMatch();
String label = match.value();
if (RESOLVE_DUMMY_TAGS && label.equals(FrenchXMLTreeReader.MISSING_PHRASAL))
continue;
String preterm = SentenceUtils.listToString(match.preTerminalYield());
String term = SentenceUtils.listToString(match.yield());
labelPreterm.incrementCount(label, preterm);
pretermLabel.incrementCount(preterm, label);
labelTerm.incrementCount(label, term);
termLabel.incrementCount(term, label);
}
}
use of edu.stanford.nlp.trees.tregex.TregexMatcher in project CoreNLP by stanfordnlp.
the class RHSFrequency method main.
public static void main(String[] args) {
if (args.length < minArgs) {
System.out.println(usage.toString());
System.exit(-1);
}
TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
DiskTreebank tb = null;
String encoding = "UTF-8";
TregexPattern rootMatch = null;
for (int i = 0; i < args.length; i++) {
if (args[i].startsWith("-")) {
switch(args[i]) {
case "-l":
Language lang = Language.valueOf(args[++i].trim());
tlpp = lang.params;
break;
case "-e":
encoding = args[++i];
break;
default:
System.out.println(usage.toString());
System.exit(-1);
}
} else {
rootMatch = TregexPattern.compile("@" + args[i++]);
if (tb == null) {
if (tlpp == null) {
System.out.println(usage.toString());
System.exit(-1);
} else {
tlpp.setInputEncoding(encoding);
tlpp.setOutputEncoding(encoding);
tb = tlpp.diskTreebank();
}
}
tb.loadPath(args[i++]);
}
}
Counter<String> rhsCounter = new ClassicCounter<>();
for (Tree t : tb) {
TregexMatcher m = rootMatch.matcher(t);
while (m.findNextMatchingNode()) {
Tree match = m.getMatch();
StringBuilder sb = new StringBuilder();
for (Tree kid : match.children()) sb.append(kid.value()).append(" ");
rhsCounter.incrementCount(sb.toString().trim());
}
}
List<String> biggestKeys = new ArrayList<>(rhsCounter.keySet());
Collections.sort(biggestKeys, Counters.toComparatorDescending(rhsCounter));
PrintWriter pw = tlpp.pw();
for (String rhs : biggestKeys) pw.printf("%s\t%d%n", rhs, (int) rhsCounter.getCount(rhs));
pw.close();
}
use of edu.stanford.nlp.trees.tregex.TregexMatcher in project CoreNLP by stanfordnlp.
the class RuleBasedCorefMentionFinder method extractNPorPRP.
protected static void extractNPorPRP(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) {
List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class);
tree.indexLeaves();
SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
TregexPattern tgrepPattern = npOrPrpMentionPattern;
TregexMatcher matcher = tgrepPattern.matcher(tree);
while (matcher.find()) {
Tree t = matcher.getMatch();
List<Tree> mLeaves = t.getLeaves();
int beginIdx = ((CoreLabel) mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class) - 1;
int endIdx = ((CoreLabel) mLeaves.get(mLeaves.size() - 1).label()).get(CoreAnnotations.IndexAnnotation.class);
// try not to have span that ends with ,
if (",".equals(sent.get(endIdx - 1).word())) {
endIdx--;
}
IntPair mSpan = new IntPair(beginIdx, endIdx);
if (!mentionSpanSet.contains(mSpan) && !insideNE(mSpan, namedEntitySpanSet)) {
int dummyMentionId = -1;
Mention m = new Mention(dummyMentionId, beginIdx, endIdx, dependency, new ArrayList<>(sent.subList(beginIdx, endIdx)), t);
mentions.add(m);
mentionSpanSet.add(mSpan);
}
}
}
use of edu.stanford.nlp.trees.tregex.TregexMatcher in project CoreNLP by stanfordnlp.
the class TregexPoweredTreebankParserParams method getAnnotationString.
/**
* Build a string of annotations for the given tree.
*
* @param t The input tree (with non-language specific annotation
* already done, so you need to strip back to basic categories)
* @param root The root of the current tree (can be null for words)
* @return A (possibly empty) string of annotations to add to the
* given tree
*/
protected String getAnnotationString(Tree t, Tree root) {
// Accumulate all annotations in this string
StringBuilder annotationStr = new StringBuilder();
for (String featureName : features) {
Pair<TregexPattern, Function<TregexMatcher, String>> behavior = annotationPatterns.get(featureName);
TregexMatcher m = behavior.first().matcher(root);
if (m.matchesAt(t))
annotationStr.append(behavior.second().apply(m));
}
return annotationStr.toString();
}
use of edu.stanford.nlp.trees.tregex.TregexMatcher in project CoreNLP by stanfordnlp.
the class TregexPoweredTreebankParserParams method compileAnnotations.
/**
* Compile the {@link #annotations} collection given a
* particular head finder. Subclasses should call this method at
* least once before the class is used, and whenever the head finder
* is changed.
*/
protected void compileAnnotations(HeadFinder hf) {
TregexPatternCompiler compiler = new TregexPatternCompiler(hf);
annotationPatterns.clear();
for (Map.Entry<String, Pair<String, Function<TregexMatcher, String>>> annotation : annotations.entrySet()) {
TregexPattern compiled;
try {
compiled = compiler.compile(annotation.getValue().first());
} catch (TregexParseException e) {
int nth = annotationPatterns.size() + 1;
log.info("Parse exception on annotation pattern #" + nth + " initialization: " + e);
continue;
}
Pair<TregexPattern, Function<TregexMatcher, String>> behavior = new Pair<>(compiled, annotation.getValue().second());
annotationPatterns.put(annotation.getKey(), behavior);
}
}
Aggregations