use of edu.stanford.nlp.trees.tregex.TregexMatcher in project CoreNLP by stanfordnlp.
the class RuleBasedCorefMentionFinder method extractEnumerations.
protected static void extractEnumerations(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) {
List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class);
SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
TregexPattern tgrepPattern = enumerationsMentionPattern;
TregexMatcher matcher = tgrepPattern.matcher(tree);
Map<IntPair, Tree> spanToMentionSubTree = Generics.newHashMap();
while (matcher.find()) {
matcher.getMatch();
Tree m1 = matcher.getNode("m1");
Tree m2 = matcher.getNode("m2");
List<Tree> mLeaves = m1.getLeaves();
int beginIdx = ((CoreLabel) mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class) - 1;
int endIdx = ((CoreLabel) mLeaves.get(mLeaves.size() - 1).label()).get(CoreAnnotations.IndexAnnotation.class);
spanToMentionSubTree.put(new IntPair(beginIdx, endIdx), m1);
mLeaves = m2.getLeaves();
beginIdx = ((CoreLabel) mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class) - 1;
endIdx = ((CoreLabel) mLeaves.get(mLeaves.size() - 1).label()).get(CoreAnnotations.IndexAnnotation.class);
spanToMentionSubTree.put(new IntPair(beginIdx, endIdx), m2);
}
for (IntPair mSpan : spanToMentionSubTree.keySet()) {
if (!mentionSpanSet.contains(mSpan) && !insideNE(mSpan, namedEntitySpanSet)) {
int dummyMentionId = -1;
Mention m = new Mention(dummyMentionId, mSpan.get(0), mSpan.get(1), dependency, new ArrayList<>(sent.subList(mSpan.get(0), mSpan.get(1))), spanToMentionSubTree.get(mSpan));
mentions.add(m);
mentionSpanSet.add(mSpan);
}
}
}
use of edu.stanford.nlp.trees.tregex.TregexMatcher in project CoreNLP by stanfordnlp.
the class Mention method setNumber.
protected void setNumber(Dictionaries dict) {
if (mentionType == MentionType.PRONOMINAL) {
if (dict.pluralPronouns.contains(headString)) {
number = Number.PLURAL;
} else if (dict.singularPronouns.contains(headString)) {
number = Number.SINGULAR;
} else {
number = Number.UNKNOWN;
}
} else if (mentionType == MentionType.LIST) {
number = Number.PLURAL;
} else if (!nerString.equals("O") && mentionType != MentionType.NOMINAL) {
// Check to see if this is a list of things
if (!(nerString.equals("ORGANIZATION") || nerString.startsWith("ORG"))) {
number = Number.SINGULAR;
} else {
// ORGs can be both plural and singular
number = Number.UNKNOWN;
}
} else {
String tag = headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class);
if (tag.startsWith("N") && tag.endsWith("S")) {
number = Number.PLURAL;
} else if (tag.startsWith("N")) {
number = Number.SINGULAR;
} else {
number = Number.UNKNOWN;
}
}
if (mentionType != MentionType.PRONOMINAL) {
if (number == Number.UNKNOWN) {
if (dict.singularWords.contains(headString)) {
number = Number.SINGULAR;
SieveCoreferenceSystem.logger.finest("[Bergsma] Number set to:\tSINGULAR:\t" + headString);
} else if (dict.pluralWords.contains(headString)) {
number = Number.PLURAL;
SieveCoreferenceSystem.logger.finest("[Bergsma] Number set to:\tPLURAL:\t" + headString);
}
}
final String enumerationPattern = "NP < (NP=tmp $.. (/,|CC/ $.. NP))";
TregexPattern tgrepPattern = TregexPattern.compile(enumerationPattern);
TregexMatcher m = tgrepPattern.matcher(this.mentionSubTree);
while (m.find()) {
// Tree t = m.getMatch();
if (this.mentionSubTree == m.getNode("tmp") && this.spanToString().toLowerCase().contains(" and ")) {
number = Number.PLURAL;
}
}
}
}
use of edu.stanford.nlp.trees.tregex.TregexMatcher in project CoreNLP by stanfordnlp.
the class SplitCanditoTrees method outputSplits.
/**
* Right now this outputs trees in PTB format. It outputs one tree
* at a time until we have output enough trees to fill the given
* file, then moves on to the next file. Trees are output in the
* order given in the <code>ids</code> list.
* <br>
* Trees have their words replaced with the words' lemmas, if those
* lemmas exist.
*/
public static void outputSplits(List<String> ids, Map<String, Tree> treeMap) throws IOException {
Queue<Integer> fSizeQueue = new LinkedList<>(Arrays.asList(fSizes));
Queue<String> fNameQueue = new LinkedList<>(Arrays.asList(fNames));
TregexPattern pBadTree = TregexPattern.compile("@SENT <: @PUNC");
TregexPattern pBadTree2 = TregexPattern.compile("@SENT <1 @PUNC <2 @PUNC !<3 __");
final TreeTransformer tt = new FTBCorrector();
int size = fSizeQueue.remove();
String filename = fNameQueue.remove();
log.info("Outputing " + filename);
PrintWriter writer = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), "UTF-8")));
int outputCount = 0;
for (String id : ids) {
if (!treeMap.containsKey(id)) {
log.info("Missing id: " + id);
continue;
}
Tree tree = treeMap.get(id);
TregexMatcher m = pBadTree.matcher(tree);
TregexMatcher m2 = pBadTree2.matcher(tree);
if (m.find() || m2.find()) {
log.info("Discarding tree: " + tree.toString());
continue;
}
// Punctuation normalization, etc.
Tree backupCopy = tree.deepCopy();
tree = tt.transformTree(tree);
if (tree.firstChild().children().length == 0) {
// Some trees have only punctuation. Tregex will mangle these. Don't throw those away.
log.info("Saving tree: " + tree.toString());
log.info("Backup: " + backupCopy.toString());
tree = backupCopy;
}
if (LEMMAS_AS_LEAVES || ADD_MORPHO_TO_LEAVES) {
mungeLeaves(tree, LEMMAS_AS_LEAVES, ADD_MORPHO_TO_LEAVES);
}
if (CC_TAGSET) {
replacePOSTags(tree);
}
if (MORFETTE_OUTPUT) {
writer.println(treeToMorfette(tree));
} else {
writer.println(tree.toString());
}
++outputCount;
if (outputCount == size) {
outputCount = 0;
size = fSizeQueue.remove();
filename = fNameQueue.remove();
log.info("Outputing " + filename);
writer.close();
writer = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), "UTF-8")));
}
}
writer.close();
}
use of edu.stanford.nlp.trees.tregex.TregexMatcher in project CoreNLP by stanfordnlp.
the class MWEFrequencyDist method main.
public static void main(String[] args) {
if (args.length != 1) {
System.err.printf("Usage: java %s file%n", MWEFrequencyDist.class.getName());
System.exit(-1);
}
final File treeFile = new File(args[0]);
TwoDimensionalCounter<String, String> mweLabelToString = new TwoDimensionalCounter<>();
Set<String> uniquePOSSequences = Generics.newHashSet();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
TreeReaderFactory trf = new FrenchTreeReaderFactory();
TreeReader tr = trf.newTreeReader(br);
final TregexPattern pMWE = TregexPattern.compile("/^MW/");
for (Tree t; (t = tr.readTree()) != null; ) {
//Count MWE statistics
TregexMatcher m = pMWE.matcher(t);
while (m.findNextMatchingNode()) {
Tree match = m.getMatch();
String label = match.value();
List<CoreLabel> yield = match.taggedLabeledYield();
StringBuilder termYield = new StringBuilder();
StringBuilder posYield = new StringBuilder();
for (CoreLabel cl : yield) {
termYield.append(cl.word()).append(" ");
posYield.append(cl.tag()).append(" ");
}
mweLabelToString.incrementCount(label, termYield.toString().trim());
uniquePOSSequences.add(posYield.toString().trim());
}
}
//Closes the underlying reader
tr.close();
System.out.printf("Type\t#Type\t#Single\t%%Single\t%%Total%n");
double nMWEs = mweLabelToString.totalCount();
int nAllSingletons = 0;
int nTokens = 0;
for (String mweLabel : mweLabelToString.firstKeySet()) {
int nSingletons = 0;
double totalCount = mweLabelToString.totalCount(mweLabel);
Counter<String> mc = mweLabelToString.getCounter(mweLabel);
for (String term : mc.keySet()) {
if (mc.getCount(term) == 1.0)
nSingletons++;
nTokens += term.split("\\s+").length * (int) mc.getCount(term);
}
nAllSingletons += nSingletons;
System.out.printf("%s\t%d\t%d\t%.2f\t%.2f%n", mweLabel, (int) totalCount, nSingletons, 100.0 * nSingletons / totalCount, 100.0 * totalCount / nMWEs);
}
System.out.printf("TOTAL:\t%d\t%d\t%.2f%n", (int) nMWEs, nAllSingletons, 100.0 * nAllSingletons / nMWEs);
System.out.println("#tokens = " + nTokens);
System.out.println("#unique MWE POS sequences = " + uniquePOSSequences.size());
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (TregexParseException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
use of edu.stanford.nlp.trees.tregex.TregexMatcher in project CoreNLP by stanfordnlp.
the class FTBCorrector method main.
/**
* @param args
*/
public static void main(String[] args) {
if (args.length != 1) {
log.info("Usage: java " + FTBCorrector.class.getName() + " filename\n");
System.exit(-1);
}
TreeTransformer tt = new FTBCorrector();
File f = new File(args[0]);
try {
//These bad trees in the Candito training set should be thrown out:
// (ROOT (SENT (" ") (. .)))
// (ROOT (SENT (. .)))
TregexPattern pBadTree = TregexPattern.compile("@SENT <: @PUNC");
TregexPattern pBadTree2 = TregexPattern.compile("@SENT <1 @PUNC <2 @PUNC !<3 __");
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8"));
TreeReaderFactory trf = new FrenchTreeReaderFactory();
TreeReader tr = trf.newTreeReader(br);
int nTrees = 0;
for (Tree t; (t = tr.readTree()) != null; nTrees++) {
TregexMatcher m = pBadTree.matcher(t);
TregexMatcher m2 = pBadTree2.matcher(t);
if (m.find() || m2.find()) {
log.info("Discarding tree: " + t.toString());
} else {
Tree fixedT = tt.transformTree(t);
System.out.println(fixedT.toString());
}
}
tr.close();
System.err.printf("Wrote %d trees%n", nTrees);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (TregexParseException e) {
e.printStackTrace();
}
}
Aggregations