use of edu.stanford.nlp.trees.tregex.TregexMatcher in project CoreNLP by stanfordnlp.
the class MWEFrequencyDist method main.
public static void main(String[] args) {
if (args.length != 1) {
System.err.printf("Usage: java %s file%n", MWEFrequencyDist.class.getName());
System.exit(-1);
}
final File treeFile = new File(args[0]);
TwoDimensionalCounter<String, String> mweLabelToString = new TwoDimensionalCounter<>();
Set<String> uniquePOSSequences = Generics.newHashSet();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
TreeReaderFactory trf = new FrenchTreeReaderFactory();
TreeReader tr = trf.newTreeReader(br);
final TregexPattern pMWE = TregexPattern.compile("/^MW/");
for (Tree t; (t = tr.readTree()) != null; ) {
//Count MWE statistics
TregexMatcher m = pMWE.matcher(t);
while (m.findNextMatchingNode()) {
Tree match = m.getMatch();
String label = match.value();
List<CoreLabel> yield = match.taggedLabeledYield();
StringBuilder termYield = new StringBuilder();
StringBuilder posYield = new StringBuilder();
for (CoreLabel cl : yield) {
termYield.append(cl.word()).append(" ");
posYield.append(cl.tag()).append(" ");
}
mweLabelToString.incrementCount(label, termYield.toString().trim());
uniquePOSSequences.add(posYield.toString().trim());
}
}
//Closes the underlying reader
tr.close();
System.out.printf("Type\t#Type\t#Single\t%%Single\t%%Total%n");
double nMWEs = mweLabelToString.totalCount();
int nAllSingletons = 0;
int nTokens = 0;
for (String mweLabel : mweLabelToString.firstKeySet()) {
int nSingletons = 0;
double totalCount = mweLabelToString.totalCount(mweLabel);
Counter<String> mc = mweLabelToString.getCounter(mweLabel);
for (String term : mc.keySet()) {
if (mc.getCount(term) == 1.0)
nSingletons++;
nTokens += term.split("\\s+").length * (int) mc.getCount(term);
}
nAllSingletons += nSingletons;
System.out.printf("%s\t%d\t%d\t%.2f\t%.2f%n", mweLabel, (int) totalCount, nSingletons, 100.0 * nSingletons / totalCount, 100.0 * totalCount / nMWEs);
}
System.out.printf("TOTAL:\t%d\t%d\t%.2f%n", (int) nMWEs, nAllSingletons, 100.0 * nAllSingletons / nMWEs);
System.out.println("#tokens = " + nTokens);
System.out.println("#unique MWE POS sequences = " + uniquePOSSequences.size());
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (TregexParseException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
use of edu.stanford.nlp.trees.tregex.TregexMatcher in project CoreNLP by stanfordnlp.
the class FTBCorrector method main.
/**
* @param args
*/
public static void main(String[] args) {
if (args.length != 1) {
log.info("Usage: java " + FTBCorrector.class.getName() + " filename\n");
System.exit(-1);
}
TreeTransformer tt = new FTBCorrector();
File f = new File(args[0]);
try {
//These bad trees in the Candito training set should be thrown out:
// (ROOT (SENT (" ") (. .)))
// (ROOT (SENT (. .)))
TregexPattern pBadTree = TregexPattern.compile("@SENT <: @PUNC");
TregexPattern pBadTree2 = TregexPattern.compile("@SENT <1 @PUNC <2 @PUNC !<3 __");
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8"));
TreeReaderFactory trf = new FrenchTreeReaderFactory();
TreeReader tr = trf.newTreeReader(br);
int nTrees = 0;
for (Tree t; (t = tr.readTree()) != null; nTrees++) {
TregexMatcher m = pBadTree.matcher(t);
TregexMatcher m2 = pBadTree2.matcher(t);
if (m.find() || m2.find()) {
log.info("Discarding tree: " + t.toString());
} else {
Tree fixedT = tt.transformTree(t);
System.out.println(fixedT.toString());
}
}
tr.close();
System.err.printf("Wrote %d trees%n", nTrees);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (TregexParseException e) {
e.printStackTrace();
}
}
use of edu.stanford.nlp.trees.tregex.TregexMatcher in project CoreNLP by stanfordnlp.
the class MWEPreprocessor method countMWEStatistics.
public static void countMWEStatistics(Tree t, TwoDimensionalCounter<String, String> unigramTagger, TwoDimensionalCounter<String, String> labelPreterm, TwoDimensionalCounter<String, String> pretermLabel, TwoDimensionalCounter<String, String> labelTerm, TwoDimensionalCounter<String, String> termLabel) {
updateTagger(unigramTagger, t);
//Count MWE statistics
TregexMatcher m = pMWE.matcher(t);
while (m.findNextMatchingNode()) {
Tree match = m.getMatch();
String label = match.value();
if (RESOLVE_DUMMY_TAGS && label.equals(FrenchXMLTreeReader.MISSING_PHRASAL))
continue;
String preterm = SentenceUtils.listToString(match.preTerminalYield());
String term = SentenceUtils.listToString(match.yield());
labelPreterm.incrementCount(label, preterm);
pretermLabel.incrementCount(preterm, label);
labelTerm.incrementCount(label, term);
termLabel.incrementCount(term, label);
}
}
use of edu.stanford.nlp.trees.tregex.TregexMatcher in project CoreNLP by stanfordnlp.
the class RHSFrequency method main.
public static void main(String[] args) {
if (args.length < minArgs) {
System.out.println(usage.toString());
System.exit(-1);
}
TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
DiskTreebank tb = null;
String encoding = "UTF-8";
TregexPattern rootMatch = null;
for (int i = 0; i < args.length; i++) {
if (args[i].startsWith("-")) {
switch(args[i]) {
case "-l":
Language lang = Language.valueOf(args[++i].trim());
tlpp = lang.params;
break;
case "-e":
encoding = args[++i];
break;
default:
System.out.println(usage.toString());
System.exit(-1);
}
} else {
rootMatch = TregexPattern.compile("@" + args[i++]);
if (tb == null) {
if (tlpp == null) {
System.out.println(usage.toString());
System.exit(-1);
} else {
tlpp.setInputEncoding(encoding);
tlpp.setOutputEncoding(encoding);
tb = tlpp.diskTreebank();
}
}
tb.loadPath(args[i++]);
}
}
Counter<String> rhsCounter = new ClassicCounter<>();
for (Tree t : tb) {
TregexMatcher m = rootMatch.matcher(t);
while (m.findNextMatchingNode()) {
Tree match = m.getMatch();
StringBuilder sb = new StringBuilder();
for (Tree kid : match.children()) sb.append(kid.value()).append(" ");
rhsCounter.incrementCount(sb.toString().trim());
}
}
List<String> biggestKeys = new ArrayList<>(rhsCounter.keySet());
Collections.sort(biggestKeys, Counters.toComparatorDescending(rhsCounter));
PrintWriter pw = tlpp.pw();
for (String rhs : biggestKeys) pw.printf("%s\t%d%n", rhs, (int) rhsCounter.getCount(rhs));
pw.close();
}
Aggregations