use of edu.stanford.nlp.trees.DiskTreebank in project CoreNLP by stanfordnlp.
the class NoPunctuationHeadFinder method main.
public static void main(String[] args) {
// simple testing code
Treebank treebank = new DiskTreebank();
CategoryWordTag.suppressTerminalDetails = true;
treebank.loadPath(args[0]);
final HeadFinder chf = new NoPunctuationHeadFinder();
treebank.apply(pt -> {
pt.percolateHeads(chf);
pt.pennPrint();
System.out.println();
});
}
use of edu.stanford.nlp.trees.DiskTreebank in project CoreNLP by stanfordnlp.
the class TreebankStats method run.
public void run(boolean pathsAreFiles, boolean displayWords, boolean displayOOV) {
if (useSplit) {
List<ObservedCorpusStats> allSplitStats = new ArrayList<>();
makeVocab = true;
for (Map.Entry<Split, Set<String>> split : splitFileLists.entrySet()) {
DiskTreebank tb = tlpp.diskTreebank();
FileFilter splitFilter = new SplitFilter(split.getValue());
for (String path : pathNames) tb.loadPath(path, splitFilter);
ObservedCorpusStats splitStats = gatherStats(tb, languageName.toString() + "." + split.getKey().toString());
allSplitStats.add(splitStats);
makeVocab = false;
}
display(aggregateStats(allSplitStats), displayWords, displayOOV);
for (ObservedCorpusStats ocs : allSplitStats) display(ocs, displayWords, displayOOV);
} else if (pathsAreFiles) {
makeVocab = true;
for (String path : pathNames) {
DiskTreebank tb = tlpp.diskTreebank();
tb.loadPath(path, pathname -> true);
ObservedCorpusStats stats = gatherStats(tb, languageName.toString() + " " + path);
display(stats, displayWords, displayOOV);
makeVocab = false;
}
} else {
trainVocab = Generics.newHashSet();
DiskTreebank tb = tlpp.diskTreebank();
for (String path : pathNames) tb.loadPath(path, pathname -> !pathname.isDirectory());
ObservedCorpusStats allStats = gatherStats(tb, languageName.toString());
display(allStats, displayWords, displayOOV);
}
}
use of edu.stanford.nlp.trees.DiskTreebank in project CoreNLP by stanfordnlp.
the class RuleBranchingFactor method main.
public static void main(String[] args) {
if (args.length < minArgs) {
System.out.println(usage);
System.exit(-1);
}
// Process command-line options
Properties options = StringUtils.argsToProperties(args, optionArgDefinitions);
String fileName = options.getProperty("");
if (fileName == null || fileName.equals("")) {
System.out.println(usage);
System.exit(-1);
}
Language language = PropertiesUtils.get(options, "l", Language.English, Language.class);
TreebankLangParserParams tlpp = language.params;
String encoding = options.getProperty("e", "UTF-8");
tlpp.setInputEncoding(encoding);
tlpp.setOutputEncoding(encoding);
DiskTreebank tb = tlpp.diskTreebank();
tb.loadPath(fileName);
// Statistics
Counter<String> binaryRuleTypes = new ClassicCounter<>(20000);
List<Integer> branchingFactors = new ArrayList<>(20000);
int nTrees = 0;
int nUnaryRules = 0;
int nBinaryRules = 0;
int binaryBranchingFactors = 0;
// Read the treebank
PrintWriter pw = tlpp.pw();
for (Tree tree : tb) {
if (tree.value().equals("ROOT")) {
tree = tree.firstChild();
}
++nTrees;
for (Tree subTree : tree) {
if (subTree.isPhrasal()) {
if (subTree.numChildren() > 1) {
++nBinaryRules;
branchingFactors.add(subTree.numChildren());
binaryBranchingFactors += subTree.numChildren();
binaryRuleTypes.incrementCount(treeToRuleString(subTree));
} else {
++nUnaryRules;
}
}
}
}
double mean = (double) binaryBranchingFactors / (double) nBinaryRules;
System.out.printf("#trees:\t%d%n", nTrees);
System.out.printf("#binary:\t%d%n", nBinaryRules);
System.out.printf("#binary types:\t%d%n", binaryRuleTypes.keySet().size());
System.out.printf("mean branching:\t%.4f%n", mean);
System.out.printf("stddev branching:\t%.4f%n", standardDeviation(branchingFactors, mean));
System.out.printf("rule entropy:\t%.5f%n", Counters.entropy(binaryRuleTypes));
System.out.printf("#unaries:\t%d%n", nUnaryRules);
}
use of edu.stanford.nlp.trees.DiskTreebank in project CoreNLP by stanfordnlp.
the class ManipulateTopBracket method main.
public static void main(String[] args) {
if (args.length < minArgs) {
System.out.println(usage());
System.exit(-1);
}
Properties options = StringUtils.argsToProperties(args, argDefs());
Language language = PropertiesUtils.get(options, "l", Language.English, Language.class);
TreebankLangParserParams tlpp = language.params;
DiskTreebank tb = null;
String encoding = options.getProperty("l", "UTF-8");
boolean removeBracket = PropertiesUtils.getBool(options, "b", false);
tlpp.setInputEncoding(encoding);
tlpp.setOutputEncoding(encoding);
tb = tlpp.diskTreebank();
String[] files = options.getProperty("", "").split("\\s+");
if (files.length != 0) {
for (String filename : files) {
tb.loadPath(filename);
}
} else {
log.info(usage());
System.exit(-1);
}
PrintWriter pwo = tlpp.pw();
String startSymbol = tlpp.treebankLanguagePack().startSymbol();
TreeFactory tf = new LabeledScoredTreeFactory();
int nTrees = 0;
for (Tree t : tb) {
if (removeBracket) {
if (t.value().equals(startSymbol)) {
t = t.firstChild();
}
} else if (!t.value().equals(startSymbol)) {
//Add a bracket if it isn't already there
t = tf.newTreeNode(startSymbol, Collections.singletonList(t));
}
pwo.println(t.toString());
nTrees++;
}
pwo.close();
System.err.printf("Processed %d trees.%n", nTrees);
}
use of edu.stanford.nlp.trees.DiskTreebank in project CoreNLP by stanfordnlp.
the class PunctFrequencyDist method main.
public static void main(String[] args) {
if (args.length < minArgs) {
System.out.println(usage.toString());
System.exit(-1);
}
TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
DiskTreebank tb = null;
String encoding = "UTF-8";
String puncTag = null;
for (int i = 0; i < args.length; i++) {
if (args[i].startsWith("-")) {
switch(args[i]) {
case "-l":
Language lang = Language.valueOf(args[++i].trim());
tlpp = lang.params;
break;
case "-e":
encoding = args[++i];
break;
default:
System.out.println(usage.toString());
System.exit(-1);
}
} else {
puncTag = args[i++];
if (tb == null) {
if (tlpp == null) {
System.out.println(usage.toString());
System.exit(-1);
} else {
tlpp.setInputEncoding(encoding);
tlpp.setOutputEncoding(encoding);
tb = tlpp.diskTreebank();
}
}
tb.loadPath(args[i]);
}
}
Counter<String> puncTypes = new ClassicCounter<>();
for (Tree t : tb) {
List<CoreLabel> yield = t.taggedLabeledYield();
for (CoreLabel word : yield) if (word.tag().equals(puncTag))
puncTypes.incrementCount(word.word());
}
List<String> biggestKeys = new ArrayList<>(puncTypes.keySet());
Collections.sort(biggestKeys, Counters.toComparatorDescending(puncTypes));
PrintWriter pw = tlpp.pw();
for (String wordType : biggestKeys) pw.printf("%s\t%d%n", wordType, (int) puncTypes.getCount(wordType));
pw.close();
}
Aggregations