use of edu.stanford.nlp.trees.DiskTreebank in project CoreNLP by stanfordnlp.
the class UNKPrinter method main.
public static void main(String[] args) {
if (args.length < minArgs) {
System.out.println(usage.toString());
System.exit(-1);
}
TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
DiskTreebank tb = null;
String encoding = "UTF-8";
Language lang = Language.English;
for (int i = 0; i < args.length; i++) {
if (args[i].startsWith("-")) {
switch(args[i]) {
case "-l":
lang = Language.valueOf(args[++i].trim());
tlpp = lang.params;
break;
case "-e":
encoding = args[++i];
break;
default:
System.out.println(usage.toString());
System.exit(-1);
}
} else {
if (tb == null) {
if (tlpp == null) {
System.out.println(usage.toString());
System.exit(-1);
} else {
tlpp.setInputEncoding(encoding);
tlpp.setOutputEncoding(encoding);
tb = tlpp.diskTreebank();
}
}
tb.loadPath(args[i]);
}
}
PrintWriter pw = tlpp.pw();
Options op = new Options();
Options.LexOptions lexOptions = op.lexOptions;
if (lang == Language.French) {
lexOptions.useUnknownWordSignatures = 1;
lexOptions.smartMutation = false;
lexOptions.unknownSuffixSize = 2;
lexOptions.unknownPrefixSize = 1;
} else if (lang == Language.Arabic) {
lexOptions.smartMutation = false;
lexOptions.useUnknownWordSignatures = 9;
lexOptions.unknownPrefixSize = 1;
lexOptions.unknownSuffixSize = 1;
}
Index<String> wordIndex = new HashIndex<>();
Index<String> tagIndex = new HashIndex<>();
Lexicon lex = tlpp.lex(op, wordIndex, tagIndex);
int computeAfter = (int) (0.50 * tb.size());
Counter<String> vocab = new ClassicCounter<>();
Counter<String> unkCounter = new ClassicCounter<>();
int treeId = 0;
for (Tree t : tb) {
List<Label> yield = t.yield();
int posId = 0;
for (Label word : yield) {
vocab.incrementCount(word.value());
if (treeId > computeAfter && vocab.getCount(word.value()) < 2.0)
// if(lex.getUnknownWordModel().getSignature(word.value(), posId++).equals("UNK"))
// pw.println(word.value());
unkCounter.incrementCount(lex.getUnknownWordModel().getSignature(word.value(), posId++));
}
treeId++;
}
List<String> biggestKeys = new ArrayList<>(unkCounter.keySet());
Collections.sort(biggestKeys, Counters.toComparatorDescending(unkCounter));
for (String wordType : biggestKeys) pw.printf("%s\t%d%n", wordType, (int) unkCounter.getCount(wordType));
pw.close();
pw.close();
}
use of edu.stanford.nlp.trees.DiskTreebank in project CoreNLP by stanfordnlp.
the class BaseLexicon method main.
/** Provides some testing and opportunities for exploration of the
* probabilities of a BaseLexicon. What's here currently probably
* only works for the English Penn Treeebank, as it uses default
* constructors. Of the words given to test on,
* the first is treated as sentence initial, and the rest as not
* sentence initial.
*
* @param args The command line arguments:
* java BaseLexicon treebankPath fileRange unknownWordModel words*
*/
public static void main(String[] args) {
if (args.length < 3) {
log.info("java BaseLexicon treebankPath fileRange unknownWordModel words*");
return;
}
System.out.print("Training BaseLexicon from " + args[0] + ' ' + args[1] + " ... ");
Treebank tb = new DiskTreebank();
tb.loadPath(args[0], new NumberRangesFileFilter(args[1], true));
// TODO: change this interface so the lexicon creates its own indices?
Index<String> wordIndex = new HashIndex<>();
Index<String> tagIndex = new HashIndex<>();
Options op = new Options();
op.lexOptions.useUnknownWordSignatures = Integer.parseInt(args[2]);
BaseLexicon lex = new BaseLexicon(op, wordIndex, tagIndex);
lex.initializeTraining(tb.size());
lex.train(tb);
lex.finishTraining();
System.out.println("done.");
System.out.println();
NumberFormat nf = NumberFormat.getNumberInstance();
nf.setMaximumFractionDigits(4);
List<String> impos = new ArrayList<>();
for (int i = 3; i < args.length; i++) {
if (lex.isKnown(args[i])) {
System.out.println(args[i] + " is a known word. Log probabilities [log P(w|t)] for its taggings are:");
for (Iterator<IntTaggedWord> it = lex.ruleIteratorByWord(wordIndex.addToIndex(args[i]), i - 3, null); it.hasNext(); ) {
IntTaggedWord iTW = it.next();
System.out.println(StringUtils.pad(iTW, 24) + nf.format(lex.score(iTW, i - 3, wordIndex.get(iTW.word), null)));
}
} else {
String sig = lex.getUnknownWordModel().getSignature(args[i], i - 3);
System.out.println(args[i] + " is an unknown word. Signature with uwm " + lex.getUnknownWordModel().getUnknownLevel() + ((i == 3) ? " init" : "non-init") + " is: " + sig);
impos.clear();
List<String> lis = new ArrayList<>(tagIndex.objectsList());
Collections.sort(lis);
for (String tStr : lis) {
IntTaggedWord iTW = new IntTaggedWord(args[i], tStr, wordIndex, tagIndex);
double score = lex.score(iTW, 1, args[i], null);
if (score == Float.NEGATIVE_INFINITY) {
impos.add(tStr);
} else {
System.out.println(StringUtils.pad(iTW, 24) + nf.format(score));
}
}
if (impos.size() > 0) {
System.out.println(args[i] + " impossible tags: " + impos);
}
}
System.out.println();
}
}
use of edu.stanford.nlp.trees.DiskTreebank in project CoreNLP by stanfordnlp.
the class VocabFrequency method main.
public static void main(String[] args) {
if (args.length < minArgs) {
System.out.println(usage.toString());
System.exit(-1);
}
TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
DiskTreebank tb = null;
String encoding = "UTF-8";
for (int i = 0; i < args.length; i++) {
if (args[i].startsWith("-")) {
switch(args[i]) {
case "-l":
Language lang = Language.valueOf(args[++i].trim());
tlpp = lang.params;
break;
case "-e":
encoding = args[++i];
break;
default:
System.out.println(usage.toString());
System.exit(-1);
}
} else {
if (tb == null) {
if (tlpp == null) {
System.out.println(usage.toString());
System.exit(-1);
} else {
tlpp.setInputEncoding(encoding);
tlpp.setOutputEncoding(encoding);
tb = tlpp.diskTreebank();
}
}
tb.loadPath(args[i]);
}
}
Counter<String> vocab = new ClassicCounter<>();
for (Tree t : tb) {
List<Label> yield = t.yield();
for (Label word : yield) vocab.incrementCount(word.value());
}
List<String> biggestKeys = new ArrayList<>(vocab.keySet());
Collections.sort(biggestKeys, Counters.toComparatorDescending(vocab));
PrintWriter pw = tlpp.pw();
for (String wordType : biggestKeys) pw.printf("%s\t%d%n", wordType, (int) vocab.getCount(wordType));
pw.close();
}
use of edu.stanford.nlp.trees.DiskTreebank in project CoreNLP by stanfordnlp.
the class CountTrees method main.
public static void main(String[] args) {
if (args.length < minArgs) {
System.out.println(usage);
System.exit(-1);
}
// Process command-line options
Properties options = StringUtils.argsToProperties(args, optionArgDefinitions);
String fileName = options.getProperty("");
if (fileName == null || fileName.equals("")) {
System.out.println(usage);
System.exit(-1);
}
int maxLen = PropertiesUtils.getInt(options, "y", Integer.MAX_VALUE);
boolean printTrees = PropertiesUtils.getBool(options, "p", false);
boolean flattenTrees = PropertiesUtils.getBool(options, "f", false);
boolean printPOS = PropertiesUtils.getBool(options, "a", false);
boolean printTnT = PropertiesUtils.getBool(options, "t", false);
Language language = PropertiesUtils.get(options, "l", Language.English, Language.class);
TreebankLangParserParams tlpp = language.params;
String encoding = options.getProperty("e", "UTF-8");
tlpp.setInputEncoding(encoding);
tlpp.setOutputEncoding(encoding);
DiskTreebank tb = tlpp.diskTreebank();
tb.loadPath(fileName);
// Read the treebank
PrintWriter pw = tlpp.pw();
int numTrees = 0;
for (Tree tree : tb) {
if (tree.yield().size() > maxLen)
continue;
++numTrees;
if (printTrees) {
pw.println(tree.toString());
} else if (flattenTrees) {
pw.println(SentenceUtils.listToString(tree.yield()));
} else if (printPOS) {
pw.println(SentenceUtils.listToString(tree.preTerminalYield()));
} else if (printTnT) {
List<CoreLabel> yield = tree.taggedLabeledYield();
for (CoreLabel label : yield) {
pw.printf("%s\t%s%n", label.word(), label.tag());
}
pw.println();
}
}
System.err.printf("Read %d trees.%n", numTrees);
}
use of edu.stanford.nlp.trees.DiskTreebank in project CoreNLP by stanfordnlp.
the class RHSFrequency method main.
public static void main(String[] args) {
if (args.length < minArgs) {
System.out.println(usage.toString());
System.exit(-1);
}
TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
DiskTreebank tb = null;
String encoding = "UTF-8";
TregexPattern rootMatch = null;
for (int i = 0; i < args.length; i++) {
if (args[i].startsWith("-")) {
switch(args[i]) {
case "-l":
Language lang = Language.valueOf(args[++i].trim());
tlpp = lang.params;
break;
case "-e":
encoding = args[++i];
break;
default:
System.out.println(usage.toString());
System.exit(-1);
}
} else {
rootMatch = TregexPattern.compile("@" + args[i++]);
if (tb == null) {
if (tlpp == null) {
System.out.println(usage.toString());
System.exit(-1);
} else {
tlpp.setInputEncoding(encoding);
tlpp.setOutputEncoding(encoding);
tb = tlpp.diskTreebank();
}
}
tb.loadPath(args[i++]);
}
}
Counter<String> rhsCounter = new ClassicCounter<>();
for (Tree t : tb) {
TregexMatcher m = rootMatch.matcher(t);
while (m.findNextMatchingNode()) {
Tree match = m.getMatch();
StringBuilder sb = new StringBuilder();
for (Tree kid : match.children()) sb.append(kid.value()).append(" ");
rhsCounter.incrementCount(sb.toString().trim());
}
}
List<String> biggestKeys = new ArrayList<>(rhsCounter.keySet());
Collections.sort(biggestKeys, Counters.toComparatorDescending(rhsCounter));
PrintWriter pw = tlpp.pw();
for (String rhs : biggestKeys) pw.printf("%s\t%d%n", rhs, (int) rhsCounter.getCount(rhs));
pw.close();
}
Aggregations