use of edu.stanford.nlp.parser.lexparser.Lexicon in project CoreNLP by stanfordnlp.
the class UNKPrinter method main.
public static void main(String[] args) {
if (args.length < minArgs) {
System.out.println(usage.toString());
System.exit(-1);
}
TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
DiskTreebank tb = null;
String encoding = "UTF-8";
Language lang = Language.English;
for (int i = 0; i < args.length; i++) {
if (args[i].startsWith("-")) {
switch(args[i]) {
case "-l":
lang = Language.valueOf(args[++i].trim());
tlpp = lang.params;
break;
case "-e":
encoding = args[++i];
break;
default:
System.out.println(usage.toString());
System.exit(-1);
}
} else {
if (tb == null) {
if (tlpp == null) {
System.out.println(usage.toString());
System.exit(-1);
} else {
tlpp.setInputEncoding(encoding);
tlpp.setOutputEncoding(encoding);
tb = tlpp.diskTreebank();
}
}
tb.loadPath(args[i]);
}
}
PrintWriter pw = tlpp.pw();
Options op = new Options();
Options.LexOptions lexOptions = op.lexOptions;
if (lang == Language.French) {
lexOptions.useUnknownWordSignatures = 1;
lexOptions.smartMutation = false;
lexOptions.unknownSuffixSize = 2;
lexOptions.unknownPrefixSize = 1;
} else if (lang == Language.Arabic) {
lexOptions.smartMutation = false;
lexOptions.useUnknownWordSignatures = 9;
lexOptions.unknownPrefixSize = 1;
lexOptions.unknownSuffixSize = 1;
}
Index<String> wordIndex = new HashIndex<>();
Index<String> tagIndex = new HashIndex<>();
Lexicon lex = tlpp.lex(op, wordIndex, tagIndex);
int computeAfter = (int) (0.50 * tb.size());
Counter<String> vocab = new ClassicCounter<>();
Counter<String> unkCounter = new ClassicCounter<>();
int treeId = 0;
for (Tree t : tb) {
List<Label> yield = t.yield();
int posId = 0;
for (Label word : yield) {
vocab.incrementCount(word.value());
if (treeId > computeAfter && vocab.getCount(word.value()) < 2.0)
// if(lex.getUnknownWordModel().getSignature(word.value(), posId++).equals("UNK"))
// pw.println(word.value());
unkCounter.incrementCount(lex.getUnknownWordModel().getSignature(word.value(), posId++));
}
treeId++;
}
List<String> biggestKeys = new ArrayList<>(unkCounter.keySet());
Collections.sort(biggestKeys, Counters.toComparatorDescending(unkCounter));
for (String wordType : biggestKeys) pw.printf("%s\t%d%n", wordType, (int) unkCounter.getCount(wordType));
pw.close();
pw.close();
}
Aggregations