use of edu.stanford.nlp.util.HashIndex in project CoreNLP by stanfordnlp.
the class FactoredLexicon method main.
/**
* @param args
*/
public static void main(String[] args) {
if (args.length != 4) {
System.err.printf("Usage: java %s language features train_file dev_file%n", FactoredLexicon.class.getName());
System.exit(-1);
}
// Command line options
Language language = Language.valueOf(args[0]);
TreebankLangParserParams tlpp = language.params;
Treebank trainTreebank = tlpp.diskTreebank();
trainTreebank.loadPath(args[2]);
Treebank devTreebank = tlpp.diskTreebank();
devTreebank.loadPath(args[3]);
MorphoFeatureSpecification morphoSpec;
Options options = getOptions(language);
if (language.equals(Language.Arabic)) {
morphoSpec = new ArabicMorphoFeatureSpecification();
String[] languageOptions = { "-arabicFactored" };
tlpp.setOptionFlag(languageOptions, 0);
} else if (language.equals(Language.French)) {
morphoSpec = new FrenchMorphoFeatureSpecification();
String[] languageOptions = { "-frenchFactored" };
tlpp.setOptionFlag(languageOptions, 0);
} else {
throw new UnsupportedOperationException();
}
String featureList = args[1];
String[] features = featureList.trim().split(",");
for (String feature : features) {
morphoSpec.activate(MorphoFeatureType.valueOf(feature));
}
System.out.println("Language: " + language.toString());
System.out.println("Features: " + args[1]);
// Create word and tag indices
// Save trees in a collection since the interface requires that....
System.out.print("Loading training trees...");
List<Tree> trainTrees = new ArrayList<>(19000);
Index<String> wordIndex = new HashIndex<>();
Index<String> tagIndex = new HashIndex<>();
for (Tree tree : trainTreebank) {
for (Tree subTree : tree) {
if (!subTree.isLeaf()) {
tlpp.transformTree(subTree, tree);
}
}
trainTrees.add(tree);
}
System.out.printf("Done! (%d trees)%n", trainTrees.size());
// Setup and train the lexicon.
System.out.print("Collecting sufficient statistics for lexicon...");
FactoredLexicon lexicon = new FactoredLexicon(options, morphoSpec, wordIndex, tagIndex);
lexicon.initializeTraining(trainTrees.size());
lexicon.train(trainTrees, null);
lexicon.finishTraining();
System.out.println("Done!");
trainTrees = null;
// Load the tuning set
System.out.print("Loading tuning set...");
List<FactoredLexiconEvent> tuningSet = getTuningSet(devTreebank, lexicon, tlpp);
System.out.printf("...Done! (%d events)%n", tuningSet.size());
// Print the probabilities that we obtain
// TODO(spenceg): Implement tagging accuracy with FactLex
int nCorrect = 0;
Counter<String> errors = new ClassicCounter<>();
for (FactoredLexiconEvent event : tuningSet) {
Iterator<IntTaggedWord> itr = lexicon.ruleIteratorByWord(event.word(), event.getLoc(), event.featureStr());
Counter<Integer> logScores = new ClassicCounter<>();
boolean noRules = true;
int goldTagId = -1;
while (itr.hasNext()) {
noRules = false;
IntTaggedWord iTW = itr.next();
if (iTW.tag() == event.tagId()) {
log.info("GOLD-");
goldTagId = iTW.tag();
}
float tagScore = lexicon.score(iTW, event.getLoc(), event.word(), event.featureStr());
logScores.incrementCount(iTW.tag(), tagScore);
}
if (noRules) {
System.err.printf("NO TAGGINGS: %s %s%n", event.word(), event.featureStr());
} else {
// Score the tagging
int hypTagId = Counters.argmax(logScores);
if (hypTagId == goldTagId) {
++nCorrect;
} else {
String goldTag = goldTagId < 0 ? "UNSEEN" : lexicon.tagIndex.get(goldTagId);
errors.incrementCount(goldTag);
}
}
log.info();
}
// Output accuracy
double acc = (double) nCorrect / (double) tuningSet.size();
System.err.printf("%n%nACCURACY: %.2f%n%n", acc * 100.0);
log.info("% of errors by type:");
List<String> biggestKeys = new ArrayList<>(errors.keySet());
Collections.sort(biggestKeys, Counters.toComparator(errors, false, true));
Counters.normalize(errors);
for (String key : biggestKeys) {
System.err.printf("%s\t%.2f%n", key, errors.getCount(key) * 100.0);
}
}
Aggregations