Search in sources :

Example 6 with MorphoFeatureSpecification

use of edu.stanford.nlp.international.morph.MorphoFeatureSpecification in project CoreNLP by stanfordnlp.

the class FactoredLexicon method main.

/**
   * @param args
   */
public static void main(String[] args) {
    if (args.length != 4) {
        System.err.printf("Usage: java %s language features train_file dev_file%n", FactoredLexicon.class.getName());
        System.exit(-1);
    }
    // Command line options
    Language language = Language.valueOf(args[0]);
    TreebankLangParserParams tlpp = language.params;
    Treebank trainTreebank = tlpp.diskTreebank();
    trainTreebank.loadPath(args[2]);
    Treebank devTreebank = tlpp.diskTreebank();
    devTreebank.loadPath(args[3]);
    MorphoFeatureSpecification morphoSpec;
    Options options = getOptions(language);
    if (language.equals(Language.Arabic)) {
        morphoSpec = new ArabicMorphoFeatureSpecification();
        String[] languageOptions = { "-arabicFactored" };
        tlpp.setOptionFlag(languageOptions, 0);
    } else if (language.equals(Language.French)) {
        morphoSpec = new FrenchMorphoFeatureSpecification();
        String[] languageOptions = { "-frenchFactored" };
        tlpp.setOptionFlag(languageOptions, 0);
    } else {
        throw new UnsupportedOperationException();
    }
    String featureList = args[1];
    String[] features = featureList.trim().split(",");
    for (String feature : features) {
        morphoSpec.activate(MorphoFeatureType.valueOf(feature));
    }
    System.out.println("Language: " + language.toString());
    System.out.println("Features: " + args[1]);
    // Create word and tag indices
    // Save trees in a collection since the interface requires that....
    System.out.print("Loading training trees...");
    List<Tree> trainTrees = new ArrayList<>(19000);
    Index<String> wordIndex = new HashIndex<>();
    Index<String> tagIndex = new HashIndex<>();
    for (Tree tree : trainTreebank) {
        for (Tree subTree : tree) {
            if (!subTree.isLeaf()) {
                tlpp.transformTree(subTree, tree);
            }
        }
        trainTrees.add(tree);
    }
    System.out.printf("Done! (%d trees)%n", trainTrees.size());
    // Setup and train the lexicon.
    System.out.print("Collecting sufficient statistics for lexicon...");
    FactoredLexicon lexicon = new FactoredLexicon(options, morphoSpec, wordIndex, tagIndex);
    lexicon.initializeTraining(trainTrees.size());
    lexicon.train(trainTrees, null);
    lexicon.finishTraining();
    System.out.println("Done!");
    trainTrees = null;
    // Load the tuning set
    System.out.print("Loading tuning set...");
    List<FactoredLexiconEvent> tuningSet = getTuningSet(devTreebank, lexicon, tlpp);
    System.out.printf("...Done! (%d events)%n", tuningSet.size());
    // Print the probabilities that we obtain
    // TODO(spenceg): Implement tagging accuracy with FactLex
    int nCorrect = 0;
    Counter<String> errors = new ClassicCounter<>();
    for (FactoredLexiconEvent event : tuningSet) {
        Iterator<IntTaggedWord> itr = lexicon.ruleIteratorByWord(event.word(), event.getLoc(), event.featureStr());
        Counter<Integer> logScores = new ClassicCounter<>();
        boolean noRules = true;
        int goldTagId = -1;
        while (itr.hasNext()) {
            noRules = false;
            IntTaggedWord iTW = itr.next();
            if (iTW.tag() == event.tagId()) {
                log.info("GOLD-");
                goldTagId = iTW.tag();
            }
            float tagScore = lexicon.score(iTW, event.getLoc(), event.word(), event.featureStr());
            logScores.incrementCount(iTW.tag(), tagScore);
        }
        if (noRules) {
            System.err.printf("NO TAGGINGS: %s %s%n", event.word(), event.featureStr());
        } else {
            // Score the tagging
            int hypTagId = Counters.argmax(logScores);
            if (hypTagId == goldTagId) {
                ++nCorrect;
            } else {
                String goldTag = goldTagId < 0 ? "UNSEEN" : lexicon.tagIndex.get(goldTagId);
                errors.incrementCount(goldTag);
            }
        }
        log.info();
    }
    // Output accuracy
    double acc = (double) nCorrect / (double) tuningSet.size();
    System.err.printf("%n%nACCURACY: %.2f%n%n", acc * 100.0);
    log.info("% of errors by type:");
    List<String> biggestKeys = new ArrayList<>(errors.keySet());
    Collections.sort(biggestKeys, Counters.toComparator(errors, false, true));
    Counters.normalize(errors);
    for (String key : biggestKeys) {
        System.err.printf("%s\t%.2f%n", key, errors.getCount(key) * 100.0);
    }
}
Also used : FrenchMorphoFeatureSpecification(edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification) Treebank(edu.stanford.nlp.trees.Treebank) ArrayList(java.util.ArrayList) Language(edu.stanford.nlp.international.Language) Tree(edu.stanford.nlp.trees.Tree) ArabicMorphoFeatureSpecification(edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification) MorphoFeatureSpecification(edu.stanford.nlp.international.morph.MorphoFeatureSpecification) ArabicMorphoFeatureSpecification(edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification) FrenchMorphoFeatureSpecification(edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification) HashIndex(edu.stanford.nlp.util.HashIndex) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter)

Aggregations

MorphoFeatureSpecification (edu.stanford.nlp.international.morph.MorphoFeatureSpecification)6 MorphoFeatures (edu.stanford.nlp.international.morph.MorphoFeatures)4 ArabicMorphoFeatureSpecification (edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification)3 FrenchMorphoFeatureSpecification (edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification)3 Language (edu.stanford.nlp.international.Language)2 CoreLabel (edu.stanford.nlp.ling.CoreLabel)2 Label (edu.stanford.nlp.ling.Label)2 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)2 Tree (edu.stanford.nlp.trees.Tree)2 Treebank (edu.stanford.nlp.trees.Treebank)2 ArrayList (java.util.ArrayList)2 TreebankLangParserParams (edu.stanford.nlp.parser.lexparser.TreebankLangParserParams)1 TwoDimensionalIntCounter (edu.stanford.nlp.stats.TwoDimensionalIntCounter)1 HashIndex (edu.stanford.nlp.util.HashIndex)1 BufferedReader (java.io.BufferedReader)1 FileNotFoundException (java.io.FileNotFoundException)1 FileReader (java.io.FileReader)1 IOException (java.io.IOException)1 Map (java.util.Map)1 Set (java.util.Set)1