Search in sources :

Example 1 with ArabicMorphoFeatureSpecification

use of edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification in project CoreNLP by stanfordnlp.

the class TreebankFactoredLexiconStats method main.

//  private static String stripTag(String tag) {
//    if (tag.startsWith("DT")) {
//      String newTag = tag.substring(2, tag.length());
//      return newTag.length() > 0 ? newTag : tag;
//    }
//    return tag;
//  }
/**
   * @param args
   */
public static void main(String[] args) {
    if (args.length != 3) {
        System.err.printf("Usage: java %s language filename features%n", TreebankFactoredLexiconStats.class.getName());
        System.exit(-1);
    }
    Language language = Language.valueOf(args[0]);
    TreebankLangParserParams tlpp = language.params;
    if (language.equals(Language.Arabic)) {
        String[] options = { "-arabicFactored" };
        tlpp.setOptionFlag(options, 0);
    } else {
        String[] options = { "-frenchFactored" };
        tlpp.setOptionFlag(options, 0);
    }
    Treebank tb = tlpp.diskTreebank();
    tb.loadPath(args[1]);
    MorphoFeatureSpecification morphoSpec = language.equals(Language.Arabic) ? new ArabicMorphoFeatureSpecification() : new FrenchMorphoFeatureSpecification();
    String[] features = args[2].trim().split(",");
    for (String feature : features) {
        morphoSpec.activate(MorphoFeatureType.valueOf(feature));
    }
    // Counters
    Counter<String> wordTagCounter = new ClassicCounter<>(30000);
    Counter<String> morphTagCounter = new ClassicCounter<>(500);
    //    Counter<String> signatureTagCounter = new ClassicCounter<String>();
    Counter<String> morphCounter = new ClassicCounter<>(500);
    Counter<String> wordCounter = new ClassicCounter<>(30000);
    Counter<String> tagCounter = new ClassicCounter<>(300);
    Counter<String> lemmaCounter = new ClassicCounter<>(25000);
    Counter<String> lemmaTagCounter = new ClassicCounter<>(25000);
    Counter<String> richTagCounter = new ClassicCounter<>(1000);
    Counter<String> reducedTagCounter = new ClassicCounter<>(500);
    Counter<String> reducedTagLemmaCounter = new ClassicCounter<>(500);
    Map<String, Set<String>> wordLemmaMap = Generics.newHashMap();
    TwoDimensionalIntCounter<String, String> lemmaReducedTagCounter = new TwoDimensionalIntCounter<>(30000);
    TwoDimensionalIntCounter<String, String> reducedTagTagCounter = new TwoDimensionalIntCounter<>(500);
    TwoDimensionalIntCounter<String, String> tagReducedTagCounter = new TwoDimensionalIntCounter<>(300);
    int numTrees = 0;
    for (Tree tree : tb) {
        for (Tree subTree : tree) {
            if (!subTree.isLeaf()) {
                tlpp.transformTree(subTree, tree);
            }
        }
        List<Label> pretermList = tree.preTerminalYield();
        List<Label> yield = tree.yield();
        assert yield.size() == pretermList.size();
        int yieldLen = yield.size();
        for (int i = 0; i < yieldLen; ++i) {
            String tag = pretermList.get(i).value();
            String word = yield.get(i).value();
            String morph = ((CoreLabel) yield.get(i)).originalText();
            // Note: if there is no lemma, then we use the surface form.
            Pair<String, String> lemmaTag = MorphoFeatureSpecification.splitMorphString(word, morph);
            String lemma = lemmaTag.first();
            String richTag = lemmaTag.second();
            // WSGDEBUG
            if (tag.contains("MW"))
                lemma += "-MWE";
            lemmaCounter.incrementCount(lemma);
            lemmaTagCounter.incrementCount(lemma + tag);
            richTagCounter.incrementCount(richTag);
            String reducedTag = morphoSpec.strToFeatures(richTag).toString();
            reducedTagCounter.incrementCount(reducedTag);
            reducedTagLemmaCounter.incrementCount(reducedTag + lemma);
            wordTagCounter.incrementCount(word + tag);
            morphTagCounter.incrementCount(morph + tag);
            morphCounter.incrementCount(morph);
            wordCounter.incrementCount(word);
            tagCounter.incrementCount(tag);
            reducedTag = reducedTag.equals("") ? "NONE" : reducedTag;
            if (wordLemmaMap.containsKey(word)) {
                wordLemmaMap.get(word).add(lemma);
            } else {
                Set<String> lemmas = Generics.newHashSet(1);
                wordLemmaMap.put(word, lemmas);
            }
            lemmaReducedTagCounter.incrementCount(lemma, reducedTag);
            reducedTagTagCounter.incrementCount(lemma + reducedTag, tag);
            tagReducedTagCounter.incrementCount(tag, reducedTag);
        }
        ++numTrees;
    }
    // Barf...
    System.out.println("Language: " + language.toString());
    System.out.printf("#trees:\t%d%n", numTrees);
    System.out.printf("#tokens:\t%d%n", (int) wordCounter.totalCount());
    System.out.printf("#words:\t%d%n", wordCounter.keySet().size());
    System.out.printf("#tags:\t%d%n", tagCounter.keySet().size());
    System.out.printf("#wordTagPairs:\t%d%n", wordTagCounter.keySet().size());
    System.out.printf("#lemmas:\t%d%n", lemmaCounter.keySet().size());
    System.out.printf("#lemmaTagPairs:\t%d%n", lemmaTagCounter.keySet().size());
    System.out.printf("#feattags:\t%d%n", reducedTagCounter.keySet().size());
    System.out.printf("#feattag+lemmas:\t%d%n", reducedTagLemmaCounter.keySet().size());
    System.out.printf("#richtags:\t%d%n", richTagCounter.keySet().size());
    System.out.printf("#richtag+lemma:\t%d%n", morphCounter.keySet().size());
    System.out.printf("#richtag+lemmaTagPairs:\t%d%n", morphTagCounter.keySet().size());
    // Extra
    System.out.println("==================");
    StringBuilder sbNoLemma = new StringBuilder();
    StringBuilder sbMultLemmas = new StringBuilder();
    for (Map.Entry<String, Set<String>> wordLemmas : wordLemmaMap.entrySet()) {
        String word = wordLemmas.getKey();
        Set<String> lemmas = wordLemmas.getValue();
        if (lemmas.size() == 0) {
            sbNoLemma.append("NO LEMMAS FOR WORD: " + word + "\n");
            continue;
        }
        if (lemmas.size() > 1) {
            sbMultLemmas.append("MULTIPLE LEMMAS: " + word + " " + setToString(lemmas) + "\n");
            continue;
        }
        String lemma = lemmas.iterator().next();
        Set<String> reducedTags = lemmaReducedTagCounter.getCounter(lemma).keySet();
        if (reducedTags.size() > 1) {
            System.out.printf("%s --> %s%n", word, lemma);
            for (String reducedTag : reducedTags) {
                int count = lemmaReducedTagCounter.getCount(lemma, reducedTag);
                String posTags = setToString(reducedTagTagCounter.getCounter(lemma + reducedTag).keySet());
                System.out.printf("\t%s\t%d\t%s%n", reducedTag, count, posTags);
            }
            System.out.println();
        }
    }
    System.out.println("==================");
    System.out.println(sbNoLemma.toString());
    System.out.println(sbMultLemmas.toString());
    System.out.println("==================");
    List<String> tags = new ArrayList<>(tagReducedTagCounter.firstKeySet());
    Collections.sort(tags);
    for (String tag : tags) {
        System.out.println(tag);
        Set<String> reducedTags = tagReducedTagCounter.getCounter(tag).keySet();
        for (String reducedTag : reducedTags) {
            int count = tagReducedTagCounter.getCount(tag, reducedTag);
            //        reducedTag = reducedTag.equals("") ? "NONE" : reducedTag;
            System.out.printf("\t%s\t%d%n", reducedTag, count);
        }
        System.out.println();
    }
    System.out.println("==================");
}
Also used : FrenchMorphoFeatureSpecification(edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification) Set(java.util.Set) Treebank(edu.stanford.nlp.trees.Treebank) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Label(edu.stanford.nlp.ling.Label) ArrayList(java.util.ArrayList) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) Language(edu.stanford.nlp.international.Language) Tree(edu.stanford.nlp.trees.Tree) ArabicMorphoFeatureSpecification(edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification) MorphoFeatureSpecification(edu.stanford.nlp.international.morph.MorphoFeatureSpecification) ArabicMorphoFeatureSpecification(edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification) FrenchMorphoFeatureSpecification(edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification) TwoDimensionalIntCounter(edu.stanford.nlp.stats.TwoDimensionalIntCounter) CoreLabel(edu.stanford.nlp.ling.CoreLabel) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) Map(java.util.Map)

Example 2 with ArabicMorphoFeatureSpecification

use of edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification in project CoreNLP by stanfordnlp.

the class IOBUtils method tokenToDatums.

/**
   * Convert token to a sequence of datums and add to iobList.
   *
   * @param iobList
   * @param token
   * @param tokType
   * @param tokenLabel
   * @param lastToken
   * @param applyRewriteRules
   * @param tf a TokenizerFactory returning ArabicTokenizers (for determining original segment boundaries)
   * @param origText the original string before tokenization (for determining original segment boundaries)
   */
private static void tokenToDatums(List<CoreLabel> iobList, CoreLabel cl, String token, TokenType tokType, CoreLabel tokenLabel, String lastToken, boolean applyRewriteRules, boolean stripRewrites, TokenizerFactory<CoreLabel> tf, String origText) {
    if (token.isEmpty())
        return;
    String lastLabel = ContinuationSymbol;
    String firstLabel = BeginSymbol;
    String rewritten = cl.get(ArabicDocumentReaderAndWriter.RewrittenArabicAnnotation.class);
    boolean crossRefRewrites = true;
    if (rewritten == null) {
        rewritten = token;
        crossRefRewrites = false;
    } else {
        rewritten = stripSegmentationMarkers(rewritten, tokType);
    }
    if (applyRewriteRules) {
        // Apply Arabic-specific re-write rules
        String rawToken = tokenLabel.word();
        String tag = tokenLabel.tag();
        MorphoFeatureSpecification featureSpec = new ArabicMorphoFeatureSpecification();
        featureSpec.activate(MorphoFeatureType.NGEN);
        featureSpec.activate(MorphoFeatureType.NNUM);
        featureSpec.activate(MorphoFeatureType.DEF);
        featureSpec.activate(MorphoFeatureType.TENSE);
        MorphoFeatures features = featureSpec.strToFeatures(tag);
        // Rule #1 : ت --> ة
        if (features.getValue(MorphoFeatureType.NGEN).equals("F") && features.getValue(MorphoFeatureType.NNUM).equals("SG") && rawToken.endsWith("ت-") && !stripRewrites) {
            lastLabel = RewriteSymbol;
        } else if (rawToken.endsWith("ة-")) {
            assert token.endsWith("ة");
            token = token.substring(0, token.length() - 1) + "ت";
            lastLabel = RewriteSymbol;
        }
        // Rule #2 : لل --> ل ال
        if (lastToken.equals("ل") && features.getValue(MorphoFeatureType.DEF).equals("D")) {
            if (rawToken.startsWith("-ال")) {
                if (!token.startsWith("ا"))
                    log.info("Bad REWAL: " + rawToken + " / " + token);
                token = token.substring(1);
                rewritten = rewritten.substring(1);
                if (!stripRewrites)
                    firstLabel = RewriteSymbol;
            } else if (rawToken.startsWith("-ل")) {
                if (!token.startsWith("ل"))
                    log.info("Bad REWAL: " + rawToken + " / " + token);
                if (!stripRewrites)
                    firstLabel = RewriteSymbol;
            } else {
                log.info("Ignoring REWAL: " + rawToken + " / " + token);
            }
        }
        // Rule #4 : ا --> ى
        if (rawToken.endsWith("ى-")) {
            if (features.getValue(MorphoFeatureType.TENSE) != null) {
                // verb: ى becomes ا
                token = token.substring(0, token.length() - 1) + "ا";
            } else {
                // assume preposition:
                token = token.substring(0, token.length() - 1) + "ي";
            }
            if (!stripRewrites)
                lastLabel = RewriteSymbol;
        } else if (rawToken.equals("علي-") || rawToken.equals("-علي-")) {
            if (!stripRewrites)
                lastLabel = RewriteSymbol;
        }
    }
    String origWord;
    if (origText == null) {
        origWord = tokenLabel.word();
    } else {
        origWord = origText.substring(cl.beginPosition(), cl.endPosition());
    }
    int origIndex = 0;
    while (origIndex < origWord.length() && isDeletedCharacter(origWord.charAt(origIndex), tf)) {
        ++origIndex;
    }
    // Create datums and add to iobList
    if (token.isEmpty())
        log.info("Rewriting resulted in empty token: " + tokenLabel.word());
    String firstChar = String.valueOf(token.charAt(0));
    // Start at 0 to make sure we include the whole token according to the tokenizer
    iobList.add(createDatum(cl, firstChar, firstLabel, 0, origIndex + 1));
    final int numChars = token.length();
    if (crossRefRewrites && rewritten.length() != numChars) {
        System.err.printf("Rewritten annotation doesn't have correct length: %s>>>%s%n", token, rewritten);
        crossRefRewrites = false;
    }
    ++origIndex;
    for (int j = 1; j < numChars; ++j, ++origIndex) {
        while (origIndex < origWord.length() && isDeletedCharacter(origWord.charAt(origIndex), tf)) {
            ++origIndex;
        }
        if (origIndex >= origWord.length()) {
            origIndex = origWord.length() - 1;
        }
        String charLabel = (j == numChars - 1) ? lastLabel : ContinuationSymbol;
        String thisChar = String.valueOf(token.charAt(j));
        if (crossRefRewrites && !String.valueOf(rewritten.charAt(j)).equals(thisChar))
            charLabel = RewriteSymbol;
        if (charLabel == ContinuationSymbol && thisChar.equals("ى") && j != numChars - 1)
            // Assume all mid-word alef maqsura are supposed to be yah
            charLabel = RewriteSymbol;
        iobList.add(createDatum(cl, thisChar, charLabel, origIndex, origIndex + 1));
    }
    // End at endPosition to make sure we include the whole token according to the tokenizer
    if (!iobList.isEmpty()) {
        iobList.get(iobList.size() - 1).setEndPosition(cl.endPosition());
    }
}
Also used : MorphoFeatureSpecification(edu.stanford.nlp.international.morph.MorphoFeatureSpecification) ArabicMorphoFeatureSpecification(edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification) MorphoFeatures(edu.stanford.nlp.international.morph.MorphoFeatures) ArabicMorphoFeatureSpecification(edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification)

Example 3 with ArabicMorphoFeatureSpecification

use of edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification in project CoreNLP by stanfordnlp.

the class FactoredLexicon method main.

/**
   * @param args
   */
public static void main(String[] args) {
    if (args.length != 4) {
        System.err.printf("Usage: java %s language features train_file dev_file%n", FactoredLexicon.class.getName());
        System.exit(-1);
    }
    // Command line options
    Language language = Language.valueOf(args[0]);
    TreebankLangParserParams tlpp = language.params;
    Treebank trainTreebank = tlpp.diskTreebank();
    trainTreebank.loadPath(args[2]);
    Treebank devTreebank = tlpp.diskTreebank();
    devTreebank.loadPath(args[3]);
    MorphoFeatureSpecification morphoSpec;
    Options options = getOptions(language);
    if (language.equals(Language.Arabic)) {
        morphoSpec = new ArabicMorphoFeatureSpecification();
        String[] languageOptions = { "-arabicFactored" };
        tlpp.setOptionFlag(languageOptions, 0);
    } else if (language.equals(Language.French)) {
        morphoSpec = new FrenchMorphoFeatureSpecification();
        String[] languageOptions = { "-frenchFactored" };
        tlpp.setOptionFlag(languageOptions, 0);
    } else {
        throw new UnsupportedOperationException();
    }
    String featureList = args[1];
    String[] features = featureList.trim().split(",");
    for (String feature : features) {
        morphoSpec.activate(MorphoFeatureType.valueOf(feature));
    }
    System.out.println("Language: " + language.toString());
    System.out.println("Features: " + args[1]);
    // Create word and tag indices
    // Save trees in a collection since the interface requires that....
    System.out.print("Loading training trees...");
    List<Tree> trainTrees = new ArrayList<>(19000);
    Index<String> wordIndex = new HashIndex<>();
    Index<String> tagIndex = new HashIndex<>();
    for (Tree tree : trainTreebank) {
        for (Tree subTree : tree) {
            if (!subTree.isLeaf()) {
                tlpp.transformTree(subTree, tree);
            }
        }
        trainTrees.add(tree);
    }
    System.out.printf("Done! (%d trees)%n", trainTrees.size());
    // Setup and train the lexicon.
    System.out.print("Collecting sufficient statistics for lexicon...");
    FactoredLexicon lexicon = new FactoredLexicon(options, morphoSpec, wordIndex, tagIndex);
    lexicon.initializeTraining(trainTrees.size());
    lexicon.train(trainTrees, null);
    lexicon.finishTraining();
    System.out.println("Done!");
    trainTrees = null;
    // Load the tuning set
    System.out.print("Loading tuning set...");
    List<FactoredLexiconEvent> tuningSet = getTuningSet(devTreebank, lexicon, tlpp);
    System.out.printf("...Done! (%d events)%n", tuningSet.size());
    // Print the probabilities that we obtain
    // TODO(spenceg): Implement tagging accuracy with FactLex
    int nCorrect = 0;
    Counter<String> errors = new ClassicCounter<>();
    for (FactoredLexiconEvent event : tuningSet) {
        Iterator<IntTaggedWord> itr = lexicon.ruleIteratorByWord(event.word(), event.getLoc(), event.featureStr());
        Counter<Integer> logScores = new ClassicCounter<>();
        boolean noRules = true;
        int goldTagId = -1;
        while (itr.hasNext()) {
            noRules = false;
            IntTaggedWord iTW = itr.next();
            if (iTW.tag() == event.tagId()) {
                log.info("GOLD-");
                goldTagId = iTW.tag();
            }
            float tagScore = lexicon.score(iTW, event.getLoc(), event.word(), event.featureStr());
            logScores.incrementCount(iTW.tag(), tagScore);
        }
        if (noRules) {
            System.err.printf("NO TAGGINGS: %s %s%n", event.word(), event.featureStr());
        } else {
            // Score the tagging
            int hypTagId = Counters.argmax(logScores);
            if (hypTagId == goldTagId) {
                ++nCorrect;
            } else {
                String goldTag = goldTagId < 0 ? "UNSEEN" : lexicon.tagIndex.get(goldTagId);
                errors.incrementCount(goldTag);
            }
        }
        log.info();
    }
    // Output accuracy
    double acc = (double) nCorrect / (double) tuningSet.size();
    System.err.printf("%n%nACCURACY: %.2f%n%n", acc * 100.0);
    log.info("% of errors by type:");
    List<String> biggestKeys = new ArrayList<>(errors.keySet());
    Collections.sort(biggestKeys, Counters.toComparator(errors, false, true));
    Counters.normalize(errors);
    for (String key : biggestKeys) {
        System.err.printf("%s\t%.2f%n", key, errors.getCount(key) * 100.0);
    }
}
Also used : FrenchMorphoFeatureSpecification(edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification) Treebank(edu.stanford.nlp.trees.Treebank) ArrayList(java.util.ArrayList) Language(edu.stanford.nlp.international.Language) Tree(edu.stanford.nlp.trees.Tree) ArabicMorphoFeatureSpecification(edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification) MorphoFeatureSpecification(edu.stanford.nlp.international.morph.MorphoFeatureSpecification) ArabicMorphoFeatureSpecification(edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification) FrenchMorphoFeatureSpecification(edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification) HashIndex(edu.stanford.nlp.util.HashIndex) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter)

Aggregations

ArabicMorphoFeatureSpecification (edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification)3 MorphoFeatureSpecification (edu.stanford.nlp.international.morph.MorphoFeatureSpecification)3 Language (edu.stanford.nlp.international.Language)2 FrenchMorphoFeatureSpecification (edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification)2 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)2 Tree (edu.stanford.nlp.trees.Tree)2 Treebank (edu.stanford.nlp.trees.Treebank)2 ArrayList (java.util.ArrayList)2 MorphoFeatures (edu.stanford.nlp.international.morph.MorphoFeatures)1 CoreLabel (edu.stanford.nlp.ling.CoreLabel)1 Label (edu.stanford.nlp.ling.Label)1 TreebankLangParserParams (edu.stanford.nlp.parser.lexparser.TreebankLangParserParams)1 TwoDimensionalIntCounter (edu.stanford.nlp.stats.TwoDimensionalIntCounter)1 HashIndex (edu.stanford.nlp.util.HashIndex)1 Map (java.util.Map)1 Set (java.util.Set)1