Search in sources :

Example 1 with GeneralizedCounter

use of edu.stanford.nlp.stats.GeneralizedCounter in project CoreNLP by stanfordnlp.

the class ChineseCharacterBasedLexicon method finishTraining.

@Override
public void finishTraining() {
    Timing.tick("Counting characters...");
    ClassicCounter<Symbol> charCounter = new ClassicCounter<>();
    // first find all chars that occur only once
    for (List<TaggedWord> labels : trainingSentences) {
        for (TaggedWord label : labels) {
            String word = label.word();
            if (word.equals(BOUNDARY)) {
                continue;
            }
            for (int j = 0, length = word.length(); j < length; j++) {
                Symbol sym = Symbol.cannonicalSymbol(word.charAt(j));
                charCounter.incrementCount(sym);
            }
            charCounter.incrementCount(Symbol.END_WORD);
        }
    }
    Set<Symbol> singletons = Counters.keysBelow(charCounter, 1.5);
    knownChars = Generics.newHashSet(charCounter.keySet());
    Timing.tick("Counting nGrams...");
    GeneralizedCounter[] POSspecificCharNGrams = new GeneralizedCounter[CONTEXT_LENGTH + 1];
    for (int i = 0; i <= CONTEXT_LENGTH; i++) {
        POSspecificCharNGrams[i] = new GeneralizedCounter(i + 2);
    }
    ClassicCounter<String> POSCounter = new ClassicCounter<>();
    List<Serializable> context = new ArrayList<>(CONTEXT_LENGTH + 1);
    for (List<TaggedWord> words : trainingSentences) {
        for (TaggedWord taggedWord : words) {
            String word = taggedWord.word();
            String tag = taggedWord.tag();
            tagIndex.add(tag);
            if (word.equals(BOUNDARY)) {
                continue;
            }
            POSCounter.incrementCount(tag);
            for (int i = 0, size = word.length(); i <= size; i++) {
                Symbol sym;
                Symbol unknownCharClass = null;
                context.clear();
                context.add(tag);
                if (i < size) {
                    char thisCh = word.charAt(i);
                    sym = Symbol.cannonicalSymbol(thisCh);
                    if (singletons.contains(sym)) {
                        unknownCharClass = unknownCharClass(sym);
                        charCounter.incrementCount(unknownCharClass);
                    }
                } else {
                    sym = Symbol.END_WORD;
                }
                // POS-specific 1-gram
                POSspecificCharNGrams[0].incrementCount(context, sym);
                if (unknownCharClass != null) {
                    // for unknown ch model
                    POSspecificCharNGrams[0].incrementCount(context, unknownCharClass);
                }
                // this could be made faster using .sublist like in score
                for (int j = 1; j <= CONTEXT_LENGTH; j++) {
                    // poly grams
                    if (i - j < 0) {
                        context.add(Symbol.BEGIN_WORD);
                        POSspecificCharNGrams[j].incrementCount(context, sym);
                        if (unknownCharClass != null) {
                            // for unknown ch model
                            POSspecificCharNGrams[j].incrementCount(context, unknownCharClass);
                        }
                        break;
                    } else {
                        Symbol prev = Symbol.cannonicalSymbol(word.charAt(i - j));
                        if (singletons.contains(prev)) {
                            context.add(unknownCharClass(prev));
                        } else {
                            context.add(prev);
                        }
                        POSspecificCharNGrams[j].incrementCount(context, sym);
                        if (unknownCharClass != null) {
                            // for unknown ch model
                            POSspecificCharNGrams[j].incrementCount(context, unknownCharClass);
                        }
                    }
                }
            }
        }
    }
    POSDistribution = Distribution.getDistribution(POSCounter);
    Timing.tick("Creating character prior distribution...");
    charDistributions = Generics.newHashMap();
    //    charDistributions = Generics.newHashMap();  // 1.5
    //    charCounter.incrementCount(Symbol.UNKNOWN, singletons.size());
    int numberOfKeys = charCounter.size() + singletons.size();
    Distribution<Symbol> prior = Distribution.goodTuringSmoothedCounter(charCounter, numberOfKeys);
    charDistributions.put(Collections.EMPTY_LIST, prior);
    for (int i = 0; i <= CONTEXT_LENGTH; i++) {
        Set<Map.Entry<List<Serializable>, ClassicCounter<Symbol>>> counterEntries = POSspecificCharNGrams[i].lowestLevelCounterEntrySet();
        Timing.tick("Creating " + counterEntries.size() + " character " + (i + 1) + "-gram distributions...");
        for (Map.Entry<List<Serializable>, ClassicCounter<Symbol>> entry : counterEntries) {
            context = entry.getKey();
            ClassicCounter<Symbol> c = entry.getValue();
            Distribution<Symbol> thisPrior = charDistributions.get(context.subList(0, context.size() - 1));
            double priorWeight = thisPrior.getNumberOfKeys() / 200.0;
            Distribution<Symbol> newDist = Distribution.dynamicCounterWithDirichletPrior(c, thisPrior, priorWeight);
            charDistributions.put(context, newDist);
        }
    }
}
Also used : GeneralizedCounter(edu.stanford.nlp.stats.GeneralizedCounter) TaggedWord(edu.stanford.nlp.ling.TaggedWord) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) RadicalMap(edu.stanford.nlp.trees.international.pennchinese.RadicalMap)

Example 2 with GeneralizedCounter

use of edu.stanford.nlp.stats.GeneralizedCounter in project CoreNLP by stanfordnlp.

the class ChineseMarkovWordSegmenter method initializeTraining.

@Override
public void initializeTraining(double numTrees) {
    lex.initializeTraining(numTrees);
    this.initial = new ClassicCounter<>();
    this.ruleCounter = new GeneralizedCounter(2);
}
Also used : GeneralizedCounter(edu.stanford.nlp.stats.GeneralizedCounter)

Aggregations

GeneralizedCounter (edu.stanford.nlp.stats.GeneralizedCounter)2 TaggedWord (edu.stanford.nlp.ling.TaggedWord)1 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)1 RadicalMap (edu.stanford.nlp.trees.international.pennchinese.RadicalMap)1