Search in sources :

Example 21 with TaggedWord

use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.

the class TextTaggedFileReader method primeNext.

void primeNext() {
    String line;
    try {
        line = reader.readLine();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    if (line == null) {
        next = null;
        return;
    }
    ++numSentences;
    next = new ArrayList<>();
    StringTokenizer st = new StringTokenizer(line);
    while (st.hasMoreTokens()) {
        String token = st.nextToken();
        int indexUnd = token.lastIndexOf(tagSeparator);
        if (indexUnd < 0) {
            throw new IllegalArgumentException("Data format error: can't find delimiter \"" + tagSeparator + "\" in word \"" + token + "\" (line " + (numSentences + 1) + " of " + filename + ')');
        }
        String word = token.substring(0, indexUnd).intern();
        String tag = token.substring(indexUnd + 1).intern();
        next.add(new TaggedWord(word, tag));
    }
}
Also used : StringTokenizer(java.util.StringTokenizer) TaggedWord(edu.stanford.nlp.ling.TaggedWord) IOException(java.io.IOException)

Example 22 with TaggedWord

use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.

the class ChineseCharacterBasedLexicon method finishTraining.

@Override
public void finishTraining() {
    Timing.tick("Counting characters...");
    ClassicCounter<Symbol> charCounter = new ClassicCounter<>();
    // first find all chars that occur only once
    for (List<TaggedWord> labels : trainingSentences) {
        for (TaggedWord label : labels) {
            String word = label.word();
            if (word.equals(BOUNDARY)) {
                continue;
            }
            for (int j = 0, length = word.length(); j < length; j++) {
                Symbol sym = Symbol.cannonicalSymbol(word.charAt(j));
                charCounter.incrementCount(sym);
            }
            charCounter.incrementCount(Symbol.END_WORD);
        }
    }
    Set<Symbol> singletons = Counters.keysBelow(charCounter, 1.5);
    knownChars = Generics.newHashSet(charCounter.keySet());
    Timing.tick("Counting nGrams...");
    GeneralizedCounter[] POSspecificCharNGrams = new GeneralizedCounter[CONTEXT_LENGTH + 1];
    for (int i = 0; i <= CONTEXT_LENGTH; i++) {
        POSspecificCharNGrams[i] = new GeneralizedCounter(i + 2);
    }
    ClassicCounter<String> POSCounter = new ClassicCounter<>();
    List<Serializable> context = new ArrayList<>(CONTEXT_LENGTH + 1);
    for (List<TaggedWord> words : trainingSentences) {
        for (TaggedWord taggedWord : words) {
            String word = taggedWord.word();
            String tag = taggedWord.tag();
            tagIndex.add(tag);
            if (word.equals(BOUNDARY)) {
                continue;
            }
            POSCounter.incrementCount(tag);
            for (int i = 0, size = word.length(); i <= size; i++) {
                Symbol sym;
                Symbol unknownCharClass = null;
                context.clear();
                context.add(tag);
                if (i < size) {
                    char thisCh = word.charAt(i);
                    sym = Symbol.cannonicalSymbol(thisCh);
                    if (singletons.contains(sym)) {
                        unknownCharClass = unknownCharClass(sym);
                        charCounter.incrementCount(unknownCharClass);
                    }
                } else {
                    sym = Symbol.END_WORD;
                }
                // POS-specific 1-gram
                POSspecificCharNGrams[0].incrementCount(context, sym);
                if (unknownCharClass != null) {
                    // for unknown ch model
                    POSspecificCharNGrams[0].incrementCount(context, unknownCharClass);
                }
                // this could be made faster using .sublist like in score
                for (int j = 1; j <= CONTEXT_LENGTH; j++) {
                    // poly grams
                    if (i - j < 0) {
                        context.add(Symbol.BEGIN_WORD);
                        POSspecificCharNGrams[j].incrementCount(context, sym);
                        if (unknownCharClass != null) {
                            // for unknown ch model
                            POSspecificCharNGrams[j].incrementCount(context, unknownCharClass);
                        }
                        break;
                    } else {
                        Symbol prev = Symbol.cannonicalSymbol(word.charAt(i - j));
                        if (singletons.contains(prev)) {
                            context.add(unknownCharClass(prev));
                        } else {
                            context.add(prev);
                        }
                        POSspecificCharNGrams[j].incrementCount(context, sym);
                        if (unknownCharClass != null) {
                            // for unknown ch model
                            POSspecificCharNGrams[j].incrementCount(context, unknownCharClass);
                        }
                    }
                }
            }
        }
    }
    POSDistribution = Distribution.getDistribution(POSCounter);
    Timing.tick("Creating character prior distribution...");
    charDistributions = Generics.newHashMap();
    //    charDistributions = Generics.newHashMap();  // 1.5
    //    charCounter.incrementCount(Symbol.UNKNOWN, singletons.size());
    int numberOfKeys = charCounter.size() + singletons.size();
    Distribution<Symbol> prior = Distribution.goodTuringSmoothedCounter(charCounter, numberOfKeys);
    charDistributions.put(Collections.EMPTY_LIST, prior);
    for (int i = 0; i <= CONTEXT_LENGTH; i++) {
        Set<Map.Entry<List<Serializable>, ClassicCounter<Symbol>>> counterEntries = POSspecificCharNGrams[i].lowestLevelCounterEntrySet();
        Timing.tick("Creating " + counterEntries.size() + " character " + (i + 1) + "-gram distributions...");
        for (Map.Entry<List<Serializable>, ClassicCounter<Symbol>> entry : counterEntries) {
            context = entry.getKey();
            ClassicCounter<Symbol> c = entry.getValue();
            Distribution<Symbol> thisPrior = charDistributions.get(context.subList(0, context.size() - 1));
            double priorWeight = thisPrior.getNumberOfKeys() / 200.0;
            Distribution<Symbol> newDist = Distribution.dynamicCounterWithDirichletPrior(c, thisPrior, priorWeight);
            charDistributions.put(context, newDist);
        }
    }
}
Also used : GeneralizedCounter(edu.stanford.nlp.stats.GeneralizedCounter) TaggedWord(edu.stanford.nlp.ling.TaggedWord) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) RadicalMap(edu.stanford.nlp.trees.international.pennchinese.RadicalMap)

Example 23 with TaggedWord

use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.

the class ChineseMaxentLexicon method train.

/**
   * Add the given sentence to the statistics counted.  Can
   * be called multiple times with different sentences.
   */
@Override
public void train(List<TaggedWord> sentence, double weight) {
    featExtractor.train(sentence, weight);
    for (TaggedWord word : sentence) {
        datumCounter.incrementCount(word, weight);
        tagsForWord.add(word.word(), word.tag());
    }
}
Also used : TaggedWord(edu.stanford.nlp.ling.TaggedWord)

Example 24 with TaggedWord

use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.

the class ChineseWordFeatureExtractor method train.

public void train(List<TaggedWord> sentence, double weight) {
    for (TaggedWord word : sentence) {
        String wordString = word.word();
        wordCounter.incrementCount(wordString, weight);
    }
}
Also used : TaggedWord(edu.stanford.nlp.ling.TaggedWord)

Example 25 with TaggedWord

use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.

the class BaseLexicon method trainUnannotated.

@Override
public final void trainUnannotated(List<TaggedWord> sentence, double weight) {
    uwModelTrainer.incrementTreesRead(weight);
    int loc = 0;
    for (TaggedWord tw : sentence) {
        String baseTag = op.langpack().basicCategory(tw.tag());
        Counter<String> counts = baseTagCounts.get(baseTag);
        if (counts == null) {
            ++loc;
            continue;
        }
        double totalCount = counts.totalCount();
        if (totalCount == 0) {
            ++loc;
            continue;
        }
        for (String tag : counts.keySet()) {
            TaggedWord newTW = new TaggedWord(tw.word(), tag);
            train(newTW, loc, weight * counts.getCount(tag) / totalCount);
        }
        ++loc;
    }
}
Also used : TaggedWord(edu.stanford.nlp.ling.TaggedWord)

Aggregations

TaggedWord (edu.stanford.nlp.ling.TaggedWord)43 HasWord (edu.stanford.nlp.ling.HasWord)9 CoreLabel (edu.stanford.nlp.ling.CoreLabel)5 DocumentPreprocessor (edu.stanford.nlp.process.DocumentPreprocessor)5 MaxentTagger (edu.stanford.nlp.tagger.maxent.MaxentTagger)5 Tree (edu.stanford.nlp.trees.Tree)5 ArrayList (java.util.ArrayList)5 Label (edu.stanford.nlp.ling.Label)4 WordTag (edu.stanford.nlp.ling.WordTag)4 List (java.util.List)4 HasTag (edu.stanford.nlp.ling.HasTag)3 TaggedFileRecord (edu.stanford.nlp.tagger.io.TaggedFileRecord)3 File (java.io.File)3 StringReader (java.io.StringReader)3 Word (edu.stanford.nlp.ling.Word)2 Morphology (edu.stanford.nlp.process.Morphology)2 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)2 GrammaticalStructure (edu.stanford.nlp.trees.GrammaticalStructure)2 BufferedReader (java.io.BufferedReader)2 IOException (java.io.IOException)2