Search in sources :

Example 1 with Label

use of edu.stanford.nlp.ling.Label in project CoreNLP by stanfordnlp.

the class FactoredLexicon method train.

/**
 * This method should populate wordIndex, tagIndex, and morphIndex.
 */
@Override
public void train(Collection<Tree> trees, Collection<Tree> rawTrees) {
    double weight = 1.0;
    // Train uw model on words
    uwModelTrainer.train(trees, weight);
    final double numTrees = trees.size();
    Iterator<Tree> rawTreesItr = rawTrees == null ? null : rawTrees.iterator();
    Iterator<Tree> treeItr = trees.iterator();
    // Train factored lexicon on lemmas and morph tags
    int treeId = 0;
    while (treeItr.hasNext()) {
        Tree tree = treeItr.next();
        // CoreLabels, with morph analysis in the originalText annotation
        List<Label> yield = rawTrees == null ? tree.yield() : rawTreesItr.next().yield();
        // Annotated, binarized tree for the tags (labels are usually CategoryWordTag)
        List<Label> pretermYield = tree.preTerminalYield();
        int yieldLen = yield.size();
        for (int i = 0; i < yieldLen; ++i) {
            String word = yield.get(i).value();
            // Don't do anything with words
            int wordId = wordIndex.addToIndex(word);
            String tag = pretermYield.get(i).value();
            int tagId = tagIndex.addToIndex(tag);
            // Use the word as backup if there is no lemma
            String featureStr = ((CoreLabel) yield.get(i)).originalText();
            Pair<String, String> lemmaMorph = MorphoFeatureSpecification.splitMorphString(word, featureStr);
            String lemma = lemmaMorph.first();
            int lemmaId = wordIndex.addToIndex(lemma);
            String richMorphTag = lemmaMorph.second();
            String reducedMorphTag = morphoSpec.strToFeatures(richMorphTag).toString().trim();
            reducedMorphTag = reducedMorphTag.isEmpty() ? NO_MORPH_ANALYSIS : reducedMorphTag;
            int morphId = morphIndex.addToIndex(reducedMorphTag);
            // Seen event counts
            wordTag.incrementCount(wordId, tagId);
            lemmaTag.incrementCount(lemmaId, tagId);
            morphTag.incrementCount(morphId, tagId);
            tagCounter.incrementCount(tagId);
            // Unseen event counts
            if (treeId > op.trainOptions.fractionBeforeUnseenCounting * numTrees) {
                if (!wordTag.firstKeySet().contains(wordId) || wordTag.getCounter(wordId).totalCount() < 2) {
                    wordTagUnseen.incrementCount(tagId);
                }
                if (!lemmaTag.firstKeySet().contains(lemmaId) || lemmaTag.getCounter(lemmaId).totalCount() < 2) {
                    lemmaTagUnseen.incrementCount(tagId);
                }
                if (!morphTag.firstKeySet().contains(morphId) || morphTag.getCounter(morphId).totalCount() < 2) {
                    morphTagUnseen.incrementCount(tagId);
                }
            }
        }
        ++treeId;
        if (DEBUG && (treeId % 100) == 0) {
            System.err.printf("[%d]", treeId);
        }
        if (DEBUG && (treeId % 10000) == 0) {
            log.info();
        }
    }
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Label(edu.stanford.nlp.ling.Label) Tree(edu.stanford.nlp.trees.Tree)

Example 2 with Label

use of edu.stanford.nlp.ling.Label in project CoreNLP by stanfordnlp.

the class ExhaustiveDependencyParser method extractBestParse.

/**
 * Find the best (partial) parse within the parameter constraints.
 *  @param start Sentence index of start of span (fenceposts, from 0 up)
 *  @param end   Sentence index of end of span (right side fencepost)
 *  @param hWord Sentence index of head word (left side fencepost)
 *  @param hTag  Tag assigned to hWord
 *  @return The best parse tree within the parameter constraints
 */
private Tree extractBestParse(int start, int end, int hWord, int hTag) {
    if (DEBUG) {
        log.info("Span " + start + " to " + end + " word " + wordIndex.get(words[hWord]) + "/" + hWord + " tag " + tagIndex.get(hTag) + "/" + hTag + " score " + iScore(start, end, hWord, hTag));
    }
    String headWordStr = wordIndex.get(words[hWord]);
    String headTagStr = tagIndex.get(hTag);
    Label headLabel = new CategoryWordTag(headWordStr, headWordStr, headTagStr);
    int numTags = tagIndex.size();
    // deal with span 1
    if (end - start == 1) {
        Tree leaf = tf.newLeaf(new Word(headWordStr));
        return tf.newTreeNode(headLabel, Collections.singletonList(leaf));
    }
    // find backtrace
    List<Tree> children = new ArrayList<>();
    double bestScore = iScore(start, end, hWord, hTag);
    for (int split = start + 1; split < end; split++) {
        int binD = binDistance[hWord][split];
        if (hWord < split) {
            for (int aWord = split; aWord < end; aWord++) {
                for (int aTag = 0; aTag < numTags; aTag++) {
                    if (matches(iScore(start, split, hWord, hTag) + iScore(split, end, aWord, aTag) + headScore[binD][hWord][dg.tagBin(hTag)][aWord][dg.tagBin(aTag)] + headStop[aWord][dg.tagBin(aTag)][split] + headStop[aWord][dg.tagBin(aTag)][end], bestScore)) {
                        if (DEBUG) {
                            String argWordStr = wordIndex.get(words[aWord]);
                            String argTagStr = tagIndex.get(aTag);
                            log.info(headWordStr + "|" + headTagStr + " -> " + argWordStr + "|" + argTagStr + " " + bestScore);
                        }
                        // build it
                        children.add(extractBestParse(start, split, hWord, hTag));
                        children.add(extractBestParse(split, end, aWord, aTag));
                        return tf.newTreeNode(headLabel, children);
                    }
                }
            }
        } else {
            for (int aWord = start; aWord < split; aWord++) {
                for (int aTag = 0; aTag < numTags; aTag++) {
                    if (matches(iScore(start, split, aWord, aTag) + iScore(split, end, hWord, hTag) + headScore[binD][hWord][dg.tagBin(hTag)][aWord][dg.tagBin(aTag)] + headStop[aWord][dg.tagBin(aTag)][start] + headStop[aWord][dg.tagBin(aTag)][split], bestScore)) {
                        if (DEBUG) {
                            String argWordStr = wordIndex.get(words[aWord]);
                            String argTagStr = tagIndex.get(aTag);
                            log.info(headWordStr + "|" + headTagStr + " -> " + argWordStr + "|" + argTagStr + " " + bestScore);
                        }
                        children.add(extractBestParse(start, split, aWord, aTag));
                        children.add(extractBestParse(split, end, hWord, hTag));
                        // build it
                        return tf.newTreeNode(headLabel, children);
                    }
                }
            }
        }
    }
    log.info("Problem in ExhaustiveDependencyParser::extractBestParse");
    return null;
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) Word(edu.stanford.nlp.ling.Word) Label(edu.stanford.nlp.ling.Label) Tree(edu.stanford.nlp.trees.Tree) CategoryWordTag(edu.stanford.nlp.ling.CategoryWordTag)

Example 3 with Label

use of edu.stanford.nlp.ling.Label in project CoreNLP by stanfordnlp.

the class BaseUnknownWordModel method score.

// todo [cdm 2010]: Recheck that this method really does the right thing in making a P(W|T) estimate....
public float score(IntTaggedWord itw, String word) {
    float logProb;
    // Label tag = itw.tagLabel();
    String tagStr = itw.tagString(tagIndex);
    Label tag = new Tag(tagStr);
    if (useEnd || useFirst || useFirstCap) {
        // The getSignature here doesn't use sentence position
        String end = getSignature(word, -1);
        if (useGT && !seenEnd.contains(end)) {
            logProb = scoreGT(tagStr);
        } else {
            if (!seenEnd.contains(end)) {
                end = unknown;
            }
            // System.out.println("using end-character model for for unknown word "+  word + " for tag " + tag);
            /* get the Counter of terminal rewrites for the relevant tag */
            ClassicCounter<String> wordProbs = tagHash.get(tag);
            /* if the proposed tag has never been seen before, issue a
         * warning and return probability 0
         */
            if (wordProbs == null) {
                log.info("Warning: proposed tag is unseen in training data:\t" + tagStr);
                logProb = Float.NEGATIVE_INFINITY;
            } else if (wordProbs.keySet().contains(end)) {
                logProb = (float) wordProbs.getCount(end);
            } else {
                logProb = (float) wordProbs.getCount(unknown);
            }
        }
    } else if (useGT) {
        logProb = scoreGT(tagStr);
    } else {
        log.info("Warning: no unknown word model in place!\nGiving the combination " + word + ' ' + tagStr + " zero probability.");
        // should never get this!
        logProb = Float.NEGATIVE_INFINITY;
    }
    // EncodingPrintWriter.out.println("Unknown word estimate for " + word + " as " + tag + ": " + logProb,encoding); //debugging
    return logProb;
}
Also used : Label(edu.stanford.nlp.ling.Label) Tag(edu.stanford.nlp.ling.Tag)

Example 4 with Label

use of edu.stanford.nlp.ling.Label in project CoreNLP by stanfordnlp.

the class BaseUnknownWordModelTrainer method finishTraining.

@Override
public UnknownWordModel finishTraining() {
    if (useGT) {
        unknownGTTrainer.finishTraining();
    }
    for (Map.Entry<Label, ClassicCounter<String>> entry : c.entrySet()) {
        /* outer iteration is over tags */
        Label key = entry.getKey();
        // counts for words given a tag
        ClassicCounter<String> wc = entry.getValue();
        if (!tagHash.containsKey(key)) {
            tagHash.put(key, new ClassicCounter<>());
        }
        /* the UNKNOWN sequence is assumed to be seen once in each tag */
        // This is sort of broken, but you can regard it as a Dirichlet prior.
        tc.incrementCount(key);
        wc.setCount(unknown, 1.0);
        /* inner iteration is over words */
        for (Map.Entry<String, Double> wEntry : wc.entrySet()) {
            String end = wEntry.getKey();
            // p(sig|tag)
            double prob = Math.log(wEntry.getValue() / tc.getCount(key));
            tagHash.get(key).setCount(end, prob);
        // if (Test.verbose)
        // EncodingPrintWriter.out.println(tag + " rewrites as " + end + " endchar with probability " + prob,encoding);
        }
    }
    return model;
}
Also used : Label(edu.stanford.nlp.ling.Label) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) Map(java.util.Map)

Example 5 with Label

use of edu.stanford.nlp.ling.Label in project CoreNLP by stanfordnlp.

the class ChineseUnknownWordModelTrainer method finishTraining.

@Override
public UnknownWordModel finishTraining() {
    // Map<String,Float> unknownGT = null;
    if (useGT) {
        unknownGTTrainer.finishTraining();
    // unknownGT = unknownGTTrainer.unknownGT;
    }
    for (Label tagLab : c.keySet()) {
        // outer iteration is over tags as Labels
        // counts for words given a tag
        ClassicCounter<String> wc = c.get(tagLab);
        if (!tagHash.containsKey(tagLab)) {
            tagHash.put(tagLab, new ClassicCounter<>());
        }
        // the UNKNOWN first character is assumed to be seen once in
        // each tag
        // this is really sort of broken!  (why??)
        tc.incrementCount(tagLab);
        wc.setCount(unknown, 1.0);
        // inner iteration is over words  as strings
        for (String first : wc.keySet()) {
            double prob = Math.log(((wc.getCount(first))) / tc.getCount(tagLab));
            tagHash.get(tagLab).setCount(first, prob);
        // if (Test.verbose)
        // EncodingPrintWriter.out.println(tag + " rewrites as " + first + " first char with probability " + prob,encoding);
        }
    }
    return model;
}
Also used : Label(edu.stanford.nlp.ling.Label)

Aggregations

Label (edu.stanford.nlp.ling.Label)83 CoreLabel (edu.stanford.nlp.ling.CoreLabel)43 Tree (edu.stanford.nlp.trees.Tree)26 HasWord (edu.stanford.nlp.ling.HasWord)13 HasTag (edu.stanford.nlp.ling.HasTag)10 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)8 TreebankLangParserParams (edu.stanford.nlp.parser.lexparser.TreebankLangParserParams)8 PrintWriter (java.io.PrintWriter)8 ArrayList (java.util.ArrayList)8 Language (edu.stanford.nlp.international.Language)7 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)6 Treebank (edu.stanford.nlp.trees.Treebank)6 TaggedWord (edu.stanford.nlp.ling.TaggedWord)5 EnglishTreebankParserParams (edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams)5 TreeTransformer (edu.stanford.nlp.trees.TreeTransformer)5 Map (java.util.Map)5 Set (java.util.Set)5 Tag (edu.stanford.nlp.ling.Tag)4 Word (edu.stanford.nlp.ling.Word)4 ParserConstraint (edu.stanford.nlp.parser.common.ParserConstraint)4