Search in sources :

Example 1 with BasicDatum

use of edu.stanford.nlp.ling.BasicDatum in project CoreNLP by stanfordnlp.

the class NBLinearClassifierFactory method tuneSigma.

private void tuneSigma(final int[][] data, final int[] labels) {
    Function<Double, Double> CVSigmaToPerplexity = trialSigma -> {
        double score = 0.0;
        double sumScore = 0.0;
        int foldSize, nbCV;
        logger.info("Trying sigma = " + trialSigma);
        if (data.length >= folds) {
            foldSize = data.length / folds;
            nbCV = folds;
        } else {
            foldSize = 1;
            nbCV = data.length;
        }
        for (int j = 0; j < nbCV; j++) {
            int testMin = j * foldSize;
            int testMax = testMin + foldSize;
            LinearClassifier<L, F> c = new LinearClassifier<>(weights(data, labels, testMin, testMax, trialSigma, foldSize), featureIndex, labelIndex);
            for (int i = testMin; i < testMax; i++) {
                score -= c.logProbabilityOf(new BasicDatum<>(featureIndex.objects(data[i]))).getCount(labelIndex.get(labels[i]));
            }
            sumScore += score;
        }
        System.err.printf(": %8g%n", sumScore);
        return sumScore;
    };
    GoldenSectionLineSearch gsls = new GoldenSectionLineSearch(true);
    sigma = gsls.minimize(CVSigmaToPerplexity, 0.01, 0.0001, 2.0);
    System.out.println("Sigma used: " + sigma);
}
Also used : Redwood(edu.stanford.nlp.util.logging.Redwood) BasicDatum(edu.stanford.nlp.ling.BasicDatum) GoldenSectionLineSearch(edu.stanford.nlp.optimization.GoldenSectionLineSearch) Function(java.util.function.Function) GoldenSectionLineSearch(edu.stanford.nlp.optimization.GoldenSectionLineSearch) BasicDatum(edu.stanford.nlp.ling.BasicDatum)

Example 2 with BasicDatum

use of edu.stanford.nlp.ling.BasicDatum in project CoreNLP by stanfordnlp.

the class CMMClassifier method makeDatum.

/** Make an individual Datum out of the data list info, focused at position loc.
   *
   *  @param info A List of IN objects
   *  @param loc  The position in the info list to focus feature creation on
   *  @param featureFactories The factory that constructs features out of the item
   *  @return A Datum (BasicDatum) representing this data instance
   */
public Datum<String, String> makeDatum(List<IN> info, int loc, List<FeatureFactory<IN>> featureFactories) {
    PaddedList<IN> pInfo = new PaddedList<>(info, pad);
    Collection<String> features = new ArrayList<>();
    for (FeatureFactory<IN> featureFactory : featureFactories) {
        List<Clique> cliques = featureFactory.getCliques();
        for (Clique c : cliques) {
            Collection<String> feats = featureFactory.getCliqueFeatures(pInfo, loc, c);
            feats = addOtherClasses(feats, pInfo, loc, c);
            features.addAll(feats);
        }
    }
    printFeatures(pInfo.get(loc), features);
    CoreLabel c = info.get(loc);
    return new BasicDatum<>(features, c.get(CoreAnnotations.AnswerAnnotation.class));
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) ArrayList(java.util.ArrayList) Clique(edu.stanford.nlp.sequences.Clique) BasicDatum(edu.stanford.nlp.ling.BasicDatum)

Example 3 with BasicDatum

use of edu.stanford.nlp.ling.BasicDatum in project CoreNLP by stanfordnlp.

the class ChineseMaxentLexicon method ensureProbs.

private void ensureProbs(int word, boolean subtractTagScore) {
    if (word == lastWord) {
        return;
    }
    lastWord = word;
    if (functionWordTags.containsKey(wordIndex.get(word))) {
        logProbs = new ClassicCounter<>();
        String trueTag = functionWordTags.get(wordIndex.get(word));
        for (String tag : tagIndex.objectsList()) {
            if (ctlp.basicCategory(tag).equals(trueTag)) {
                logProbs.setCount(tag, 0);
            } else {
                logProbs.setCount(tag, Double.NEGATIVE_INFINITY);
            }
        }
        return;
    }
    Datum datum = new BasicDatum(featExtractor.makeFeatures(wordIndex.get(word)));
    logProbs = scorer.logProbabilityOf(datum);
    if (subtractTagScore) {
        Set<String> tagSet = logProbs.keySet();
        for (String tag : tagSet) {
            logProbs.incrementCount(tag, -Math.log(tagDist.probabilityOf(tag)));
        }
    }
}
Also used : BasicDatum(edu.stanford.nlp.ling.BasicDatum) Datum(edu.stanford.nlp.ling.Datum) BasicDatum(edu.stanford.nlp.ling.BasicDatum)

Example 4 with BasicDatum

use of edu.stanford.nlp.ling.BasicDatum in project CoreNLP by stanfordnlp.

the class ChineseMaxentLexicon method finishTraining.

@Override
public void finishTraining() {
    IntCounter<String> tagCounter = new IntCounter<>();
    WeightedDataset data = new WeightedDataset(datumCounter.size());
    for (TaggedWord word : datumCounter.keySet()) {
        int count = datumCounter.getIntCount(word);
        if (trainOnLowCount && count > trainCountThreshold) {
            continue;
        }
        if (functionWordTags.containsKey(word.word())) {
            continue;
        }
        tagCounter.incrementCount(word.tag());
        if (trainByType) {
            count = 1;
        }
        data.add(new BasicDatum(featExtractor.makeFeatures(word.word()), word.tag()), count);
    }
    datumCounter = null;
    tagDist = Distribution.laplaceSmoothedDistribution(tagCounter, tagCounter.size(), 0.5);
    tagCounter = null;
    applyThresholds(data);
    verbose("Making classifier...");
    //new ResultStoringMonitor(5, "weights"));
    QNMinimizer minim = new QNMinimizer();
    //    minim.shutUp();
    LinearClassifierFactory factory = new LinearClassifierFactory(minim);
    factory.setTol(tol);
    factory.setSigma(sigma);
    if (tuneSigma) {
        factory.setTuneSigmaHeldOut();
    }
    scorer = factory.trainClassifier(data);
    verbose("Done training.");
}
Also used : TaggedWord(edu.stanford.nlp.ling.TaggedWord) LinearClassifierFactory(edu.stanford.nlp.classify.LinearClassifierFactory) WeightedDataset(edu.stanford.nlp.classify.WeightedDataset) QNMinimizer(edu.stanford.nlp.optimization.QNMinimizer) BasicDatum(edu.stanford.nlp.ling.BasicDatum)

Aggregations

BasicDatum (edu.stanford.nlp.ling.BasicDatum)4 LinearClassifierFactory (edu.stanford.nlp.classify.LinearClassifierFactory)1 WeightedDataset (edu.stanford.nlp.classify.WeightedDataset)1 CoreLabel (edu.stanford.nlp.ling.CoreLabel)1 Datum (edu.stanford.nlp.ling.Datum)1 TaggedWord (edu.stanford.nlp.ling.TaggedWord)1 GoldenSectionLineSearch (edu.stanford.nlp.optimization.GoldenSectionLineSearch)1 QNMinimizer (edu.stanford.nlp.optimization.QNMinimizer)1 Clique (edu.stanford.nlp.sequences.Clique)1 Redwood (edu.stanford.nlp.util.logging.Redwood)1 ArrayList (java.util.ArrayList)1 Function (java.util.function.Function)1