use of edu.stanford.nlp.ling.BasicDatum in project CoreNLP by stanfordnlp.
the class NBLinearClassifierFactory method tuneSigma.
private void tuneSigma(final int[][] data, final int[] labels) {
Function<Double, Double> CVSigmaToPerplexity = trialSigma -> {
double score = 0.0;
double sumScore = 0.0;
int foldSize, nbCV;
logger.info("Trying sigma = " + trialSigma);
if (data.length >= folds) {
foldSize = data.length / folds;
nbCV = folds;
} else {
foldSize = 1;
nbCV = data.length;
}
for (int j = 0; j < nbCV; j++) {
int testMin = j * foldSize;
int testMax = testMin + foldSize;
LinearClassifier<L, F> c = new LinearClassifier<>(weights(data, labels, testMin, testMax, trialSigma, foldSize), featureIndex, labelIndex);
for (int i = testMin; i < testMax; i++) {
score -= c.logProbabilityOf(new BasicDatum<>(featureIndex.objects(data[i]))).getCount(labelIndex.get(labels[i]));
}
sumScore += score;
}
System.err.printf(": %8g%n", sumScore);
return sumScore;
};
GoldenSectionLineSearch gsls = new GoldenSectionLineSearch(true);
sigma = gsls.minimize(CVSigmaToPerplexity, 0.01, 0.0001, 2.0);
System.out.println("Sigma used: " + sigma);
}
use of edu.stanford.nlp.ling.BasicDatum in project CoreNLP by stanfordnlp.
the class CMMClassifier method makeDatum.
/** Make an individual Datum out of the data list info, focused at position loc.
*
* @param info A List of IN objects
* @param loc The position in the info list to focus feature creation on
* @param featureFactories The factory that constructs features out of the item
* @return A Datum (BasicDatum) representing this data instance
*/
public Datum<String, String> makeDatum(List<IN> info, int loc, List<FeatureFactory<IN>> featureFactories) {
PaddedList<IN> pInfo = new PaddedList<>(info, pad);
Collection<String> features = new ArrayList<>();
for (FeatureFactory<IN> featureFactory : featureFactories) {
List<Clique> cliques = featureFactory.getCliques();
for (Clique c : cliques) {
Collection<String> feats = featureFactory.getCliqueFeatures(pInfo, loc, c);
feats = addOtherClasses(feats, pInfo, loc, c);
features.addAll(feats);
}
}
printFeatures(pInfo.get(loc), features);
CoreLabel c = info.get(loc);
return new BasicDatum<>(features, c.get(CoreAnnotations.AnswerAnnotation.class));
}
use of edu.stanford.nlp.ling.BasicDatum in project CoreNLP by stanfordnlp.
the class ChineseMaxentLexicon method ensureProbs.
private void ensureProbs(int word, boolean subtractTagScore) {
if (word == lastWord) {
return;
}
lastWord = word;
if (functionWordTags.containsKey(wordIndex.get(word))) {
logProbs = new ClassicCounter<>();
String trueTag = functionWordTags.get(wordIndex.get(word));
for (String tag : tagIndex.objectsList()) {
if (ctlp.basicCategory(tag).equals(trueTag)) {
logProbs.setCount(tag, 0);
} else {
logProbs.setCount(tag, Double.NEGATIVE_INFINITY);
}
}
return;
}
Datum datum = new BasicDatum(featExtractor.makeFeatures(wordIndex.get(word)));
logProbs = scorer.logProbabilityOf(datum);
if (subtractTagScore) {
Set<String> tagSet = logProbs.keySet();
for (String tag : tagSet) {
logProbs.incrementCount(tag, -Math.log(tagDist.probabilityOf(tag)));
}
}
}
use of edu.stanford.nlp.ling.BasicDatum in project CoreNLP by stanfordnlp.
the class ChineseMaxentLexicon method finishTraining.
@Override
public void finishTraining() {
IntCounter<String> tagCounter = new IntCounter<>();
WeightedDataset data = new WeightedDataset(datumCounter.size());
for (TaggedWord word : datumCounter.keySet()) {
int count = datumCounter.getIntCount(word);
if (trainOnLowCount && count > trainCountThreshold) {
continue;
}
if (functionWordTags.containsKey(word.word())) {
continue;
}
tagCounter.incrementCount(word.tag());
if (trainByType) {
count = 1;
}
data.add(new BasicDatum(featExtractor.makeFeatures(word.word()), word.tag()), count);
}
datumCounter = null;
tagDist = Distribution.laplaceSmoothedDistribution(tagCounter, tagCounter.size(), 0.5);
tagCounter = null;
applyThresholds(data);
verbose("Making classifier...");
//new ResultStoringMonitor(5, "weights"));
QNMinimizer minim = new QNMinimizer();
// minim.shutUp();
LinearClassifierFactory factory = new LinearClassifierFactory(minim);
factory.setTol(tol);
factory.setSigma(sigma);
if (tuneSigma) {
factory.setTuneSigmaHeldOut();
}
scorer = factory.trainClassifier(data);
verbose("Done training.");
}
Aggregations