use of edu.stanford.nlp.classify.WeightedDataset in project CoreNLP by stanfordnlp.
the class ChineseMaxentLexicon method finishTraining.
@Override
public void finishTraining() {
IntCounter<String> tagCounter = new IntCounter<>();
WeightedDataset data = new WeightedDataset(datumCounter.size());
for (TaggedWord word : datumCounter.keySet()) {
int count = datumCounter.getIntCount(word);
if (trainOnLowCount && count > trainCountThreshold) {
continue;
}
if (functionWordTags.containsKey(word.word())) {
continue;
}
tagCounter.incrementCount(word.tag());
if (trainByType) {
count = 1;
}
data.add(new BasicDatum(featExtractor.makeFeatures(word.word()), word.tag()), count);
}
datumCounter = null;
tagDist = Distribution.laplaceSmoothedDistribution(tagCounter, tagCounter.size(), 0.5);
tagCounter = null;
applyThresholds(data);
verbose("Making classifier...");
//new ResultStoringMonitor(5, "weights"));
QNMinimizer minim = new QNMinimizer();
// minim.shutUp();
LinearClassifierFactory factory = new LinearClassifierFactory(minim);
factory.setTol(tol);
factory.setSigma(sigma);
if (tuneSigma) {
factory.setTuneSigmaHeldOut();
}
scorer = factory.trainClassifier(data);
verbose("Done training.");
}
Aggregations