use of com.joliciel.talismane.machineLearning.ClassificationEventStream in project talismane by joliciel-informatique.
the class MaxentModelTrainer method trainModel.
@Override
public ClassificationModel trainModel(ClassificationEventStream corpusEventStream, Map<String, List<String>> descriptors) throws IOException {
MaxentModel maxentModel = null;
EventStream eventStream = new OpenNLPEventStream(corpusEventStream);
DataIndexer dataIndexer = new TwoPassRealValueDataIndexer(eventStream, cutoff);
GISTrainer trainer = new GISTrainer(true);
if (this.getSmoothing() > 0) {
trainer.setSmoothing(true);
trainer.setSmoothingObservation(this.getSmoothing());
} else if (this.getSigma() > 0) {
trainer.setGaussianSigma(this.getSigma());
}
maxentModel = trainer.trainModel(iterations, dataIndexer, cutoff);
MaximumEntropyModel model = new MaximumEntropyModel(maxentModel, config, descriptors);
model.addModelAttribute("cutoff", this.getCutoff());
model.addModelAttribute("iterations", this.getIterations());
model.addModelAttribute("sigma", this.getSigma());
model.addModelAttribute("smoothing", this.getSmoothing());
model.getModelAttributes().putAll(corpusEventStream.getAttributes());
return model;
}
use of com.joliciel.talismane.machineLearning.ClassificationEventStream in project jochre by urieli.
the class Jochre method doCommandTrain.
/**
* Train a letter guessing model.
*
* @param featureDescriptors
* the feature descriptors for training
* @param criteria
* criteria for selecting images to include when training
* @param reconstructLetters
* whether or not complete letters should be reconstructed for
* training, from merged/split letters
*/
public void doCommandTrain(List<String> featureDescriptors, CorpusSelectionCriteria criteria, boolean reconstructLetters) {
if (jochreSession.getLetterModelPath() == null)
throw new RuntimeException("Missing argument: letterModel");
if (featureDescriptors == null)
throw new JochreException("features is required");
LetterFeatureParser letterFeatureParser = new LetterFeatureParser();
Set<LetterFeature<?>> features = letterFeatureParser.getLetterFeatureSet(featureDescriptors);
BoundaryDetector boundaryDetector = null;
if (reconstructLetters) {
ShapeSplitter splitter = new TrainingCorpusShapeSplitter(jochreSession);
ShapeMerger merger = new TrainingCorpusShapeMerger();
boundaryDetector = new LetterByLetterBoundaryDetector(splitter, merger, jochreSession);
} else {
boundaryDetector = new OriginalBoundaryDetector();
}
LetterValidator letterValidator = new ComponentCharacterValidator(jochreSession);
ClassificationEventStream corpusEventStream = new JochreLetterEventStream(features, boundaryDetector, letterValidator, criteria, jochreSession);
File letterModelFile = new File(jochreSession.getLetterModelPath());
letterModelFile.getParentFile().mkdirs();
ModelTrainerFactory modelTrainerFactory = new ModelTrainerFactory();
ClassificationModelTrainer trainer = modelTrainerFactory.constructTrainer(jochreSession.getConfig());
ClassificationModel letterModel = trainer.trainModel(corpusEventStream, featureDescriptors);
letterModel.persist(letterModelFile);
}
use of com.joliciel.talismane.machineLearning.ClassificationEventStream in project jochre by urieli.
the class Jochre method doCommandTrainMerge.
/**
* Train the letter merging model.
*
* @param featureDescriptors
* feature descriptors for training
* @param multiplier
* if > 0, will be used to equalize the outcomes
* @param criteria
* the criteria used to select the training corpus
*/
public void doCommandTrainMerge(List<String> featureDescriptors, int multiplier, CorpusSelectionCriteria criteria) {
if (jochreSession.getMergeModelPath() == null)
throw new RuntimeException("Missing argument: mergeModel");
if (featureDescriptors == null)
throw new JochreException("features is required");
File mergeModelFile = new File(jochreSession.getMergeModelPath());
mergeModelFile.getParentFile().mkdirs();
MergeFeatureParser mergeFeatureParser = new MergeFeatureParser();
Set<MergeFeature<?>> mergeFeatures = mergeFeatureParser.getMergeFeatureSet(featureDescriptors);
ClassificationEventStream corpusEventStream = new JochreMergeEventStream(criteria, mergeFeatures, jochreSession);
if (multiplier > 0) {
corpusEventStream = new OutcomeEqualiserEventStream(corpusEventStream, multiplier);
}
ModelTrainerFactory modelTrainerFactory = new ModelTrainerFactory();
ClassificationModelTrainer trainer = modelTrainerFactory.constructTrainer(jochreSession.getConfig());
ClassificationModel mergeModel = trainer.trainModel(corpusEventStream, featureDescriptors);
mergeModel.persist(mergeModelFile);
}
use of com.joliciel.talismane.machineLearning.ClassificationEventStream in project jochre by urieli.
the class Jochre method doCommandTrainSplits.
/**
* Train the letter splitting model.
*
* @param featureDescriptors
* the feature descriptors for training this model
* @param criteria
* the criteria used to select the training corpus
*/
public void doCommandTrainSplits(List<String> featureDescriptors, CorpusSelectionCriteria criteria) {
if (jochreSession.getSplitModelPath() == null)
throw new RuntimeException("Missing argument: splitModel");
if (featureDescriptors == null)
throw new JochreException("features is required");
File splitModelFile = new File(jochreSession.getSplitModelPath());
splitModelFile.getParentFile().mkdirs();
SplitFeatureParser splitFeatureParser = new SplitFeatureParser();
Set<SplitFeature<?>> splitFeatures = splitFeatureParser.getSplitFeatureSet(featureDescriptors);
ClassificationEventStream corpusEventStream = new JochreSplitEventStream(criteria, splitFeatures, jochreSession);
ModelTrainerFactory modelTrainerFactory = new ModelTrainerFactory();
ClassificationModelTrainer trainer = modelTrainerFactory.constructTrainer(jochreSession.getConfig());
ClassificationModel splitModel = trainer.trainModel(corpusEventStream, featureDescriptors);
splitModel.persist(splitModelFile);
}
Aggregations