use of org.apache.ignite.ml.composition.ModelsComposition in project ignite by apache.
the class RandomForestRegressionFromSparkExample method main.
/**
* Run example.
*/
public static void main(String[] args) throws FileNotFoundException {
System.out.println();
System.out.println(">>> Random Forest regression model loaded from Spark through serialization over partitioned dataset usage example started.");
// Start ignite grid.
try (Ignite ignite = Ignition.start("examples/config/example-ignite.xml")) {
System.out.println(">>> Ignite grid started.");
IgniteCache<Integer, Vector> dataCache = null;
try {
dataCache = TitanicUtils.readPassengersWithoutNulls(ignite);
final Vectorizer<Integer, Vector, Integer, Double> vectorizer = new DummyVectorizer<Integer>(0, 1, 5, 6).labeled(4);
ModelsComposition mdl = (ModelsComposition) SparkModelParser.parse(SPARK_MDL_PATH, SupportedSparkModels.RANDOM_FOREST_REGRESSION, env);
System.out.println(">>> Random Forest regression model: " + mdl);
System.out.println(">>> ---------------------------------");
System.out.println(">>> | Prediction\t| Ground Truth\t|");
System.out.println(">>> ---------------------------------");
try (QueryCursor<Cache.Entry<Integer, Vector>> observations = dataCache.query(new ScanQuery<>())) {
for (Cache.Entry<Integer, Vector> observation : observations) {
LabeledVector<Double> lv = vectorizer.apply(observation.getKey(), observation.getValue());
Vector inputs = lv.features();
double groundTruth = lv.label();
double prediction = mdl.predict(inputs);
System.out.printf(">>> | %.4f\t\t| %.4f\t\t|\n", prediction, groundTruth);
}
}
System.out.println(">>> ---------------------------------");
} finally {
dataCache.destroy();
}
}
}
use of org.apache.ignite.ml.composition.ModelsComposition in project ignite by apache.
the class TargetEncoderExample method main.
/**
* Run example.
*/
public static void main(String[] args) {
System.out.println();
System.out.println(">>> Train Gradient Boosing Decision Tree model on amazon-employee-access-challenge_train.csv dataset.");
try (Ignite ignite = Ignition.start("examples/config/example-ignite.xml")) {
try {
IgniteCache<Integer, Object[]> dataCache = new SandboxMLCache(ignite).fillObjectCacheWithCategoricalData(MLSandboxDatasets.AMAZON_EMPLOYEE_ACCESS);
Set<Integer> featuresIndexies = new HashSet<>(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9));
Set<Integer> targetEncodedfeaturesIndexies = new HashSet<>(Arrays.asList(1, 5, 6));
Integer targetIndex = 0;
final Vectorizer<Integer, Object[], Integer, Object> vectorizer = new ObjectArrayVectorizer<Integer>(featuresIndexies.toArray(new Integer[0])).labeled(targetIndex);
Preprocessor<Integer, Object[]> strEncoderPreprocessor = new EncoderTrainer<Integer, Object[]>().withEncoderType(EncoderType.STRING_ENCODER).withEncodedFeature(0).withEncodedFeatures(featuresIndexies).fit(ignite, dataCache, vectorizer);
Preprocessor<Integer, Object[]> targetEncoderProcessor = new EncoderTrainer<Integer, Object[]>().withEncoderType(EncoderType.TARGET_ENCODER).labeled(0).withEncodedFeatures(targetEncodedfeaturesIndexies).minSamplesLeaf(1).minCategorySize(1L).smoothing(1d).fit(ignite, dataCache, strEncoderPreprocessor);
Preprocessor<Integer, Object[]> lbEncoderPreprocessor = new EncoderTrainer<Integer, Object[]>().withEncoderType(EncoderType.LABEL_ENCODER).fit(ignite, dataCache, targetEncoderProcessor);
GDBTrainer trainer = new GDBBinaryClassifierOnTreesTrainer(0.5, 500, 4, 0.).withCheckConvergenceStgyFactory(new MedianOfMedianConvergenceCheckerFactory(0.1));
// Train model.
ModelsComposition mdl = trainer.fit(ignite, dataCache, lbEncoderPreprocessor);
System.out.println("\n>>> Trained model: " + mdl);
double accuracy = Evaluator.evaluate(dataCache, mdl, lbEncoderPreprocessor, new Accuracy());
System.out.println("\n>>> Accuracy " + accuracy);
System.out.println("\n>>> Test Error " + (1 - accuracy));
System.out.println(">>> Train Gradient Boosing Decision Tree model on amazon-employee-access-challenge_train.csv dataset.");
} catch (FileNotFoundException e) {
e.printStackTrace();
}
} finally {
System.out.flush();
}
}
use of org.apache.ignite.ml.composition.ModelsComposition in project ignite by apache.
the class GDBTrainerTest method testClassifier.
/**
*/
private void testClassifier(BiFunction<GDBTrainer, Map<Integer, double[]>, IgniteModel<Vector, Double>> fitter) {
int sampleSize = 100;
double[] xs = new double[sampleSize];
double[] ys = new double[sampleSize];
for (int i = 0; i < sampleSize; i++) {
xs[i] = i;
ys[i] = ((int) (xs[i] / 10.0) % 2) == 0 ? -1.0 : 1.0;
}
Map<Integer, double[]> learningSample = new HashMap<>();
for (int i = 0; i < sampleSize; i++) learningSample.put(i, new double[] { xs[i], ys[i] });
GDBTrainer trainer = new GDBBinaryClassifierOnTreesTrainer(0.3, 500, 3, 0.0).withUsingIdx(true).withCheckConvergenceStgyFactory(new MeanAbsValueConvergenceCheckerFactory(0.3));
IgniteModel<Vector, Double> mdl = fitter.apply(trainer, learningSample);
int errorsCnt = 0;
for (int j = 0; j < sampleSize; j++) {
double x = xs[j];
double y = ys[j];
double p = mdl.predict(VectorUtils.of(x));
if (p != y)
errorsCnt++;
}
assertEquals(0, errorsCnt);
assertTrue(mdl instanceof ModelsComposition);
ModelsComposition composition = (ModelsComposition) mdl;
composition.getModels().forEach(m -> assertTrue(m instanceof DecisionTreeModel));
assertTrue(composition.getModels().size() < 500);
assertTrue(composition.getPredictionsAggregator() instanceof WeightedPredictionsAggregator);
trainer = trainer.withCheckConvergenceStgyFactory(new ConvergenceCheckerStubFactory());
assertEquals(500, ((ModelsComposition) fitter.apply(trainer, learningSample)).getModels().size());
}
use of org.apache.ignite.ml.composition.ModelsComposition in project ignite by apache.
the class TrainerTransformers method runOnEnsemble.
/**
* This method accepts function which for given dataset builder and index of model in ensemble generates
* task of training this model.
*
* @param trainingTaskGenerator Training test generator.
* @param datasetBuilder Dataset builder.
* @param ensembleSize Size of ensemble.
* @param subsampleRatio Ratio (subsample size) / (initial dataset size).
* @param featuresVectorSize Dimensionality of feature vector.
* @param featureSubspaceDim Dimensionality of feature subspace.
* @param aggregator Aggregator of models.
* @param environment Environment.
* @param <K> Type of keys in dataset builder.
* @param <V> Type of values in dataset builder.
* @param <M> Type of model.
* @return Composition of models trained on bagged dataset.
*/
private static <K, V, M extends IgniteModel<Vector, Double>> ModelsComposition runOnEnsemble(IgniteTriFunction<DatasetBuilder<K, V>, Integer, IgniteBiFunction<K, V, Vector>, IgniteSupplier<M>> trainingTaskGenerator, DatasetBuilder<K, V> datasetBuilder, int ensembleSize, double subsampleRatio, int featuresVectorSize, int featureSubspaceDim, IgniteBiFunction<K, V, Vector> extractor, PredictionsAggregator aggregator, LearningEnvironment environment) {
MLLogger log = environment.logger(datasetBuilder.getClass());
log.log(MLLogger.VerboseLevel.LOW, "Start learning.");
List<int[]> mappings = null;
if (featuresVectorSize > 0 && featureSubspaceDim != featuresVectorSize) {
mappings = IntStream.range(0, ensembleSize).mapToObj(modelIdx -> getMapping(featuresVectorSize, featureSubspaceDim, environment.randomNumbersGenerator().nextLong() + modelIdx)).collect(Collectors.toList());
}
Long startTs = System.currentTimeMillis();
List<IgniteSupplier<M>> tasks = new ArrayList<>();
List<IgniteBiFunction<K, V, Vector>> extractors = new ArrayList<>();
if (mappings != null) {
for (int[] mapping : mappings) extractors.add(wrapExtractor(extractor, mapping));
}
for (int i = 0; i < ensembleSize; i++) {
DatasetBuilder<K, V> newBuilder = datasetBuilder.withUpstreamTransformer(BaggingUpstreamTransformer.builder(subsampleRatio, i));
tasks.add(trainingTaskGenerator.apply(newBuilder, i, mappings != null ? extractors.get(i) : extractor));
}
List<ModelWithMapping<Vector, Double, M>> models = environment.parallelismStrategy().submit(tasks).stream().map(Promise::unsafeGet).map(ModelWithMapping<Vector, Double, M>::new).collect(Collectors.toList());
// If we need to do projection, do it.
if (mappings != null) {
for (int i = 0; i < models.size(); i++) models.get(i).setMapping(VectorUtils.getProjector(mappings.get(i)));
}
double learningTime = (double) (System.currentTimeMillis() - startTs) / 1000.0;
log.log(MLLogger.VerboseLevel.LOW, "The training time was %.2fs.", learningTime);
log.log(MLLogger.VerboseLevel.LOW, "Learning finished.");
return new ModelsComposition(models, aggregator);
}
use of org.apache.ignite.ml.composition.ModelsComposition in project ignite by apache.
the class GDBLearningStrategy method update.
/**
* Gets state of model in arguments, compare it with training parameters of trainer and if they are fit then trainer
* updates model in according to new data and return new model. In other case trains new model.
*
* @param mdlToUpdate Learned model.
* @param datasetBuilder Dataset builder.
* @param preprocessor Upstream preprocessor.
* @param <K> Type of a key in {@code upstream} data.
* @param <V> Type of a value in {@code upstream} data.
* @return Updated models list.
*/
public <K, V> List<IgniteModel<Vector, Double>> update(GDBModel mdlToUpdate, DatasetBuilder<K, V> datasetBuilder, Preprocessor<K, V> preprocessor) {
if (trainerEnvironment == null)
throw new IllegalStateException("Learning environment builder is not set.");
List<IgniteModel<Vector, Double>> models = initLearningState(mdlToUpdate);
ConvergenceChecker<K, V> convCheck = checkConvergenceStgyFactory.create(sampleSize, externalLbToInternalMapping, loss, datasetBuilder, preprocessor);
DatasetTrainer<? extends IgniteModel<Vector, Double>, Double> trainer = baseMdlTrainerBuilder.get();
for (int i = 0; i < cntOfIterations; i++) {
double[] weights = Arrays.copyOf(compositionWeights, models.size());
WeightedPredictionsAggregator aggregator = new WeightedPredictionsAggregator(weights, meanLbVal);
ModelsComposition currComposition = new ModelsComposition(models, aggregator);
if (convCheck.isConverged(envBuilder, datasetBuilder, currComposition))
break;
Vectorizer<K, V, Serializable, Double> extractor = new Vectorizer.VectorizerAdapter<K, V, Serializable, Double>() {
/**
* {@inheritDoc}
*/
@Override
public LabeledVector<Double> extract(K k, V v) {
LabeledVector<Double> labeledVector = preprocessor.apply(k, v);
Vector features = labeledVector.features();
Double realAnswer = externalLbToInternalMapping.apply(labeledVector.label());
Double mdlAnswer = currComposition.predict(features);
return new LabeledVector<>(features, -loss.gradient(sampleSize, realAnswer, mdlAnswer));
}
};
long startTs = System.currentTimeMillis();
models.add(trainer.fit(datasetBuilder, extractor));
double learningTime = (double) (System.currentTimeMillis() - startTs) / 1000.0;
trainerEnvironment.logger(getClass()).log(MLLogger.VerboseLevel.LOW, "One model training time was %.2fs", learningTime);
}
return models;
}
Aggregations