Search in sources :

Example 1 with ModelsComposition

use of org.apache.ignite.ml.composition.ModelsComposition in project ignite by apache.

the class RandomForestRegressionFromSparkExample method main.

/**
 * Run example.
 */
public static void main(String[] args) throws FileNotFoundException {
    System.out.println();
    System.out.println(">>> Random Forest regression model loaded from Spark through serialization over partitioned dataset usage example started.");
    // Start ignite grid.
    try (Ignite ignite = Ignition.start("examples/config/example-ignite.xml")) {
        System.out.println(">>> Ignite grid started.");
        IgniteCache<Integer, Vector> dataCache = null;
        try {
            dataCache = TitanicUtils.readPassengersWithoutNulls(ignite);
            final Vectorizer<Integer, Vector, Integer, Double> vectorizer = new DummyVectorizer<Integer>(0, 1, 5, 6).labeled(4);
            ModelsComposition mdl = (ModelsComposition) SparkModelParser.parse(SPARK_MDL_PATH, SupportedSparkModels.RANDOM_FOREST_REGRESSION, env);
            System.out.println(">>> Random Forest regression model: " + mdl);
            System.out.println(">>> ---------------------------------");
            System.out.println(">>> | Prediction\t| Ground Truth\t|");
            System.out.println(">>> ---------------------------------");
            try (QueryCursor<Cache.Entry<Integer, Vector>> observations = dataCache.query(new ScanQuery<>())) {
                for (Cache.Entry<Integer, Vector> observation : observations) {
                    LabeledVector<Double> lv = vectorizer.apply(observation.getKey(), observation.getValue());
                    Vector inputs = lv.features();
                    double groundTruth = lv.label();
                    double prediction = mdl.predict(inputs);
                    System.out.printf(">>> | %.4f\t\t| %.4f\t\t|\n", prediction, groundTruth);
                }
            }
            System.out.println(">>> ---------------------------------");
        } finally {
            dataCache.destroy();
        }
    }
}
Also used : ModelsComposition(org.apache.ignite.ml.composition.ModelsComposition) Ignite(org.apache.ignite.Ignite) Vector(org.apache.ignite.ml.math.primitives.vector.Vector) LabeledVector(org.apache.ignite.ml.structures.LabeledVector) IgniteCache(org.apache.ignite.IgniteCache) Cache(javax.cache.Cache)

Example 2 with ModelsComposition

use of org.apache.ignite.ml.composition.ModelsComposition in project ignite by apache.

the class TargetEncoderExample method main.

/**
 * Run example.
 */
public static void main(String[] args) {
    System.out.println();
    System.out.println(">>> Train Gradient Boosing Decision Tree model on amazon-employee-access-challenge_train.csv dataset.");
    try (Ignite ignite = Ignition.start("examples/config/example-ignite.xml")) {
        try {
            IgniteCache<Integer, Object[]> dataCache = new SandboxMLCache(ignite).fillObjectCacheWithCategoricalData(MLSandboxDatasets.AMAZON_EMPLOYEE_ACCESS);
            Set<Integer> featuresIndexies = new HashSet<>(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9));
            Set<Integer> targetEncodedfeaturesIndexies = new HashSet<>(Arrays.asList(1, 5, 6));
            Integer targetIndex = 0;
            final Vectorizer<Integer, Object[], Integer, Object> vectorizer = new ObjectArrayVectorizer<Integer>(featuresIndexies.toArray(new Integer[0])).labeled(targetIndex);
            Preprocessor<Integer, Object[]> strEncoderPreprocessor = new EncoderTrainer<Integer, Object[]>().withEncoderType(EncoderType.STRING_ENCODER).withEncodedFeature(0).withEncodedFeatures(featuresIndexies).fit(ignite, dataCache, vectorizer);
            Preprocessor<Integer, Object[]> targetEncoderProcessor = new EncoderTrainer<Integer, Object[]>().withEncoderType(EncoderType.TARGET_ENCODER).labeled(0).withEncodedFeatures(targetEncodedfeaturesIndexies).minSamplesLeaf(1).minCategorySize(1L).smoothing(1d).fit(ignite, dataCache, strEncoderPreprocessor);
            Preprocessor<Integer, Object[]> lbEncoderPreprocessor = new EncoderTrainer<Integer, Object[]>().withEncoderType(EncoderType.LABEL_ENCODER).fit(ignite, dataCache, targetEncoderProcessor);
            GDBTrainer trainer = new GDBBinaryClassifierOnTreesTrainer(0.5, 500, 4, 0.).withCheckConvergenceStgyFactory(new MedianOfMedianConvergenceCheckerFactory(0.1));
            // Train model.
            ModelsComposition mdl = trainer.fit(ignite, dataCache, lbEncoderPreprocessor);
            System.out.println("\n>>> Trained model: " + mdl);
            double accuracy = Evaluator.evaluate(dataCache, mdl, lbEncoderPreprocessor, new Accuracy());
            System.out.println("\n>>> Accuracy " + accuracy);
            System.out.println("\n>>> Test Error " + (1 - accuracy));
            System.out.println(">>> Train Gradient Boosing Decision Tree model on amazon-employee-access-challenge_train.csv dataset.");
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        }
    } finally {
        System.out.flush();
    }
}
Also used : SandboxMLCache(org.apache.ignite.examples.ml.util.SandboxMLCache) GDBTrainer(org.apache.ignite.ml.composition.boosting.GDBTrainer) FileNotFoundException(java.io.FileNotFoundException) ModelsComposition(org.apache.ignite.ml.composition.ModelsComposition) GDBBinaryClassifierOnTreesTrainer(org.apache.ignite.ml.tree.boosting.GDBBinaryClassifierOnTreesTrainer) Accuracy(org.apache.ignite.ml.selection.scoring.metric.classification.Accuracy) MedianOfMedianConvergenceCheckerFactory(org.apache.ignite.ml.composition.boosting.convergence.median.MedianOfMedianConvergenceCheckerFactory) Ignite(org.apache.ignite.Ignite) EncoderTrainer(org.apache.ignite.ml.preprocessing.encoding.EncoderTrainer) HashSet(java.util.HashSet)

Example 3 with ModelsComposition

use of org.apache.ignite.ml.composition.ModelsComposition in project ignite by apache.

the class GDBTrainerTest method testClassifier.

/**
 */
private void testClassifier(BiFunction<GDBTrainer, Map<Integer, double[]>, IgniteModel<Vector, Double>> fitter) {
    int sampleSize = 100;
    double[] xs = new double[sampleSize];
    double[] ys = new double[sampleSize];
    for (int i = 0; i < sampleSize; i++) {
        xs[i] = i;
        ys[i] = ((int) (xs[i] / 10.0) % 2) == 0 ? -1.0 : 1.0;
    }
    Map<Integer, double[]> learningSample = new HashMap<>();
    for (int i = 0; i < sampleSize; i++) learningSample.put(i, new double[] { xs[i], ys[i] });
    GDBTrainer trainer = new GDBBinaryClassifierOnTreesTrainer(0.3, 500, 3, 0.0).withUsingIdx(true).withCheckConvergenceStgyFactory(new MeanAbsValueConvergenceCheckerFactory(0.3));
    IgniteModel<Vector, Double> mdl = fitter.apply(trainer, learningSample);
    int errorsCnt = 0;
    for (int j = 0; j < sampleSize; j++) {
        double x = xs[j];
        double y = ys[j];
        double p = mdl.predict(VectorUtils.of(x));
        if (p != y)
            errorsCnt++;
    }
    assertEquals(0, errorsCnt);
    assertTrue(mdl instanceof ModelsComposition);
    ModelsComposition composition = (ModelsComposition) mdl;
    composition.getModels().forEach(m -> assertTrue(m instanceof DecisionTreeModel));
    assertTrue(composition.getModels().size() < 500);
    assertTrue(composition.getPredictionsAggregator() instanceof WeightedPredictionsAggregator);
    trainer = trainer.withCheckConvergenceStgyFactory(new ConvergenceCheckerStubFactory());
    assertEquals(500, ((ModelsComposition) fitter.apply(trainer, learningSample)).getModels().size());
}
Also used : HashMap(java.util.HashMap) DecisionTreeModel(org.apache.ignite.ml.tree.DecisionTreeModel) WeightedPredictionsAggregator(org.apache.ignite.ml.composition.predictionsaggregator.WeightedPredictionsAggregator) ModelsComposition(org.apache.ignite.ml.composition.ModelsComposition) GDBBinaryClassifierOnTreesTrainer(org.apache.ignite.ml.tree.boosting.GDBBinaryClassifierOnTreesTrainer) MeanAbsValueConvergenceCheckerFactory(org.apache.ignite.ml.composition.boosting.convergence.mean.MeanAbsValueConvergenceCheckerFactory) ConvergenceCheckerStubFactory(org.apache.ignite.ml.composition.boosting.convergence.simple.ConvergenceCheckerStubFactory) Vector(org.apache.ignite.ml.math.primitives.vector.Vector)

Example 4 with ModelsComposition

use of org.apache.ignite.ml.composition.ModelsComposition in project ignite by apache.

the class TrainerTransformers method runOnEnsemble.

/**
 * This method accepts function which for given dataset builder and index of model in ensemble generates
 * task of training this model.
 *
 * @param trainingTaskGenerator Training test generator.
 * @param datasetBuilder Dataset builder.
 * @param ensembleSize Size of ensemble.
 * @param subsampleRatio Ratio (subsample size) / (initial dataset size).
 * @param featuresVectorSize Dimensionality of feature vector.
 * @param featureSubspaceDim Dimensionality of feature subspace.
 * @param aggregator Aggregator of models.
 * @param environment Environment.
 * @param <K> Type of keys in dataset builder.
 * @param <V> Type of values in dataset builder.
 * @param <M> Type of model.
 * @return Composition of models trained on bagged dataset.
 */
private static <K, V, M extends IgniteModel<Vector, Double>> ModelsComposition runOnEnsemble(IgniteTriFunction<DatasetBuilder<K, V>, Integer, IgniteBiFunction<K, V, Vector>, IgniteSupplier<M>> trainingTaskGenerator, DatasetBuilder<K, V> datasetBuilder, int ensembleSize, double subsampleRatio, int featuresVectorSize, int featureSubspaceDim, IgniteBiFunction<K, V, Vector> extractor, PredictionsAggregator aggregator, LearningEnvironment environment) {
    MLLogger log = environment.logger(datasetBuilder.getClass());
    log.log(MLLogger.VerboseLevel.LOW, "Start learning.");
    List<int[]> mappings = null;
    if (featuresVectorSize > 0 && featureSubspaceDim != featuresVectorSize) {
        mappings = IntStream.range(0, ensembleSize).mapToObj(modelIdx -> getMapping(featuresVectorSize, featureSubspaceDim, environment.randomNumbersGenerator().nextLong() + modelIdx)).collect(Collectors.toList());
    }
    Long startTs = System.currentTimeMillis();
    List<IgniteSupplier<M>> tasks = new ArrayList<>();
    List<IgniteBiFunction<K, V, Vector>> extractors = new ArrayList<>();
    if (mappings != null) {
        for (int[] mapping : mappings) extractors.add(wrapExtractor(extractor, mapping));
    }
    for (int i = 0; i < ensembleSize; i++) {
        DatasetBuilder<K, V> newBuilder = datasetBuilder.withUpstreamTransformer(BaggingUpstreamTransformer.builder(subsampleRatio, i));
        tasks.add(trainingTaskGenerator.apply(newBuilder, i, mappings != null ? extractors.get(i) : extractor));
    }
    List<ModelWithMapping<Vector, Double, M>> models = environment.parallelismStrategy().submit(tasks).stream().map(Promise::unsafeGet).map(ModelWithMapping<Vector, Double, M>::new).collect(Collectors.toList());
    // If we need to do projection, do it.
    if (mappings != null) {
        for (int i = 0; i < models.size(); i++) models.get(i).setMapping(VectorUtils.getProjector(mappings.get(i)));
    }
    double learningTime = (double) (System.currentTimeMillis() - startTs) / 1000.0;
    log.log(MLLogger.VerboseLevel.LOW, "The training time was %.2fs.", learningTime);
    log.log(MLLogger.VerboseLevel.LOW, "Learning finished.");
    return new ModelsComposition(models, aggregator);
}
Also used : IgniteSupplier(org.apache.ignite.ml.math.functions.IgniteSupplier) IgniteBiFunction(org.apache.ignite.ml.math.functions.IgniteBiFunction) ArrayList(java.util.ArrayList) ModelsComposition(org.apache.ignite.ml.composition.ModelsComposition) Promise(org.apache.ignite.ml.environment.parallelism.Promise) Vector(org.apache.ignite.ml.math.primitives.vector.Vector) MLLogger(org.apache.ignite.ml.environment.logging.MLLogger)

Example 5 with ModelsComposition

use of org.apache.ignite.ml.composition.ModelsComposition in project ignite by apache.

the class GDBLearningStrategy method update.

/**
 * Gets state of model in arguments, compare it with training parameters of trainer and if they are fit then trainer
 * updates model in according to new data and return new model. In other case trains new model.
 *
 * @param mdlToUpdate Learned model.
 * @param datasetBuilder Dataset builder.
 * @param preprocessor Upstream preprocessor.
 * @param <K> Type of a key in {@code upstream} data.
 * @param <V> Type of a value in {@code upstream} data.
 * @return Updated models list.
 */
public <K, V> List<IgniteModel<Vector, Double>> update(GDBModel mdlToUpdate, DatasetBuilder<K, V> datasetBuilder, Preprocessor<K, V> preprocessor) {
    if (trainerEnvironment == null)
        throw new IllegalStateException("Learning environment builder is not set.");
    List<IgniteModel<Vector, Double>> models = initLearningState(mdlToUpdate);
    ConvergenceChecker<K, V> convCheck = checkConvergenceStgyFactory.create(sampleSize, externalLbToInternalMapping, loss, datasetBuilder, preprocessor);
    DatasetTrainer<? extends IgniteModel<Vector, Double>, Double> trainer = baseMdlTrainerBuilder.get();
    for (int i = 0; i < cntOfIterations; i++) {
        double[] weights = Arrays.copyOf(compositionWeights, models.size());
        WeightedPredictionsAggregator aggregator = new WeightedPredictionsAggregator(weights, meanLbVal);
        ModelsComposition currComposition = new ModelsComposition(models, aggregator);
        if (convCheck.isConverged(envBuilder, datasetBuilder, currComposition))
            break;
        Vectorizer<K, V, Serializable, Double> extractor = new Vectorizer.VectorizerAdapter<K, V, Serializable, Double>() {

            /**
             * {@inheritDoc}
             */
            @Override
            public LabeledVector<Double> extract(K k, V v) {
                LabeledVector<Double> labeledVector = preprocessor.apply(k, v);
                Vector features = labeledVector.features();
                Double realAnswer = externalLbToInternalMapping.apply(labeledVector.label());
                Double mdlAnswer = currComposition.predict(features);
                return new LabeledVector<>(features, -loss.gradient(sampleSize, realAnswer, mdlAnswer));
            }
        };
        long startTs = System.currentTimeMillis();
        models.add(trainer.fit(datasetBuilder, extractor));
        double learningTime = (double) (System.currentTimeMillis() - startTs) / 1000.0;
        trainerEnvironment.logger(getClass()).log(MLLogger.VerboseLevel.LOW, "One model training time was %.2fs", learningTime);
    }
    return models;
}
Also used : Serializable(java.io.Serializable) WeightedPredictionsAggregator(org.apache.ignite.ml.composition.predictionsaggregator.WeightedPredictionsAggregator) ModelsComposition(org.apache.ignite.ml.composition.ModelsComposition) LabeledVector(org.apache.ignite.ml.structures.LabeledVector) IgniteModel(org.apache.ignite.ml.IgniteModel) Vector(org.apache.ignite.ml.math.primitives.vector.Vector) LabeledVector(org.apache.ignite.ml.structures.LabeledVector)

Aggregations

ModelsComposition (org.apache.ignite.ml.composition.ModelsComposition)12 Vector (org.apache.ignite.ml.math.primitives.vector.Vector)11 Ignite (org.apache.ignite.Ignite)7 Cache (javax.cache.Cache)4 IgniteCache (org.apache.ignite.IgniteCache)4 WeightedPredictionsAggregator (org.apache.ignite.ml.composition.predictionsaggregator.WeightedPredictionsAggregator)4 SandboxMLCache (org.apache.ignite.examples.ml.util.SandboxMLCache)3 LabeledVector (org.apache.ignite.ml.structures.LabeledVector)3 HashMap (java.util.HashMap)2 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)2 IgniteModel (org.apache.ignite.ml.IgniteModel)2 MeanAbsValueConvergenceCheckerFactory (org.apache.ignite.ml.composition.boosting.convergence.mean.MeanAbsValueConvergenceCheckerFactory)2 FeatureMeta (org.apache.ignite.ml.dataset.feature.FeatureMeta)2 DecisionTreeModel (org.apache.ignite.ml.tree.DecisionTreeModel)2 GDBBinaryClassifierOnTreesTrainer (org.apache.ignite.ml.tree.boosting.GDBBinaryClassifierOnTreesTrainer)2 FileNotFoundException (java.io.FileNotFoundException)1 Serializable (java.io.Serializable)1 ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1 TrainerTest (org.apache.ignite.ml.common.TrainerTest)1