Search in sources :

Example 76 with Vector

use of org.apache.ignite.ml.math.primitives.vector.Vector in project ignite by apache.

the class LabeledDatasetPartitionDataBuilderOnHeap method build.

/**
 * {@inheritDoc}
 */
@Override
public LabeledVectorSet<LabeledVector> build(LearningEnvironment env, Iterator<UpstreamEntry<K, V>> upstreamData, long upstreamDataSize, C ctx) {
    int xCols = -1;
    double[][] x = null;
    double[] y = new double[Math.toIntExact(upstreamDataSize)];
    int ptr = 0;
    while (upstreamData.hasNext()) {
        UpstreamEntry<K, V> entry = upstreamData.next();
        LabeledVector<Double> labeledVector = preprocessor.apply(entry.getKey(), entry.getValue());
        Vector row = labeledVector.features();
        if (xCols < 0) {
            xCols = row.size();
            x = new double[Math.toIntExact(upstreamDataSize)][xCols];
        } else
            assert row.size() == xCols : "X extractor must return exactly " + xCols + " columns";
        x[ptr] = row.asArray();
        y[ptr] = labeledVector.label();
        ptr++;
    }
    return new LabeledVectorSet<>(x, y);
}
Also used : LabeledVectorSet(org.apache.ignite.ml.structures.LabeledVectorSet) LabeledVector(org.apache.ignite.ml.structures.LabeledVector) Vector(org.apache.ignite.ml.math.primitives.vector.Vector)

Example 77 with Vector

use of org.apache.ignite.ml.math.primitives.vector.Vector in project ignite by apache.

the class StandardScalerTrainer method computeSum.

/**
 * Computes sum, squared sum and row count.
 */
private StandardScalerData computeSum(LearningEnvironmentBuilder envBuilder, DatasetBuilder<K, V> datasetBuilder, Preprocessor<K, V> basePreprocessor) {
    try (Dataset<EmptyContext, StandardScalerData> dataset = datasetBuilder.build(envBuilder, (env, upstream, upstreamSize) -> new EmptyContext(), (env, upstream, upstreamSize, ctx) -> {
        double[] sum = null;
        double[] squaredSum = null;
        long cnt = 0;
        while (upstream.hasNext()) {
            UpstreamEntry<K, V> entity = upstream.next();
            Vector row = basePreprocessor.apply(entity.getKey(), entity.getValue()).features();
            if (sum == null) {
                sum = new double[row.size()];
                squaredSum = new double[row.size()];
            } else {
                assert sum.length == row.size() : "Base preprocessor must return exactly " + sum.length + " features";
            }
            ++cnt;
            for (int i = 0; i < row.size(); i++) {
                double x = row.get(i);
                sum[i] += x;
                squaredSum[i] += x * x;
            }
        }
        return new StandardScalerData(sum, squaredSum, cnt);
    }, learningEnvironment(basePreprocessor))) {
        return dataset.compute(data -> data, (a, b) -> {
            if (a == null)
                return b;
            if (b == null)
                return a;
            return a.merge(b);
        });
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}
Also used : EmptyContext(org.apache.ignite.ml.dataset.primitive.context.EmptyContext) Vector(org.apache.ignite.ml.math.primitives.vector.Vector)

Example 78 with Vector

use of org.apache.ignite.ml.math.primitives.vector.Vector in project ignite by apache.

the class TrainerTransformers method runOnEnsemble.

/**
 * This method accepts function which for given dataset builder and index of model in ensemble generates
 * task of training this model.
 *
 * @param trainingTaskGenerator Training test generator.
 * @param datasetBuilder Dataset builder.
 * @param ensembleSize Size of ensemble.
 * @param subsampleRatio Ratio (subsample size) / (initial dataset size).
 * @param featuresVectorSize Dimensionality of feature vector.
 * @param featureSubspaceDim Dimensionality of feature subspace.
 * @param aggregator Aggregator of models.
 * @param environment Environment.
 * @param <K> Type of keys in dataset builder.
 * @param <V> Type of values in dataset builder.
 * @param <M> Type of model.
 * @return Composition of models trained on bagged dataset.
 */
private static <K, V, M extends IgniteModel<Vector, Double>> ModelsComposition runOnEnsemble(IgniteTriFunction<DatasetBuilder<K, V>, Integer, IgniteBiFunction<K, V, Vector>, IgniteSupplier<M>> trainingTaskGenerator, DatasetBuilder<K, V> datasetBuilder, int ensembleSize, double subsampleRatio, int featuresVectorSize, int featureSubspaceDim, IgniteBiFunction<K, V, Vector> extractor, PredictionsAggregator aggregator, LearningEnvironment environment) {
    MLLogger log = environment.logger(datasetBuilder.getClass());
    log.log(MLLogger.VerboseLevel.LOW, "Start learning.");
    List<int[]> mappings = null;
    if (featuresVectorSize > 0 && featureSubspaceDim != featuresVectorSize) {
        mappings = IntStream.range(0, ensembleSize).mapToObj(modelIdx -> getMapping(featuresVectorSize, featureSubspaceDim, environment.randomNumbersGenerator().nextLong() + modelIdx)).collect(Collectors.toList());
    }
    Long startTs = System.currentTimeMillis();
    List<IgniteSupplier<M>> tasks = new ArrayList<>();
    List<IgniteBiFunction<K, V, Vector>> extractors = new ArrayList<>();
    if (mappings != null) {
        for (int[] mapping : mappings) extractors.add(wrapExtractor(extractor, mapping));
    }
    for (int i = 0; i < ensembleSize; i++) {
        DatasetBuilder<K, V> newBuilder = datasetBuilder.withUpstreamTransformer(BaggingUpstreamTransformer.builder(subsampleRatio, i));
        tasks.add(trainingTaskGenerator.apply(newBuilder, i, mappings != null ? extractors.get(i) : extractor));
    }
    List<ModelWithMapping<Vector, Double, M>> models = environment.parallelismStrategy().submit(tasks).stream().map(Promise::unsafeGet).map(ModelWithMapping<Vector, Double, M>::new).collect(Collectors.toList());
    // If we need to do projection, do it.
    if (mappings != null) {
        for (int i = 0; i < models.size(); i++) models.get(i).setMapping(VectorUtils.getProjector(mappings.get(i)));
    }
    double learningTime = (double) (System.currentTimeMillis() - startTs) / 1000.0;
    log.log(MLLogger.VerboseLevel.LOW, "The training time was %.2fs.", learningTime);
    log.log(MLLogger.VerboseLevel.LOW, "Learning finished.");
    return new ModelsComposition(models, aggregator);
}
Also used : IgniteSupplier(org.apache.ignite.ml.math.functions.IgniteSupplier) IgniteBiFunction(org.apache.ignite.ml.math.functions.IgniteBiFunction) ArrayList(java.util.ArrayList) ModelsComposition(org.apache.ignite.ml.composition.ModelsComposition) Promise(org.apache.ignite.ml.environment.parallelism.Promise) Vector(org.apache.ignite.ml.math.primitives.vector.Vector) MLLogger(org.apache.ignite.ml.environment.logging.MLLogger)

Example 79 with Vector

use of org.apache.ignite.ml.math.primitives.vector.Vector in project ignite by apache.

the class Evaluator method initEvaluationContexts.

/**
 * Inits evaluation contexts for metrics.
 *
 * @param dataset Dataset.
 * @param metrics Metrics.
 * @return Computed contexts.
 */
@SuppressWarnings("unchecked")
private static Map<Class, EvaluationContext> initEvaluationContexts(Dataset<EmptyContext, FeatureMatrixWithLabelsOnHeapData> dataset, Metric... metrics) {
    long nonEmptyCtxsCnt = Arrays.stream(metrics).map(x -> x.makeAggregator().createInitializedContext()).filter(x -> ((EvaluationContext) x).needToCompute()).count();
    if (nonEmptyCtxsCnt == 0) {
        HashMap<Class, EvaluationContext> res = new HashMap<>();
        for (Metric m : metrics) {
            MetricStatsAggregator<Double, ?, ?> aggregator = m.makeAggregator();
            res.put(aggregator.getClass(), (EvaluationContext) m.makeAggregator().createInitializedContext());
            return res;
        }
    }
    return dataset.compute(data -> {
        Map<Class, MetricStatsAggregator> aggrs = new HashMap<>();
        for (Metric m : metrics) {
            MetricStatsAggregator<Double, ?, ?> aggregator = m.makeAggregator();
            if (!aggrs.containsKey(aggregator.getClass()))
                aggrs.put(aggregator.getClass(), aggregator);
        }
        Map<Class, EvaluationContext> aggrToEvCtx = new HashMap<>();
        aggrs.forEach((clazz, aggr) -> aggrToEvCtx.put(clazz, (EvaluationContext) aggr.createInitializedContext()));
        for (int i = 0; i < data.getLabels().length; i++) {
            LabeledVector<Double> vector = VectorUtils.of(data.getFeatures()[i]).labeled(data.getLabels()[i]);
            aggrToEvCtx.values().forEach(ctx -> ctx.aggregate(vector));
        }
        return aggrToEvCtx;
    }, (left, right) -> {
        if (left == null && right == null)
            return new HashMap<>();
        if (left == null)
            return right;
        if (right == null)
            return left;
        HashMap<Class, EvaluationContext> res = new HashMap<>();
        for (Class key : left.keySet()) {
            EvaluationContext ctx1 = left.get(key);
            EvaluationContext ctx2 = right.get(key);
            A.ensure(ctx1 != null && ctx2 != null, "ctx1 != null && ctx2 != null");
            res.put(key, ctx1.mergeWith(ctx2));
        }
        return res;
    });
}
Also used : FeatureMatrixWithLabelsOnHeapDataBuilder(org.apache.ignite.ml.dataset.primitive.FeatureMatrixWithLabelsOnHeapDataBuilder) Metric(org.apache.ignite.ml.selection.scoring.metric.Metric) Arrays(java.util.Arrays) IgniteBiPredicate(org.apache.ignite.lang.IgniteBiPredicate) EvaluationContext(org.apache.ignite.ml.selection.scoring.evaluator.context.EvaluationContext) Vector(org.apache.ignite.ml.math.primitives.vector.Vector) Preprocessor(org.apache.ignite.ml.preprocessing.Preprocessor) HashMap(java.util.HashMap) MetricStatsAggregator(org.apache.ignite.ml.selection.scoring.evaluator.aggregator.MetricStatsAggregator) LabeledVector(org.apache.ignite.ml.structures.LabeledVector) LearningEnvironment(org.apache.ignite.ml.environment.LearningEnvironment) MetricName(org.apache.ignite.ml.selection.scoring.metric.MetricName) Map(java.util.Map) Cache(javax.cache.Cache) LocalDatasetBuilder(org.apache.ignite.ml.dataset.impl.local.LocalDatasetBuilder) EmptyContextBuilder(org.apache.ignite.ml.dataset.primitive.builder.context.EmptyContextBuilder) LearningEnvironmentBuilder(org.apache.ignite.ml.environment.LearningEnvironmentBuilder) EmptyContext(org.apache.ignite.ml.dataset.primitive.context.EmptyContext) A(org.apache.ignite.internal.util.typedef.internal.A) FeatureMatrixWithLabelsOnHeapData(org.apache.ignite.ml.dataset.primitive.FeatureMatrixWithLabelsOnHeapData) CacheBasedDatasetBuilder(org.apache.ignite.ml.dataset.impl.cache.CacheBasedDatasetBuilder) IgniteModel(org.apache.ignite.ml.IgniteModel) DatasetBuilder(org.apache.ignite.ml.dataset.DatasetBuilder) KNNModel(org.apache.ignite.ml.knn.KNNModel) IgniteCache(org.apache.ignite.IgniteCache) Ignition(org.apache.ignite.Ignition) VectorUtils(org.apache.ignite.ml.math.primitives.vector.VectorUtils) Dataset(org.apache.ignite.ml.dataset.Dataset) QueryCursor(org.apache.ignite.cache.query.QueryCursor) ScanQuery(org.apache.ignite.cache.query.ScanQuery) HashMap(java.util.HashMap) MetricStatsAggregator(org.apache.ignite.ml.selection.scoring.evaluator.aggregator.MetricStatsAggregator) Metric(org.apache.ignite.ml.selection.scoring.metric.Metric) EvaluationContext(org.apache.ignite.ml.selection.scoring.evaluator.context.EvaluationContext)

Example 80 with Vector

use of org.apache.ignite.ml.math.primitives.vector.Vector in project ignite by apache.

the class LabeledDatasetLoader method parseFeatures.

/**
 */
@NotNull
private static Vector parseFeatures(Path pathToFile, boolean isFallOnBadData, int colSize, int rowIdx, String[] rowData) {
    final Vector vec = LabeledVectorSet.emptyVector(colSize);
    if (isFallOnBadData && rowData.length != colSize + 1)
        throw new CardinalityException(colSize + 1, rowData.length);
    double missedData = fillMissedData();
    for (int j = 0; j < colSize; j++) {
        try {
            double feature = Double.parseDouble(rowData[j + 1]);
            vec.set(j, feature);
        } catch (NumberFormatException e) {
            if (isFallOnBadData)
                throw new FileParsingException(rowData[j + 1], rowIdx, pathToFile);
            else
                vec.set(j, missedData);
        } catch (ArrayIndexOutOfBoundsException e) {
            vec.set(j, missedData);
        }
    }
    return vec;
}
Also used : FileParsingException(org.apache.ignite.ml.math.exceptions.datastructures.FileParsingException) CardinalityException(org.apache.ignite.ml.math.exceptions.math.CardinalityException) Vector(org.apache.ignite.ml.math.primitives.vector.Vector) LabeledVector(org.apache.ignite.ml.structures.LabeledVector) NotNull(org.jetbrains.annotations.NotNull)

Aggregations

Vector (org.apache.ignite.ml.math.primitives.vector.Vector)265 DenseVector (org.apache.ignite.ml.math.primitives.vector.impl.DenseVector)95 Test (org.junit.Test)94 Ignite (org.apache.ignite.Ignite)78 LabeledVector (org.apache.ignite.ml.structures.LabeledVector)49 HashMap (java.util.HashMap)39 SandboxMLCache (org.apache.ignite.examples.ml.util.SandboxMLCache)38 DummyVectorizer (org.apache.ignite.ml.dataset.feature.extractor.impl.DummyVectorizer)26 FileNotFoundException (java.io.FileNotFoundException)22 TrainerTest (org.apache.ignite.ml.common.TrainerTest)22 DecisionTreeClassificationTrainer (org.apache.ignite.ml.tree.DecisionTreeClassificationTrainer)21 DecisionTreeModel (org.apache.ignite.ml.tree.DecisionTreeModel)21 Serializable (java.io.Serializable)19 IgniteCache (org.apache.ignite.IgniteCache)18 EncoderTrainer (org.apache.ignite.ml.preprocessing.encoding.EncoderTrainer)16 Cache (javax.cache.Cache)15 DoubleArrayVectorizer (org.apache.ignite.ml.dataset.feature.extractor.impl.DoubleArrayVectorizer)15 EuclideanDistance (org.apache.ignite.ml.math.distances.EuclideanDistance)14 ArrayList (java.util.ArrayList)12 ModelsComposition (org.apache.ignite.ml.composition.ModelsComposition)12