Search in sources :

Example 1 with PartitionDataBuilder

use of org.apache.ignite.ml.dataset.PartitionDataBuilder in project ignite by apache.

the class ANNClassificationTrainer method getCentroidStat.

/**
 */
private <K, V> CentroidStat getCentroidStat(DatasetBuilder<K, V> datasetBuilder, Preprocessor<K, V> vectorizer, List<Vector> centers) {
    PartitionDataBuilder<K, V, EmptyContext, LabeledVectorSet<LabeledVector>> partDataBuilder = new LabeledDatasetPartitionDataBuilderOnHeap<>(vectorizer);
    try (Dataset<EmptyContext, LabeledVectorSet<LabeledVector>> dataset = datasetBuilder.build(envBuilder, (env, upstream, upstreamSize) -> new EmptyContext(), partDataBuilder, learningEnvironment())) {
        return dataset.compute(data -> {
            CentroidStat res = new CentroidStat();
            for (int i = 0; i < data.rowSize(); i++) {
                final IgniteBiTuple<Integer, Double> closestCentroid = findClosestCentroid(centers, data.getRow(i));
                int centroidIdx = closestCentroid.get1();
                double lb = data.label(i);
                // add new label to label set
                res.labels().add(lb);
                ConcurrentHashMap<Double, Integer> centroidStat = res.centroidStat.get(centroidIdx);
                if (centroidStat == null) {
                    centroidStat = new ConcurrentHashMap<>();
                    centroidStat.put(lb, 1);
                    res.centroidStat.put(centroidIdx, centroidStat);
                } else {
                    int cnt = centroidStat.getOrDefault(lb, 0);
                    centroidStat.put(lb, cnt + 1);
                }
                res.counts.merge(centroidIdx, 1, (IgniteBiFunction<Integer, Integer, Integer>) (i1, i2) -> i1 + i2);
            }
            return res;
        }, (a, b) -> {
            if (a == null)
                return b == null ? new CentroidStat() : b;
            if (b == null)
                return a;
            return a.merge(b);
        });
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}
Also used : Arrays(java.util.Arrays) Vector(org.apache.ignite.ml.math.primitives.vector.Vector) Preprocessor(org.apache.ignite.ml.preprocessing.Preprocessor) KMeansTrainer(org.apache.ignite.ml.clustering.kmeans.KMeansTrainer) LabeledVector(org.apache.ignite.ml.structures.LabeledVector) SingleLabelDatasetTrainer(org.apache.ignite.ml.trainers.SingleLabelDatasetTrainer) EuclideanDistance(org.apache.ignite.ml.math.distances.EuclideanDistance) JsonIgnore(com.fasterxml.jackson.annotation.JsonIgnore) LabeledVectorSet(org.apache.ignite.ml.structures.LabeledVectorSet) PartitionDataBuilder(org.apache.ignite.ml.dataset.PartitionDataBuilder) LearningEnvironmentBuilder(org.apache.ignite.ml.environment.LearningEnvironmentBuilder) EmptyContext(org.apache.ignite.ml.dataset.primitive.context.EmptyContext) LabeledDatasetPartitionDataBuilderOnHeap(org.apache.ignite.ml.structures.partition.LabeledDatasetPartitionDataBuilderOnHeap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) DistanceMeasure(org.apache.ignite.ml.math.distances.DistanceMeasure) DatasetBuilder(org.apache.ignite.ml.dataset.DatasetBuilder) Collectors(java.util.stream.Collectors) MapUtil(org.apache.ignite.ml.math.util.MapUtil) Serializable(java.io.Serializable) IgniteBiTuple(org.apache.ignite.lang.IgniteBiTuple) List(java.util.List) TreeMap(java.util.TreeMap) ConcurrentSkipListSet(java.util.concurrent.ConcurrentSkipListSet) IgniteBiFunction(org.apache.ignite.ml.math.functions.IgniteBiFunction) KMeansModel(org.apache.ignite.ml.clustering.kmeans.KMeansModel) Dataset(org.apache.ignite.ml.dataset.Dataset) NotNull(org.jetbrains.annotations.NotNull) EmptyContext(org.apache.ignite.ml.dataset.primitive.context.EmptyContext) LabeledDatasetPartitionDataBuilderOnHeap(org.apache.ignite.ml.structures.partition.LabeledDatasetPartitionDataBuilderOnHeap) LabeledVectorSet(org.apache.ignite.ml.structures.LabeledVectorSet)

Example 2 with PartitionDataBuilder

use of org.apache.ignite.ml.dataset.PartitionDataBuilder in project ignite by apache.

the class KNNUtils method buildDataset.

/**
 * Builds dataset.
 *
 * @param envBuilder Learning environment builder.
 * @param datasetBuilder Dataset builder.
 * @param vectorizer Upstream vectorizer.
 * @return Dataset.
 */
@Nullable
public static <K, V, C extends Serializable> Dataset<EmptyContext, LabeledVectorSet<LabeledVector>> buildDataset(LearningEnvironmentBuilder envBuilder, DatasetBuilder<K, V> datasetBuilder, Preprocessor<K, V> vectorizer) {
    LearningEnvironment environment = envBuilder.buildForTrainer();
    environment.initDeployingContext(vectorizer);
    PartitionDataBuilder<K, V, EmptyContext, LabeledVectorSet<LabeledVector>> partDataBuilder = new LabeledDatasetPartitionDataBuilderOnHeap<>(vectorizer);
    Dataset<EmptyContext, LabeledVectorSet<LabeledVector>> dataset = null;
    if (datasetBuilder != null) {
        dataset = datasetBuilder.build(envBuilder, (env, upstream, upstreamSize) -> new EmptyContext(), partDataBuilder, environment);
    }
    return dataset;
}
Also used : LearningEnvironment(org.apache.ignite.ml.environment.LearningEnvironment) Nullable(org.jetbrains.annotations.Nullable) LabeledVector(org.apache.ignite.ml.structures.LabeledVector) LearningEnvironment(org.apache.ignite.ml.environment.LearningEnvironment) Dataset(org.apache.ignite.ml.dataset.Dataset) Preprocessor(org.apache.ignite.ml.preprocessing.Preprocessor) LabeledVectorSet(org.apache.ignite.ml.structures.LabeledVectorSet) DatasetBuilder(org.apache.ignite.ml.dataset.DatasetBuilder) PartitionDataBuilder(org.apache.ignite.ml.dataset.PartitionDataBuilder) LearningEnvironmentBuilder(org.apache.ignite.ml.environment.LearningEnvironmentBuilder) Serializable(java.io.Serializable) EmptyContext(org.apache.ignite.ml.dataset.primitive.context.EmptyContext) LabeledDatasetPartitionDataBuilderOnHeap(org.apache.ignite.ml.structures.partition.LabeledDatasetPartitionDataBuilderOnHeap) EmptyContext(org.apache.ignite.ml.dataset.primitive.context.EmptyContext) LabeledDatasetPartitionDataBuilderOnHeap(org.apache.ignite.ml.structures.partition.LabeledDatasetPartitionDataBuilderOnHeap) LabeledVectorSet(org.apache.ignite.ml.structures.LabeledVectorSet) Nullable(org.jetbrains.annotations.Nullable)

Example 3 with PartitionDataBuilder

use of org.apache.ignite.ml.dataset.PartitionDataBuilder in project ignite by apache.

the class LocalDatasetBuilder method build.

/**
 * {@inheritDoc}
 */
@Override
public <C extends Serializable, D extends AutoCloseable> LocalDataset<C, D> build(LearningEnvironmentBuilder envBuilder, PartitionContextBuilder<K, V, C> partCtxBuilder, PartitionDataBuilder<K, V, C, D> partDataBuilder, LearningEnvironment learningEnvironment) {
    List<C> ctxList = new ArrayList<>();
    List<D> dataList = new ArrayList<>();
    List<UpstreamEntry<K, V>> entriesList = new ArrayList<>();
    upstreamMap.entrySet().stream().filter(en -> filter.apply(en.getKey(), en.getValue())).map(en -> new UpstreamEntry<>(en.getKey(), en.getValue())).forEach(entriesList::add);
    int partSize = Math.max(1, entriesList.size() / partitions);
    Iterator<UpstreamEntry<K, V>> firstKeysIter = entriesList.iterator();
    Iterator<UpstreamEntry<K, V>> secondKeysIter = entriesList.iterator();
    Iterator<UpstreamEntry<K, V>> thirdKeysIter = entriesList.iterator();
    int ptr = 0;
    List<LearningEnvironment> envs = IntStream.range(0, partitions).boxed().map(envBuilder::buildForWorker).collect(Collectors.toList());
    for (int part = 0; part < partitions; part++) {
        int cntBeforeTransform = part == partitions - 1 ? entriesList.size() - ptr : Math.min(partSize, entriesList.size() - ptr);
        LearningEnvironment env = envs.get(part);
        UpstreamTransformer transformer1 = upstreamTransformerBuilder.build(env);
        UpstreamTransformer transformer2 = Utils.copy(transformer1);
        UpstreamTransformer transformer3 = Utils.copy(transformer1);
        int cnt = (int) transformer1.transform(Utils.asStream(new IteratorWindow<>(thirdKeysIter, k -> k, cntBeforeTransform))).count();
        Iterator<UpstreamEntry> iter = transformer2.transform(Utils.asStream(new IteratorWindow<>(firstKeysIter, k -> k, cntBeforeTransform)).map(x -> (UpstreamEntry) x)).iterator();
        Iterator<UpstreamEntry<K, V>> convertedBack = Utils.asStream(iter).map(x -> (UpstreamEntry<K, V>) x).iterator();
        C ctx = cntBeforeTransform > 0 ? partCtxBuilder.build(env, convertedBack, cnt) : null;
        Iterator<UpstreamEntry> iter1 = transformer3.transform(Utils.asStream(new IteratorWindow<>(secondKeysIter, k -> k, cntBeforeTransform))).iterator();
        Iterator<UpstreamEntry<K, V>> convertedBack1 = Utils.asStream(iter1).map(x -> (UpstreamEntry<K, V>) x).iterator();
        D data = cntBeforeTransform > 0 ? partDataBuilder.build(env, convertedBack1, cnt, ctx) : null;
        ctxList.add(ctx);
        dataList.add(data);
        ptr += cntBeforeTransform;
    }
    return new LocalDataset<>(envs, ctxList, dataList);
}
Also used : IntStream(java.util.stream.IntStream) UpstreamTransformer(org.apache.ignite.ml.dataset.UpstreamTransformer) IgniteBiPredicate(org.apache.ignite.lang.IgniteBiPredicate) Iterator(java.util.Iterator) IgniteFunction(org.apache.ignite.ml.math.functions.IgniteFunction) DatasetBuilder(org.apache.ignite.ml.dataset.DatasetBuilder) Collectors(java.util.stream.Collectors) Serializable(java.io.Serializable) ArrayList(java.util.ArrayList) List(java.util.List) Utils(org.apache.ignite.ml.util.Utils) PartitionContextBuilder(org.apache.ignite.ml.dataset.PartitionContextBuilder) LearningEnvironment(org.apache.ignite.ml.environment.LearningEnvironment) Map(java.util.Map) PartitionDataBuilder(org.apache.ignite.ml.dataset.PartitionDataBuilder) UpstreamTransformerBuilder(org.apache.ignite.ml.dataset.UpstreamTransformerBuilder) LearningEnvironmentBuilder(org.apache.ignite.ml.environment.LearningEnvironmentBuilder) UpstreamEntry(org.apache.ignite.ml.dataset.UpstreamEntry) ArrayList(java.util.ArrayList) LearningEnvironment(org.apache.ignite.ml.environment.LearningEnvironment) UpstreamTransformer(org.apache.ignite.ml.dataset.UpstreamTransformer) UpstreamEntry(org.apache.ignite.ml.dataset.UpstreamEntry)

Example 4 with PartitionDataBuilder

use of org.apache.ignite.ml.dataset.PartitionDataBuilder in project ignite by apache.

the class Deltas method updateModel.

/**
 * {@inheritDoc}
 */
@Override
protected <K, V> SVMLinearClassificationModel updateModel(SVMLinearClassificationModel mdl, DatasetBuilder<K, V> datasetBuilder, Preprocessor<K, V> preprocessor) {
    assert datasetBuilder != null;
    IgniteFunction<Double, Double> lbTransformer = lb -> {
        if (lb == 0.0)
            return -1.0;
        else
            return lb;
    };
    IgniteFunction<LabeledVector<Double>, LabeledVector<Double>> func = lv -> new LabeledVector<>(lv.features(), lbTransformer.apply(lv.label()));
    PatchedPreprocessor<K, V, Double, Double> patchedPreprocessor = new PatchedPreprocessor<>(func, preprocessor);
    PartitionDataBuilder<K, V, EmptyContext, LabeledVectorSet<LabeledVector>> partDataBuilder = new LabeledDatasetPartitionDataBuilderOnHeap<>(patchedPreprocessor);
    Vector weights;
    try (Dataset<EmptyContext, LabeledVectorSet<LabeledVector>> dataset = datasetBuilder.build(envBuilder, (env, upstream, upstreamSize) -> new EmptyContext(), partDataBuilder, learningEnvironment())) {
        if (mdl == null) {
            final int cols = dataset.compute(org.apache.ignite.ml.structures.Dataset::colSize, (a, b) -> {
                if (a == null)
                    return b == null ? 0 : b;
                if (b == null)
                    return a;
                return b;
            });
            final int weightVectorSizeWithIntercept = cols + 1;
            weights = initializeWeightsWithZeros(weightVectorSizeWithIntercept);
        } else
            weights = getStateVector(mdl);
        for (int i = 0; i < this.getAmountOfIterations(); i++) {
            Vector deltaWeights = calculateUpdates(weights, dataset);
            if (deltaWeights == null)
                return getLastTrainedModelOrThrowEmptyDatasetException(mdl);
            // creates new vector
            weights = weights.plus(deltaWeights);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
    return new SVMLinearClassificationModel(weights.copyOfRange(1, weights.size()), weights.get(0));
}
Also used : IgniteFunction(org.apache.ignite.ml.math.functions.IgniteFunction) Vector(org.apache.ignite.ml.math.primitives.vector.Vector) Preprocessor(org.apache.ignite.ml.preprocessing.Preprocessor) Random(java.util.Random) DatasetBuilder(org.apache.ignite.ml.dataset.DatasetBuilder) SparseVector(org.apache.ignite.ml.math.primitives.vector.impl.SparseVector) LabeledVector(org.apache.ignite.ml.structures.LabeledVector) Dataset(org.apache.ignite.ml.dataset.Dataset) SingleLabelDatasetTrainer(org.apache.ignite.ml.trainers.SingleLabelDatasetTrainer) LabeledVectorSet(org.apache.ignite.ml.structures.LabeledVectorSet) PatchedPreprocessor(org.apache.ignite.ml.preprocessing.developer.PatchedPreprocessor) PartitionDataBuilder(org.apache.ignite.ml.dataset.PartitionDataBuilder) NotNull(org.jetbrains.annotations.NotNull) DenseVector(org.apache.ignite.ml.math.primitives.vector.impl.DenseVector) UpstreamEntry(org.apache.ignite.ml.dataset.UpstreamEntry) EmptyContext(org.apache.ignite.ml.dataset.primitive.context.EmptyContext) LabeledDatasetPartitionDataBuilderOnHeap(org.apache.ignite.ml.structures.partition.LabeledDatasetPartitionDataBuilderOnHeap) EmptyContext(org.apache.ignite.ml.dataset.primitive.context.EmptyContext) LabeledDatasetPartitionDataBuilderOnHeap(org.apache.ignite.ml.structures.partition.LabeledDatasetPartitionDataBuilderOnHeap) Dataset(org.apache.ignite.ml.dataset.Dataset) LabeledVector(org.apache.ignite.ml.structures.LabeledVector) LabeledVectorSet(org.apache.ignite.ml.structures.LabeledVectorSet) PatchedPreprocessor(org.apache.ignite.ml.preprocessing.developer.PatchedPreprocessor) Vector(org.apache.ignite.ml.math.primitives.vector.Vector) SparseVector(org.apache.ignite.ml.math.primitives.vector.impl.SparseVector) LabeledVector(org.apache.ignite.ml.structures.LabeledVector) DenseVector(org.apache.ignite.ml.math.primitives.vector.impl.DenseVector)

Example 5 with PartitionDataBuilder

use of org.apache.ignite.ml.dataset.PartitionDataBuilder in project ignite by apache.

the class ComputeUtils method getData.

/**
 * Extracts partition {@code data} from the local storage, if it's not found in local storage recovers this {@code
 * data} from a partition {@code upstream} and {@code context}. Be aware that this method should be called from
 * the node where partition is placed.
 *
 * @param ignite Ignite instance.
 * @param upstreamCacheName Name of an {@code upstream} cache.
 * @param filter Filter for {@code upstream} data.
 * @param transformerBuilder Builder of upstream transformers.
 * @param datasetCacheName Name of a partition {@code context} cache.
 * @param datasetId Dataset ID.
 * @param partDataBuilder Partition data builder.
 * @param env Learning environment.
 * @param <K> Type of a key in {@code upstream} data.
 * @param <V> Type of a value in {@code upstream} data.
 * @param <C> Type of a partition {@code context}.
 * @param <D> Type of a partition {@code data}.
 * @return Partition {@code data}.
 */
public static <K, V, C extends Serializable, D extends AutoCloseable> D getData(Ignite ignite, String upstreamCacheName, IgniteBiPredicate<K, V> filter, UpstreamTransformerBuilder transformerBuilder, String datasetCacheName, UUID datasetId, PartitionDataBuilder<K, V, C, D> partDataBuilder, LearningEnvironment env, boolean isKeepBinary) {
    PartitionDataStorage dataStorage = (PartitionDataStorage) ignite.cluster().nodeLocalMap().computeIfAbsent(String.format(DATA_STORAGE_KEY_TEMPLATE, datasetId), key -> new PartitionDataStorage());
    final int part = env.partition();
    return dataStorage.computeDataIfAbsent(part, () -> {
        IgniteCache<Integer, C> learningCtxCache = ignite.cache(datasetCacheName);
        C ctx = learningCtxCache.get(part);
        IgniteCache<K, V> upstreamCache = ignite.cache(upstreamCacheName);
        if (isKeepBinary)
            upstreamCache = upstreamCache.withKeepBinary();
        ScanQuery<K, V> qry = new ScanQuery<>();
        qry.setLocal(true);
        qry.setPartition(part);
        qry.setFilter(filter);
        UpstreamTransformer transformer = transformerBuilder.build(env);
        UpstreamTransformer transformerCp = Utils.copy(transformer);
        long cnt = computeCount(upstreamCache, qry, transformer);
        if (cnt > 0) {
            try (QueryCursor<UpstreamEntry<K, V>> cursor = upstreamCache.query(qry, e -> new UpstreamEntry<>(e.getKey(), e.getValue()))) {
                Iterator<UpstreamEntry<K, V>> it = cursor.iterator();
                Stream<UpstreamEntry> transformedStream = transformerCp.transform(Utils.asStream(it, cnt).map(x -> (UpstreamEntry) x));
                it = Utils.asStream(transformedStream.iterator()).map(x -> (UpstreamEntry<K, V>) x).iterator();
                Iterator<UpstreamEntry<K, V>> iter = new IteratorWithConcurrentModificationChecker<>(it, cnt, "Cache expected to be not modified during dataset data building [partition=" + part + ']');
                return partDataBuilder.build(env, iter, cnt, ctx);
            }
        }
        return null;
    });
}
Also used : UpstreamTransformer(org.apache.ignite.ml.dataset.UpstreamTransformer) Arrays(java.util.Arrays) DeployingContext(org.apache.ignite.ml.environment.deploy.DeployingContext) IgniteBiPredicate(org.apache.ignite.lang.IgniteBiPredicate) IgniteFunction(org.apache.ignite.ml.math.functions.IgniteFunction) Affinity(org.apache.ignite.cache.affinity.Affinity) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ConcurrentMap(java.util.concurrent.ConcurrentMap) IgniteCallable(org.apache.ignite.lang.IgniteCallable) PartitionContextBuilder(org.apache.ignite.ml.dataset.PartitionContextBuilder) LearningEnvironment(org.apache.ignite.ml.environment.LearningEnvironment) Map(java.util.Map) PartitionDataBuilder(org.apache.ignite.ml.dataset.PartitionDataBuilder) UpstreamTransformerBuilder(org.apache.ignite.ml.dataset.UpstreamTransformerBuilder) LearningEnvironmentBuilder(org.apache.ignite.ml.environment.LearningEnvironmentBuilder) UpstreamEntry(org.apache.ignite.ml.dataset.UpstreamEntry) ClusterGroup(org.apache.ignite.cluster.ClusterGroup) IgniteFuture(org.apache.ignite.lang.IgniteFuture) Iterator(java.util.Iterator) Collection(java.util.Collection) IgniteException(org.apache.ignite.IgniteException) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) UUID(java.util.UUID) Ignite(org.apache.ignite.Ignite) IgniteCache(org.apache.ignite.IgniteCache) Serializable(java.io.Serializable) LockSupport(java.util.concurrent.locks.LockSupport) Stream(java.util.stream.Stream) Ignition(org.apache.ignite.Ignition) Utils(org.apache.ignite.ml.util.Utils) QueryCursor(org.apache.ignite.cache.query.QueryCursor) BitSet(java.util.BitSet) ScanQuery(org.apache.ignite.cache.query.ScanQuery) GridPeerDeployAware(org.apache.ignite.internal.util.lang.GridPeerDeployAware) ScanQuery(org.apache.ignite.cache.query.ScanQuery) UpstreamTransformer(org.apache.ignite.ml.dataset.UpstreamTransformer) UpstreamEntry(org.apache.ignite.ml.dataset.UpstreamEntry)

Aggregations

PartitionDataBuilder (org.apache.ignite.ml.dataset.PartitionDataBuilder)6 DatasetBuilder (org.apache.ignite.ml.dataset.DatasetBuilder)5 Serializable (java.io.Serializable)4 Dataset (org.apache.ignite.ml.dataset.Dataset)4 EmptyContext (org.apache.ignite.ml.dataset.primitive.context.EmptyContext)4 Preprocessor (org.apache.ignite.ml.preprocessing.Preprocessor)4 Map (java.util.Map)3 Collectors (java.util.stream.Collectors)3 UpstreamEntry (org.apache.ignite.ml.dataset.UpstreamEntry)3 LearningEnvironmentBuilder (org.apache.ignite.ml.environment.LearningEnvironmentBuilder)3 IgniteFunction (org.apache.ignite.ml.math.functions.IgniteFunction)3 Vector (org.apache.ignite.ml.math.primitives.vector.Vector)3 ArrayList (java.util.ArrayList)2 Arrays (java.util.Arrays)2 Iterator (java.util.Iterator)2 List (java.util.List)2 Random (java.util.Random)2 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)2 IntStream (java.util.stream.IntStream)2 IgniteBiPredicate (org.apache.ignite.lang.IgniteBiPredicate)2