use of org.apache.ignite.ml.dataset.PartitionDataBuilder in project ignite by apache.
the class ANNClassificationTrainer method getCentroidStat.
/**
*/
private <K, V> CentroidStat getCentroidStat(DatasetBuilder<K, V> datasetBuilder, Preprocessor<K, V> vectorizer, List<Vector> centers) {
PartitionDataBuilder<K, V, EmptyContext, LabeledVectorSet<LabeledVector>> partDataBuilder = new LabeledDatasetPartitionDataBuilderOnHeap<>(vectorizer);
try (Dataset<EmptyContext, LabeledVectorSet<LabeledVector>> dataset = datasetBuilder.build(envBuilder, (env, upstream, upstreamSize) -> new EmptyContext(), partDataBuilder, learningEnvironment())) {
return dataset.compute(data -> {
CentroidStat res = new CentroidStat();
for (int i = 0; i < data.rowSize(); i++) {
final IgniteBiTuple<Integer, Double> closestCentroid = findClosestCentroid(centers, data.getRow(i));
int centroidIdx = closestCentroid.get1();
double lb = data.label(i);
// add new label to label set
res.labels().add(lb);
ConcurrentHashMap<Double, Integer> centroidStat = res.centroidStat.get(centroidIdx);
if (centroidStat == null) {
centroidStat = new ConcurrentHashMap<>();
centroidStat.put(lb, 1);
res.centroidStat.put(centroidIdx, centroidStat);
} else {
int cnt = centroidStat.getOrDefault(lb, 0);
centroidStat.put(lb, cnt + 1);
}
res.counts.merge(centroidIdx, 1, (IgniteBiFunction<Integer, Integer, Integer>) (i1, i2) -> i1 + i2);
}
return res;
}, (a, b) -> {
if (a == null)
return b == null ? new CentroidStat() : b;
if (b == null)
return a;
return a.merge(b);
});
} catch (Exception e) {
throw new RuntimeException(e);
}
}
use of org.apache.ignite.ml.dataset.PartitionDataBuilder in project ignite by apache.
the class KNNUtils method buildDataset.
/**
* Builds dataset.
*
* @param envBuilder Learning environment builder.
* @param datasetBuilder Dataset builder.
* @param vectorizer Upstream vectorizer.
* @return Dataset.
*/
@Nullable
public static <K, V, C extends Serializable> Dataset<EmptyContext, LabeledVectorSet<LabeledVector>> buildDataset(LearningEnvironmentBuilder envBuilder, DatasetBuilder<K, V> datasetBuilder, Preprocessor<K, V> vectorizer) {
LearningEnvironment environment = envBuilder.buildForTrainer();
environment.initDeployingContext(vectorizer);
PartitionDataBuilder<K, V, EmptyContext, LabeledVectorSet<LabeledVector>> partDataBuilder = new LabeledDatasetPartitionDataBuilderOnHeap<>(vectorizer);
Dataset<EmptyContext, LabeledVectorSet<LabeledVector>> dataset = null;
if (datasetBuilder != null) {
dataset = datasetBuilder.build(envBuilder, (env, upstream, upstreamSize) -> new EmptyContext(), partDataBuilder, environment);
}
return dataset;
}
use of org.apache.ignite.ml.dataset.PartitionDataBuilder in project ignite by apache.
the class LocalDatasetBuilder method build.
/**
* {@inheritDoc}
*/
@Override
public <C extends Serializable, D extends AutoCloseable> LocalDataset<C, D> build(LearningEnvironmentBuilder envBuilder, PartitionContextBuilder<K, V, C> partCtxBuilder, PartitionDataBuilder<K, V, C, D> partDataBuilder, LearningEnvironment learningEnvironment) {
List<C> ctxList = new ArrayList<>();
List<D> dataList = new ArrayList<>();
List<UpstreamEntry<K, V>> entriesList = new ArrayList<>();
upstreamMap.entrySet().stream().filter(en -> filter.apply(en.getKey(), en.getValue())).map(en -> new UpstreamEntry<>(en.getKey(), en.getValue())).forEach(entriesList::add);
int partSize = Math.max(1, entriesList.size() / partitions);
Iterator<UpstreamEntry<K, V>> firstKeysIter = entriesList.iterator();
Iterator<UpstreamEntry<K, V>> secondKeysIter = entriesList.iterator();
Iterator<UpstreamEntry<K, V>> thirdKeysIter = entriesList.iterator();
int ptr = 0;
List<LearningEnvironment> envs = IntStream.range(0, partitions).boxed().map(envBuilder::buildForWorker).collect(Collectors.toList());
for (int part = 0; part < partitions; part++) {
int cntBeforeTransform = part == partitions - 1 ? entriesList.size() - ptr : Math.min(partSize, entriesList.size() - ptr);
LearningEnvironment env = envs.get(part);
UpstreamTransformer transformer1 = upstreamTransformerBuilder.build(env);
UpstreamTransformer transformer2 = Utils.copy(transformer1);
UpstreamTransformer transformer3 = Utils.copy(transformer1);
int cnt = (int) transformer1.transform(Utils.asStream(new IteratorWindow<>(thirdKeysIter, k -> k, cntBeforeTransform))).count();
Iterator<UpstreamEntry> iter = transformer2.transform(Utils.asStream(new IteratorWindow<>(firstKeysIter, k -> k, cntBeforeTransform)).map(x -> (UpstreamEntry) x)).iterator();
Iterator<UpstreamEntry<K, V>> convertedBack = Utils.asStream(iter).map(x -> (UpstreamEntry<K, V>) x).iterator();
C ctx = cntBeforeTransform > 0 ? partCtxBuilder.build(env, convertedBack, cnt) : null;
Iterator<UpstreamEntry> iter1 = transformer3.transform(Utils.asStream(new IteratorWindow<>(secondKeysIter, k -> k, cntBeforeTransform))).iterator();
Iterator<UpstreamEntry<K, V>> convertedBack1 = Utils.asStream(iter1).map(x -> (UpstreamEntry<K, V>) x).iterator();
D data = cntBeforeTransform > 0 ? partDataBuilder.build(env, convertedBack1, cnt, ctx) : null;
ctxList.add(ctx);
dataList.add(data);
ptr += cntBeforeTransform;
}
return new LocalDataset<>(envs, ctxList, dataList);
}
use of org.apache.ignite.ml.dataset.PartitionDataBuilder in project ignite by apache.
the class Deltas method updateModel.
/**
* {@inheritDoc}
*/
@Override
protected <K, V> SVMLinearClassificationModel updateModel(SVMLinearClassificationModel mdl, DatasetBuilder<K, V> datasetBuilder, Preprocessor<K, V> preprocessor) {
assert datasetBuilder != null;
IgniteFunction<Double, Double> lbTransformer = lb -> {
if (lb == 0.0)
return -1.0;
else
return lb;
};
IgniteFunction<LabeledVector<Double>, LabeledVector<Double>> func = lv -> new LabeledVector<>(lv.features(), lbTransformer.apply(lv.label()));
PatchedPreprocessor<K, V, Double, Double> patchedPreprocessor = new PatchedPreprocessor<>(func, preprocessor);
PartitionDataBuilder<K, V, EmptyContext, LabeledVectorSet<LabeledVector>> partDataBuilder = new LabeledDatasetPartitionDataBuilderOnHeap<>(patchedPreprocessor);
Vector weights;
try (Dataset<EmptyContext, LabeledVectorSet<LabeledVector>> dataset = datasetBuilder.build(envBuilder, (env, upstream, upstreamSize) -> new EmptyContext(), partDataBuilder, learningEnvironment())) {
if (mdl == null) {
final int cols = dataset.compute(org.apache.ignite.ml.structures.Dataset::colSize, (a, b) -> {
if (a == null)
return b == null ? 0 : b;
if (b == null)
return a;
return b;
});
final int weightVectorSizeWithIntercept = cols + 1;
weights = initializeWeightsWithZeros(weightVectorSizeWithIntercept);
} else
weights = getStateVector(mdl);
for (int i = 0; i < this.getAmountOfIterations(); i++) {
Vector deltaWeights = calculateUpdates(weights, dataset);
if (deltaWeights == null)
return getLastTrainedModelOrThrowEmptyDatasetException(mdl);
// creates new vector
weights = weights.plus(deltaWeights);
}
} catch (Exception e) {
throw new RuntimeException(e);
}
return new SVMLinearClassificationModel(weights.copyOfRange(1, weights.size()), weights.get(0));
}
use of org.apache.ignite.ml.dataset.PartitionDataBuilder in project ignite by apache.
the class ComputeUtils method getData.
/**
* Extracts partition {@code data} from the local storage, if it's not found in local storage recovers this {@code
* data} from a partition {@code upstream} and {@code context}. Be aware that this method should be called from
* the node where partition is placed.
*
* @param ignite Ignite instance.
* @param upstreamCacheName Name of an {@code upstream} cache.
* @param filter Filter for {@code upstream} data.
* @param transformerBuilder Builder of upstream transformers.
* @param datasetCacheName Name of a partition {@code context} cache.
* @param datasetId Dataset ID.
* @param partDataBuilder Partition data builder.
* @param env Learning environment.
* @param <K> Type of a key in {@code upstream} data.
* @param <V> Type of a value in {@code upstream} data.
* @param <C> Type of a partition {@code context}.
* @param <D> Type of a partition {@code data}.
* @return Partition {@code data}.
*/
public static <K, V, C extends Serializable, D extends AutoCloseable> D getData(Ignite ignite, String upstreamCacheName, IgniteBiPredicate<K, V> filter, UpstreamTransformerBuilder transformerBuilder, String datasetCacheName, UUID datasetId, PartitionDataBuilder<K, V, C, D> partDataBuilder, LearningEnvironment env, boolean isKeepBinary) {
PartitionDataStorage dataStorage = (PartitionDataStorage) ignite.cluster().nodeLocalMap().computeIfAbsent(String.format(DATA_STORAGE_KEY_TEMPLATE, datasetId), key -> new PartitionDataStorage());
final int part = env.partition();
return dataStorage.computeDataIfAbsent(part, () -> {
IgniteCache<Integer, C> learningCtxCache = ignite.cache(datasetCacheName);
C ctx = learningCtxCache.get(part);
IgniteCache<K, V> upstreamCache = ignite.cache(upstreamCacheName);
if (isKeepBinary)
upstreamCache = upstreamCache.withKeepBinary();
ScanQuery<K, V> qry = new ScanQuery<>();
qry.setLocal(true);
qry.setPartition(part);
qry.setFilter(filter);
UpstreamTransformer transformer = transformerBuilder.build(env);
UpstreamTransformer transformerCp = Utils.copy(transformer);
long cnt = computeCount(upstreamCache, qry, transformer);
if (cnt > 0) {
try (QueryCursor<UpstreamEntry<K, V>> cursor = upstreamCache.query(qry, e -> new UpstreamEntry<>(e.getKey(), e.getValue()))) {
Iterator<UpstreamEntry<K, V>> it = cursor.iterator();
Stream<UpstreamEntry> transformedStream = transformerCp.transform(Utils.asStream(it, cnt).map(x -> (UpstreamEntry) x));
it = Utils.asStream(transformedStream.iterator()).map(x -> (UpstreamEntry<K, V>) x).iterator();
Iterator<UpstreamEntry<K, V>> iter = new IteratorWithConcurrentModificationChecker<>(it, cnt, "Cache expected to be not modified during dataset data building [partition=" + part + ']');
return partDataBuilder.build(env, iter, cnt, ctx);
}
}
return null;
});
}
Aggregations