use of org.apache.ignite.ml.dataset.PartitionContextBuilder in project ignite by apache.
the class MinMaxScalerTrainer method fit.
/**
* {@inheritDoc}
*/
@Override
public MinMaxScalerPreprocessor<K, V> fit(LearningEnvironmentBuilder envBuilder, DatasetBuilder<K, V> datasetBuilder, Preprocessor<K, V> basePreprocessor) {
PartitionContextBuilder<K, V, EmptyContext> ctxBuilder = (env, upstream, upstreamSize) -> new EmptyContext();
try (Dataset<EmptyContext, MinMaxScalerPartitionData> dataset = datasetBuilder.build(envBuilder, ctxBuilder, (env, upstream, upstreamSize, ctx) -> {
double[] min = null;
double[] max = null;
while (upstream.hasNext()) {
UpstreamEntry<K, V> entity = upstream.next();
LabeledVector row = basePreprocessor.apply(entity.getKey(), entity.getValue());
if (min == null) {
min = new double[row.size()];
Arrays.fill(min, Double.MAX_VALUE);
} else
assert min.length == row.size() : "Base preprocessor must return exactly " + min.length + " features";
if (max == null) {
max = new double[row.size()];
Arrays.fill(max, -Double.MAX_VALUE);
} else
assert max.length == row.size() : "Base preprocessor must return exactly " + min.length + " features";
for (int i = 0; i < row.size(); i++) {
if (row.get(i) < min[i])
min[i] = row.get(i);
if (row.get(i) > max[i])
max[i] = row.get(i);
}
}
return new MinMaxScalerPartitionData(min, max);
}, learningEnvironment(basePreprocessor))) {
double[][] minMax = dataset.compute(data -> data.getMin() != null ? new double[][] { data.getMin(), data.getMax() } : null, (a, b) -> {
if (a == null)
return b;
if (b == null)
return a;
double[][] res = new double[2][];
res[0] = new double[a[0].length];
for (int i = 0; i < res[0].length; i++) res[0][i] = Math.min(a[0][i], b[0][i]);
res[1] = new double[a[1].length];
for (int i = 0; i < res[1].length; i++) res[1][i] = Math.max(a[1][i], b[1][i]);
return res;
});
return new MinMaxScalerPreprocessor<>(minMax[0], minMax[1], basePreprocessor);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
use of org.apache.ignite.ml.dataset.PartitionContextBuilder in project ignite by apache.
the class ComputeUtils method initContext.
/**
* Initializes partition {@code context} by loading it from a partition {@code upstream}.
* @param ignite Ignite instance.
* @param upstreamCacheName Name of an {@code upstream} cache.
* @param filter Filter for {@code upstream} data.
* @param transformerBuilder Upstream transformer builder.
* @param ctxBuilder Partition {@code context} builder.
* @param envBuilder Environment builder.
* @param isKeepBinary Support of binary objects.
* @param deployingCtx Deploy context.
* @param <K> Type of a key in {@code upstream} data.
* @param <V> Type of a value in {@code upstream} data.
* @param <C> Type of a partition {@code context}.
*/
public static <K, V, C extends Serializable> void initContext(Ignite ignite, String upstreamCacheName, UpstreamTransformerBuilder transformerBuilder, IgniteBiPredicate<K, V> filter, String datasetCacheName, PartitionContextBuilder<K, V, C> ctxBuilder, LearningEnvironmentBuilder envBuilder, int retries, int interval, boolean isKeepBinary, DeployingContext deployingCtx) {
affinityCallWithRetries(ignite, Arrays.asList(datasetCacheName, upstreamCacheName), part -> {
Ignite locIgnite = Ignition.localIgnite();
LearningEnvironment env = envBuilder.buildForWorker(part);
IgniteCache<K, V> locUpstreamCache = locIgnite.cache(upstreamCacheName);
if (isKeepBinary)
locUpstreamCache = locUpstreamCache.withKeepBinary();
ScanQuery<K, V> qry = new ScanQuery<>();
qry.setLocal(true);
qry.setPartition(part);
qry.setFilter(filter);
C ctx;
UpstreamTransformer transformer = transformerBuilder.build(env);
UpstreamTransformer transformerCp = Utils.copy(transformer);
long cnt = computeCount(locUpstreamCache, qry, transformer);
try (QueryCursor<UpstreamEntry<K, V>> cursor = locUpstreamCache.query(qry, e -> new UpstreamEntry<>(e.getKey(), e.getValue()))) {
Iterator<UpstreamEntry<K, V>> it = cursor.iterator();
Stream<UpstreamEntry> transformedStream = transformerCp.transform(Utils.asStream(it, cnt).map(x -> (UpstreamEntry) x));
it = Utils.asStream(transformedStream.iterator()).map(x -> (UpstreamEntry<K, V>) x).iterator();
Iterator<UpstreamEntry<K, V>> iter = new IteratorWithConcurrentModificationChecker<>(it, cnt, "Cache expected to be not modified during dataset data building [partition=" + part + ']');
ctx = ctxBuilder.build(env, iter, cnt);
}
IgniteCache<Integer, C> datasetCache = locIgnite.cache(datasetCacheName);
datasetCache.put(part, ctx);
return part;
}, retries, interval, deployingCtx);
}
use of org.apache.ignite.ml.dataset.PartitionContextBuilder in project ignite by apache.
the class LocalDatasetBuilder method build.
/**
* {@inheritDoc}
*/
@Override
public <C extends Serializable, D extends AutoCloseable> LocalDataset<C, D> build(LearningEnvironmentBuilder envBuilder, PartitionContextBuilder<K, V, C> partCtxBuilder, PartitionDataBuilder<K, V, C, D> partDataBuilder, LearningEnvironment learningEnvironment) {
List<C> ctxList = new ArrayList<>();
List<D> dataList = new ArrayList<>();
List<UpstreamEntry<K, V>> entriesList = new ArrayList<>();
upstreamMap.entrySet().stream().filter(en -> filter.apply(en.getKey(), en.getValue())).map(en -> new UpstreamEntry<>(en.getKey(), en.getValue())).forEach(entriesList::add);
int partSize = Math.max(1, entriesList.size() / partitions);
Iterator<UpstreamEntry<K, V>> firstKeysIter = entriesList.iterator();
Iterator<UpstreamEntry<K, V>> secondKeysIter = entriesList.iterator();
Iterator<UpstreamEntry<K, V>> thirdKeysIter = entriesList.iterator();
int ptr = 0;
List<LearningEnvironment> envs = IntStream.range(0, partitions).boxed().map(envBuilder::buildForWorker).collect(Collectors.toList());
for (int part = 0; part < partitions; part++) {
int cntBeforeTransform = part == partitions - 1 ? entriesList.size() - ptr : Math.min(partSize, entriesList.size() - ptr);
LearningEnvironment env = envs.get(part);
UpstreamTransformer transformer1 = upstreamTransformerBuilder.build(env);
UpstreamTransformer transformer2 = Utils.copy(transformer1);
UpstreamTransformer transformer3 = Utils.copy(transformer1);
int cnt = (int) transformer1.transform(Utils.asStream(new IteratorWindow<>(thirdKeysIter, k -> k, cntBeforeTransform))).count();
Iterator<UpstreamEntry> iter = transformer2.transform(Utils.asStream(new IteratorWindow<>(firstKeysIter, k -> k, cntBeforeTransform)).map(x -> (UpstreamEntry) x)).iterator();
Iterator<UpstreamEntry<K, V>> convertedBack = Utils.asStream(iter).map(x -> (UpstreamEntry<K, V>) x).iterator();
C ctx = cntBeforeTransform > 0 ? partCtxBuilder.build(env, convertedBack, cnt) : null;
Iterator<UpstreamEntry> iter1 = transformer3.transform(Utils.asStream(new IteratorWindow<>(secondKeysIter, k -> k, cntBeforeTransform))).iterator();
Iterator<UpstreamEntry<K, V>> convertedBack1 = Utils.asStream(iter1).map(x -> (UpstreamEntry<K, V>) x).iterator();
D data = cntBeforeTransform > 0 ? partDataBuilder.build(env, convertedBack1, cnt, ctx) : null;
ctxList.add(ctx);
dataList.add(data);
ptr += cntBeforeTransform;
}
return new LocalDataset<>(envs, ctxList, dataList);
}
use of org.apache.ignite.ml.dataset.PartitionContextBuilder in project ignite by apache.
the class ImputerTrainer method fit.
/**
* {@inheritDoc}
*/
@Override
public ImputerPreprocessor<K, V> fit(LearningEnvironmentBuilder envBuilder, DatasetBuilder<K, V> datasetBuilder, Preprocessor<K, V> basePreprocessor) {
PartitionContextBuilder<K, V, EmptyContext> builder = (env, upstream, upstreamSize) -> new EmptyContext();
try (Dataset<EmptyContext, ImputerPartitionData> dataset = datasetBuilder.build(envBuilder, builder, (env, upstream, upstreamSize, ctx) -> {
double[] sums = null;
int[] counts = null;
double[] maxs = null;
double[] mins = null;
Map<Double, Integer>[] valuesByFreq = null;
while (upstream.hasNext()) {
UpstreamEntry<K, V> entity = upstream.next();
LabeledVector row = basePreprocessor.apply(entity.getKey(), entity.getValue());
switch(imputingStgy) {
case MEAN:
sums = updateTheSums(row, sums);
counts = updateTheCounts(row, counts);
break;
case MOST_FREQUENT:
valuesByFreq = updateFrequenciesByGivenRow(row, valuesByFreq);
break;
case LEAST_FREQUENT:
valuesByFreq = updateFrequenciesByGivenRow(row, valuesByFreq);
break;
case MAX:
maxs = updateTheMaxs(row, maxs);
break;
case MIN:
mins = updateTheMins(row, mins);
break;
case COUNT:
counts = updateTheCounts(row, counts);
break;
default:
throw new UnsupportedOperationException("The chosen strategy is not supported");
}
}
ImputerPartitionData partData;
switch(imputingStgy) {
case MEAN:
partData = new ImputerPartitionData().withSums(sums).withCounts(counts);
break;
case MOST_FREQUENT:
partData = new ImputerPartitionData().withValuesByFrequency(valuesByFreq);
break;
case LEAST_FREQUENT:
partData = new ImputerPartitionData().withValuesByFrequency(valuesByFreq);
break;
case MAX:
partData = new ImputerPartitionData().withMaxs(maxs);
break;
case MIN:
partData = new ImputerPartitionData().withMins(mins);
break;
case COUNT:
partData = new ImputerPartitionData().withCounts(counts);
break;
default:
throw new UnsupportedOperationException("The chosen strategy is not supported");
}
return partData;
}, learningEnvironment(basePreprocessor))) {
Vector imputingValues;
switch(imputingStgy) {
case MEAN:
imputingValues = VectorUtils.of(calculateImputingValuesBySumsAndCounts(dataset));
break;
case MOST_FREQUENT:
imputingValues = VectorUtils.of(calculateImputingValuesByTheMostFrequentValues(dataset));
break;
case LEAST_FREQUENT:
imputingValues = VectorUtils.of(calculateImputingValuesByTheLeastFrequentValues(dataset));
break;
case MAX:
imputingValues = VectorUtils.of(calculateImputingValuesByMaxValues(dataset));
break;
case MIN:
imputingValues = VectorUtils.of(calculateImputingValuesByMinValues(dataset));
break;
case COUNT:
imputingValues = VectorUtils.of(calculateImputingValuesByCounts(dataset));
break;
default:
throw new UnsupportedOperationException("The chosen strategy is not supported");
}
return new ImputerPreprocessor<>(imputingValues, basePreprocessor);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
Aggregations