use of org.apache.ignite.ml.math.primitives.vector.Vector in project ignite by apache.
the class LabeledDatasetPartitionDataBuilderOnHeap method build.
/**
* {@inheritDoc}
*/
@Override
public LabeledVectorSet<LabeledVector> build(LearningEnvironment env, Iterator<UpstreamEntry<K, V>> upstreamData, long upstreamDataSize, C ctx) {
int xCols = -1;
double[][] x = null;
double[] y = new double[Math.toIntExact(upstreamDataSize)];
int ptr = 0;
while (upstreamData.hasNext()) {
UpstreamEntry<K, V> entry = upstreamData.next();
LabeledVector<Double> labeledVector = preprocessor.apply(entry.getKey(), entry.getValue());
Vector row = labeledVector.features();
if (xCols < 0) {
xCols = row.size();
x = new double[Math.toIntExact(upstreamDataSize)][xCols];
} else
assert row.size() == xCols : "X extractor must return exactly " + xCols + " columns";
x[ptr] = row.asArray();
y[ptr] = labeledVector.label();
ptr++;
}
return new LabeledVectorSet<>(x, y);
}
use of org.apache.ignite.ml.math.primitives.vector.Vector in project ignite by apache.
the class StandardScalerTrainer method computeSum.
/**
* Computes sum, squared sum and row count.
*/
private StandardScalerData computeSum(LearningEnvironmentBuilder envBuilder, DatasetBuilder<K, V> datasetBuilder, Preprocessor<K, V> basePreprocessor) {
try (Dataset<EmptyContext, StandardScalerData> dataset = datasetBuilder.build(envBuilder, (env, upstream, upstreamSize) -> new EmptyContext(), (env, upstream, upstreamSize, ctx) -> {
double[] sum = null;
double[] squaredSum = null;
long cnt = 0;
while (upstream.hasNext()) {
UpstreamEntry<K, V> entity = upstream.next();
Vector row = basePreprocessor.apply(entity.getKey(), entity.getValue()).features();
if (sum == null) {
sum = new double[row.size()];
squaredSum = new double[row.size()];
} else {
assert sum.length == row.size() : "Base preprocessor must return exactly " + sum.length + " features";
}
++cnt;
for (int i = 0; i < row.size(); i++) {
double x = row.get(i);
sum[i] += x;
squaredSum[i] += x * x;
}
}
return new StandardScalerData(sum, squaredSum, cnt);
}, learningEnvironment(basePreprocessor))) {
return dataset.compute(data -> data, (a, b) -> {
if (a == null)
return b;
if (b == null)
return a;
return a.merge(b);
});
} catch (Exception e) {
throw new RuntimeException(e);
}
}
use of org.apache.ignite.ml.math.primitives.vector.Vector in project ignite by apache.
the class TrainerTransformers method runOnEnsemble.
/**
* This method accepts function which for given dataset builder and index of model in ensemble generates
* task of training this model.
*
* @param trainingTaskGenerator Training test generator.
* @param datasetBuilder Dataset builder.
* @param ensembleSize Size of ensemble.
* @param subsampleRatio Ratio (subsample size) / (initial dataset size).
* @param featuresVectorSize Dimensionality of feature vector.
* @param featureSubspaceDim Dimensionality of feature subspace.
* @param aggregator Aggregator of models.
* @param environment Environment.
* @param <K> Type of keys in dataset builder.
* @param <V> Type of values in dataset builder.
* @param <M> Type of model.
* @return Composition of models trained on bagged dataset.
*/
private static <K, V, M extends IgniteModel<Vector, Double>> ModelsComposition runOnEnsemble(IgniteTriFunction<DatasetBuilder<K, V>, Integer, IgniteBiFunction<K, V, Vector>, IgniteSupplier<M>> trainingTaskGenerator, DatasetBuilder<K, V> datasetBuilder, int ensembleSize, double subsampleRatio, int featuresVectorSize, int featureSubspaceDim, IgniteBiFunction<K, V, Vector> extractor, PredictionsAggregator aggregator, LearningEnvironment environment) {
MLLogger log = environment.logger(datasetBuilder.getClass());
log.log(MLLogger.VerboseLevel.LOW, "Start learning.");
List<int[]> mappings = null;
if (featuresVectorSize > 0 && featureSubspaceDim != featuresVectorSize) {
mappings = IntStream.range(0, ensembleSize).mapToObj(modelIdx -> getMapping(featuresVectorSize, featureSubspaceDim, environment.randomNumbersGenerator().nextLong() + modelIdx)).collect(Collectors.toList());
}
Long startTs = System.currentTimeMillis();
List<IgniteSupplier<M>> tasks = new ArrayList<>();
List<IgniteBiFunction<K, V, Vector>> extractors = new ArrayList<>();
if (mappings != null) {
for (int[] mapping : mappings) extractors.add(wrapExtractor(extractor, mapping));
}
for (int i = 0; i < ensembleSize; i++) {
DatasetBuilder<K, V> newBuilder = datasetBuilder.withUpstreamTransformer(BaggingUpstreamTransformer.builder(subsampleRatio, i));
tasks.add(trainingTaskGenerator.apply(newBuilder, i, mappings != null ? extractors.get(i) : extractor));
}
List<ModelWithMapping<Vector, Double, M>> models = environment.parallelismStrategy().submit(tasks).stream().map(Promise::unsafeGet).map(ModelWithMapping<Vector, Double, M>::new).collect(Collectors.toList());
// If we need to do projection, do it.
if (mappings != null) {
for (int i = 0; i < models.size(); i++) models.get(i).setMapping(VectorUtils.getProjector(mappings.get(i)));
}
double learningTime = (double) (System.currentTimeMillis() - startTs) / 1000.0;
log.log(MLLogger.VerboseLevel.LOW, "The training time was %.2fs.", learningTime);
log.log(MLLogger.VerboseLevel.LOW, "Learning finished.");
return new ModelsComposition(models, aggregator);
}
use of org.apache.ignite.ml.math.primitives.vector.Vector in project ignite by apache.
the class Evaluator method initEvaluationContexts.
/**
* Inits evaluation contexts for metrics.
*
* @param dataset Dataset.
* @param metrics Metrics.
* @return Computed contexts.
*/
@SuppressWarnings("unchecked")
private static Map<Class, EvaluationContext> initEvaluationContexts(Dataset<EmptyContext, FeatureMatrixWithLabelsOnHeapData> dataset, Metric... metrics) {
long nonEmptyCtxsCnt = Arrays.stream(metrics).map(x -> x.makeAggregator().createInitializedContext()).filter(x -> ((EvaluationContext) x).needToCompute()).count();
if (nonEmptyCtxsCnt == 0) {
HashMap<Class, EvaluationContext> res = new HashMap<>();
for (Metric m : metrics) {
MetricStatsAggregator<Double, ?, ?> aggregator = m.makeAggregator();
res.put(aggregator.getClass(), (EvaluationContext) m.makeAggregator().createInitializedContext());
return res;
}
}
return dataset.compute(data -> {
Map<Class, MetricStatsAggregator> aggrs = new HashMap<>();
for (Metric m : metrics) {
MetricStatsAggregator<Double, ?, ?> aggregator = m.makeAggregator();
if (!aggrs.containsKey(aggregator.getClass()))
aggrs.put(aggregator.getClass(), aggregator);
}
Map<Class, EvaluationContext> aggrToEvCtx = new HashMap<>();
aggrs.forEach((clazz, aggr) -> aggrToEvCtx.put(clazz, (EvaluationContext) aggr.createInitializedContext()));
for (int i = 0; i < data.getLabels().length; i++) {
LabeledVector<Double> vector = VectorUtils.of(data.getFeatures()[i]).labeled(data.getLabels()[i]);
aggrToEvCtx.values().forEach(ctx -> ctx.aggregate(vector));
}
return aggrToEvCtx;
}, (left, right) -> {
if (left == null && right == null)
return new HashMap<>();
if (left == null)
return right;
if (right == null)
return left;
HashMap<Class, EvaluationContext> res = new HashMap<>();
for (Class key : left.keySet()) {
EvaluationContext ctx1 = left.get(key);
EvaluationContext ctx2 = right.get(key);
A.ensure(ctx1 != null && ctx2 != null, "ctx1 != null && ctx2 != null");
res.put(key, ctx1.mergeWith(ctx2));
}
return res;
});
}
use of org.apache.ignite.ml.math.primitives.vector.Vector in project ignite by apache.
the class LabeledDatasetLoader method parseFeatures.
/**
*/
@NotNull
private static Vector parseFeatures(Path pathToFile, boolean isFallOnBadData, int colSize, int rowIdx, String[] rowData) {
final Vector vec = LabeledVectorSet.emptyVector(colSize);
if (isFallOnBadData && rowData.length != colSize + 1)
throw new CardinalityException(colSize + 1, rowData.length);
double missedData = fillMissedData();
for (int j = 0; j < colSize; j++) {
try {
double feature = Double.parseDouble(rowData[j + 1]);
vec.set(j, feature);
} catch (NumberFormatException e) {
if (isFallOnBadData)
throw new FileParsingException(rowData[j + 1], rowIdx, pathToFile);
else
vec.set(j, missedData);
} catch (ArrayIndexOutOfBoundsException e) {
vec.set(j, missedData);
}
}
return vec;
}
Aggregations