use of org.apache.ignite.ml.preprocessing.encoding.label.LabelEncoderPreprocessor in project ignite by apache.
the class EncoderTrainer method fit.
/**
* {@inheritDoc}
*/
@Override
public EncoderPreprocessor<K, V> fit(LearningEnvironmentBuilder envBuilder, DatasetBuilder<K, V> datasetBuilder, Preprocessor<K, V> basePreprocessor) {
if (handledIndices.isEmpty() && encoderType != EncoderType.LABEL_ENCODER)
throw new RuntimeException("Add indices of handled features");
try (Dataset<EmptyContext, EncoderPartitionData> dataset = datasetBuilder.build(envBuilder, (env, upstream, upstreamSize) -> new EmptyContext(), (env, upstream, upstreamSize, ctx) -> {
EncoderPartitionData partData = new EncoderPartitionData();
if (encoderType == EncoderType.LABEL_ENCODER) {
Map<String, Integer> lbFrequencies = null;
while (upstream.hasNext()) {
UpstreamEntry<K, V> entity = upstream.next();
LabeledVector<Double> row = basePreprocessor.apply(entity.getKey(), entity.getValue());
lbFrequencies = updateLabelFrequenciesForNextRow(row, lbFrequencies);
}
partData.withLabelFrequencies(lbFrequencies);
} else if (encoderType == EncoderType.TARGET_ENCODER) {
TargetCounter[] targetCounter = null;
while (upstream.hasNext()) {
UpstreamEntry<K, V> entity = upstream.next();
LabeledVector<Double> row = basePreprocessor.apply(entity.getKey(), entity.getValue());
targetCounter = updateTargetCountersForNextRow(row, targetCounter);
}
partData.withTargetCounters(targetCounter);
} else {
// This array will contain not null values for handled indices
Map<String, Integer>[] categoryFrequencies = null;
while (upstream.hasNext()) {
UpstreamEntry<K, V> entity = upstream.next();
LabeledVector<Double> row = basePreprocessor.apply(entity.getKey(), entity.getValue());
categoryFrequencies = updateFeatureFrequenciesForNextRow(row, categoryFrequencies);
}
partData.withCategoryFrequencies(categoryFrequencies);
}
return partData;
}, learningEnvironment(basePreprocessor))) {
switch(encoderType) {
case ONE_HOT_ENCODER:
return new OneHotEncoderPreprocessor<>(calculateEncodingValuesByFrequencies(dataset), basePreprocessor, handledIndices);
case STRING_ENCODER:
return new StringEncoderPreprocessor<>(calculateEncodingValuesByFrequencies(dataset), basePreprocessor, handledIndices);
case LABEL_ENCODER:
return new LabelEncoderPreprocessor<>(calculateEncodingValuesForLabelsByFrequencies(dataset), basePreprocessor);
case FREQUENCY_ENCODER:
return new FrequencyEncoderPreprocessor<>(calculateEncodingFrequencies(dataset), basePreprocessor, handledIndices);
case TARGET_ENCODER:
return new TargetEncoderPreprocessor<>(calculateTargetEncodingFrequencies(dataset), basePreprocessor, handledIndices);
default:
throw new IllegalStateException("Define the type of the resulting prerocessor.");
}
} catch (Exception e) {
throw new RuntimeException(e);
}
}
use of org.apache.ignite.ml.preprocessing.encoding.label.LabelEncoderPreprocessor in project ignite by apache.
the class LabelEncoderPreprocessorTest method testApply.
/**
* Tests {@code apply()} method.
*/
@Test
public void testApply() {
Map<Integer, Object[]> data = new HashMap<>();
data.put(0, new Object[] { 1, "A" });
data.put(1, new Object[] { 2, "B" });
data.put(2, new Object[] { 3, "B" });
final Vectorizer<Integer, Object[], Integer, Object> vectorizer = new ObjectArrayVectorizer<Integer>(0).labeled(1);
LabelEncoderPreprocessor<Integer, Object[]> preprocessor = new LabelEncoderPreprocessor<Integer, Object[]>(new HashMap() {
{
put("A", 1);
put("B", 0);
}
}, vectorizer);
double[] postProcessedData = new double[] { 1.0, 0.0, 0.0 };
for (int i = 0; i < data.size(); i++) assertEquals(postProcessedData[i], (Double) preprocessor.apply(i, data.get(i)).label(), 1e-8);
}
Aggregations