Search in sources :

Example 1 with QuantileDiscretizerModelDataConverter

use of com.alibaba.alink.operator.common.feature.QuantileDiscretizerModelDataConverter in project Alink by alibaba.

the class Preprocessing method generateQuantileDiscretizerModel.

public static BatchOperator<?> generateQuantileDiscretizerModel(BatchOperator<?> input, Params params) {
    if (params.contains(HasVectorCol.VECTOR_COL)) {
        return sample(input, params).linkTo(new VectorTrain(new Params().set(ZERO_AS_MISSING, params.get(ZERO_AS_MISSING))).setMLEnvironmentId(input.getMLEnvironmentId()).setVectorCol(params.get(HasVectorCol.VECTOR_COL)).setNumBuckets(params.get(HasMaxBins.MAX_BINS)));
    }
    String[] continuousColNames = ArrayUtils.removeElements(params.get(HasFeatureCols.FEATURE_COLS), params.get(HasCategoricalCols.CATEGORICAL_COLS));
    BatchOperator<?> quantileDiscretizerModel;
    if (continuousColNames != null && continuousColNames.length > 0) {
        quantileDiscretizerModel = sample(input, params).linkTo(new QuantileDiscretizerTrainBatchOp(new Params().set(ZERO_AS_MISSING, params.get(ZERO_AS_MISSING))).setMLEnvironmentId(input.getMLEnvironmentId()).setSelectedCols(continuousColNames).setNumBuckets(params.get(HasMaxBins.MAX_BINS)));
    } else {
        QuantileDiscretizerModelDataConverter emptyModel = new QuantileDiscretizerModelDataConverter();
        quantileDiscretizerModel = new DataSetWrapperBatchOp(MLEnvironmentFactory.get(input.getMLEnvironmentId()).getExecutionEnvironment().fromElements(1).mapPartition(new MapPartitionFunction<Integer, Row>() {

            private static final long serialVersionUID = 2328781103352773618L;

            @Override
            public void mapPartition(Iterable<Integer> values, Collector<Row> out) throws Exception {
            // pass
            }
        }), emptyModel.getModelSchema().getFieldNames(), emptyModel.getModelSchema().getFieldTypes()).setMLEnvironmentId(input.getMLEnvironmentId());
    }
    return quantileDiscretizerModel;
}
Also used : RichMapPartitionFunction(org.apache.flink.api.common.functions.RichMapPartitionFunction) MapPartitionFunction(org.apache.flink.api.common.functions.MapPartitionFunction) Collector(org.apache.flink.util.Collector) QuantileDiscretizerTrainParams(com.alibaba.alink.params.feature.QuantileDiscretizerTrainParams) SISOMapperParams(com.alibaba.alink.params.mapper.SISOMapperParams) Params(org.apache.flink.ml.api.misc.param.Params) QuantileDiscretizerTrainBatchOp(com.alibaba.alink.operator.batch.feature.QuantileDiscretizerTrainBatchOp) QuantileDiscretizerModelDataConverter(com.alibaba.alink.operator.common.feature.QuantileDiscretizerModelDataConverter) DataSetWrapperBatchOp(com.alibaba.alink.operator.batch.source.DataSetWrapperBatchOp)

Example 2 with QuantileDiscretizerModelDataConverter

use of com.alibaba.alink.operator.common.feature.QuantileDiscretizerModelDataConverter in project Alink by alibaba.

the class PreprocessingTest method sparse.

@Test
public void sparse() {
    Row[] rows = new Row[] { Row.of(0L, "{\"vectorCol\":\"\\\"vector\\\"\",\"MLEnvironmentId\":\"0\",\"version\":\"\\\"v2\\\"\"," + "\"numBuckets\":\"128\"}\n"), Row.of(1048576L, "[{\"featureName\":\"0\",\"featureType\":\"DOUBLE\",\"splitsArray\":[0.0,1.0,4.0,5.0]," + "\"isLeftOpen\":true}]\n"), Row.of(2097152L, "[{\"featureName\":\"1\",\"featureType\":\"DOUBLE\",\"splitsArray\":[2.0,3.0,4.0]," + "\"isLeftOpen\":true}]\n") };
    List<Row> model = Arrays.asList(rows);
    QuantileDiscretizerModelDataConverter quantileModel = new QuantileDiscretizerModelDataConverter();
    quantileModel.load(model);
    Assert.assertEquals(quantileModel.getFeatureSize("0"), 5);
    Assert.assertEquals(quantileModel.missingIndex("0"), 5);
    Assert.assertEquals(quantileModel.getFeatureSize("1"), 4);
    Assert.assertEquals(quantileModel.getFeatureSize("1"), 4);
    Assert.assertEquals(Preprocessing.zeroIndex(quantileModel, "0"), 0);
    Assert.assertEquals(Preprocessing.zeroIndex(quantileModel, "1"), 0);
}
Also used : QuantileDiscretizerModelDataConverter(com.alibaba.alink.operator.common.feature.QuantileDiscretizerModelDataConverter) Row(org.apache.flink.types.Row) Test(org.junit.Test)

Example 3 with QuantileDiscretizerModelDataConverter

use of com.alibaba.alink.operator.common.feature.QuantileDiscretizerModelDataConverter in project Alink by alibaba.

the class TreeInitObj method initialMapping.

private static QuantileDiscretizerModelDataConverter initialMapping(List<Row> quantileModel) {
    if (!quantileModel.isEmpty()) {
        QuantileDiscretizerModelDataConverter quantileDiscretizerModel = new QuantileDiscretizerModelDataConverter();
        quantileDiscretizerModel.load(quantileModel);
        return quantileDiscretizerModel;
    } else {
        return null;
    }
}
Also used : QuantileDiscretizerModelDataConverter(com.alibaba.alink.operator.common.feature.QuantileDiscretizerModelDataConverter)

Example 4 with QuantileDiscretizerModelDataConverter

use of com.alibaba.alink.operator.common.feature.QuantileDiscretizerModelDataConverter in project Alink by alibaba.

the class TreeInitObj method calc.

@Override
public void calc(ComContext context) {
    if (context.getStepNo() != 1) {
        return;
    }
    List<Row> dataRows = context.getObj("treeInput");
    List<Row> quantileModel = context.getObj("quantileModel");
    List<Row> stringIndexerModel = context.getObj("stringIndexerModel");
    List<Object[]> labels = context.getObj("labels");
    int nLocalRow = dataRows == null ? 0 : dataRows.size();
    Params localParams = params.clone();
    localParams.set(TASK_ID, context.getTaskId());
    localParams.set(NUM_OF_SUBTASKS, context.getNumTask());
    localParams.set(N_LOCAL_ROW, nLocalRow);
    QuantileDiscretizerModelDataConverter quantileDiscretizerModel = initialMapping(quantileModel);
    List<String> lookUpColNames = new ArrayList<>();
    if (params.get(RandomForestTrainParams.CATEGORICAL_COLS) != null) {
        lookUpColNames.addAll(Arrays.asList(params.get(RandomForestTrainParams.CATEGORICAL_COLS)));
    }
    Map<String, Integer> categoricalColsSize = TreeUtil.extractCategoricalColsSize(stringIndexerModel, lookUpColNames.toArray(new String[0]));
    if (!Criteria.isRegression(params.get(TreeUtil.TREE_TYPE))) {
        categoricalColsSize.put(params.get(RandomForestTrainParams.LABEL_COL), labels.get(0).length);
    }
    FeatureMeta[] featureMetas = TreeUtil.getFeatureMeta(params.get(RandomForestTrainParams.FEATURE_COLS), categoricalColsSize);
    FeatureMeta labelMeta = TreeUtil.getLabelMeta(params.get(RandomForestTrainParams.LABEL_COL), params.get(RandomForestTrainParams.FEATURE_COLS).length, categoricalColsSize);
    TreeObj treeObj;
    if (Criteria.isRegression(params.get(TreeUtil.TREE_TYPE))) {
        treeObj = new RegObj(localParams, quantileDiscretizerModel, featureMetas, labelMeta);
    } else {
        treeObj = new ClassifierObj(localParams, quantileDiscretizerModel, featureMetas, labelMeta);
    }
    int nFeatureCol = localParams.get(RandomForestTrainParams.FEATURE_COLS).length;
    int[] data = new int[nFeatureCol * nLocalRow];
    double[] regLabels = null;
    int[] classifyLabels = null;
    if (Criteria.isRegression(params.get(TreeUtil.TREE_TYPE))) {
        regLabels = new double[nLocalRow];
    } else {
        classifyLabels = new int[nLocalRow];
    }
    int agg = 0;
    for (int iter = 0; iter < nLocalRow; ++iter) {
        for (int i = 0; i < nFeatureCol; ++i) {
            data[i * nLocalRow + agg] = (int) dataRows.get(iter).getField(i);
        }
        if (Criteria.isRegression(params.get(TreeUtil.TREE_TYPE))) {
            regLabels[agg] = (double) dataRows.get(iter).getField(nFeatureCol);
        } else {
            classifyLabels[agg] = (int) dataRows.get(iter).getField(nFeatureCol);
        }
        agg++;
    }
    treeObj.setFeatures(data);
    if (Criteria.isRegression(params.get(TreeUtil.TREE_TYPE))) {
        treeObj.setLabels(regLabels);
    } else {
        treeObj.setLabels(classifyLabels);
    }
    double[] histBuffer = new double[treeObj.getMaxHistBufferSize()];
    context.putObj("allReduce", histBuffer);
    treeObj.setHist(histBuffer);
    treeObj.initialRoot();
    context.putObj("treeObj", treeObj);
}
Also used : ArrayList(java.util.ArrayList) RandomForestTrainParams(com.alibaba.alink.params.classification.RandomForestTrainParams) Params(org.apache.flink.ml.api.misc.param.Params) QuantileDiscretizerModelDataConverter(com.alibaba.alink.operator.common.feature.QuantileDiscretizerModelDataConverter) FeatureMeta(com.alibaba.alink.operator.common.tree.FeatureMeta) Row(org.apache.flink.types.Row)

Example 5 with QuantileDiscretizerModelDataConverter

use of com.alibaba.alink.operator.common.feature.QuantileDiscretizerModelDataConverter in project Alink by alibaba.

the class QuantileDiscretizerTrainBatchOp method transformFeatureBinsToModel.

public static void transformFeatureBinsToModel(Iterable<FeatureBinsCalculator> values, Collector<Row> out) {
    List<String> selectedCols = new ArrayList<>();
    Map<String, ContinuousRanges> m = new HashMap<>();
    for (FeatureBinsCalculator featureBinsCalculator : values) {
        m.put(featureBinsCalculator.getFeatureName(), FeatureBinsCalculatorTransformer.toContinuousFeatureInterval(featureBinsCalculator));
        selectedCols.add(featureBinsCalculator.getFeatureName());
    }
    Params meta = new Params().set(QuantileDiscretizerTrainParams.SELECTED_COLS, selectedCols.toArray(new String[0]));
    QuantileDiscretizerModelDataConverter model = new QuantileDiscretizerModelDataConverter(m, meta);
    model.save(model, out);
}
Also used : FeatureBinsCalculator(com.alibaba.alink.operator.common.feature.binning.FeatureBinsCalculator) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) QuantileDiscretizerTrainParams(com.alibaba.alink.params.feature.QuantileDiscretizerTrainParams) Params(org.apache.flink.ml.api.misc.param.Params) QuantileDiscretizerModelDataConverter(com.alibaba.alink.operator.common.feature.QuantileDiscretizerModelDataConverter) ContinuousRanges(com.alibaba.alink.operator.common.feature.ContinuousRanges)

Aggregations

QuantileDiscretizerModelDataConverter (com.alibaba.alink.operator.common.feature.QuantileDiscretizerModelDataConverter)7 Params (org.apache.flink.ml.api.misc.param.Params)3 Row (org.apache.flink.types.Row)3 FeatureMeta (com.alibaba.alink.operator.common.tree.FeatureMeta)2 QuantileDiscretizerTrainParams (com.alibaba.alink.params.feature.QuantileDiscretizerTrainParams)2 ArrayList (java.util.ArrayList)2 Test (org.junit.Test)2 QuantileDiscretizerTrainBatchOp (com.alibaba.alink.operator.batch.feature.QuantileDiscretizerTrainBatchOp)1 DataSetWrapperBatchOp (com.alibaba.alink.operator.batch.source.DataSetWrapperBatchOp)1 ContinuousRanges (com.alibaba.alink.operator.common.feature.ContinuousRanges)1 FeatureBinsCalculator (com.alibaba.alink.operator.common.feature.binning.FeatureBinsCalculator)1 RandomForestTrainParams (com.alibaba.alink.params.classification.RandomForestTrainParams)1 SISOMapperParams (com.alibaba.alink.params.mapper.SISOMapperParams)1 HashMap (java.util.HashMap)1 MapPartitionFunction (org.apache.flink.api.common.functions.MapPartitionFunction)1 RichMapPartitionFunction (org.apache.flink.api.common.functions.RichMapPartitionFunction)1 Collector (org.apache.flink.util.Collector)1