Search in sources :

Example 1 with QuantileDiscretizerTrainBatchOp

use of com.alibaba.alink.operator.batch.feature.QuantileDiscretizerTrainBatchOp in project Alink by alibaba.

the class Preprocessing method generateQuantileDiscretizerModel.

public static BatchOperator<?> generateQuantileDiscretizerModel(BatchOperator<?> input, Params params) {
    if (params.contains(HasVectorCol.VECTOR_COL)) {
        return sample(input, params).linkTo(new VectorTrain(new Params().set(ZERO_AS_MISSING, params.get(ZERO_AS_MISSING))).setMLEnvironmentId(input.getMLEnvironmentId()).setVectorCol(params.get(HasVectorCol.VECTOR_COL)).setNumBuckets(params.get(HasMaxBins.MAX_BINS)));
    }
    String[] continuousColNames = ArrayUtils.removeElements(params.get(HasFeatureCols.FEATURE_COLS), params.get(HasCategoricalCols.CATEGORICAL_COLS));
    BatchOperator<?> quantileDiscretizerModel;
    if (continuousColNames != null && continuousColNames.length > 0) {
        quantileDiscretizerModel = sample(input, params).linkTo(new QuantileDiscretizerTrainBatchOp(new Params().set(ZERO_AS_MISSING, params.get(ZERO_AS_MISSING))).setMLEnvironmentId(input.getMLEnvironmentId()).setSelectedCols(continuousColNames).setNumBuckets(params.get(HasMaxBins.MAX_BINS)));
    } else {
        QuantileDiscretizerModelDataConverter emptyModel = new QuantileDiscretizerModelDataConverter();
        quantileDiscretizerModel = new DataSetWrapperBatchOp(MLEnvironmentFactory.get(input.getMLEnvironmentId()).getExecutionEnvironment().fromElements(1).mapPartition(new MapPartitionFunction<Integer, Row>() {

            private static final long serialVersionUID = 2328781103352773618L;

            @Override
            public void mapPartition(Iterable<Integer> values, Collector<Row> out) throws Exception {
            // pass
            }
        }), emptyModel.getModelSchema().getFieldNames(), emptyModel.getModelSchema().getFieldTypes()).setMLEnvironmentId(input.getMLEnvironmentId());
    }
    return quantileDiscretizerModel;
}
Also used : RichMapPartitionFunction(org.apache.flink.api.common.functions.RichMapPartitionFunction) MapPartitionFunction(org.apache.flink.api.common.functions.MapPartitionFunction) Collector(org.apache.flink.util.Collector) QuantileDiscretizerTrainParams(com.alibaba.alink.params.feature.QuantileDiscretizerTrainParams) SISOMapperParams(com.alibaba.alink.params.mapper.SISOMapperParams) Params(org.apache.flink.ml.api.misc.param.Params) QuantileDiscretizerTrainBatchOp(com.alibaba.alink.operator.batch.feature.QuantileDiscretizerTrainBatchOp) QuantileDiscretizerModelDataConverter(com.alibaba.alink.operator.common.feature.QuantileDiscretizerModelDataConverter) DataSetWrapperBatchOp(com.alibaba.alink.operator.batch.source.DataSetWrapperBatchOp)

Example 2 with QuantileDiscretizerTrainBatchOp

use of com.alibaba.alink.operator.batch.feature.QuantileDiscretizerTrainBatchOp in project Alink by alibaba.

the class PipelineSaveAndLoadTest method test2.

@Test
public void test2() throws Exception {
    String model_filename = "/tmp/model2.csv";
    CsvSourceBatchOp source = new CsvSourceBatchOp().setSchemaStr("sepal_length double, sepal_width double, petal_length double, petal_width double, category string").setFilePath("https://alink-test-data.oss-cn-hangzhou.aliyuncs.com/iris.csv");
    QuantileDiscretizerTrainBatchOp train = new QuantileDiscretizerTrainBatchOp().setNumBuckets(2).setSelectedCols("petal_length").linkFrom(source);
    train.link(new AkSinkBatchOp().setFilePath(model_filename).setOverwriteSink(true));
    BatchOperator.execute();
    // # save pipeline model data to file
    String pipelineModelFilename = "/tmp/model23424.csv";
    QuantileDiscretizer stage1 = new QuantileDiscretizer().setNumBuckets(2).setSelectedCols("sepal_length");
    Binarizer stage2 = new Binarizer().setSelectedCol("petal_width").setThreshold(1.);
    AkSourceBatchOp modelData = new AkSourceBatchOp().setFilePath(model_filename);
    QuantileDiscretizerModel stage3 = new QuantileDiscretizerModel().setSelectedCols("petal_length").setModelData(modelData);
    PipelineModel prevPipelineModel = new Pipeline(stage1, stage2, stage3).fit(source);
    prevPipelineModel.save(pipelineModelFilename, true);
    BatchOperator.execute();
}
Also used : AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) QuantileDiscretizerModel(com.alibaba.alink.pipeline.feature.QuantileDiscretizerModel) QuantileDiscretizerTrainBatchOp(com.alibaba.alink.operator.batch.feature.QuantileDiscretizerTrainBatchOp) AkSinkBatchOp(com.alibaba.alink.operator.batch.sink.AkSinkBatchOp) Binarizer(com.alibaba.alink.pipeline.feature.Binarizer) CsvSourceBatchOp(com.alibaba.alink.operator.batch.source.CsvSourceBatchOp) QuantileDiscretizer(com.alibaba.alink.pipeline.feature.QuantileDiscretizer) Test(org.junit.Test)

Aggregations

QuantileDiscretizerTrainBatchOp (com.alibaba.alink.operator.batch.feature.QuantileDiscretizerTrainBatchOp)2 AkSinkBatchOp (com.alibaba.alink.operator.batch.sink.AkSinkBatchOp)1 AkSourceBatchOp (com.alibaba.alink.operator.batch.source.AkSourceBatchOp)1 CsvSourceBatchOp (com.alibaba.alink.operator.batch.source.CsvSourceBatchOp)1 DataSetWrapperBatchOp (com.alibaba.alink.operator.batch.source.DataSetWrapperBatchOp)1 QuantileDiscretizerModelDataConverter (com.alibaba.alink.operator.common.feature.QuantileDiscretizerModelDataConverter)1 QuantileDiscretizerTrainParams (com.alibaba.alink.params.feature.QuantileDiscretizerTrainParams)1 SISOMapperParams (com.alibaba.alink.params.mapper.SISOMapperParams)1 Binarizer (com.alibaba.alink.pipeline.feature.Binarizer)1 QuantileDiscretizer (com.alibaba.alink.pipeline.feature.QuantileDiscretizer)1 QuantileDiscretizerModel (com.alibaba.alink.pipeline.feature.QuantileDiscretizerModel)1 MapPartitionFunction (org.apache.flink.api.common.functions.MapPartitionFunction)1 RichMapPartitionFunction (org.apache.flink.api.common.functions.RichMapPartitionFunction)1 Params (org.apache.flink.ml.api.misc.param.Params)1 Collector (org.apache.flink.util.Collector)1 Test (org.junit.Test)1