Search in sources :

Example 1 with StandardScaler

use of com.alibaba.alink.pipeline.dataproc.StandardScaler in project Alink by alibaba.

the class Chap14 method c_3.

static void c_3() throws Exception {
    CsvSourceBatchOp trainBatchData = new CsvSourceBatchOp().setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/data-files/avazu-small.csv").setSchemaStr(SCHEMA_STRING);
    // setup feature enginerring pipeline
    Pipeline feature_pipeline = new Pipeline().add(new StandardScaler().setSelectedCols(NUMERICAL_COL_NAMES)).add(new FeatureHasher().setSelectedCols(ArrayUtils.addAll(CATEGORY_COL_NAMES, NUMERICAL_COL_NAMES)).setCategoricalCols(CATEGORY_COL_NAMES).setOutputCol(VEC_COL_NAME).setNumFeatures(NUM_HASH_FEATURES));
    if (!new File(DATA_DIR + FEATURE_MODEL_FILE).exists()) {
        // fit and save feature pipeline model
        feature_pipeline.fit(trainBatchData).save(DATA_DIR + FEATURE_MODEL_FILE);
        BatchOperator.execute();
    }
}
Also used : FeatureHasher(com.alibaba.alink.pipeline.feature.FeatureHasher) StandardScaler(com.alibaba.alink.pipeline.dataproc.StandardScaler) File(java.io.File) CsvSourceBatchOp(com.alibaba.alink.operator.batch.source.CsvSourceBatchOp) Pipeline(com.alibaba.alink.pipeline.Pipeline)

Example 2 with StandardScaler

use of com.alibaba.alink.pipeline.dataproc.StandardScaler in project Alink by alibaba.

the class FTRLExample method main.

public static void main(String[] args) throws Exception {
    String schemaStr = "id string, click string, dt string, C1 string, banner_pos int, site_id string, site_domain string, " + "site_category string, app_id string, app_domain string, app_category string, device_id string, " + "device_ip string, device_model string, device_type string, device_conn_type string, C14 int, C15 int, " + "C16 int, C17 int, C18 int, C19 int, C20 int, C21 int";
    CsvSourceBatchOp trainBatchData = new CsvSourceBatchOp().setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/data-files/avazu-small.csv").setSchemaStr(schemaStr);
    trainBatchData.firstN(10).print();
    String labelColName = "click";
    String[] selectedColNames = new String[] { "C1", "banner_pos", "site_category", "app_domain", "app_category", "device_type", "device_conn_type", "C14", "C15", "C16", "C17", "C18", "C19", "C20", "C21", "site_id", "site_domain", "device_id", "device_model" };
    String[] categoryColNames = new String[] { "C1", "banner_pos", "site_category", "app_domain", "app_category", "device_type", "device_conn_type", "site_id", "site_domain", "device_id", "device_model" };
    String[] numericalColNames = new String[] { "C14", "C15", "C16", "C17", "C18", "C19", "C20", "C21" };
    // result column name of feature engineering
    String vecColName = "vec";
    int numHashFeatures = 30000;
    // setup feature engineering pipeline
    Pipeline featurePipeline = new Pipeline().add(new StandardScaler().setSelectedCols(numericalColNames)).add(new FeatureHasher().setSelectedCols(selectedColNames).setCategoricalCols(categoryColNames).setOutputCol(vecColName).setNumFeatures(numHashFeatures));
    // fit feature pipeline model
    PipelineModel featurePipelineModel = featurePipeline.fit(trainBatchData);
    // prepare stream train data
    CsvSourceStreamOp data = new CsvSourceStreamOp().setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/data-files/avazu-ctr-train-8M.csv").setSchemaStr(schemaStr).setIgnoreFirstLine(true);
    // split stream to train and eval data
    SplitStreamOp splitter = new SplitStreamOp().setFraction(0.5).linkFrom(data);
    // train initial batch model
    LogisticRegressionTrainBatchOp lr = new LogisticRegressionTrainBatchOp().setVectorCol(vecColName).setLabelCol(labelColName).setWithIntercept(true).setMaxIter(10);
    BatchOperator<?> initModel = featurePipelineModel.transform(trainBatchData).link(lr);
    // ftrl train
    FtrlTrainStreamOp model = new FtrlTrainStreamOp(initModel).setVectorCol(vecColName).setLabelCol(labelColName).setWithIntercept(true).setAlpha(0.1).setBeta(0.1).setL1(0.01).setL2(0.01).setTimeInterval(10).setVectorSize(numHashFeatures).linkFrom(featurePipelineModel.transform(splitter));
    // ftrl predict
    FtrlPredictStreamOp predictResult = new FtrlPredictStreamOp(initModel).setVectorCol(vecColName).setPredictionCol("pred").setReservedCols(new String[] { labelColName }).setPredictionDetailCol("details").linkFrom(model, featurePipelineModel.transform(splitter.getSideOutput(0)));
    // ftrl eval
    predictResult.link(new EvalBinaryClassStreamOp().setLabelCol(labelColName).setPredictionCol("pred").setPredictionDetailCol("details").setTimeInterval(10)).link(new JsonValueStreamOp().setSelectedCol("Data").setReservedCols(new String[] { "Statistics" }).setOutputCols(new String[] { "Accuracy", "AUC", "ConfusionMatrix" }).setJsonPath(new String[] { "$.Accuracy", "$.AUC", "$.ConfusionMatrix" })).print();
}
Also used : JsonValueStreamOp(com.alibaba.alink.operator.stream.dataproc.JsonValueStreamOp) LogisticRegressionTrainBatchOp(com.alibaba.alink.operator.batch.classification.LogisticRegressionTrainBatchOp) FtrlPredictStreamOp(com.alibaba.alink.operator.stream.onlinelearning.FtrlPredictStreamOp) CsvSourceBatchOp(com.alibaba.alink.operator.batch.source.CsvSourceBatchOp) Pipeline(com.alibaba.alink.pipeline.Pipeline) PipelineModel(com.alibaba.alink.pipeline.PipelineModel) SplitStreamOp(com.alibaba.alink.operator.stream.dataproc.SplitStreamOp) FeatureHasher(com.alibaba.alink.pipeline.feature.FeatureHasher) FtrlTrainStreamOp(com.alibaba.alink.operator.stream.onlinelearning.FtrlTrainStreamOp) StandardScaler(com.alibaba.alink.pipeline.dataproc.StandardScaler) EvalBinaryClassStreamOp(com.alibaba.alink.operator.stream.evaluation.EvalBinaryClassStreamOp) CsvSourceStreamOp(com.alibaba.alink.operator.stream.source.CsvSourceStreamOp)

Example 3 with StandardScaler

use of com.alibaba.alink.pipeline.dataproc.StandardScaler in project Alink by alibaba.

the class StandardScalerTest method test.

@Test
public void test() throws Exception {
    BatchOperator batchData = new TableSourceBatchOp(GenerateData.getBatchTable());
    StreamOperator streamData = new TableSourceStreamOp(GenerateData.getStreamTable());
    StandardScalerTrainBatchOp op = new StandardScalerTrainBatchOp().setWithMean(true).setWithStd(true).setSelectedCols("f0", "f1").linkFrom(batchData);
    new StandardScalerPredictBatchOp().setOutputCols("f0_1", "f1_1").linkFrom(op, batchData).lazyCollect(new Consumer<List<Row>>() {

        @Override
        public void accept(List<Row> rows) {
            rows.sort(compare);
            assertEquals(rows.get(0), Row.of(null, null, null, null));
            assertRow(rows.get(1), Row.of(-1., -3., -0.9272, -1.1547));
            assertRow(rows.get(2), Row.of(1., 2., -0.1325, 0.5774));
            assertRow(rows.get(3), Row.of(4., 2., 1.0596, 0.5774));
        }
    });
    new StandardScalerPredictStreamOp(op).setOutputCols("f0_1", "f1_1").linkFrom(streamData).print();
    StandardScalerModel model1 = new StandardScaler().setWithMean(true).setWithStd(false).setSelectedCols("f0", "f1").setOutputCols("f0_1", "f1_1").fit(batchData);
    model1.transform(batchData).lazyCollect(new Consumer<List<Row>>() {

        @Override
        public void accept(List<Row> rows) {
            rows.sort(compare);
            assertEquals(rows.get(0), Row.of(null, null, null, null));
            assertRow(rows.get(1), Row.of(-1., -3., -2.3333, -3.3333));
            assertRow(rows.get(2), Row.of(1., 2., -0.3333, 1.6666));
            assertRow(rows.get(3), Row.of(4., 2., 2.6666, 1.6666));
        }
    });
    model1.transform(streamData).print();
    StandardScalerModel model2 = new StandardScaler().setWithMean(false).setWithStd(true).setSelectedCols("f0", "f1").setOutputCols("f0_1", "f1_1").fit(batchData);
    model2.transform(batchData).lazyCollect(new Consumer<List<Row>>() {

        @Override
        public void accept(List<Row> rows) {
            rows.sort(compare);
            assertEquals(rows.get(0), Row.of(null, null, null, null));
            assertRow(rows.get(1), Row.of(-1., -3., -0.3974, -1.0392));
            assertRow(rows.get(2), Row.of(1., 2., 0.3974, 0.6928));
            assertRow(rows.get(3), Row.of(4., 2., 1.5894, 0.6928));
        }
    });
    model2.transform(streamData).print();
    StandardScalerModel model3 = new StandardScaler().setWithMean(false).setWithStd(false).setSelectedCols("f0", "f1").setOutputCols("f0_1", "f1_1").fit(batchData);
    model3.transform(batchData).lazyCollect(new Consumer<List<Row>>() {

        @Override
        public void accept(List<Row> rows) {
            rows.sort(compare);
            assertEquals(rows.get(0), Row.of(null, null, null, null));
            assertRow(rows.get(1), Row.of(-1., -3., -1., -3.));
            assertRow(rows.get(2), Row.of(1., 2., 1., 2.));
            assertRow(rows.get(3), Row.of(4., 2., 4., 2.));
        }
    });
    model3.transform(streamData).print();
    StreamOperator.execute();
}
Also used : StandardScalerModel(com.alibaba.alink.pipeline.dataproc.StandardScalerModel) StandardScalerPredictStreamOp(com.alibaba.alink.operator.stream.dataproc.StandardScalerPredictStreamOp) TableSourceBatchOp(com.alibaba.alink.operator.batch.source.TableSourceBatchOp) BatchOperator(com.alibaba.alink.operator.batch.BatchOperator) StandardScaler(com.alibaba.alink.pipeline.dataproc.StandardScaler) List(java.util.List) TableSourceStreamOp(com.alibaba.alink.operator.stream.source.TableSourceStreamOp) Row(org.apache.flink.types.Row) StreamOperator(com.alibaba.alink.operator.stream.StreamOperator) Test(org.junit.Test)

Example 4 with StandardScaler

use of com.alibaba.alink.pipeline.dataproc.StandardScaler in project Alink by alibaba.

the class Chap25 method dnnReg.

public static void dnnReg(BatchOperator<?> train_set, BatchOperator<?> test_set) throws Exception {
    BatchOperator.setParallelism(1);
    new Pipeline().add(new StandardScaler().setSelectedCols(Chap16.FEATURE_COL_NAMES)).add(new VectorAssembler().setSelectedCols(Chap16.FEATURE_COL_NAMES).setOutputCol("vec")).add(new VectorToTensor().setSelectedCol("vec").setOutputCol("tensor").setReservedCols("quality")).add(new KerasSequentialRegressor().setTensorCol("tensor").setLabelCol("quality").setPredictionCol("pred").setLayers("Dense(64, activation='relu')", "Dense(64, activation='relu')", "Dense(64, activation='relu')", "Dense(64, activation='relu')", "Dense(64, activation='relu')").setNumEpochs(20)).fit(train_set).transform(test_set).lazyPrintStatistics().link(new EvalRegressionBatchOp().setLabelCol("quality").setPredictionCol("pred").lazyPrintMetrics());
    BatchOperator.execute();
}
Also used : StandardScaler(com.alibaba.alink.pipeline.dataproc.StandardScaler) VectorAssembler(com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler) EvalRegressionBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalRegressionBatchOp) KerasSequentialRegressor(com.alibaba.alink.pipeline.regression.KerasSequentialRegressor) VectorToTensor(com.alibaba.alink.pipeline.dataproc.VectorToTensor) Pipeline(com.alibaba.alink.pipeline.Pipeline)

Example 5 with StandardScaler

use of com.alibaba.alink.pipeline.dataproc.StandardScaler in project Alink by alibaba.

the class Chap07 method c_3_1.

static void c_3_1() throws Exception {
    CsvSourceBatchOp source = new CsvSourceBatchOp().setFilePath(DATA_DIR + ORIGIN_FILE).setSchemaStr(SCHEMA_STRING);
    source.lazyPrintStatistics("< Origin data >");
    StandardScaler scaler = new StandardScaler().setSelectedCols(FEATURE_COL_NAMES);
    scaler.fit(source).transform(source).lazyPrintStatistics("< after Standard Scale >");
    BatchOperator.execute();
}
Also used : VectorStandardScaler(com.alibaba.alink.pipeline.dataproc.vector.VectorStandardScaler) StandardScaler(com.alibaba.alink.pipeline.dataproc.StandardScaler) CsvSourceBatchOp(com.alibaba.alink.operator.batch.source.CsvSourceBatchOp)

Aggregations

StandardScaler (com.alibaba.alink.pipeline.dataproc.StandardScaler)6 Pipeline (com.alibaba.alink.pipeline.Pipeline)4 CsvSourceBatchOp (com.alibaba.alink.operator.batch.source.CsvSourceBatchOp)3 FeatureHasher (com.alibaba.alink.pipeline.feature.FeatureHasher)2 BatchOperator (com.alibaba.alink.operator.batch.BatchOperator)1 LogisticRegressionTrainBatchOp (com.alibaba.alink.operator.batch.classification.LogisticRegressionTrainBatchOp)1 VectorToColumnsBatchOp (com.alibaba.alink.operator.batch.dataproc.format.VectorToColumnsBatchOp)1 EvalRegressionBatchOp (com.alibaba.alink.operator.batch.evaluation.EvalRegressionBatchOp)1 MemSourceBatchOp (com.alibaba.alink.operator.batch.source.MemSourceBatchOp)1 TableSourceBatchOp (com.alibaba.alink.operator.batch.source.TableSourceBatchOp)1 StreamOperator (com.alibaba.alink.operator.stream.StreamOperator)1 JsonValueStreamOp (com.alibaba.alink.operator.stream.dataproc.JsonValueStreamOp)1 SplitStreamOp (com.alibaba.alink.operator.stream.dataproc.SplitStreamOp)1 StandardScalerPredictStreamOp (com.alibaba.alink.operator.stream.dataproc.StandardScalerPredictStreamOp)1 EvalBinaryClassStreamOp (com.alibaba.alink.operator.stream.evaluation.EvalBinaryClassStreamOp)1 FtrlPredictStreamOp (com.alibaba.alink.operator.stream.onlinelearning.FtrlPredictStreamOp)1 FtrlTrainStreamOp (com.alibaba.alink.operator.stream.onlinelearning.FtrlTrainStreamOp)1 CsvSourceStreamOp (com.alibaba.alink.operator.stream.source.CsvSourceStreamOp)1 TableSourceStreamOp (com.alibaba.alink.operator.stream.source.TableSourceStreamOp)1 PipelineModel (com.alibaba.alink.pipeline.PipelineModel)1