Search in sources :

Example 21 with EvalBinaryClassBatchOp

use of com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp in project Alink by alibaba.

the class Chap11 method c_4.

static void c_4() throws Exception {
    AkSourceBatchOp all_data = new AkSourceBatchOp().setFilePath(DATA_DIR + FEATURE_LABEL_FILE);
    all_data.lazyPrintStatistics().groupBy("label", "label, COUNT(*) AS cnt").print();
    Utils.splitTrainTestIfNotExist(all_data, DATA_DIR + TRAIN_FILE, DATA_DIR + TEST_FILE, 0.8);
    AkSourceBatchOp train_data = new AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
    AkSourceBatchOp test_data = new AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);
    String[] featureColNames = ArrayUtils.removeElement(train_data.getColNames(), LABEL_COL_NAME);
    new LogisticRegression().setFeatureCols(featureColNames).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).fit(train_data).transform(test_data).link(new EvalBinaryClassBatchOp().setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("LogisticRegression"));
    BatchOperator.execute();
    if (!new File(DATA_DIR + TRAIN_SAMPLE_FILE).exists()) {
        train_data.link(new StratifiedSampleBatchOp().setStrataRatios("0:0.05,1:1.0").setStrataCol(LABEL_COL_NAME)).link(new AkSinkBatchOp().setFilePath(DATA_DIR + TRAIN_SAMPLE_FILE));
        BatchOperator.execute();
    }
    AkSourceBatchOp train_sample = new AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_SAMPLE_FILE);
    new LogisticRegression().setFeatureCols(featureColNames).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).fit(train_sample).transform(test_data).link(new EvalBinaryClassBatchOp().setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("LogisticRegression with Stratified Sample"));
    BatchOperator.execute();
}
Also used : AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) StratifiedSampleBatchOp(com.alibaba.alink.operator.batch.dataproc.StratifiedSampleBatchOp) LogisticRegression(com.alibaba.alink.pipeline.classification.LogisticRegression) AkSinkBatchOp(com.alibaba.alink.operator.batch.sink.AkSinkBatchOp) File(java.io.File) EvalBinaryClassBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp)

Example 22 with EvalBinaryClassBatchOp

use of com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp in project Alink by alibaba.

the class Chap20 method c_1.

static void c_1() throws Exception {
    BatchOperator<?> train_data = new AkSourceBatchOp().setFilePath(Chap10.DATA_DIR + Chap10.TRAIN_FILE).select(Chap10.CLAUSE_CREATE_FEATURES);
    BatchOperator<?> test_data = new AkSourceBatchOp().setFilePath(Chap10.DATA_DIR + Chap10.TEST_FILE).select(Chap10.CLAUSE_CREATE_FEATURES);
    final String[] new_features = ArrayUtils.removeElement(train_data.getColNames(), Chap10.LABEL_COL_NAME);
    LogisticRegression lr = new LogisticRegression().setFeatureCols(new_features).setLabelCol(Chap10.LABEL_COL_NAME).setPredictionCol(Chap10.PREDICTION_COL_NAME).setPredictionDetailCol(Chap10.PRED_DETAIL_COL_NAME);
    Pipeline pipeline = new Pipeline().add(lr);
    GridSearchCV gridSearch = new GridSearchCV().setNumFolds(5).setEstimator(pipeline).setParamGrid(new ParamGrid().addGrid(lr, LogisticRegression.L_1, new Double[] { 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0 })).setTuningEvaluator(new BinaryClassificationTuningEvaluator().setLabelCol(Chap10.LABEL_COL_NAME).setPredictionDetailCol(Chap10.PRED_DETAIL_COL_NAME).setTuningBinaryClassMetric(TuningBinaryClassMetric.AUC)).enableLazyPrintTrainInfo();
    GridSearchCVModel bestModel = gridSearch.fit(train_data);
    bestModel.transform(test_data).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("2").setLabelCol(Chap10.LABEL_COL_NAME).setPredictionDetailCol(Chap10.PRED_DETAIL_COL_NAME).lazyPrintMetrics("GridSearchCV"));
    BatchOperator.execute();
}
Also used : ParamGrid(com.alibaba.alink.pipeline.tuning.ParamGrid) AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) GridSearchCV(com.alibaba.alink.pipeline.tuning.GridSearchCV) LogisticRegression(com.alibaba.alink.pipeline.classification.LogisticRegression) GridSearchCVModel(com.alibaba.alink.pipeline.tuning.GridSearchCVModel) BinaryClassificationTuningEvaluator(com.alibaba.alink.pipeline.tuning.BinaryClassificationTuningEvaluator) Pipeline(com.alibaba.alink.pipeline.Pipeline) EvalBinaryClassBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp)

Example 23 with EvalBinaryClassBatchOp

use of com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp in project Alink by alibaba.

the class Chap20 method c_2.

static void c_2() throws Exception {
    Stopwatch sw = new Stopwatch();
    sw.start();
    AlinkGlobalConfiguration.setPrintProcessInfo(true);
    BatchOperator train_sample = new AkSourceBatchOp().setFilePath(Chap11.DATA_DIR + Chap11.TRAIN_SAMPLE_FILE);
    BatchOperator test_data = new AkSourceBatchOp().setFilePath(Chap11.DATA_DIR + Chap11.TEST_FILE);
    final String[] featuresColNames = ArrayUtils.removeElement(train_sample.getColNames(), Chap11.LABEL_COL_NAME);
    GbdtClassifier gbdt = new GbdtClassifier().setFeatureCols(featuresColNames).setLabelCol(Chap11.LABEL_COL_NAME).setPredictionCol(Chap11.PREDICTION_COL_NAME).setPredictionDetailCol(Chap11.PRED_DETAIL_COL_NAME);
    RandomSearchTVSplit randomSearch = new RandomSearchTVSplit().setNumIter(20).setTrainRatio(0.8).setEstimator(gbdt).setParamDist(new ParamDist().addDist(gbdt, GbdtClassifier.NUM_TREES, ValueDist.randArray(new Integer[] { 50, 100 })).addDist(gbdt, GbdtClassifier.MAX_DEPTH, ValueDist.randInteger(4, 10)).addDist(gbdt, GbdtClassifier.MAX_BINS, ValueDist.randArray(new Integer[] { 64, 128, 256, 512 })).addDist(gbdt, GbdtClassifier.LEARNING_RATE, ValueDist.randArray(new Double[] { 0.3, 0.1, 0.01 }))).setTuningEvaluator(new BinaryClassificationTuningEvaluator().setLabelCol(Chap11.LABEL_COL_NAME).setPredictionDetailCol(Chap11.PRED_DETAIL_COL_NAME).setTuningBinaryClassMetric(TuningBinaryClassMetric.F1)).enableLazyPrintTrainInfo();
    RandomSearchTVSplitModel bestModel = randomSearch.fit(train_sample);
    bestModel.transform(test_data).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("1").setLabelCol(Chap11.LABEL_COL_NAME).setPredictionDetailCol(Chap11.PRED_DETAIL_COL_NAME).lazyPrintMetrics());
    BatchOperator.execute();
    sw.stop();
    System.out.println(sw.getElapsedTimeSpan());
}
Also used : ParamDist(com.alibaba.alink.pipeline.tuning.ParamDist) AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) GbdtClassifier(com.alibaba.alink.pipeline.classification.GbdtClassifier) Stopwatch(com.alibaba.alink.common.utils.Stopwatch) RandomSearchTVSplit(com.alibaba.alink.pipeline.tuning.RandomSearchTVSplit) BinaryClassificationTuningEvaluator(com.alibaba.alink.pipeline.tuning.BinaryClassificationTuningEvaluator) BatchOperator(com.alibaba.alink.operator.batch.BatchOperator) RandomSearchTVSplitModel(com.alibaba.alink.pipeline.tuning.RandomSearchTVSplitModel) EvalBinaryClassBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp)

Aggregations

EvalBinaryClassBatchOp (com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp)23 AkSourceBatchOp (com.alibaba.alink.operator.batch.source.AkSourceBatchOp)18 LogisticRegression (com.alibaba.alink.pipeline.classification.LogisticRegression)9 Pipeline (com.alibaba.alink.pipeline.Pipeline)8 Row (org.apache.flink.types.Row)5 MemSourceBatchOp (com.alibaba.alink.operator.batch.source.MemSourceBatchOp)4 VectorAssembler (com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler)4 DocCountVectorizer (com.alibaba.alink.pipeline.nlp.DocCountVectorizer)3 RegexTokenizer (com.alibaba.alink.pipeline.nlp.RegexTokenizer)3 BinaryClassificationTuningEvaluator (com.alibaba.alink.pipeline.tuning.BinaryClassificationTuningEvaluator)3 File (java.io.File)3 BatchOperator (com.alibaba.alink.operator.batch.BatchOperator)2 LogisticRegressionPredictBatchOp (com.alibaba.alink.operator.batch.classification.LogisticRegressionPredictBatchOp)2 LogisticRegressionTrainBatchOp (com.alibaba.alink.operator.batch.classification.LogisticRegressionTrainBatchOp)2 NaiveBayesModelInfo (com.alibaba.alink.operator.batch.classification.NaiveBayesModelInfo)2 NaiveBayesPredictBatchOp (com.alibaba.alink.operator.batch.classification.NaiveBayesPredictBatchOp)2 NaiveBayesTrainBatchOp (com.alibaba.alink.operator.batch.classification.NaiveBayesTrainBatchOp)2 JsonValueBatchOp (com.alibaba.alink.operator.batch.dataproc.JsonValueBatchOp)2 AkSinkBatchOp (com.alibaba.alink.operator.batch.sink.AkSinkBatchOp)2 BinaryClassMetrics (com.alibaba.alink.operator.common.evaluation.BinaryClassMetrics)2