Search in sources :

Example 6 with EvalBinaryClassBatchOp

use of com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp in project Alink by alibaba.

the class GbdtBatchOpTest method linkFromSimple.

@Test
public void linkFromSimple() throws Exception {
    Row[] testArray = new Row[] { Row.of(1, 2, 0), Row.of(1, 2, 0), Row.of(0, 3, 1), Row.of(0, 2, 0), Row.of(1, 3, 1), Row.of(4, 3, 1), Row.of(4, 4, 1) };
    String[] colNames = new String[] { "col0", "col1", "label" };
    MemSourceBatchOp memSourceBatchOp = new MemSourceBatchOp(Arrays.asList(testArray), colNames);
    GbdtTrainBatchOp gbdtTrainBatchOp = new GbdtTrainBatchOp().setFeatureCols(colNames[0], colNames[1]).setLabelCol(colNames[2]).setMinSamplesPerLeaf(1).setNumTrees(2);
    BatchOperator<?> model = gbdtTrainBatchOp.linkFrom(memSourceBatchOp);
    Assert.assertEquals(new GbdtPredictBatchOp().setPredictionCol("pred_col").setPredictionDetailCol("pred_detail").linkFrom(model, memSourceBatchOp).link(new EvalBinaryClassBatchOp().setLabelCol(colNames[2]).setPositiveLabelValueString("1").setPredictionDetailCol("pred_detail")).collectMetrics().getAuc(), 1.0, 1e-6);
}
Also used : MemSourceBatchOp(com.alibaba.alink.operator.batch.source.MemSourceBatchOp) Row(org.apache.flink.types.Row) BaseGbdtTrainBatchOp(com.alibaba.alink.operator.common.tree.parallelcart.BaseGbdtTrainBatchOp) EvalBinaryClassBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp) Test(org.junit.Test)

Example 7 with EvalBinaryClassBatchOp

use of com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp in project Alink by alibaba.

the class FmRecommImplicitTest method eval.

private static void eval(BatchOperator<?> pred) {
    pred = pred.select("label, concat('{\"0\":', cast((1-p) as varchar), ',\"1\":', cast(p as varchar), '}') as p_detail");
    EvalBinaryClassBatchOp eval = new EvalBinaryClassBatchOp().setLabelCol("label").setPredictionDetailCol("p_detail").linkFrom(pred);
    BinaryClassMetrics metrics = eval.collectMetrics();
    System.out.println(String.format("auc=%f,acc=%f,f1=%f", metrics.getAuc(), metrics.getAccuracy(), metrics.getF1()));
}
Also used : BinaryClassMetrics(com.alibaba.alink.operator.common.evaluation.BinaryClassMetrics) EvalBinaryClassBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp)

Example 8 with EvalBinaryClassBatchOp

use of com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp in project Alink by alibaba.

the class Chap23 method c_2.

static void c_2() throws Exception {
    if (!new File(DATA_DIR + TRAIN_FILE).exists()) {
        ArrayList<Row> trainRows = new ArrayList<>();
        ArrayList<Row> testRows = new ArrayList<>();
        for (String label : new String[] { "pos", "neg" }) {
            File subfolder = new File(ORIGIN_DATA_DIR + "train" + File.separator + label);
            for (File f : subfolder.listFiles()) {
                trainRows.add(Row.of(label, readFileContent(f)));
            }
        }
        for (String label : new String[] { "pos", "neg" }) {
            File subfolder = new File(ORIGIN_DATA_DIR + "test" + File.separator + label);
            for (File f : subfolder.listFiles()) {
                testRows.add(Row.of(label, readFileContent(f)));
            }
        }
        new MemSourceBatchOp(trainRows, COL_NAMES).link(new AkSinkBatchOp().setFilePath(DATA_DIR + TRAIN_FILE));
        new MemSourceBatchOp(testRows, COL_NAMES).link(new AkSinkBatchOp().setFilePath(DATA_DIR + TEST_FILE));
        BatchOperator.execute();
    }
    AkSourceBatchOp train_set = new AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
    AkSourceBatchOp test_set = new AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);
    train_set.lazyPrint(2);
    new Pipeline().add(new RegexTokenizer().setPattern("\\W+").setSelectedCol(TXT_COL_NAME)).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol(TXT_COL_NAME).setOutputCol(VECTOR_COL_NAME).enableLazyPrintTransformData(1)).add(new LogisticRegression().setMaxIter(30).setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME)).fit(train_set).transform(test_set).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("pos").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("DocCountVectorizer"));
    BatchOperator.execute();
    new Pipeline().add(new RegexTokenizer().setPattern("\\W+").setSelectedCol(TXT_COL_NAME)).add(new DocHashCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol(TXT_COL_NAME).setOutputCol(VECTOR_COL_NAME).enableLazyPrintTransformData(1)).add(new LogisticRegression().setMaxIter(30).setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME)).fit(train_set).transform(test_set).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("pos").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("DocHashCountVectorizer"));
    BatchOperator.execute();
}
Also used : ArrayList(java.util.ArrayList) DocCountVectorizer(com.alibaba.alink.pipeline.nlp.DocCountVectorizer) Pipeline(com.alibaba.alink.pipeline.Pipeline) EvalBinaryClassBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp) MemSourceBatchOp(com.alibaba.alink.operator.batch.source.MemSourceBatchOp) AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) RegexTokenizer(com.alibaba.alink.pipeline.nlp.RegexTokenizer) Row(org.apache.flink.types.Row) AkSinkBatchOp(com.alibaba.alink.operator.batch.sink.AkSinkBatchOp) LogisticRegression(com.alibaba.alink.pipeline.classification.LogisticRegression) File(java.io.File) DocHashCountVectorizer(com.alibaba.alink.pipeline.nlp.DocHashCountVectorizer)

Example 9 with EvalBinaryClassBatchOp

use of com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp in project Alink by alibaba.

the class Chap23 method c_3.

static void c_3() throws Exception {
    AkSourceBatchOp train_set = new AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
    AkSourceBatchOp test_set = new AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);
    new Pipeline().add(new RegexTokenizer().setPattern("\\W+").setSelectedCol(TXT_COL_NAME)).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol(TXT_COL_NAME).setOutputCol(VECTOR_COL_NAME)).add(new NGram().setN(2).setSelectedCol(TXT_COL_NAME).setOutputCol("v_2").enableLazyPrintTransformData(1, "2-gram")).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol("v_2").setOutputCol("v_2")).add(new VectorAssembler().setSelectedCols(VECTOR_COL_NAME, "v_2").setOutputCol(VECTOR_COL_NAME)).add(new LogisticRegression().setMaxIter(30).setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME)).fit(train_set).transform(test_set).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("pos").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("NGram 2"));
    BatchOperator.execute();
    new Pipeline().add(new RegexTokenizer().setPattern("\\W+").setSelectedCol(TXT_COL_NAME)).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol(TXT_COL_NAME).setOutputCol(VECTOR_COL_NAME)).add(new NGram().setN(2).setSelectedCol(TXT_COL_NAME).setOutputCol("v_2")).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol("v_2").setOutputCol("v_2")).add(new NGram().setN(3).setSelectedCol(TXT_COL_NAME).setOutputCol("v_3")).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setVocabSize(10000).setSelectedCol("v_3").setOutputCol("v_3")).add(new VectorAssembler().setSelectedCols(VECTOR_COL_NAME, "v_2", "v_3").setOutputCol(VECTOR_COL_NAME)).add(new LogisticRegression().setMaxIter(30).setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME)).fit(train_set).transform(test_set).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("pos").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("NGram 2 and 3"));
    BatchOperator.execute();
}
Also used : AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) VectorAssembler(com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler) RegexTokenizer(com.alibaba.alink.pipeline.nlp.RegexTokenizer) NGram(com.alibaba.alink.pipeline.nlp.NGram) DocCountVectorizer(com.alibaba.alink.pipeline.nlp.DocCountVectorizer) LogisticRegression(com.alibaba.alink.pipeline.classification.LogisticRegression) Pipeline(com.alibaba.alink.pipeline.Pipeline) EvalBinaryClassBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp)

Example 10 with EvalBinaryClassBatchOp

use of com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp in project Alink by alibaba.

the class FmClassifierTest method testFmSparse.

@Test
public void testFmSparse() {
    BatchOperator<?> trainData = new MemSourceBatchOp(new Object[][] { { "1:1.1 3:2.0", 1.0 }, { "2:2.1 10:3.1", 1.0 }, { "3:3.1 7:2.2", 1.0 }, { "1:1.2 5:3.2", 0.0 }, { "3:1.2 7:4.2", 0.0 } }, new String[] { "vec", "label" });
    FmClassifierTrainBatchOp adagrad = new FmClassifierTrainBatchOp().setVectorCol("vec").setLabelCol("label").setNumEpochs(10).setInitStdev(0.01).setLearnRate(0.01).setEpsilon(0.0001).linkFrom(trainData);
    BatchOperator<?> result = new FmPredictBatchOp().setVectorCol("vec").setPredictionCol("pred").setPredictionDetailCol("details").linkFrom(adagrad, trainData);
    List<Row> eval = new EvalBinaryClassBatchOp().setLabelCol("label").setPredictionDetailCol("details").linkFrom(result).link(new JsonValueBatchOp().setSelectedCol("Data").setReservedCols(new String[] { "Statistics" }).setOutputCols(new String[] { "Accuracy", "AUC", "ConfusionMatrix" }).setJsonPath("$.Accuracy", "$.AUC", "$.ConfusionMatrix")).collect();
    Assert.assertEquals(Double.parseDouble(eval.get(0).getField(0).toString()), 0.8, 0.01);
}
Also used : MemSourceBatchOp(com.alibaba.alink.operator.batch.source.MemSourceBatchOp) JsonValueBatchOp(com.alibaba.alink.operator.batch.dataproc.JsonValueBatchOp) Row(org.apache.flink.types.Row) FmPredictBatchOp(com.alibaba.alink.operator.common.fm.FmPredictBatchOp) EvalBinaryClassBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp) Test(org.junit.Test)

Aggregations

EvalBinaryClassBatchOp (com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp)23 AkSourceBatchOp (com.alibaba.alink.operator.batch.source.AkSourceBatchOp)18 LogisticRegression (com.alibaba.alink.pipeline.classification.LogisticRegression)9 Pipeline (com.alibaba.alink.pipeline.Pipeline)8 Row (org.apache.flink.types.Row)5 MemSourceBatchOp (com.alibaba.alink.operator.batch.source.MemSourceBatchOp)4 VectorAssembler (com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler)4 DocCountVectorizer (com.alibaba.alink.pipeline.nlp.DocCountVectorizer)3 RegexTokenizer (com.alibaba.alink.pipeline.nlp.RegexTokenizer)3 BinaryClassificationTuningEvaluator (com.alibaba.alink.pipeline.tuning.BinaryClassificationTuningEvaluator)3 File (java.io.File)3 BatchOperator (com.alibaba.alink.operator.batch.BatchOperator)2 LogisticRegressionPredictBatchOp (com.alibaba.alink.operator.batch.classification.LogisticRegressionPredictBatchOp)2 LogisticRegressionTrainBatchOp (com.alibaba.alink.operator.batch.classification.LogisticRegressionTrainBatchOp)2 NaiveBayesModelInfo (com.alibaba.alink.operator.batch.classification.NaiveBayesModelInfo)2 NaiveBayesPredictBatchOp (com.alibaba.alink.operator.batch.classification.NaiveBayesPredictBatchOp)2 NaiveBayesTrainBatchOp (com.alibaba.alink.operator.batch.classification.NaiveBayesTrainBatchOp)2 JsonValueBatchOp (com.alibaba.alink.operator.batch.dataproc.JsonValueBatchOp)2 AkSinkBatchOp (com.alibaba.alink.operator.batch.sink.AkSinkBatchOp)2 BinaryClassMetrics (com.alibaba.alink.operator.common.evaluation.BinaryClassMetrics)2