Search in sources :

Example 11 with EvalBinaryClassBatchOp

use of com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp in project Alink by alibaba.

the class Chap23 method c_1.

static void c_1() throws Exception {
    BatchOperator<?> train_set = new LibSvmSourceBatchOp().setFilePath(ORIGIN_DATA_DIR + "train" + File.separator + "labeledBow.feat").setStartIndex(0);
    train_set.lazyPrint(1, "train_set");
    train_set.groupBy("label", "label, COUNT(label) AS cnt").orderBy("label", 100).lazyPrint(-1, "labels of train_set");
    BatchOperator<?> test_set = new LibSvmSourceBatchOp().setFilePath(ORIGIN_DATA_DIR + "test" + File.separator + "labeledBow.feat").setStartIndex(0);
    train_set = train_set.select("CASE WHEN label>5 THEN 'pos' ELSE 'neg' END AS label, " + "features AS " + VECTOR_COL_NAME);
    test_set = test_set.select("CASE WHEN label>5 THEN 'pos' ELSE 'neg' END AS label, " + "features AS " + VECTOR_COL_NAME);
    train_set.lazyPrint(1, "train_set");
    new NaiveBayesTextClassifier().setModelType("Multinomial").setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).enableLazyPrintModelInfo().fit(train_set).transform(test_set).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("pos").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("NaiveBayesTextClassifier + Multinomial"));
    BatchOperator.execute();
    new Pipeline().add(new Binarizer().setSelectedCol(VECTOR_COL_NAME).enableLazyPrintTransformData(1, "After Binarizer")).add(new NaiveBayesTextClassifier().setModelType("Bernoulli").setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).enableLazyPrintModelInfo()).fit(train_set).transform(test_set).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("pos").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("Binarizer + NaiveBayesTextClassifier + Bernoulli"));
    BatchOperator.execute();
    new LogisticRegression().setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).enableLazyPrintTrainInfo("< LR train info >").enableLazyPrintModelInfo("< LR model info >").fit(train_set).transform(test_set).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("pos").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("LogisticRegression"));
    BatchOperator.execute();
    AlinkGlobalConfiguration.setPrintProcessInfo(true);
    LogisticRegression lr = new LogisticRegression().setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME);
    GridSearchCV gridSearch = new GridSearchCV().setEstimator(new Pipeline().add(lr)).setParamGrid(new ParamGrid().addGrid(lr, LogisticRegression.MAX_ITER, new Integer[] { 10, 20, 30, 40, 50, 60, 80, 100 })).setTuningEvaluator(new BinaryClassificationTuningEvaluator().setLabelCol(LABEL_COL_NAME).setPositiveLabelValueString("pos").setPredictionDetailCol(PRED_DETAIL_COL_NAME).setTuningBinaryClassMetric(TuningBinaryClassMetric.AUC)).setNumFolds(6).enableLazyPrintTrainInfo();
    GridSearchCVModel bestModel = gridSearch.fit(train_set);
    bestModel.transform(test_set).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("pos").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("LogisticRegression"));
    BatchOperator.execute();
}
Also used : ParamGrid(com.alibaba.alink.pipeline.tuning.ParamGrid) LibSvmSourceBatchOp(com.alibaba.alink.operator.batch.source.LibSvmSourceBatchOp) GridSearchCV(com.alibaba.alink.pipeline.tuning.GridSearchCV) NaiveBayesTextClassifier(com.alibaba.alink.pipeline.classification.NaiveBayesTextClassifier) LogisticRegression(com.alibaba.alink.pipeline.classification.LogisticRegression) GridSearchCVModel(com.alibaba.alink.pipeline.tuning.GridSearchCVModel) Binarizer(com.alibaba.alink.pipeline.feature.Binarizer) BinaryClassificationTuningEvaluator(com.alibaba.alink.pipeline.tuning.BinaryClassificationTuningEvaluator) EvalBinaryClassBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp) Pipeline(com.alibaba.alink.pipeline.Pipeline)

Example 12 with EvalBinaryClassBatchOp

use of com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp in project Alink by alibaba.

the class Chap23 method c_4.

static void c_4() throws Exception {
    AkSourceBatchOp train_set = new AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
    if (!new File(DATA_DIR + PIPELINE_MODEL).exists()) {
        new Pipeline().add(new RegexTokenizer().setPattern("\\W+").setSelectedCol(TXT_COL_NAME)).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol(TXT_COL_NAME).setOutputCol(VECTOR_COL_NAME)).add(new NGram().setN(2).setSelectedCol(TXT_COL_NAME).setOutputCol("v_2")).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setVocabSize(50000).setSelectedCol("v_2").setOutputCol("v_2")).add(new NGram().setN(3).setSelectedCol(TXT_COL_NAME).setOutputCol("v_3")).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setVocabSize(10000).setSelectedCol("v_3").setOutputCol("v_3")).add(new VectorAssembler().setSelectedCols(VECTOR_COL_NAME, "v_2", "v_3").setOutputCol(VECTOR_COL_NAME)).add(new LogisticRegression().setMaxIter(30).setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME)).fit(train_set).save(DATA_DIR + PIPELINE_MODEL);
        BatchOperator.execute();
    }
    PipelineModel pipeline_model = PipelineModel.load(DATA_DIR + PIPELINE_MODEL);
    AkSourceBatchOp test_set = new AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);
    pipeline_model.transform(test_set).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("pos").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("NGram 2 and 3"));
    BatchOperator.execute();
    AkSourceStreamOp test_stream = new AkSourceStreamOp().setFilePath(DATA_DIR + TEST_FILE);
    pipeline_model.transform(test_stream).sample(0.001).select(PREDICTION_COL_NAME + ", " + LABEL_COL_NAME + ", " + TXT_COL_NAME).print();
    StreamOperator.execute();
    String str = "Oh dear. good cast, but to write and direct is an art and to write wit and direct wit is a bit of a " + "task. Even doing good comedy you have to get the timing and moment right. Im not putting it all down " + "there were parts where i laughed loud but that was at very few times. The main focus to me was on the " + "fast free flowing dialogue, that made some people in the film annoying. It may sound great while " + "reading the script in your head but getting that out and to the camera is a different task. And the " + "hand held camera work does give energy to few parts of the film. Overall direction was good but the " + "script was not all that to me, but I'm sure you was reading the script in your head it would sound good" + ". Sorry.";
    Row pred_row;
    LocalPredictor local_predictor = pipeline_model.collectLocalPredictor("review string");
    System.out.println(local_predictor.getOutputSchema());
    pred_row = local_predictor.map(Row.of(str));
    System.out.println(pred_row.getField(4));
    LocalPredictor local_predictor_2 = new LocalPredictor(DATA_DIR + PIPELINE_MODEL, "review string");
    System.out.println(local_predictor_2.getOutputSchema());
    pred_row = local_predictor_2.map(Row.of(str));
    System.out.println(pred_row.getField(4));
}
Also used : LocalPredictor(com.alibaba.alink.pipeline.LocalPredictor) VectorAssembler(com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler) NGram(com.alibaba.alink.pipeline.nlp.NGram) DocCountVectorizer(com.alibaba.alink.pipeline.nlp.DocCountVectorizer) Pipeline(com.alibaba.alink.pipeline.Pipeline) PipelineModel(com.alibaba.alink.pipeline.PipelineModel) EvalBinaryClassBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp) AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) RegexTokenizer(com.alibaba.alink.pipeline.nlp.RegexTokenizer) AkSourceStreamOp(com.alibaba.alink.operator.stream.source.AkSourceStreamOp) Row(org.apache.flink.types.Row) LogisticRegression(com.alibaba.alink.pipeline.classification.LogisticRegression) File(java.io.File)

Example 13 with EvalBinaryClassBatchOp

use of com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp in project Alink by alibaba.

the class Chap08 method c_9.

static void c_9() throws Exception {
    AkSourceBatchOp train_data = new AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
    AkSourceBatchOp test_data = new AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);
    new FmClassifier().setNumEpochs(10).setLearnRate(0.5).setNumFactor(2).setFeatureCols(FEATURE_COL_NAMES).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).enableLazyPrintTrainInfo().enableLazyPrintModelInfo().fit(train_data).transform(test_data).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("1").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("FM"));
    BatchOperator.execute();
}
Also used : AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) FmClassifier(com.alibaba.alink.pipeline.classification.FmClassifier) EvalBinaryClassBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp)

Example 14 with EvalBinaryClassBatchOp

use of com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp in project Alink by alibaba.

the class Chap08 method c_8.

static void c_8() throws Exception {
    BatchOperator<?> train_data = new AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
    BatchOperator<?> test_data = new AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);
    PipelineModel featureExpand = new Pipeline().add(new VectorAssembler().setSelectedCols(FEATURE_COL_NAMES).setOutputCol(VEC_COL_NAME + "_0")).add(new VectorPolynomialExpand().setSelectedCol(VEC_COL_NAME + "_0").setOutputCol(VEC_COL_NAME).setDegree(2)).fit(train_data);
    train_data = featureExpand.transform(train_data);
    test_data = featureExpand.transform(test_data);
    train_data.lazyPrint(1);
    new LinearSvm().setVectorCol(VEC_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).fit(train_data).transform(test_data).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("1").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("LinearSVM"));
    new LogisticRegression().setVectorCol(VEC_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).fit(train_data).transform(test_data).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("1").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("LogisticRegression"));
    new LogisticRegression().setOptimMethod(OptimMethod.Newton).setVectorCol(VEC_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).fit(train_data).transform(test_data).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("1").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("LogisticRegression + OptimMethod.Newton"));
    BatchOperator.execute();
}
Also used : VectorPolynomialExpand(com.alibaba.alink.pipeline.dataproc.vector.VectorPolynomialExpand) AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) VectorAssembler(com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler) LinearSvm(com.alibaba.alink.pipeline.classification.LinearSvm) LogisticRegression(com.alibaba.alink.pipeline.classification.LogisticRegression) PipelineModel(com.alibaba.alink.pipeline.PipelineModel) Pipeline(com.alibaba.alink.pipeline.Pipeline) EvalBinaryClassBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp)

Example 15 with EvalBinaryClassBatchOp

use of com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp in project Alink by alibaba.

the class Chap09 method c_4_b.

static void c_4_b() throws Exception {
    AkSourceBatchOp train_data = new AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
    AkSourceBatchOp test_data = new AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);
    NaiveBayesTrainBatchOp trainer = new NaiveBayesTrainBatchOp().setFeatureCols("odor", "gill_color").setCategoricalCols("odor", "gill_color").setLabelCol(LABEL_COL_NAME);
    NaiveBayesPredictBatchOp predictor = new NaiveBayesPredictBatchOp().setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME);
    train_data.link(trainer);
    predictor.linkFrom(trainer, test_data);
    trainer.lazyCollectModelInfo(new Consumer<NaiveBayesModelInfo>() {

        @Override
        public void accept(NaiveBayesModelInfo naiveBayesModelInfo) {
            StringBuilder sbd = new StringBuilder();
            for (String feature : new String[] { "odor", "gill_color" }) {
                HashMap<Object, HashMap<Object, Double>> map2 = naiveBayesModelInfo.getCategoryFeatureInfo().get(feature);
                sbd.append("\nfeature:").append(feature);
                for (Entry<Object, HashMap<Object, Double>> entry : map2.entrySet()) {
                    sbd.append("\n").append(entry.getKey()).append(" : ").append(entry.getValue().toString());
                }
            }
            System.out.println(sbd.toString());
        }
    });
    predictor.lazyPrint(10, "< Prediction >").link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("p").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics());
    BatchOperator.execute();
}
Also used : HashMap(java.util.HashMap) EvalBinaryClassBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp) Entry(java.util.Map.Entry) AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) NaiveBayesModelInfo(com.alibaba.alink.operator.batch.classification.NaiveBayesModelInfo) NaiveBayesPredictBatchOp(com.alibaba.alink.operator.batch.classification.NaiveBayesPredictBatchOp) NaiveBayesTrainBatchOp(com.alibaba.alink.operator.batch.classification.NaiveBayesTrainBatchOp)

Aggregations

EvalBinaryClassBatchOp (com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp)23 AkSourceBatchOp (com.alibaba.alink.operator.batch.source.AkSourceBatchOp)18 LogisticRegression (com.alibaba.alink.pipeline.classification.LogisticRegression)9 Pipeline (com.alibaba.alink.pipeline.Pipeline)8 Row (org.apache.flink.types.Row)5 MemSourceBatchOp (com.alibaba.alink.operator.batch.source.MemSourceBatchOp)4 VectorAssembler (com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler)4 DocCountVectorizer (com.alibaba.alink.pipeline.nlp.DocCountVectorizer)3 RegexTokenizer (com.alibaba.alink.pipeline.nlp.RegexTokenizer)3 BinaryClassificationTuningEvaluator (com.alibaba.alink.pipeline.tuning.BinaryClassificationTuningEvaluator)3 File (java.io.File)3 BatchOperator (com.alibaba.alink.operator.batch.BatchOperator)2 LogisticRegressionPredictBatchOp (com.alibaba.alink.operator.batch.classification.LogisticRegressionPredictBatchOp)2 LogisticRegressionTrainBatchOp (com.alibaba.alink.operator.batch.classification.LogisticRegressionTrainBatchOp)2 NaiveBayesModelInfo (com.alibaba.alink.operator.batch.classification.NaiveBayesModelInfo)2 NaiveBayesPredictBatchOp (com.alibaba.alink.operator.batch.classification.NaiveBayesPredictBatchOp)2 NaiveBayesTrainBatchOp (com.alibaba.alink.operator.batch.classification.NaiveBayesTrainBatchOp)2 JsonValueBatchOp (com.alibaba.alink.operator.batch.dataproc.JsonValueBatchOp)2 AkSinkBatchOp (com.alibaba.alink.operator.batch.sink.AkSinkBatchOp)2 BinaryClassMetrics (com.alibaba.alink.operator.common.evaluation.BinaryClassMetrics)2