Search in sources :

Example 6 with Stopwatch

use of com.alibaba.alink.common.utils.Stopwatch in project Alink by alibaba.

the class Chap13 method c_5.

static void c_5() throws Exception {
    BatchOperator.setParallelism(4);
    if (!new File(DATA_DIR + TABLE_TRAIN_FILE).exists()) {
        AkSourceBatchOp train_sparse = new AkSourceBatchOp().setFilePath(DATA_DIR + SPARSE_TRAIN_FILE);
        AkSourceBatchOp test_sparse = new AkSourceBatchOp().setFilePath(DATA_DIR + SPARSE_TEST_FILE);
        StringBuilder sbd = new StringBuilder();
        sbd.append("c_0 double");
        for (int i = 1; i < 784; i++) {
            sbd.append(", c_").append(i).append(" double");
        }
        new VectorToColumns().setVectorCol(VECTOR_COL_NAME).setSchemaStr(sbd.toString()).setReservedCols(LABEL_COL_NAME).transform(train_sparse).link(new AkSinkBatchOp().setFilePath(DATA_DIR + TABLE_TRAIN_FILE));
        new VectorToColumns().setVectorCol(VECTOR_COL_NAME).setSchemaStr(sbd.toString()).setReservedCols(LABEL_COL_NAME).transform(test_sparse).link(new AkSinkBatchOp().setFilePath(DATA_DIR + TABLE_TEST_FILE));
        BatchOperator.execute();
    }
    AkSourceBatchOp train_data = new AkSourceBatchOp().setFilePath(DATA_DIR + TABLE_TRAIN_FILE);
    AkSourceBatchOp test_data = new AkSourceBatchOp().setFilePath(DATA_DIR + TABLE_TEST_FILE);
    final String[] featureColNames = ArrayUtils.removeElement(train_data.getColNames(), LABEL_COL_NAME);
    train_data.lazyPrint(5);
    Stopwatch sw = new Stopwatch();
    for (TreeType treeType : new TreeType[] { TreeType.GINI, TreeType.INFOGAIN, TreeType.INFOGAINRATIO }) {
        sw.reset();
        sw.start();
        new DecisionTreeClassifier().setTreeType(treeType).setFeatureCols(featureColNames).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).enableLazyPrintModelInfo().fit(train_data).transform(test_data).link(new EvalMultiClassBatchOp().setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).lazyPrintMetrics("DecisionTreeClassifier " + treeType.toString()));
        BatchOperator.execute();
        sw.stop();
        System.out.println(sw.getElapsedTimeSpan());
    }
    for (int numTrees : new int[] { 2, 4, 8, 16, 32, 64, 128 }) {
        sw.reset();
        sw.start();
        new RandomForestClassifier().setSubsamplingRatio(0.6).setNumTreesOfInfoGain(numTrees).setFeatureCols(featureColNames).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).enableLazyPrintModelInfo().fit(train_data).transform(test_data).link(new EvalMultiClassBatchOp().setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).lazyPrintMetrics("RandomForestClassifier : " + numTrees));
        BatchOperator.execute();
        sw.stop();
        System.out.println(sw.getElapsedTimeSpan());
    }
}
Also used : TreeType(com.alibaba.alink.params.shared.tree.HasIndividualTreeType.TreeType) EvalMultiClassBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalMultiClassBatchOp) Stopwatch(com.alibaba.alink.common.utils.Stopwatch) RandomForestClassifier(com.alibaba.alink.pipeline.classification.RandomForestClassifier) AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) VectorToColumns(com.alibaba.alink.pipeline.dataproc.format.VectorToColumns) DecisionTreeClassifier(com.alibaba.alink.pipeline.classification.DecisionTreeClassifier) AkSinkBatchOp(com.alibaba.alink.operator.batch.sink.AkSinkBatchOp) File(java.io.File)

Example 7 with Stopwatch

use of com.alibaba.alink.common.utils.Stopwatch in project Alink by alibaba.

the class Chap18 method c_1.

static void c_1() throws Exception {
    AkSourceBatchOp dense_source = new AkSourceBatchOp().setFilePath(DATA_DIR + DENSE_TRAIN_FILE);
    AkSourceBatchOp sparse_source = new AkSourceBatchOp().setFilePath(DATA_DIR + SPARSE_TRAIN_FILE);
    Stopwatch sw = new Stopwatch();
    ArrayList<Tuple2<String, Pipeline>> pipelineList = new ArrayList<>();
    pipelineList.add(new Tuple2<>("KMeans EUCLIDEAN", new Pipeline().add(new KMeans().setK(10).setVectorCol(VECTOR_COL_NAME).setPredictionCol(PREDICTION_COL_NAME))));
    pipelineList.add(new Tuple2<>("KMeans COSINE", new Pipeline().add(new KMeans().setDistanceType(DistanceType.COSINE).setK(10).setVectorCol(VECTOR_COL_NAME).setPredictionCol(PREDICTION_COL_NAME))));
    pipelineList.add(new Tuple2<>("BisectingKMeans", new Pipeline().add(new BisectingKMeans().setK(10).setVectorCol(VECTOR_COL_NAME).setPredictionCol(PREDICTION_COL_NAME))));
    for (Tuple2<String, Pipeline> pipelineTuple2 : pipelineList) {
        sw.reset();
        sw.start();
        pipelineTuple2.f1.fit(dense_source).transform(dense_source).link(new EvalClusterBatchOp().setVectorCol(VECTOR_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setLabelCol(LABEL_COL_NAME).lazyPrintMetrics(pipelineTuple2.f0 + " DENSE"));
        BatchOperator.execute();
        sw.stop();
        System.out.println(sw.getElapsedTimeSpan());
        sw.reset();
        sw.start();
        pipelineTuple2.f1.fit(sparse_source).transform(sparse_source).link(new EvalClusterBatchOp().setVectorCol(VECTOR_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setLabelCol(LABEL_COL_NAME).lazyPrintMetrics(pipelineTuple2.f0 + " SPARSE"));
        BatchOperator.execute();
        sw.stop();
        System.out.println(sw.getElapsedTimeSpan());
    }
}
Also used : AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) BisectingKMeans(com.alibaba.alink.pipeline.clustering.BisectingKMeans) KMeans(com.alibaba.alink.pipeline.clustering.KMeans) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Stopwatch(com.alibaba.alink.common.utils.Stopwatch) ArrayList(java.util.ArrayList) BisectingKMeans(com.alibaba.alink.pipeline.clustering.BisectingKMeans) EvalClusterBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalClusterBatchOp) Pipeline(com.alibaba.alink.pipeline.Pipeline)

Example 8 with Stopwatch

use of com.alibaba.alink.common.utils.Stopwatch in project Alink by alibaba.

the class Chap19 method c_4.

static void c_4() throws Exception {
    AkSourceBatchOp dense_train_data = new AkSourceBatchOp().setFilePath(DATA_DIR + DENSE_TRAIN_FILE);
    AkSourceBatchOp dense_test_data = new AkSourceBatchOp().setFilePath(DATA_DIR + DENSE_TEST_FILE);
    AkSourceBatchOp sparse_train_data = new AkSourceBatchOp().setFilePath(DATA_DIR + SPARSE_TRAIN_FILE);
    AkSourceBatchOp sparse_test_data = new AkSourceBatchOp().setFilePath(DATA_DIR + SPARSE_TEST_FILE);
    Stopwatch sw = new Stopwatch();
    sw.reset();
    sw.start();
    new KnnClassifier().setK(3).setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).fit(dense_train_data).transform(dense_test_data).link(new EvalMultiClassBatchOp().setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).lazyPrintMetrics("KnnClassifier Dense"));
    BatchOperator.execute();
    sw.stop();
    System.out.println(sw.getElapsedTimeSpan());
    sw.reset();
    sw.start();
    new KnnClassifier().setK(3).setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).fit(sparse_train_data).transform(sparse_test_data).link(new EvalMultiClassBatchOp().setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).lazyPrintMetrics("KnnClassifier Sparse"));
    BatchOperator.execute();
    sw.stop();
    System.out.println(sw.getElapsedTimeSpan());
    sw.reset();
    sw.start();
    new Pipeline().add(new PCA().setK(39).setCalculationType(CalculationType.COV).setVectorCol(VECTOR_COL_NAME).setPredictionCol(VECTOR_COL_NAME)).add(new KnnClassifier().setK(3).setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME)).fit(dense_train_data).transform(dense_test_data).link(new EvalMultiClassBatchOp().setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).lazyPrintMetrics("Knn with PCA Dense"));
    BatchOperator.execute();
    sw.stop();
    System.out.println(sw.getElapsedTimeSpan());
    sw.reset();
    sw.start();
    new Pipeline().add(new PCA().setK(39).setCalculationType(CalculationType.COV).setVectorCol(VECTOR_COL_NAME).setPredictionCol(VECTOR_COL_NAME)).add(new KnnClassifier().setK(3).setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME)).fit(sparse_train_data).transform(sparse_test_data).link(new EvalMultiClassBatchOp().setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).lazyPrintMetrics("Knn with PCA Sparse"));
    BatchOperator.execute();
    sw.stop();
    System.out.println(sw.getElapsedTimeSpan());
    sw.reset();
    sw.start();
    new Pipeline().add(new PCAModel().setVectorCol(VECTOR_COL_NAME).setPredictionCol(VECTOR_COL_NAME).setModelData(new AkSourceBatchOp().setFilePath(DATA_DIR + PCA_MODEL_FILE))).add(new KnnClassifier().setK(3).setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME)).fit(dense_train_data).transform(dense_test_data).link(new EvalMultiClassBatchOp().setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).lazyPrintMetrics("Knn PCAModel Dense"));
    BatchOperator.execute();
    sw.stop();
    System.out.println(sw.getElapsedTimeSpan());
    sw.reset();
    sw.start();
    new Pipeline().add(new PCAModel().setVectorCol(VECTOR_COL_NAME).setPredictionCol(VECTOR_COL_NAME).setModelData(new AkSourceBatchOp().setFilePath(DATA_DIR + PCA_MODEL_FILE))).add(new KnnClassifier().setK(3).setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME)).fit(sparse_train_data).transform(sparse_test_data).link(new EvalMultiClassBatchOp().setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).lazyPrintMetrics("Knn PCAModel Sparse"));
    BatchOperator.execute();
    sw.stop();
    System.out.println(sw.getElapsedTimeSpan());
}
Also used : PCAModel(com.alibaba.alink.pipeline.feature.PCAModel) EvalMultiClassBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalMultiClassBatchOp) AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) Stopwatch(com.alibaba.alink.common.utils.Stopwatch) KnnClassifier(com.alibaba.alink.pipeline.classification.KnnClassifier) Pipeline(com.alibaba.alink.pipeline.Pipeline) PCA(com.alibaba.alink.pipeline.feature.PCA)

Example 9 with Stopwatch

use of com.alibaba.alink.common.utils.Stopwatch in project Alink by alibaba.

the class Chap20 method c_3.

static void c_3() throws Exception {
    Stopwatch sw = new Stopwatch();
    sw.start();
    AlinkGlobalConfiguration.setPrintProcessInfo(true);
    AkSourceBatchOp source = new AkSourceBatchOp().setFilePath(Chap17.DATA_DIR + Chap17.VECTOR_FILE);
    KMeans kmeans = new KMeans().setVectorCol(Chap17.VECTOR_COL_NAME).setPredictionCol(Chap17.PREDICTION_COL_NAME);
    GridSearchCV cv = new GridSearchCV().setNumFolds(4).setEstimator(kmeans).setParamGrid(new ParamGrid().addGrid(kmeans, KMeans.K, new Integer[] { 2, 3, 4, 5, 6 }).addGrid(kmeans, KMeans.DISTANCE_TYPE, new DistanceType[] { DistanceType.EUCLIDEAN, DistanceType.COSINE })).setTuningEvaluator(new ClusterTuningEvaluator().setVectorCol(Chap17.VECTOR_COL_NAME).setPredictionCol(Chap17.PREDICTION_COL_NAME).setLabelCol(Chap17.LABEL_COL_NAME).setTuningClusterMetric(TuningClusterMetric.RI)).enableLazyPrintTrainInfo();
    GridSearchCVModel bestModel = cv.fit(source);
    bestModel.transform(source).link(new EvalClusterBatchOp().setLabelCol(Chap17.LABEL_COL_NAME).setVectorCol(Chap17.VECTOR_COL_NAME).setPredictionCol(Chap17.PREDICTION_COL_NAME).lazyPrintMetrics());
    BatchOperator.execute();
    sw.stop();
    System.out.println(sw.getElapsedTimeSpan());
}
Also used : ParamGrid(com.alibaba.alink.pipeline.tuning.ParamGrid) AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) KMeans(com.alibaba.alink.pipeline.clustering.KMeans) ClusterTuningEvaluator(com.alibaba.alink.pipeline.tuning.ClusterTuningEvaluator) Stopwatch(com.alibaba.alink.common.utils.Stopwatch) GridSearchCV(com.alibaba.alink.pipeline.tuning.GridSearchCV) DistanceType(com.alibaba.alink.params.shared.clustering.HasKMeansDistanceType.DistanceType) GridSearchCVModel(com.alibaba.alink.pipeline.tuning.GridSearchCVModel) EvalClusterBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalClusterBatchOp)

Example 10 with Stopwatch

use of com.alibaba.alink.common.utils.Stopwatch in project Alink by alibaba.

the class Chap20 method c_2.

static void c_2() throws Exception {
    Stopwatch sw = new Stopwatch();
    sw.start();
    AlinkGlobalConfiguration.setPrintProcessInfo(true);
    BatchOperator train_sample = new AkSourceBatchOp().setFilePath(Chap11.DATA_DIR + Chap11.TRAIN_SAMPLE_FILE);
    BatchOperator test_data = new AkSourceBatchOp().setFilePath(Chap11.DATA_DIR + Chap11.TEST_FILE);
    final String[] featuresColNames = ArrayUtils.removeElement(train_sample.getColNames(), Chap11.LABEL_COL_NAME);
    GbdtClassifier gbdt = new GbdtClassifier().setFeatureCols(featuresColNames).setLabelCol(Chap11.LABEL_COL_NAME).setPredictionCol(Chap11.PREDICTION_COL_NAME).setPredictionDetailCol(Chap11.PRED_DETAIL_COL_NAME);
    RandomSearchTVSplit randomSearch = new RandomSearchTVSplit().setNumIter(20).setTrainRatio(0.8).setEstimator(gbdt).setParamDist(new ParamDist().addDist(gbdt, GbdtClassifier.NUM_TREES, ValueDist.randArray(new Integer[] { 50, 100 })).addDist(gbdt, GbdtClassifier.MAX_DEPTH, ValueDist.randInteger(4, 10)).addDist(gbdt, GbdtClassifier.MAX_BINS, ValueDist.randArray(new Integer[] { 64, 128, 256, 512 })).addDist(gbdt, GbdtClassifier.LEARNING_RATE, ValueDist.randArray(new Double[] { 0.3, 0.1, 0.01 }))).setTuningEvaluator(new BinaryClassificationTuningEvaluator().setLabelCol(Chap11.LABEL_COL_NAME).setPredictionDetailCol(Chap11.PRED_DETAIL_COL_NAME).setTuningBinaryClassMetric(TuningBinaryClassMetric.F1)).enableLazyPrintTrainInfo();
    RandomSearchTVSplitModel bestModel = randomSearch.fit(train_sample);
    bestModel.transform(test_data).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("1").setLabelCol(Chap11.LABEL_COL_NAME).setPredictionDetailCol(Chap11.PRED_DETAIL_COL_NAME).lazyPrintMetrics());
    BatchOperator.execute();
    sw.stop();
    System.out.println(sw.getElapsedTimeSpan());
}
Also used : ParamDist(com.alibaba.alink.pipeline.tuning.ParamDist) AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) GbdtClassifier(com.alibaba.alink.pipeline.classification.GbdtClassifier) Stopwatch(com.alibaba.alink.common.utils.Stopwatch) RandomSearchTVSplit(com.alibaba.alink.pipeline.tuning.RandomSearchTVSplit) BinaryClassificationTuningEvaluator(com.alibaba.alink.pipeline.tuning.BinaryClassificationTuningEvaluator) BatchOperator(com.alibaba.alink.operator.batch.BatchOperator) RandomSearchTVSplitModel(com.alibaba.alink.pipeline.tuning.RandomSearchTVSplitModel) EvalBinaryClassBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp)

Aggregations

Stopwatch (com.alibaba.alink.common.utils.Stopwatch)10 AkSourceBatchOp (com.alibaba.alink.operator.batch.source.AkSourceBatchOp)8 EvalClusterBatchOp (com.alibaba.alink.operator.batch.evaluation.EvalClusterBatchOp)3 Pipeline (com.alibaba.alink.pipeline.Pipeline)3 KMeans (com.alibaba.alink.pipeline.clustering.KMeans)3 EvalMultiClassBatchOp (com.alibaba.alink.operator.batch.evaluation.EvalMultiClassBatchOp)2 AkSinkBatchOp (com.alibaba.alink.operator.batch.sink.AkSinkBatchOp)2 File (java.io.File)2 BatchOperator (com.alibaba.alink.operator.batch.BatchOperator)1 EvalBinaryClassBatchOp (com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp)1 PcaPredictBatchOp (com.alibaba.alink.operator.batch.feature.PcaPredictBatchOp)1 PcaTrainBatchOp (com.alibaba.alink.operator.batch.feature.PcaTrainBatchOp)1 MemSourceBatchOp (com.alibaba.alink.operator.batch.source.MemSourceBatchOp)1 MemSourceStreamOp (com.alibaba.alink.operator.stream.source.MemSourceStreamOp)1 DistanceType (com.alibaba.alink.params.shared.clustering.HasKMeansDistanceType.DistanceType)1 TreeType (com.alibaba.alink.params.shared.tree.HasIndividualTreeType.TreeType)1 DecisionTreeClassifier (com.alibaba.alink.pipeline.classification.DecisionTreeClassifier)1 GbdtClassifier (com.alibaba.alink.pipeline.classification.GbdtClassifier)1 KnnClassifier (com.alibaba.alink.pipeline.classification.KnnClassifier)1 RandomForestClassifier (com.alibaba.alink.pipeline.classification.RandomForestClassifier)1