Search in sources :

Example 1 with SplitBatchOp

use of com.alibaba.alink.operator.batch.dataproc.SplitBatchOp in project Alink by alibaba.

the class ALSExample method main.

public static void main(String[] args) throws Exception {
    String url = "https://alink-release.oss-cn-beijing.aliyuncs.com/data-files/movielens_ratings.csv";
    String schema = "userid bigint, movieid bigint, rating double, timestamp string";
    BatchOperator data = new CsvSourceBatchOp().setFilePath(url).setSchemaStr(schema);
    SplitBatchOp spliter = new SplitBatchOp().setFraction(0.8);
    spliter.linkFrom(data);
    BatchOperator trainData = spliter;
    BatchOperator testData = spliter.getSideOutput(0);
    AlsTrainBatchOp als = new AlsTrainBatchOp().setUserCol("userid").setItemCol("movieid").setRateCol("rating").setNumIter(10).setRank(10).setLambda(0.1);
    BatchOperator model = als.linkFrom(trainData);
    AlsRateRecommBatchOp predictor = new AlsRateRecommBatchOp().setUserCol("userid").setItemCol("movieid").setRecommCol("prediction_result");
    BatchOperator preditionResult = predictor.linkFrom(model, testData).select("rating, prediction_result");
    preditionResult.print();
}
Also used : AlsTrainBatchOp(com.alibaba.alink.operator.batch.recommendation.AlsTrainBatchOp) AlsRateRecommBatchOp(com.alibaba.alink.operator.batch.recommendation.AlsRateRecommBatchOp) BatchOperator(com.alibaba.alink.operator.batch.BatchOperator) CsvSourceBatchOp(com.alibaba.alink.operator.batch.source.CsvSourceBatchOp) SplitBatchOp(com.alibaba.alink.operator.batch.dataproc.SplitBatchOp)

Example 2 with SplitBatchOp

use of com.alibaba.alink.operator.batch.dataproc.SplitBatchOp in project Alink by alibaba.

the class BaseTuning method findBestTVSplit.

protected Tuple2<Pipeline, Report> findBestTVSplit(BatchOperator<?> in, double ratio, PipelineCandidatesBase candidates) {
    int nIter = candidates.size();
    SplitBatchOp sbo = new SplitBatchOp().setFraction(ratio).setMLEnvironmentId(getMLEnvironmentId()).linkFrom(new TableSourceBatchOp(DataSetConversionUtil.toTable(in.getMLEnvironmentId(), shuffle(in.getDataSet()), in.getSchema())).setMLEnvironmentId(getMLEnvironmentId()));
    int bestIdx = -1;
    double bestMetric = 0.;
    ArrayList<Double> experienceScores = new ArrayList<>(nIter);
    List<Report.ReportElement> reportElements = new ArrayList<>();
    for (int i = 0; i < nIter; i++) {
        Tuple2<Pipeline, List<Tuple3<Integer, ParamInfo, Object>>> cur;
        try {
            cur = candidates.get(i, experienceScores);
        } catch (CloneNotSupportedException e) {
            throw new RuntimeException(e);
        }
        double metric = Double.NaN;
        try {
            metric = tuningEvaluator.evaluate(cur.f0.fit(sbo).transform(sbo.getSideOutput(0)));
        } catch (Exception ex) {
            if (AlinkGlobalConfiguration.isPrintProcessInfo()) {
                System.out.println(String.format("BestTVSplit, i: %d, best: %f, metric: %f, exception: %s", i, bestMetric, metric, ExceptionUtils.stringifyException(ex)));
            }
            experienceScores.add(i, metric);
            reportElements.add(new Report.ReportElement(cur.f0, cur.f1, metric, ExceptionUtils.stringifyException(ex)));
            continue;
        }
        experienceScores.add(i, metric);
        if (Double.isNaN(metric)) {
            reportElements.add(new Report.ReportElement(cur.f0, cur.f1, metric, "Metric is nan."));
            continue;
        }
        reportElements.add(new Report.ReportElement(cur.f0, cur.f1, metric));
        if (bestIdx == -1) {
            bestMetric = metric;
            bestIdx = i;
        } else {
            if ((tuningEvaluator.isLargerBetter() && bestMetric < metric) || (!tuningEvaluator.isLargerBetter() && bestMetric > metric)) {
                bestMetric = metric;
                bestIdx = i;
            }
        }
        if (AlinkGlobalConfiguration.isPrintProcessInfo()) {
            System.out.println(String.format("BestTVSplit, i: %d, best: %f, metric: %f", i, bestMetric, metric));
        }
    }
    if (bestIdx < 0) {
        throw new RuntimeException("Can not find a best model. Report: " + new Report(tuningEvaluator, reportElements).toPrettyJson());
    }
    try {
        return Tuple2.of(candidates.get(bestIdx, experienceScores).f0, new Report(tuningEvaluator, reportElements));
    } catch (CloneNotSupportedException e) {
        throw new RuntimeException(e);
    }
}
Also used : ArrayList(java.util.ArrayList) TableSourceBatchOp(com.alibaba.alink.operator.batch.source.TableSourceBatchOp) SplitBatchOp(com.alibaba.alink.operator.batch.dataproc.SplitBatchOp) Pipeline(com.alibaba.alink.pipeline.Pipeline) ArrayList(java.util.ArrayList) List(java.util.List) ParamInfo(org.apache.flink.ml.api.misc.param.ParamInfo)

Example 3 with SplitBatchOp

use of com.alibaba.alink.operator.batch.dataproc.SplitBatchOp in project Alink by alibaba.

the class Utils method splitTrainTestIfNotExist.

public static void splitTrainTestIfNotExist(BatchOperator<?> source, String trainFilePath, String testFilePath, double ratio) throws Exception {
    if ((!new File(trainFilePath).exists()) && (!new File(testFilePath).exists())) {
        SplitBatchOp spliter = new SplitBatchOp().setFraction(ratio);
        source.link(spliter);
        spliter.link(new AkSinkBatchOp().setFilePath(trainFilePath));
        spliter.getSideOutput(0).link(new AkSinkBatchOp().setFilePath(testFilePath));
        BatchOperator.execute();
    }
}
Also used : AkSinkBatchOp(com.alibaba.alink.operator.batch.sink.AkSinkBatchOp) File(java.io.File) SplitBatchOp(com.alibaba.alink.operator.batch.dataproc.SplitBatchOp)

Example 4 with SplitBatchOp

use of com.alibaba.alink.operator.batch.dataproc.SplitBatchOp in project Alink by alibaba.

the class Chap07 method c_2.

static void c_2() throws Exception {
    CsvSourceBatchOp source = new CsvSourceBatchOp().setFilePath(DATA_DIR + ORIGIN_FILE).setSchemaStr(SCHEMA_STRING);
    System.out.println("schema of source:");
    System.out.println(source.getSchema());
    SplitBatchOp spliter = new SplitBatchOp().setFraction(0.9);
    source.link(spliter);
    System.out.println("schema of spliter's main output:");
    System.out.println(spliter.getSchema());
    System.out.println("count of spliter's side outputs:");
    System.out.println(spliter.getSideOutputCount());
    System.out.println("schema of spliter's side output :");
    System.out.println(spliter.getSideOutput(0).getSchema());
    spliter.lazyPrintStatistics("< Main Output >").link(new AkSinkBatchOp().setFilePath(DATA_DIR + TRAIN_FILE).setOverwriteSink(true));
    spliter.getSideOutput(0).lazyPrintStatistics("< Side Output >").link(new AkSinkBatchOp().setFilePath(DATA_DIR + TEST_FILE).setOverwriteSink(true));
    BatchOperator.execute();
}
Also used : AkSinkBatchOp(com.alibaba.alink.operator.batch.sink.AkSinkBatchOp) CsvSourceBatchOp(com.alibaba.alink.operator.batch.source.CsvSourceBatchOp) SplitBatchOp(com.alibaba.alink.operator.batch.dataproc.SplitBatchOp)

Aggregations

SplitBatchOp (com.alibaba.alink.operator.batch.dataproc.SplitBatchOp)4 AkSinkBatchOp (com.alibaba.alink.operator.batch.sink.AkSinkBatchOp)2 CsvSourceBatchOp (com.alibaba.alink.operator.batch.source.CsvSourceBatchOp)2 BatchOperator (com.alibaba.alink.operator.batch.BatchOperator)1 AlsRateRecommBatchOp (com.alibaba.alink.operator.batch.recommendation.AlsRateRecommBatchOp)1 AlsTrainBatchOp (com.alibaba.alink.operator.batch.recommendation.AlsTrainBatchOp)1 TableSourceBatchOp (com.alibaba.alink.operator.batch.source.TableSourceBatchOp)1 Pipeline (com.alibaba.alink.pipeline.Pipeline)1 File (java.io.File)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1 ParamInfo (org.apache.flink.ml.api.misc.param.ParamInfo)1