use of com.alibaba.alink.operator.batch.dataproc.SplitBatchOp in project Alink by alibaba.
the class ALSExample method main.
public static void main(String[] args) throws Exception {
String url = "https://alink-release.oss-cn-beijing.aliyuncs.com/data-files/movielens_ratings.csv";
String schema = "userid bigint, movieid bigint, rating double, timestamp string";
BatchOperator data = new CsvSourceBatchOp().setFilePath(url).setSchemaStr(schema);
SplitBatchOp spliter = new SplitBatchOp().setFraction(0.8);
spliter.linkFrom(data);
BatchOperator trainData = spliter;
BatchOperator testData = spliter.getSideOutput(0);
AlsTrainBatchOp als = new AlsTrainBatchOp().setUserCol("userid").setItemCol("movieid").setRateCol("rating").setNumIter(10).setRank(10).setLambda(0.1);
BatchOperator model = als.linkFrom(trainData);
AlsRateRecommBatchOp predictor = new AlsRateRecommBatchOp().setUserCol("userid").setItemCol("movieid").setRecommCol("prediction_result");
BatchOperator preditionResult = predictor.linkFrom(model, testData).select("rating, prediction_result");
preditionResult.print();
}
use of com.alibaba.alink.operator.batch.dataproc.SplitBatchOp in project Alink by alibaba.
the class BaseTuning method findBestTVSplit.
protected Tuple2<Pipeline, Report> findBestTVSplit(BatchOperator<?> in, double ratio, PipelineCandidatesBase candidates) {
int nIter = candidates.size();
SplitBatchOp sbo = new SplitBatchOp().setFraction(ratio).setMLEnvironmentId(getMLEnvironmentId()).linkFrom(new TableSourceBatchOp(DataSetConversionUtil.toTable(in.getMLEnvironmentId(), shuffle(in.getDataSet()), in.getSchema())).setMLEnvironmentId(getMLEnvironmentId()));
int bestIdx = -1;
double bestMetric = 0.;
ArrayList<Double> experienceScores = new ArrayList<>(nIter);
List<Report.ReportElement> reportElements = new ArrayList<>();
for (int i = 0; i < nIter; i++) {
Tuple2<Pipeline, List<Tuple3<Integer, ParamInfo, Object>>> cur;
try {
cur = candidates.get(i, experienceScores);
} catch (CloneNotSupportedException e) {
throw new RuntimeException(e);
}
double metric = Double.NaN;
try {
metric = tuningEvaluator.evaluate(cur.f0.fit(sbo).transform(sbo.getSideOutput(0)));
} catch (Exception ex) {
if (AlinkGlobalConfiguration.isPrintProcessInfo()) {
System.out.println(String.format("BestTVSplit, i: %d, best: %f, metric: %f, exception: %s", i, bestMetric, metric, ExceptionUtils.stringifyException(ex)));
}
experienceScores.add(i, metric);
reportElements.add(new Report.ReportElement(cur.f0, cur.f1, metric, ExceptionUtils.stringifyException(ex)));
continue;
}
experienceScores.add(i, metric);
if (Double.isNaN(metric)) {
reportElements.add(new Report.ReportElement(cur.f0, cur.f1, metric, "Metric is nan."));
continue;
}
reportElements.add(new Report.ReportElement(cur.f0, cur.f1, metric));
if (bestIdx == -1) {
bestMetric = metric;
bestIdx = i;
} else {
if ((tuningEvaluator.isLargerBetter() && bestMetric < metric) || (!tuningEvaluator.isLargerBetter() && bestMetric > metric)) {
bestMetric = metric;
bestIdx = i;
}
}
if (AlinkGlobalConfiguration.isPrintProcessInfo()) {
System.out.println(String.format("BestTVSplit, i: %d, best: %f, metric: %f", i, bestMetric, metric));
}
}
if (bestIdx < 0) {
throw new RuntimeException("Can not find a best model. Report: " + new Report(tuningEvaluator, reportElements).toPrettyJson());
}
try {
return Tuple2.of(candidates.get(bestIdx, experienceScores).f0, new Report(tuningEvaluator, reportElements));
} catch (CloneNotSupportedException e) {
throw new RuntimeException(e);
}
}
use of com.alibaba.alink.operator.batch.dataproc.SplitBatchOp in project Alink by alibaba.
the class Utils method splitTrainTestIfNotExist.
public static void splitTrainTestIfNotExist(BatchOperator<?> source, String trainFilePath, String testFilePath, double ratio) throws Exception {
if ((!new File(trainFilePath).exists()) && (!new File(testFilePath).exists())) {
SplitBatchOp spliter = new SplitBatchOp().setFraction(ratio);
source.link(spliter);
spliter.link(new AkSinkBatchOp().setFilePath(trainFilePath));
spliter.getSideOutput(0).link(new AkSinkBatchOp().setFilePath(testFilePath));
BatchOperator.execute();
}
}
use of com.alibaba.alink.operator.batch.dataproc.SplitBatchOp in project Alink by alibaba.
the class Chap07 method c_2.
static void c_2() throws Exception {
CsvSourceBatchOp source = new CsvSourceBatchOp().setFilePath(DATA_DIR + ORIGIN_FILE).setSchemaStr(SCHEMA_STRING);
System.out.println("schema of source:");
System.out.println(source.getSchema());
SplitBatchOp spliter = new SplitBatchOp().setFraction(0.9);
source.link(spliter);
System.out.println("schema of spliter's main output:");
System.out.println(spliter.getSchema());
System.out.println("count of spliter's side outputs:");
System.out.println(spliter.getSideOutputCount());
System.out.println("schema of spliter's side output :");
System.out.println(spliter.getSideOutput(0).getSchema());
spliter.lazyPrintStatistics("< Main Output >").link(new AkSinkBatchOp().setFilePath(DATA_DIR + TRAIN_FILE).setOverwriteSink(true));
spliter.getSideOutput(0).lazyPrintStatistics("< Side Output >").link(new AkSinkBatchOp().setFilePath(DATA_DIR + TEST_FILE).setOverwriteSink(true));
BatchOperator.execute();
}
Aggregations