use of com.alibaba.alink.operator.batch.BatchOperator in project Alink by alibaba.
the class DataSetDiskDownloader method downloadUserFile.
/**
* task 0 downloads the file and broadcast to all other workers.
*
* @return barrier as the barrier for other dataset operations. Row(String Path)
*/
public static BatchOperator downloadUserFile(String uri) {
ExecutionEnvironment env = MLEnvironmentFactory.getDefault().getExecutionEnvironment();
if (null == uri || uri.length() == 0) {
return handleEmptyFile();
}
// expand data parallelism by partitionBy
DataSet<Integer> dataSetWithMaxParallelism = env.fromElements(Tuple2.of(1, 1)).partitionCustom(new Partitioner<Integer>() {
@Override
public int partition(Integer key, int numPartitions) {
return key % numPartitions;
}
}, 0).map(new MapFunction<Tuple2<Integer, Integer>, Integer>() {
@Override
public Integer map(Tuple2<Integer, Integer> value) throws Exception {
return value.f1;
}
});
DataSet<Row> filePaths = dataSetWithMaxParallelism.mapPartition(new RichMapPartitionFunction<Integer, Tuple2<Integer, byte[]>>() {
String targetFileName;
String targetDir;
@Override
public void open(Configuration configuration) throws Exception {
// delete the file in task zero.
int taskId = getRuntimeContext().getIndexOfThisSubtask();
if (taskId == 0) {
targetFileName = uri.contains("\\") ? uri.substring(uri.lastIndexOf('\\') + 1) : uri.substring(uri.lastIndexOf('/') + 1);
targetDir = PythonFileUtils.createTempDir("temp_user_files_").toString();
Path localPath = Paths.get(targetDir, targetFileName).toAbsolutePath();
File file = localPath.toFile();
if (file.exists()) {
file.delete();
}
}
}
@Override
public void mapPartition(Iterable<Integer> values, Collector<Tuple2<Integer, byte[]>> out) throws Exception {
int taskId = getRuntimeContext().getIndexOfThisSubtask();
int numTasks = getRuntimeContext().getNumberOfParallelSubtasks();
if (taskId == 0) {
DownloadUtils.resumableDownloadHttpFile(uri, targetDir, targetFileName);
// read from local disk and send to other workers
Path localPath = Paths.get(targetDir, targetFileName).toAbsolutePath();
File fileOnDisk = localPath.toFile();
FileInputStream fis = new FileInputStream(fileOnDisk);
int read;
// 64KB
final int buffSize = 64 * 1024;
byte[] buffer = new byte[buffSize];
while ((read = fis.read(buffer, 0, buffSize)) != -1) {
byte[] toSend = new byte[read];
System.arraycopy(buffer, 0, toSend, 0, read);
for (int idx = 0; idx < numTasks; idx++) {
out.collect(Tuple2.of(idx, toSend));
}
}
LOG.info("Downloading on TM with taskId: " + taskId + " ip: " + IpHostUtil.getIpAddress());
fis.close();
// delete the file.
} else {
LOG.info("No downloading on TM with taskId: " + taskId + " ip: " + IpHostUtil.getIpAddress());
}
}
}).partitionCustom(new Partitioner<Integer>() {
@Override
public int partition(Integer key, int numPartitions) {
return key;
}
}, 0).mapPartition(new RichMapPartitionFunction<Tuple2<Integer, byte[]>, Row>() {
String targetFileName;
String targetDir;
@Override
public void open(Configuration configuration) throws Exception {
targetFileName = uri.contains("\\") ? uri.substring(uri.lastIndexOf('\\') + 1) : uri.substring(uri.lastIndexOf('/') + 1);
targetDir = PythonFileUtils.createTempDir("temp_user_files_").toString();
Path localPath = Paths.get(targetDir, targetFileName).toAbsolutePath();
File file = localPath.toFile();
if (file.exists()) {
file.delete();
}
}
@Override
public void mapPartition(Iterable<Tuple2<Integer, byte[]>> values, Collector<Row> out) throws Exception {
int taskId = getRuntimeContext().getIndexOfThisSubtask();
// write to disk
Path localPath = Paths.get(targetDir, targetFileName).toAbsolutePath();
File outputFile = localPath.toFile();
FileOutputStream fos = new FileOutputStream(outputFile, true);
for (Tuple2<Integer, byte[]> val : values) {
fos.write(val.f1, 0, val.f1.length);
}
fos.close();
LOG.info("Write to disk on TM with taskId: " + taskId + " ip: " + IpHostUtil.getIpAddress());
Row row = new Row(1);
row.setField(0, targetDir + File.separator + targetFileName);
out.collect(row);
}
});
BatchOperator modelSource = BatchOperator.fromTable(DataSetConversionUtil.toTable(MLEnvironmentFactory.DEFAULT_ML_ENVIRONMENT_ID, filePaths, new String[] { "targetPath" }, new TypeInformation[] { TypeInformation.of(String.class) }));
return modelSource;
}
use of com.alibaba.alink.operator.batch.BatchOperator in project Alink by alibaba.
the class DataSetDiskDownloader method handleEmptyFile.
private static BatchOperator handleEmptyFile() {
ExecutionEnvironment env = MLEnvironmentFactory.getDefault().getExecutionEnvironment();
DataSet<Row> empty = env.fromElements(0).mapPartition(new MapPartitionFunction<Integer, Row>() {
@Override
public void mapPartition(Iterable<Integer> values, Collector<Row> out) throws Exception {
values.forEach(t -> {
// doing nothing, just to avoid a bug in Blink.
});
}
});
BatchOperator modelSource = BatchOperator.fromTable(DataSetConversionUtil.toTable(MLEnvironmentFactory.DEFAULT_ML_ENVIRONMENT_ID, empty, new String[] { "targetPath" }, new TypeInformation[] { TypeInformation.of(String.class) }));
return modelSource;
}
use of com.alibaba.alink.operator.batch.BatchOperator in project Alink by alibaba.
the class Chap05 method c_2_2.
static void c_2_2() throws Exception {
BatchOperator<?> ratings = Chap24.getSourceRatings();
BatchOperator<?> items = Chap24.getSourceItems();
BatchOperator left_ratings = ratings.filter("user_id<3 AND item_id<4").select("user_id, item_id, rating");
BatchOperator right_movies = items.select("item_id AS movie_id, title").filter("movie_id < 6 AND MOD(movie_id, 2) = 1");
System.out.println("# left_ratings #");
left_ratings.print();
System.out.println("\n# right_movies #");
right_movies.print();
System.out.println("# JOIN #");
new JoinBatchOp().setJoinPredicate("item_id = movie_id").setSelectClause("user_id, item_id, title, rating").linkFrom(left_ratings, right_movies).print();
System.out.println("\n# LEFT OUTER JOIN #");
new LeftOuterJoinBatchOp().setJoinPredicate("item_id = movie_id").setSelectClause("user_id, item_id, title, rating").linkFrom(left_ratings, right_movies).print();
System.out.println("\n# RIGHT OUTER JOIN #");
new RightOuterJoinBatchOp().setJoinPredicate("item_id = movie_id").setSelectClause("user_id, item_id, title, rating").linkFrom(left_ratings, right_movies).print();
System.out.println("\n# FULL OUTER JOIN #");
new FullOuterJoinBatchOp().setJoinPredicate("item_id = movie_id").setSelectClause("user_id, item_id, title, rating").linkFrom(left_ratings, right_movies).print();
}
use of com.alibaba.alink.operator.batch.BatchOperator in project Alink by alibaba.
the class Chap05 method c_2_3.
static void c_2_3() throws Exception {
BatchOperator<?> users = Chap24.getSourceUsers();
BatchOperator users_1_4 = users.filter("user_id<5");
System.out.println("# users_1_4 #");
users_1_4.print();
BatchOperator users_3_6 = users.filter("user_id>2 AND user_id<7");
System.out.println("\n# users_3_6 #");
users_3_6.print();
new UnionAllBatchOp().linkFrom(users_1_4, users_3_6).print();
new UnionBatchOp().linkFrom(users_1_4, users_3_6).print();
new IntersectBatchOp().linkFrom(users_1_4, users_3_6).print();
new IntersectAllBatchOp().linkFrom(new UnionAllBatchOp().linkFrom(users_1_4, users_1_4), new UnionAllBatchOp().linkFrom(users_1_4, users_3_6)).print();
new MinusBatchOp().linkFrom(users_1_4, users_3_6).print();
new MinusAllBatchOp().linkFrom(new UnionAllBatchOp().linkFrom(users_1_4, users_1_4), new UnionAllBatchOp().linkFrom(users_1_4, users_3_6)).print();
}
use of com.alibaba.alink.operator.batch.BatchOperator in project Alink by alibaba.
the class Chap09 method c_5.
static void c_5() throws Exception {
BatchOperator train_data = new AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
BatchOperator test_data = new AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);
for (TreeType treeType : new TreeType[] { TreeType.GINI, TreeType.INFOGAIN, TreeType.INFOGAINRATIO }) {
BatchOperator<?> model = train_data.link(new DecisionTreeTrainBatchOp().setTreeType(treeType).setFeatureCols(FEATURE_COL_NAMES).setCategoricalCols(FEATURE_COL_NAMES).setLabelCol(LABEL_COL_NAME).lazyPrintModelInfo("< " + treeType.toString() + " >").lazyCollectModelInfo(new Consumer<DecisionTreeModelInfo>() {
@Override
public void accept(DecisionTreeModelInfo decisionTreeModelInfo) {
try {
decisionTreeModelInfo.saveTreeAsImage(DATA_DIR + "tree_" + treeType.toString() + ".jpg", true);
} catch (IOException e) {
e.printStackTrace();
}
}
}));
DecisionTreePredictBatchOp predictor = new DecisionTreePredictBatchOp().setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME);
predictor.linkFrom(model, test_data);
predictor.link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("p").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("< " + treeType.toString() + " >"));
}
BatchOperator.execute();
}
Aggregations