Search in sources :

Example 1 with BatchOperator

use of com.alibaba.alink.operator.batch.BatchOperator in project Alink by alibaba.

the class DataSetDiskDownloader method downloadUserFile.

/**
 * task 0 downloads the file and broadcast to all other workers.
 *
 * @return barrier as the barrier for other dataset operations. Row(String Path)
 */
public static BatchOperator downloadUserFile(String uri) {
    ExecutionEnvironment env = MLEnvironmentFactory.getDefault().getExecutionEnvironment();
    if (null == uri || uri.length() == 0) {
        return handleEmptyFile();
    }
    // expand data parallelism by partitionBy
    DataSet<Integer> dataSetWithMaxParallelism = env.fromElements(Tuple2.of(1, 1)).partitionCustom(new Partitioner<Integer>() {

        @Override
        public int partition(Integer key, int numPartitions) {
            return key % numPartitions;
        }
    }, 0).map(new MapFunction<Tuple2<Integer, Integer>, Integer>() {

        @Override
        public Integer map(Tuple2<Integer, Integer> value) throws Exception {
            return value.f1;
        }
    });
    DataSet<Row> filePaths = dataSetWithMaxParallelism.mapPartition(new RichMapPartitionFunction<Integer, Tuple2<Integer, byte[]>>() {

        String targetFileName;

        String targetDir;

        @Override
        public void open(Configuration configuration) throws Exception {
            // delete the file in task zero.
            int taskId = getRuntimeContext().getIndexOfThisSubtask();
            if (taskId == 0) {
                targetFileName = uri.contains("\\") ? uri.substring(uri.lastIndexOf('\\') + 1) : uri.substring(uri.lastIndexOf('/') + 1);
                targetDir = PythonFileUtils.createTempDir("temp_user_files_").toString();
                Path localPath = Paths.get(targetDir, targetFileName).toAbsolutePath();
                File file = localPath.toFile();
                if (file.exists()) {
                    file.delete();
                }
            }
        }

        @Override
        public void mapPartition(Iterable<Integer> values, Collector<Tuple2<Integer, byte[]>> out) throws Exception {
            int taskId = getRuntimeContext().getIndexOfThisSubtask();
            int numTasks = getRuntimeContext().getNumberOfParallelSubtasks();
            if (taskId == 0) {
                DownloadUtils.resumableDownloadHttpFile(uri, targetDir, targetFileName);
                // read from local disk and send to other workers
                Path localPath = Paths.get(targetDir, targetFileName).toAbsolutePath();
                File fileOnDisk = localPath.toFile();
                FileInputStream fis = new FileInputStream(fileOnDisk);
                int read;
                // 64KB
                final int buffSize = 64 * 1024;
                byte[] buffer = new byte[buffSize];
                while ((read = fis.read(buffer, 0, buffSize)) != -1) {
                    byte[] toSend = new byte[read];
                    System.arraycopy(buffer, 0, toSend, 0, read);
                    for (int idx = 0; idx < numTasks; idx++) {
                        out.collect(Tuple2.of(idx, toSend));
                    }
                }
                LOG.info("Downloading on TM with taskId: " + taskId + " ip: " + IpHostUtil.getIpAddress());
                fis.close();
            // delete the file.
            } else {
                LOG.info("No downloading on TM with taskId: " + taskId + " ip: " + IpHostUtil.getIpAddress());
            }
        }
    }).partitionCustom(new Partitioner<Integer>() {

        @Override
        public int partition(Integer key, int numPartitions) {
            return key;
        }
    }, 0).mapPartition(new RichMapPartitionFunction<Tuple2<Integer, byte[]>, Row>() {

        String targetFileName;

        String targetDir;

        @Override
        public void open(Configuration configuration) throws Exception {
            targetFileName = uri.contains("\\") ? uri.substring(uri.lastIndexOf('\\') + 1) : uri.substring(uri.lastIndexOf('/') + 1);
            targetDir = PythonFileUtils.createTempDir("temp_user_files_").toString();
            Path localPath = Paths.get(targetDir, targetFileName).toAbsolutePath();
            File file = localPath.toFile();
            if (file.exists()) {
                file.delete();
            }
        }

        @Override
        public void mapPartition(Iterable<Tuple2<Integer, byte[]>> values, Collector<Row> out) throws Exception {
            int taskId = getRuntimeContext().getIndexOfThisSubtask();
            // write to disk
            Path localPath = Paths.get(targetDir, targetFileName).toAbsolutePath();
            File outputFile = localPath.toFile();
            FileOutputStream fos = new FileOutputStream(outputFile, true);
            for (Tuple2<Integer, byte[]> val : values) {
                fos.write(val.f1, 0, val.f1.length);
            }
            fos.close();
            LOG.info("Write to disk on TM with taskId: " + taskId + " ip: " + IpHostUtil.getIpAddress());
            Row row = new Row(1);
            row.setField(0, targetDir + File.separator + targetFileName);
            out.collect(row);
        }
    });
    BatchOperator modelSource = BatchOperator.fromTable(DataSetConversionUtil.toTable(MLEnvironmentFactory.DEFAULT_ML_ENVIRONMENT_ID, filePaths, new String[] { "targetPath" }, new TypeInformation[] { TypeInformation.of(String.class) }));
    return modelSource;
}
Also used : ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Configuration(org.apache.flink.configuration.Configuration) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) Partitioner(org.apache.flink.api.common.functions.Partitioner) Path(java.nio.file.Path) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) BatchOperator(com.alibaba.alink.operator.batch.BatchOperator) Tuple2(org.apache.flink.api.java.tuple.Tuple2) FileOutputStream(java.io.FileOutputStream) Row(org.apache.flink.types.Row) File(java.io.File)

Example 2 with BatchOperator

use of com.alibaba.alink.operator.batch.BatchOperator in project Alink by alibaba.

the class DataSetDiskDownloader method handleEmptyFile.

private static BatchOperator handleEmptyFile() {
    ExecutionEnvironment env = MLEnvironmentFactory.getDefault().getExecutionEnvironment();
    DataSet<Row> empty = env.fromElements(0).mapPartition(new MapPartitionFunction<Integer, Row>() {

        @Override
        public void mapPartition(Iterable<Integer> values, Collector<Row> out) throws Exception {
            values.forEach(t -> {
            // doing nothing, just to avoid a bug in Blink.
            });
        }
    });
    BatchOperator modelSource = BatchOperator.fromTable(DataSetConversionUtil.toTable(MLEnvironmentFactory.DEFAULT_ML_ENVIRONMENT_ID, empty, new String[] { "targetPath" }, new TypeInformation[] { TypeInformation.of(String.class) }));
    return modelSource;
}
Also used : DownloadUtils(com.alibaba.alink.common.utils.DownloadUtils) Arrays(java.util.Arrays) Tuple2(org.apache.flink.api.java.tuple.Tuple2) MLEnvironmentFactory(com.alibaba.alink.common.MLEnvironmentFactory) LoggerFactory(org.slf4j.LoggerFactory) MapFunction(org.apache.flink.api.common.functions.MapFunction) Partitioner(org.apache.flink.api.common.functions.Partitioner) DataSet(org.apache.flink.api.java.DataSet) Collector(org.apache.flink.util.Collector) Map(java.util.Map) BatchOperator(com.alibaba.alink.operator.batch.BatchOperator) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) Path(java.nio.file.Path) Types(org.apache.flink.api.common.typeinfo.Types) Logger(org.slf4j.Logger) Iterator(java.util.Iterator) Files(java.nio.file.Files) IpHostUtil(com.alibaba.flink.ml.util.IpHostUtil) Configuration(org.apache.flink.configuration.Configuration) FileOutputStream(java.io.FileOutputStream) IOException(java.io.IOException) RichMapPartitionFunction(org.apache.flink.api.common.functions.RichMapPartitionFunction) FileInputStream(java.io.FileInputStream) File(java.io.File) Serializable(java.io.Serializable) List(java.util.List) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) DataSetConversionUtil(com.alibaba.alink.common.utils.DataSetConversionUtil) Paths(java.nio.file.Paths) MapPartitionFunction(org.apache.flink.api.common.functions.MapPartitionFunction) Row(org.apache.flink.types.Row) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) IOException(java.io.IOException) BatchOperator(com.alibaba.alink.operator.batch.BatchOperator) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) Row(org.apache.flink.types.Row)

Example 3 with BatchOperator

use of com.alibaba.alink.operator.batch.BatchOperator in project Alink by alibaba.

the class Chap05 method c_2_2.

static void c_2_2() throws Exception {
    BatchOperator<?> ratings = Chap24.getSourceRatings();
    BatchOperator<?> items = Chap24.getSourceItems();
    BatchOperator left_ratings = ratings.filter("user_id<3 AND item_id<4").select("user_id, item_id, rating");
    BatchOperator right_movies = items.select("item_id AS movie_id, title").filter("movie_id < 6 AND MOD(movie_id, 2) = 1");
    System.out.println("# left_ratings #");
    left_ratings.print();
    System.out.println("\n# right_movies #");
    right_movies.print();
    System.out.println("# JOIN #");
    new JoinBatchOp().setJoinPredicate("item_id = movie_id").setSelectClause("user_id, item_id, title, rating").linkFrom(left_ratings, right_movies).print();
    System.out.println("\n# LEFT OUTER JOIN #");
    new LeftOuterJoinBatchOp().setJoinPredicate("item_id = movie_id").setSelectClause("user_id, item_id, title, rating").linkFrom(left_ratings, right_movies).print();
    System.out.println("\n# RIGHT OUTER JOIN #");
    new RightOuterJoinBatchOp().setJoinPredicate("item_id = movie_id").setSelectClause("user_id, item_id, title, rating").linkFrom(left_ratings, right_movies).print();
    System.out.println("\n# FULL OUTER JOIN #");
    new FullOuterJoinBatchOp().setJoinPredicate("item_id = movie_id").setSelectClause("user_id, item_id, title, rating").linkFrom(left_ratings, right_movies).print();
}
Also used : LeftOuterJoinBatchOp(com.alibaba.alink.operator.batch.sql.LeftOuterJoinBatchOp) FullOuterJoinBatchOp(com.alibaba.alink.operator.batch.sql.FullOuterJoinBatchOp) LeftOuterJoinBatchOp(com.alibaba.alink.operator.batch.sql.LeftOuterJoinBatchOp) JoinBatchOp(com.alibaba.alink.operator.batch.sql.JoinBatchOp) FullOuterJoinBatchOp(com.alibaba.alink.operator.batch.sql.FullOuterJoinBatchOp) RightOuterJoinBatchOp(com.alibaba.alink.operator.batch.sql.RightOuterJoinBatchOp) RightOuterJoinBatchOp(com.alibaba.alink.operator.batch.sql.RightOuterJoinBatchOp) BatchOperator(com.alibaba.alink.operator.batch.BatchOperator)

Example 4 with BatchOperator

use of com.alibaba.alink.operator.batch.BatchOperator in project Alink by alibaba.

the class Chap05 method c_2_3.

static void c_2_3() throws Exception {
    BatchOperator<?> users = Chap24.getSourceUsers();
    BatchOperator users_1_4 = users.filter("user_id<5");
    System.out.println("# users_1_4 #");
    users_1_4.print();
    BatchOperator users_3_6 = users.filter("user_id>2 AND user_id<7");
    System.out.println("\n# users_3_6 #");
    users_3_6.print();
    new UnionAllBatchOp().linkFrom(users_1_4, users_3_6).print();
    new UnionBatchOp().linkFrom(users_1_4, users_3_6).print();
    new IntersectBatchOp().linkFrom(users_1_4, users_3_6).print();
    new IntersectAllBatchOp().linkFrom(new UnionAllBatchOp().linkFrom(users_1_4, users_1_4), new UnionAllBatchOp().linkFrom(users_1_4, users_3_6)).print();
    new MinusBatchOp().linkFrom(users_1_4, users_3_6).print();
    new MinusAllBatchOp().linkFrom(new UnionAllBatchOp().linkFrom(users_1_4, users_1_4), new UnionAllBatchOp().linkFrom(users_1_4, users_3_6)).print();
}
Also used : MinusBatchOp(com.alibaba.alink.operator.batch.sql.MinusBatchOp) IntersectAllBatchOp(com.alibaba.alink.operator.batch.sql.IntersectAllBatchOp) IntersectBatchOp(com.alibaba.alink.operator.batch.sql.IntersectBatchOp) UnionBatchOp(com.alibaba.alink.operator.batch.sql.UnionBatchOp) MinusAllBatchOp(com.alibaba.alink.operator.batch.sql.MinusAllBatchOp) BatchOperator(com.alibaba.alink.operator.batch.BatchOperator) UnionAllBatchOp(com.alibaba.alink.operator.batch.sql.UnionAllBatchOp)

Example 5 with BatchOperator

use of com.alibaba.alink.operator.batch.BatchOperator in project Alink by alibaba.

the class Chap09 method c_5.

static void c_5() throws Exception {
    BatchOperator train_data = new AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
    BatchOperator test_data = new AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);
    for (TreeType treeType : new TreeType[] { TreeType.GINI, TreeType.INFOGAIN, TreeType.INFOGAINRATIO }) {
        BatchOperator<?> model = train_data.link(new DecisionTreeTrainBatchOp().setTreeType(treeType).setFeatureCols(FEATURE_COL_NAMES).setCategoricalCols(FEATURE_COL_NAMES).setLabelCol(LABEL_COL_NAME).lazyPrintModelInfo("< " + treeType.toString() + " >").lazyCollectModelInfo(new Consumer<DecisionTreeModelInfo>() {

            @Override
            public void accept(DecisionTreeModelInfo decisionTreeModelInfo) {
                try {
                    decisionTreeModelInfo.saveTreeAsImage(DATA_DIR + "tree_" + treeType.toString() + ".jpg", true);
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }));
        DecisionTreePredictBatchOp predictor = new DecisionTreePredictBatchOp().setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME);
        predictor.linkFrom(model, test_data);
        predictor.link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("p").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("< " + treeType.toString() + " >"));
    }
    BatchOperator.execute();
}
Also used : TreeType(com.alibaba.alink.params.shared.tree.HasIndividualTreeType.TreeType) AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) Consumer(java.util.function.Consumer) DecisionTreeModelInfo(com.alibaba.alink.operator.common.tree.TreeModelInfo.DecisionTreeModelInfo) IOException(java.io.IOException) DecisionTreePredictBatchOp(com.alibaba.alink.operator.batch.classification.DecisionTreePredictBatchOp) BatchOperator(com.alibaba.alink.operator.batch.BatchOperator) DecisionTreeTrainBatchOp(com.alibaba.alink.operator.batch.classification.DecisionTreeTrainBatchOp) EvalBinaryClassBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp)

Aggregations

BatchOperator (com.alibaba.alink.operator.batch.BatchOperator)234 Test (org.junit.Test)168 Row (org.apache.flink.types.Row)141 MemSourceBatchOp (com.alibaba.alink.operator.batch.source.MemSourceBatchOp)89 HashMap (java.util.HashMap)64 StreamOperator (com.alibaba.alink.operator.stream.StreamOperator)35 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)27 List (java.util.List)24 ArrayList (java.util.ArrayList)23 TableSourceBatchOp (com.alibaba.alink.operator.batch.source.TableSourceBatchOp)22 Params (org.apache.flink.ml.api.misc.param.Params)22 CollectSinkStreamOp (com.alibaba.alink.operator.stream.sink.CollectSinkStreamOp)20 MemSourceStreamOp (com.alibaba.alink.operator.stream.source.MemSourceStreamOp)18 Vector (com.alibaba.alink.common.linalg.Vector)17 PipelineModel (com.alibaba.alink.pipeline.PipelineModel)17 TypeInformation (org.apache.flink.api.common.typeinfo.TypeInformation)17 Pipeline (com.alibaba.alink.pipeline.Pipeline)16 DataSet (org.apache.flink.api.java.DataSet)16 TableSchema (org.apache.flink.table.api.TableSchema)16 DenseVector (com.alibaba.alink.common.linalg.DenseVector)14