Search in sources :

Example 11 with FilePath

use of com.alibaba.alink.common.io.filesystem.FilePath in project Alink by alibaba.

the class ModelStreamFileSinkStreamOp method sinkFrom.

@Override
public ModelStreamFileSinkStreamOp sinkFrom(StreamOperator<?> in) {
    TableSchema schema = in.getSchema();
    final int timestampColIndex = ModelStreamUtils.findTimestampColIndexWithAssertAndHint(schema);
    final int countColIndex = ModelStreamUtils.findCountColIndexWithAssertAndHint(schema);
    final TableSchema dataSchema = new TableSchema(ArrayUtils.removeAll(schema.getFieldNames(), timestampColIndex, countColIndex), ArrayUtils.removeAll(schema.getFieldTypes(), timestampColIndex, countColIndex));
    final String dataSchemaStr = CsvUtil.schema2SchemaStr(dataSchema);
    final FilePath path = getFilePath();
    final int numKeepModel = getNumKeepModel();
    final FileModelStreamSink fileModelStreamSink = new FileModelStreamSink(path, dataSchemaStr);
    try {
        fileModelStreamSink.initializeGlobal();
    } catch (IOException e) {
        throw new IllegalStateException(e);
    }
    DataStream<Row> inputStream = in.getDataStream();
    DataStream<Tuple4<Timestamp, Integer, Long, Long>> count = inputStream.map(new RichMapFunction<Row, Tuple4<Timestamp, Integer, Long, Long>>() {

        @Override
        public Tuple4<Timestamp, Integer, Long, Long> map(Row value) {
            return Tuple4.of((Timestamp) value.getField(timestampColIndex), getRuntimeContext().getIndexOfThisSubtask(), (Long) value.getField(countColIndex), 1L);
        }
    }).keyBy(0, 1, 2).reduce(new ReduceFunction<Tuple4<Timestamp, Integer, Long, Long>>() {

        @Override
        public Tuple4<Timestamp, Integer, Long, Long> reduce(Tuple4<Timestamp, Integer, Long, Long> value1, Tuple4<Timestamp, Integer, Long, Long> value2) {
            return Tuple4.of(value1.f0, value1.f1, value1.f2, value1.f3 + value2.f3);
        }
    }).keyBy(0, 2).flatMap(new RichFlatMapFunction<Tuple4<Timestamp, Integer, Long, Long>, Tuple4<Timestamp, Integer, Long, Long>>() {

        private transient MapState<Integer, Tuple4<Timestamp, Integer, Long, Long>> latest;

        @Override
        public void open(Configuration parameters) throws Exception {
            super.open(parameters);
            latest = getRuntimeContext().getMapState(new MapStateDescriptor<>("latest", Types.INT, new TupleTypeInfo<>(Types.SQL_TIMESTAMP, Types.INT, Types.LONG, Types.LONG)));
        }

        @Override
        public void flatMap(Tuple4<Timestamp, Integer, Long, Long> value, Collector<Tuple4<Timestamp, Integer, Long, Long>> out) throws Exception {
            latest.put(value.f1, value);
            long sum = 0;
            long total = -1;
            for (Map.Entry<Integer, Tuple4<Timestamp, Integer, Long, Long>> entry : latest.entries()) {
                total = entry.getValue().f2;
                sum += entry.getValue().f3;
            }
            if (total == sum) {
                for (Map.Entry<Integer, Tuple4<Timestamp, Integer, Long, Long>> entry : latest.entries()) {
                    out.collect(entry.getValue());
                }
            }
        }
    });
    inputStream.map(new RichMapFunction<Row, Tuple3<Timestamp, Integer, Row>>() {

        @Override
        public Tuple3<Timestamp, Integer, Row> map(Row value) {
            return Tuple3.of((Timestamp) value.getField(timestampColIndex), getRuntimeContext().getIndexOfThisSubtask(), ModelStreamUtils.genRowWithoutIdentifier(value, timestampColIndex, countColIndex));
        }
    }).keyBy(0, 1).connect(count.keyBy(0, 1)).flatMap(new RichCoFlatMapFunction<Tuple3<Timestamp, Integer, Row>, Tuple4<Timestamp, Integer, Long, Long>, Tuple1<Timestamp>>() {

        private final Map<Tuple2<Timestamp, Integer>, Tuple3<FileModelStreamSink, Long, Long>> writerContainer = new HashMap<>();

        @Override
        public void open(Configuration parameters) throws Exception {
            super.open(parameters);
        }

        @Override
        public void flatMap1(Tuple3<Timestamp, Integer, Row> value, Collector<Tuple1<Timestamp>> out) {
            writerContainer.compute(Tuple2.of(value.f0, value.f1), (key, oldValue) -> {
                if (oldValue == null) {
                    FileModelStreamSink localFileModelStreamSink = new FileModelStreamSink(path, dataSchemaStr);
                    try {
                        localFileModelStreamSink.open(value.f0, value.f1);
                    } catch (IOException e) {
                        throw new RuntimeException(e);
                    }
                    localFileModelStreamSink.collect(value.f2);
                    return Tuple3.of(localFileModelStreamSink, 1L, null);
                } else if (oldValue.f0 == null) {
                    FileModelStreamSink localFileModelStreamSink = new FileModelStreamSink(path, dataSchemaStr);
                    try {
                        localFileModelStreamSink.open(value.f0, value.f1);
                    } catch (IOException e) {
                        throw new RuntimeException(e);
                    }
                    localFileModelStreamSink.collect(value.f2);
                    if (oldValue.f2 != null && oldValue.f2.equals(1L)) {
                        localFileModelStreamSink.close();
                        out.collect(Tuple1.of(key.f0));
                        return null;
                    } else {
                        return Tuple3.of(localFileModelStreamSink, 1L, null);
                    }
                } else {
                    oldValue.f0.collect(value.f2);
                    ++oldValue.f1;
                    if (oldValue.f2 != null && oldValue.f2.equals(oldValue.f1)) {
                        oldValue.f0.close();
                        out.collect(Tuple1.of(key.f0));
                        return null;
                    } else {
                        return Tuple3.of(oldValue.f0, oldValue.f1, oldValue.f2);
                    }
                }
            });
        }

        @Override
        public void flatMap2(Tuple4<Timestamp, Integer, Long, Long> value, Collector<Tuple1<Timestamp>> out) {
            writerContainer.compute(Tuple2.of(value.f0, value.f1), (key, oldValue) -> {
                if (oldValue == null) {
                    return Tuple3.of(null, null, value.f3);
                } else {
                    if (value.f3.equals(oldValue.f1)) {
                        oldValue.f0.close();
                        out.collect(Tuple1.of(key.f0));
                        return null;
                    } else {
                        return Tuple3.of(oldValue.f0, oldValue.f1, value.f3);
                    }
                }
            });
        }
    }).keyBy(0).connect(count.keyBy(0).flatMap(new RichFlatMapFunction<Tuple4<Timestamp, Integer, Long, Long>, Tuple4<Timestamp, Integer, Integer, Long>>() {

        private transient ListState<Tuple2<Integer, Long>> filesCounter;

        @Override
        public void open(Configuration parameters) throws Exception {
            super.open(parameters);
            filesCounter = getRuntimeContext().getListState(new ListStateDescriptor<>("filesCounter", new TupleTypeInfo<>(Types.INT, Types.LONG)));
        }

        @Override
        public void flatMap(Tuple4<Timestamp, Integer, Long, Long> value, Collector<Tuple4<Timestamp, Integer, Integer, Long>> out) throws Exception {
            Long sum = value.f3;
            Integer fileCount = 1;
            List<Tuple2<Integer, Long>> local = new ArrayList<>();
            local.add(Tuple2.of(value.f1, value.f3));
            for (Tuple2<Integer, Long> count : filesCounter.get()) {
                sum += count.f1;
                fileCount++;
                local.add(count);
            }
            if (value.f2.equals(sum)) {
                for (Tuple2<Integer, Long> count : local) {
                    out.collect(Tuple4.of(value.f0, count.f0, fileCount, value.f2));
                }
            }
            filesCounter.add(Tuple2.of(value.f1, value.f3));
        }
    }).keyBy(0)).flatMap(new RichCoFlatMapFunction<Tuple1<Timestamp>, Tuple4<Timestamp, Integer, Integer, Long>, byte[]>() {

        private transient ValueState<Integer> filesCounter;

        private transient ListState<Tuple3<Integer, Integer, Long>> total;

        @Override
        public void open(Configuration parameters) throws Exception {
            super.open(parameters);
            filesCounter = getRuntimeContext().getState(new ValueStateDescriptor<>("filesCounter", Types.INT));
            total = getRuntimeContext().getListState(new ListStateDescriptor<>("total", new TupleTypeInfo<>(Types.INT, Types.LONG)));
        }

        @Override
        public void flatMap1(Tuple1<Timestamp> value, Collector<byte[]> out) throws Exception {
            Integer count = filesCounter.value();
            if (count == null) {
                count = 1;
            } else {
                ++count;
            }
            List<Tuple3<Integer, Integer, Long>> local = new ArrayList<>();
            for (Tuple3<Integer, Integer, Long> t : total.get()) {
                local.add(t);
            }
            if (!local.isEmpty() && local.get(0).f1.equals(local.size()) && local.get(0).f1.equals(count)) {
                List<Integer> filesId = new ArrayList<>();
                for (Tuple3<Integer, Integer, Long> t : local) {
                    filesId.add(t.f0);
                }
                new FileModelStreamSink(path, dataSchemaStr).finalizeGlobal(value.f0, local.get(0).f2, filesId, numKeepModel);
            }
            filesCounter.update(count);
        }

        @Override
        public void flatMap2(Tuple4<Timestamp, Integer, Integer, Long> value, Collector<byte[]> out) throws Exception {
            List<Tuple3<Integer, Integer, Long>> local = new ArrayList<>();
            local.add(Tuple3.of(value.f1, value.f2, value.f3));
            for (Tuple3<Integer, Integer, Long> t : total.get()) {
                local.add(t);
            }
            if (local.get(0).f1.equals(local.size()) && local.get(0).f1.equals(filesCounter.value())) {
                List<Integer> filesId = new ArrayList<>();
                for (Tuple3<Integer, Integer, Long> t : local) {
                    filesId.add(t.f0);
                }
                new FileModelStreamSink(path, dataSchemaStr).finalizeGlobal(value.f0, local.get(0).f2, filesId, numKeepModel);
            }
            total.add(Tuple3.of(value.f1, value.f2, value.f3));
        }
    }).writeUsingOutputFormat(new DummyOutputFormat<>());
    return this;
}
Also used : RichFlatMapFunction(org.apache.flink.api.common.functions.RichFlatMapFunction) Tuple1(org.apache.flink.api.java.tuple.Tuple1) Tuple3(org.apache.flink.api.java.tuple.Tuple3) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Tuple4(org.apache.flink.api.java.tuple.Tuple4) TupleTypeInfo(org.apache.flink.api.java.typeutils.TupleTypeInfo) ArrayUtils(org.apache.commons.lang3.ArrayUtils) HashMap(java.util.HashMap) MapStateDescriptor(org.apache.flink.api.common.state.MapStateDescriptor) RichCoFlatMapFunction(org.apache.flink.streaming.api.functions.co.RichCoFlatMapFunction) ArrayList(java.util.ArrayList) FileModelStreamSink(com.alibaba.alink.operator.common.stream.model.FileModelStreamSink) ListState(org.apache.flink.api.common.state.ListState) StreamOperator(com.alibaba.alink.operator.stream.StreamOperator) RichMapFunction(org.apache.flink.api.common.functions.RichMapFunction) Collector(org.apache.flink.util.Collector) DummyOutputFormat(com.alibaba.alink.operator.common.io.dummy.DummyOutputFormat) Map(java.util.Map) ListStateDescriptor(org.apache.flink.api.common.state.ListStateDescriptor) CsvUtil(com.alibaba.alink.operator.common.io.csv.CsvUtil) ReduceFunction(org.apache.flink.api.common.functions.ReduceFunction) Types(org.apache.flink.api.common.typeinfo.Types) IOType(com.alibaba.alink.common.io.annotations.IOType) AnnotationUtils(com.alibaba.alink.common.io.annotations.AnnotationUtils) ValueStateDescriptor(org.apache.flink.api.common.state.ValueStateDescriptor) Timestamp(java.sql.Timestamp) Configuration(org.apache.flink.configuration.Configuration) TableSchema(org.apache.flink.table.api.TableSchema) IOException(java.io.IOException) DataStream(org.apache.flink.streaming.api.datastream.DataStream) List(java.util.List) FilePath(com.alibaba.alink.common.io.filesystem.FilePath) ModelStreamUtils(com.alibaba.alink.operator.common.stream.model.ModelStreamUtils) MapState(org.apache.flink.api.common.state.MapState) ValueState(org.apache.flink.api.common.state.ValueState) Row(org.apache.flink.types.Row) Params(org.apache.flink.ml.api.misc.param.Params) IoOpAnnotation(com.alibaba.alink.common.io.annotations.IoOpAnnotation) ModelStreamFileSinkParams(com.alibaba.alink.params.io.ModelStreamFileSinkParams) Configuration(org.apache.flink.configuration.Configuration) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ValueStateDescriptor(org.apache.flink.api.common.state.ValueStateDescriptor) ArrayList(java.util.ArrayList) List(java.util.List) ListState(org.apache.flink.api.common.state.ListState) FileModelStreamSink(com.alibaba.alink.operator.common.stream.model.FileModelStreamSink) ValueState(org.apache.flink.api.common.state.ValueState) Row(org.apache.flink.types.Row) TableSchema(org.apache.flink.table.api.TableSchema) ListStateDescriptor(org.apache.flink.api.common.state.ListStateDescriptor) Timestamp(java.sql.Timestamp) Collector(org.apache.flink.util.Collector) RichCoFlatMapFunction(org.apache.flink.streaming.api.functions.co.RichCoFlatMapFunction) FilePath(com.alibaba.alink.common.io.filesystem.FilePath) IOException(java.io.IOException) IOException(java.io.IOException) TupleTypeInfo(org.apache.flink.api.java.typeutils.TupleTypeInfo) Tuple4(org.apache.flink.api.java.tuple.Tuple4) Tuple1(org.apache.flink.api.java.tuple.Tuple1) RichMapFunction(org.apache.flink.api.common.functions.RichMapFunction) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Tuple3(org.apache.flink.api.java.tuple.Tuple3)

Example 12 with FilePath

use of com.alibaba.alink.common.io.filesystem.FilePath in project Alink by alibaba.

the class DLEnvConfig method getDefaultPythonEnv.

static String getDefaultPythonEnv(Version version) {
    String pythonEnv = null;
    // Try to get PythonEnv from environment variables
    String pythonEnvKey = PYTHON_ENV_KEY.get(version);
    if (null != pythonEnvKey) {
        pythonEnv = System.getenv(pythonEnvKey);
        if (null == pythonEnv) {
            pythonEnv = System.getProperty(pythonEnvKey);
        }
        if (null != pythonEnv) {
            if (!pythonEnv.startsWith("file://")) {
                pythonEnv = "file://" + pythonEnv;
            }
            return pythonEnv;
        }
    }
    OsType systemType = OsUtils.getSystemType();
    String remotePath = PYTHON_ENV_PATH_MAP.get(Pair.of(systemType, version));
    // Try to get PythonEnv from plugin directory
    FilePath pluginFilePath = null;
    RegisterKey registerKey = getRegisterKey(version);
    try {
        pluginFilePath = ResourcePluginFactory.getResourcePluginPath(registerKey);
    } catch (Exception e) {
        String info = String.format("Cannot prepare plugin for %s-%s, fallback to direct downloading from %s.", registerKey.getName(), registerKey.getVersion(), remotePath);
        LOG.info(info, e);
        if (AlinkGlobalConfiguration.isPrintProcessInfo()) {
            System.out.println(info + ":" + e);
        }
    }
    if (null != pluginFilePath) {
        String compressedFileName = PythonFileUtils.getCompressedFileName(remotePath);
        File directoryFile = new File(pluginFilePath.getPath().toString(), compressedFileName);
        Preconditions.checkArgument(directoryFile.exists(), String.format("There should be a directory named %s in plugin directory %s, but cannot be found.", compressedFileName, pluginFilePath.getPath().toString()));
        return "file://" + directoryFile.getAbsolutePath();
    }
    // Use default PythonEnv path in PYTHON_ENV_MAP
    if (null == remotePath) {
        throw new RuntimeException(String.format("Default python env for %s not specified.", version.name()));
    }
    return remotePath;
}
Also used : FilePath(com.alibaba.alink.common.io.filesystem.FilePath) OsType(com.alibaba.alink.common.io.plugin.OsType) RegisterKey(com.alibaba.alink.common.io.plugin.RegisterKey) File(java.io.File)

Example 13 with FilePath

use of com.alibaba.alink.common.io.filesystem.FilePath in project Alink by alibaba.

the class FileModelStreamSink method cleanUp.

private static void cleanUp(FilePath filePath, int numKeepModel) throws IOException {
    if (numKeepModel < 0) {
        return;
    }
    List<Timestamp> models = ModelStreamUtils.listModels(filePath);
    models.sort(Timestamp::compareTo);
    BaseFileSystem<?> fileSystem = filePath.getFileSystem();
    Path confFolder = new Path(filePath.getPath(), MODEL_CONF);
    for (int i = 0; i < models.size() - numKeepModel; ++i) {
        // do remove
        // remove model
        fileSystem.delete(new Path(filePath.getPath(), ModelStreamUtils.toStringPresentation(models.get(i))), true);
        // remove log
        fileSystem.delete(new Path(confFolder, String.format("%s.log", ModelStreamUtils.toStringPresentation(models.get(i)))), false);
    }
}
Also used : FilePath(com.alibaba.alink.common.io.filesystem.FilePath) Path(org.apache.flink.core.fs.Path) Timestamp(java.sql.Timestamp)

Example 14 with FilePath

use of com.alibaba.alink.common.io.filesystem.FilePath in project Alink by alibaba.

the class FileModelStreamSink method finalizeGlobal.

public void finalizeGlobal(Timestamp modelId, long numRows, List<Integer> filesId, int numKeepModel) throws IOException {
    BaseFileSystem<?> fileSystem = filePath.getFileSystem();
    // construct model folder
    Path confDirPath = new Path(filePath.getPath(), MODEL_CONF);
    Path modelPath = new Path(confDirPath, ModelStreamUtils.toStringPresentation(modelId));
    if (fileSystem.exists(modelPath)) {
        throw new IOException(String.format("ModelPath: %s has existed.", modelPath));
    } else {
        fileSystem.mkdirs(modelPath);
    }
    filesId.sort(Integer::compareTo);
    for (int i = 0; i < filesId.size(); ++i) {
        Path subInProgressModelFilePath = new Path(confDirPath, String.format("%s_%d", ModelStreamUtils.toStringPresentation(modelId), filesId.get(i)));
        Path subToCommitModelFilePath = new Path(modelPath, String.valueOf(i));
        if (!fileSystem.rename(subInProgressModelFilePath, subToCommitModelFilePath)) {
            throw new IOException(String.format("Submit sub-model %s to %s failed. Maybe folder %s exists.", subInProgressModelFilePath, subToCommitModelFilePath, subToCommitModelFilePath));
        }
    }
    // if done, write redo log.
    Path logPath = new Path(confDirPath, String.format("%s.log", ModelStreamUtils.toStringPresentation(modelId)));
    try (FSDataOutputStream outputStream = fileSystem.create(logPath, WriteMode.OVERWRITE)) {
        outputStream.write(JsonConverter.toJson(new ModelStreamMeta(numRows, filesId.size())).getBytes());
    } catch (Exception ex) {
        // if write fail, delete the redo log to make the model invalid.
        fileSystem.delete(logPath, false);
        throw ex;
    }
    // if done, do commit.
    Path finalModelPath = new Path(filePath.getPath(), ModelStreamUtils.toStringPresentation(modelId));
    if (!fileSystem.rename(modelPath, finalModelPath)) {
        throw new IOException(String.format("Submit model %s to %s failed. Maybe folder %s exists.", modelPath, finalModelPath, finalModelPath));
    }
    // if done, do clean up
    cleanUp(filePath, numKeepModel);
}
Also used : FilePath(com.alibaba.alink.common.io.filesystem.FilePath) Path(org.apache.flink.core.fs.Path) IOException(java.io.IOException) FSDataOutputStream(org.apache.flink.core.fs.FSDataOutputStream) IOException(java.io.IOException)

Example 15 with FilePath

use of com.alibaba.alink.common.io.filesystem.FilePath in project Alink by alibaba.

the class ModelStreamUtils method getSchemaFromFolder.

public static TableSchema getSchemaFromFolder(FilePath filePath) throws IOException {
    List<Timestamp> models = listModels(filePath);
    if (models.isEmpty()) {
        throw new IllegalArgumentException("Stream model is empty. path: " + filePath.getPath().toString());
    }
    Timestamp timestamp = models.get(0);
    AkMeta meta = AkUtils.getMetaFromPath(new FilePath(new Path(filePath.getPath(), toStringPresentation(timestamp)), filePath.getFileSystem()));
    return CsvUtil.schemaStr2Schema(meta.schemaStr);
}
Also used : FilePath(com.alibaba.alink.common.io.filesystem.FilePath) Path(org.apache.flink.core.fs.Path) FilePath(com.alibaba.alink.common.io.filesystem.FilePath) AkMeta(com.alibaba.alink.common.io.filesystem.AkUtils.AkMeta) Timestamp(java.sql.Timestamp)

Aggregations

FilePath (com.alibaba.alink.common.io.filesystem.FilePath)36 Path (org.apache.flink.core.fs.Path)22 IOException (java.io.IOException)9 Test (org.junit.Test)9 Row (org.apache.flink.types.Row)8 File (java.io.File)7 TableSchema (org.apache.flink.table.api.TableSchema)6 AkSinkStreamOp (com.alibaba.alink.operator.stream.sink.AkSinkStreamOp)5 AppendIdBatchOp (com.alibaba.alink.operator.batch.dataproc.AppendIdBatchOp)4 CsvSourceBatchOp (com.alibaba.alink.operator.batch.source.CsvSourceBatchOp)4 RandomTableSourceBatchOp (com.alibaba.alink.operator.batch.source.RandomTableSourceBatchOp)4 SelectBatchOp (com.alibaba.alink.operator.batch.sql.SelectBatchOp)4 HttpFileSplitReader (com.alibaba.alink.operator.common.io.reader.HttpFileSplitReader)4 AppendIdStreamOp (com.alibaba.alink.operator.stream.dataproc.AppendIdStreamOp)4 ArrayList (java.util.ArrayList)4 List (java.util.List)4 HadoopFileSystem (com.alibaba.alink.common.io.filesystem.HadoopFileSystem)3 OssFileSystem (com.alibaba.alink.common.io.filesystem.OssFileSystem)3 RandomTableSourceStreamOp (com.alibaba.alink.operator.stream.source.RandomTableSourceStreamOp)3 SelectStreamOp (com.alibaba.alink.operator.stream.sql.SelectStreamOp)3