Search in sources :

Example 1 with FileModelStreamSink

use of com.alibaba.alink.operator.common.stream.model.FileModelStreamSink in project Alink by alibaba.

the class ModelStreamFileSinkStreamOp method sinkFrom.

@Override
public ModelStreamFileSinkStreamOp sinkFrom(StreamOperator<?> in) {
    TableSchema schema = in.getSchema();
    final int timestampColIndex = ModelStreamUtils.findTimestampColIndexWithAssertAndHint(schema);
    final int countColIndex = ModelStreamUtils.findCountColIndexWithAssertAndHint(schema);
    final TableSchema dataSchema = new TableSchema(ArrayUtils.removeAll(schema.getFieldNames(), timestampColIndex, countColIndex), ArrayUtils.removeAll(schema.getFieldTypes(), timestampColIndex, countColIndex));
    final String dataSchemaStr = CsvUtil.schema2SchemaStr(dataSchema);
    final FilePath path = getFilePath();
    final int numKeepModel = getNumKeepModel();
    final FileModelStreamSink fileModelStreamSink = new FileModelStreamSink(path, dataSchemaStr);
    try {
        fileModelStreamSink.initializeGlobal();
    } catch (IOException e) {
        throw new IllegalStateException(e);
    }
    DataStream<Row> inputStream = in.getDataStream();
    DataStream<Tuple4<Timestamp, Integer, Long, Long>> count = inputStream.map(new RichMapFunction<Row, Tuple4<Timestamp, Integer, Long, Long>>() {

        @Override
        public Tuple4<Timestamp, Integer, Long, Long> map(Row value) {
            return Tuple4.of((Timestamp) value.getField(timestampColIndex), getRuntimeContext().getIndexOfThisSubtask(), (Long) value.getField(countColIndex), 1L);
        }
    }).keyBy(0, 1, 2).reduce(new ReduceFunction<Tuple4<Timestamp, Integer, Long, Long>>() {

        @Override
        public Tuple4<Timestamp, Integer, Long, Long> reduce(Tuple4<Timestamp, Integer, Long, Long> value1, Tuple4<Timestamp, Integer, Long, Long> value2) {
            return Tuple4.of(value1.f0, value1.f1, value1.f2, value1.f3 + value2.f3);
        }
    }).keyBy(0, 2).flatMap(new RichFlatMapFunction<Tuple4<Timestamp, Integer, Long, Long>, Tuple4<Timestamp, Integer, Long, Long>>() {

        private transient MapState<Integer, Tuple4<Timestamp, Integer, Long, Long>> latest;

        @Override
        public void open(Configuration parameters) throws Exception {
            super.open(parameters);
            latest = getRuntimeContext().getMapState(new MapStateDescriptor<>("latest", Types.INT, new TupleTypeInfo<>(Types.SQL_TIMESTAMP, Types.INT, Types.LONG, Types.LONG)));
        }

        @Override
        public void flatMap(Tuple4<Timestamp, Integer, Long, Long> value, Collector<Tuple4<Timestamp, Integer, Long, Long>> out) throws Exception {
            latest.put(value.f1, value);
            long sum = 0;
            long total = -1;
            for (Map.Entry<Integer, Tuple4<Timestamp, Integer, Long, Long>> entry : latest.entries()) {
                total = entry.getValue().f2;
                sum += entry.getValue().f3;
            }
            if (total == sum) {
                for (Map.Entry<Integer, Tuple4<Timestamp, Integer, Long, Long>> entry : latest.entries()) {
                    out.collect(entry.getValue());
                }
            }
        }
    });
    inputStream.map(new RichMapFunction<Row, Tuple3<Timestamp, Integer, Row>>() {

        @Override
        public Tuple3<Timestamp, Integer, Row> map(Row value) {
            return Tuple3.of((Timestamp) value.getField(timestampColIndex), getRuntimeContext().getIndexOfThisSubtask(), ModelStreamUtils.genRowWithoutIdentifier(value, timestampColIndex, countColIndex));
        }
    }).keyBy(0, 1).connect(count.keyBy(0, 1)).flatMap(new RichCoFlatMapFunction<Tuple3<Timestamp, Integer, Row>, Tuple4<Timestamp, Integer, Long, Long>, Tuple1<Timestamp>>() {

        private final Map<Tuple2<Timestamp, Integer>, Tuple3<FileModelStreamSink, Long, Long>> writerContainer = new HashMap<>();

        @Override
        public void open(Configuration parameters) throws Exception {
            super.open(parameters);
        }

        @Override
        public void flatMap1(Tuple3<Timestamp, Integer, Row> value, Collector<Tuple1<Timestamp>> out) {
            writerContainer.compute(Tuple2.of(value.f0, value.f1), (key, oldValue) -> {
                if (oldValue == null) {
                    FileModelStreamSink localFileModelStreamSink = new FileModelStreamSink(path, dataSchemaStr);
                    try {
                        localFileModelStreamSink.open(value.f0, value.f1);
                    } catch (IOException e) {
                        throw new RuntimeException(e);
                    }
                    localFileModelStreamSink.collect(value.f2);
                    return Tuple3.of(localFileModelStreamSink, 1L, null);
                } else if (oldValue.f0 == null) {
                    FileModelStreamSink localFileModelStreamSink = new FileModelStreamSink(path, dataSchemaStr);
                    try {
                        localFileModelStreamSink.open(value.f0, value.f1);
                    } catch (IOException e) {
                        throw new RuntimeException(e);
                    }
                    localFileModelStreamSink.collect(value.f2);
                    if (oldValue.f2 != null && oldValue.f2.equals(1L)) {
                        localFileModelStreamSink.close();
                        out.collect(Tuple1.of(key.f0));
                        return null;
                    } else {
                        return Tuple3.of(localFileModelStreamSink, 1L, null);
                    }
                } else {
                    oldValue.f0.collect(value.f2);
                    ++oldValue.f1;
                    if (oldValue.f2 != null && oldValue.f2.equals(oldValue.f1)) {
                        oldValue.f0.close();
                        out.collect(Tuple1.of(key.f0));
                        return null;
                    } else {
                        return Tuple3.of(oldValue.f0, oldValue.f1, oldValue.f2);
                    }
                }
            });
        }

        @Override
        public void flatMap2(Tuple4<Timestamp, Integer, Long, Long> value, Collector<Tuple1<Timestamp>> out) {
            writerContainer.compute(Tuple2.of(value.f0, value.f1), (key, oldValue) -> {
                if (oldValue == null) {
                    return Tuple3.of(null, null, value.f3);
                } else {
                    if (value.f3.equals(oldValue.f1)) {
                        oldValue.f0.close();
                        out.collect(Tuple1.of(key.f0));
                        return null;
                    } else {
                        return Tuple3.of(oldValue.f0, oldValue.f1, value.f3);
                    }
                }
            });
        }
    }).keyBy(0).connect(count.keyBy(0).flatMap(new RichFlatMapFunction<Tuple4<Timestamp, Integer, Long, Long>, Tuple4<Timestamp, Integer, Integer, Long>>() {

        private transient ListState<Tuple2<Integer, Long>> filesCounter;

        @Override
        public void open(Configuration parameters) throws Exception {
            super.open(parameters);
            filesCounter = getRuntimeContext().getListState(new ListStateDescriptor<>("filesCounter", new TupleTypeInfo<>(Types.INT, Types.LONG)));
        }

        @Override
        public void flatMap(Tuple4<Timestamp, Integer, Long, Long> value, Collector<Tuple4<Timestamp, Integer, Integer, Long>> out) throws Exception {
            Long sum = value.f3;
            Integer fileCount = 1;
            List<Tuple2<Integer, Long>> local = new ArrayList<>();
            local.add(Tuple2.of(value.f1, value.f3));
            for (Tuple2<Integer, Long> count : filesCounter.get()) {
                sum += count.f1;
                fileCount++;
                local.add(count);
            }
            if (value.f2.equals(sum)) {
                for (Tuple2<Integer, Long> count : local) {
                    out.collect(Tuple4.of(value.f0, count.f0, fileCount, value.f2));
                }
            }
            filesCounter.add(Tuple2.of(value.f1, value.f3));
        }
    }).keyBy(0)).flatMap(new RichCoFlatMapFunction<Tuple1<Timestamp>, Tuple4<Timestamp, Integer, Integer, Long>, byte[]>() {

        private transient ValueState<Integer> filesCounter;

        private transient ListState<Tuple3<Integer, Integer, Long>> total;

        @Override
        public void open(Configuration parameters) throws Exception {
            super.open(parameters);
            filesCounter = getRuntimeContext().getState(new ValueStateDescriptor<>("filesCounter", Types.INT));
            total = getRuntimeContext().getListState(new ListStateDescriptor<>("total", new TupleTypeInfo<>(Types.INT, Types.LONG)));
        }

        @Override
        public void flatMap1(Tuple1<Timestamp> value, Collector<byte[]> out) throws Exception {
            Integer count = filesCounter.value();
            if (count == null) {
                count = 1;
            } else {
                ++count;
            }
            List<Tuple3<Integer, Integer, Long>> local = new ArrayList<>();
            for (Tuple3<Integer, Integer, Long> t : total.get()) {
                local.add(t);
            }
            if (!local.isEmpty() && local.get(0).f1.equals(local.size()) && local.get(0).f1.equals(count)) {
                List<Integer> filesId = new ArrayList<>();
                for (Tuple3<Integer, Integer, Long> t : local) {
                    filesId.add(t.f0);
                }
                new FileModelStreamSink(path, dataSchemaStr).finalizeGlobal(value.f0, local.get(0).f2, filesId, numKeepModel);
            }
            filesCounter.update(count);
        }

        @Override
        public void flatMap2(Tuple4<Timestamp, Integer, Integer, Long> value, Collector<byte[]> out) throws Exception {
            List<Tuple3<Integer, Integer, Long>> local = new ArrayList<>();
            local.add(Tuple3.of(value.f1, value.f2, value.f3));
            for (Tuple3<Integer, Integer, Long> t : total.get()) {
                local.add(t);
            }
            if (local.get(0).f1.equals(local.size()) && local.get(0).f1.equals(filesCounter.value())) {
                List<Integer> filesId = new ArrayList<>();
                for (Tuple3<Integer, Integer, Long> t : local) {
                    filesId.add(t.f0);
                }
                new FileModelStreamSink(path, dataSchemaStr).finalizeGlobal(value.f0, local.get(0).f2, filesId, numKeepModel);
            }
            total.add(Tuple3.of(value.f1, value.f2, value.f3));
        }
    }).writeUsingOutputFormat(new DummyOutputFormat<>());
    return this;
}
Also used : RichFlatMapFunction(org.apache.flink.api.common.functions.RichFlatMapFunction) Tuple1(org.apache.flink.api.java.tuple.Tuple1) Tuple3(org.apache.flink.api.java.tuple.Tuple3) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Tuple4(org.apache.flink.api.java.tuple.Tuple4) TupleTypeInfo(org.apache.flink.api.java.typeutils.TupleTypeInfo) ArrayUtils(org.apache.commons.lang3.ArrayUtils) HashMap(java.util.HashMap) MapStateDescriptor(org.apache.flink.api.common.state.MapStateDescriptor) RichCoFlatMapFunction(org.apache.flink.streaming.api.functions.co.RichCoFlatMapFunction) ArrayList(java.util.ArrayList) FileModelStreamSink(com.alibaba.alink.operator.common.stream.model.FileModelStreamSink) ListState(org.apache.flink.api.common.state.ListState) StreamOperator(com.alibaba.alink.operator.stream.StreamOperator) RichMapFunction(org.apache.flink.api.common.functions.RichMapFunction) Collector(org.apache.flink.util.Collector) DummyOutputFormat(com.alibaba.alink.operator.common.io.dummy.DummyOutputFormat) Map(java.util.Map) ListStateDescriptor(org.apache.flink.api.common.state.ListStateDescriptor) CsvUtil(com.alibaba.alink.operator.common.io.csv.CsvUtil) ReduceFunction(org.apache.flink.api.common.functions.ReduceFunction) Types(org.apache.flink.api.common.typeinfo.Types) IOType(com.alibaba.alink.common.io.annotations.IOType) AnnotationUtils(com.alibaba.alink.common.io.annotations.AnnotationUtils) ValueStateDescriptor(org.apache.flink.api.common.state.ValueStateDescriptor) Timestamp(java.sql.Timestamp) Configuration(org.apache.flink.configuration.Configuration) TableSchema(org.apache.flink.table.api.TableSchema) IOException(java.io.IOException) DataStream(org.apache.flink.streaming.api.datastream.DataStream) List(java.util.List) FilePath(com.alibaba.alink.common.io.filesystem.FilePath) ModelStreamUtils(com.alibaba.alink.operator.common.stream.model.ModelStreamUtils) MapState(org.apache.flink.api.common.state.MapState) ValueState(org.apache.flink.api.common.state.ValueState) Row(org.apache.flink.types.Row) Params(org.apache.flink.ml.api.misc.param.Params) IoOpAnnotation(com.alibaba.alink.common.io.annotations.IoOpAnnotation) ModelStreamFileSinkParams(com.alibaba.alink.params.io.ModelStreamFileSinkParams) Configuration(org.apache.flink.configuration.Configuration) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ValueStateDescriptor(org.apache.flink.api.common.state.ValueStateDescriptor) ArrayList(java.util.ArrayList) List(java.util.List) ListState(org.apache.flink.api.common.state.ListState) FileModelStreamSink(com.alibaba.alink.operator.common.stream.model.FileModelStreamSink) ValueState(org.apache.flink.api.common.state.ValueState) Row(org.apache.flink.types.Row) TableSchema(org.apache.flink.table.api.TableSchema) ListStateDescriptor(org.apache.flink.api.common.state.ListStateDescriptor) Timestamp(java.sql.Timestamp) Collector(org.apache.flink.util.Collector) RichCoFlatMapFunction(org.apache.flink.streaming.api.functions.co.RichCoFlatMapFunction) FilePath(com.alibaba.alink.common.io.filesystem.FilePath) IOException(java.io.IOException) IOException(java.io.IOException) TupleTypeInfo(org.apache.flink.api.java.typeutils.TupleTypeInfo) Tuple4(org.apache.flink.api.java.tuple.Tuple4) Tuple1(org.apache.flink.api.java.tuple.Tuple1) RichMapFunction(org.apache.flink.api.common.functions.RichMapFunction) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Tuple3(org.apache.flink.api.java.tuple.Tuple3)

Example 2 with FileModelStreamSink

use of com.alibaba.alink.operator.common.stream.model.FileModelStreamSink in project Alink by alibaba.

the class AppendModelStreamFileSinkBatchOp method sinkFrom.

@Override
protected AppendModelStreamFileSinkBatchOp sinkFrom(BatchOperator<?> in) {
    final FilePath filePath = getFilePath();
    final Timestamp timestamp = ModelStreamUtils.createStartTime(getModelTime());
    final int numFiles = getNumFiles();
    final int numKeepModel = getNumKeepModel();
    final TableSchema schema = in.getSchema();
    final FileModelStreamSink sink = new FileModelStreamSink(filePath, CsvUtil.schema2SchemaStr(schema));
    try {
        sink.initializeGlobal();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    DataSet<Row> writtenModel = in.getDataSet().map(new RichMapFunction<Row, Row>() {

        @Override
        public void open(Configuration parameters) throws Exception {
            sink.open(timestamp, getRuntimeContext().getIndexOfThisSubtask());
        }

        @Override
        public void close() throws Exception {
            sink.close();
        }

        @Override
        public Row map(Row value) throws Exception {
            sink.collect(value);
            return value;
        }
    }).setParallelism(numFiles);
    DataSetUtils.countElementsPerPartition(writtenModel).sum(1).output(new OutputFormat<Tuple2<Integer, Long>>() {

        @Override
        public void configure(Configuration parameters) {
        // pass
        }

        @Override
        public void open(int taskNumber, int numTasks) throws IOException {
        // pass
        }

        @Override
        public void writeRecord(Tuple2<Integer, Long> record) throws IOException {
            sink.finalizeGlobal(timestamp, record.f1, numFiles, numKeepModel);
        }

        @Override
        public void close() throws IOException {
        // pass
        }
    }).setParallelism(1);
    return this;
}
Also used : FilePath(com.alibaba.alink.common.io.filesystem.FilePath) TableSchema(org.apache.flink.table.api.TableSchema) Configuration(org.apache.flink.configuration.Configuration) FileModelStreamSink(com.alibaba.alink.operator.common.stream.model.FileModelStreamSink) OutputFormat(org.apache.flink.api.common.io.OutputFormat) IOException(java.io.IOException) Timestamp(java.sql.Timestamp) RichMapFunction(org.apache.flink.api.common.functions.RichMapFunction) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Row(org.apache.flink.types.Row)

Aggregations

FilePath (com.alibaba.alink.common.io.filesystem.FilePath)2 FileModelStreamSink (com.alibaba.alink.operator.common.stream.model.FileModelStreamSink)2 IOException (java.io.IOException)2 Timestamp (java.sql.Timestamp)2 RichMapFunction (org.apache.flink.api.common.functions.RichMapFunction)2 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)2 Configuration (org.apache.flink.configuration.Configuration)2 TableSchema (org.apache.flink.table.api.TableSchema)2 Row (org.apache.flink.types.Row)2 AnnotationUtils (com.alibaba.alink.common.io.annotations.AnnotationUtils)1 IOType (com.alibaba.alink.common.io.annotations.IOType)1 IoOpAnnotation (com.alibaba.alink.common.io.annotations.IoOpAnnotation)1 CsvUtil (com.alibaba.alink.operator.common.io.csv.CsvUtil)1 DummyOutputFormat (com.alibaba.alink.operator.common.io.dummy.DummyOutputFormat)1 ModelStreamUtils (com.alibaba.alink.operator.common.stream.model.ModelStreamUtils)1 StreamOperator (com.alibaba.alink.operator.stream.StreamOperator)1 ModelStreamFileSinkParams (com.alibaba.alink.params.io.ModelStreamFileSinkParams)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 List (java.util.List)1