use of com.alibaba.alink.common.io.filesystem.FilePath in project Alink by alibaba.
the class ModelStreamFileSinkStreamOp method sinkFrom.
@Override
public ModelStreamFileSinkStreamOp sinkFrom(StreamOperator<?> in) {
TableSchema schema = in.getSchema();
final int timestampColIndex = ModelStreamUtils.findTimestampColIndexWithAssertAndHint(schema);
final int countColIndex = ModelStreamUtils.findCountColIndexWithAssertAndHint(schema);
final TableSchema dataSchema = new TableSchema(ArrayUtils.removeAll(schema.getFieldNames(), timestampColIndex, countColIndex), ArrayUtils.removeAll(schema.getFieldTypes(), timestampColIndex, countColIndex));
final String dataSchemaStr = CsvUtil.schema2SchemaStr(dataSchema);
final FilePath path = getFilePath();
final int numKeepModel = getNumKeepModel();
final FileModelStreamSink fileModelStreamSink = new FileModelStreamSink(path, dataSchemaStr);
try {
fileModelStreamSink.initializeGlobal();
} catch (IOException e) {
throw new IllegalStateException(e);
}
DataStream<Row> inputStream = in.getDataStream();
DataStream<Tuple4<Timestamp, Integer, Long, Long>> count = inputStream.map(new RichMapFunction<Row, Tuple4<Timestamp, Integer, Long, Long>>() {
@Override
public Tuple4<Timestamp, Integer, Long, Long> map(Row value) {
return Tuple4.of((Timestamp) value.getField(timestampColIndex), getRuntimeContext().getIndexOfThisSubtask(), (Long) value.getField(countColIndex), 1L);
}
}).keyBy(0, 1, 2).reduce(new ReduceFunction<Tuple4<Timestamp, Integer, Long, Long>>() {
@Override
public Tuple4<Timestamp, Integer, Long, Long> reduce(Tuple4<Timestamp, Integer, Long, Long> value1, Tuple4<Timestamp, Integer, Long, Long> value2) {
return Tuple4.of(value1.f0, value1.f1, value1.f2, value1.f3 + value2.f3);
}
}).keyBy(0, 2).flatMap(new RichFlatMapFunction<Tuple4<Timestamp, Integer, Long, Long>, Tuple4<Timestamp, Integer, Long, Long>>() {
private transient MapState<Integer, Tuple4<Timestamp, Integer, Long, Long>> latest;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
latest = getRuntimeContext().getMapState(new MapStateDescriptor<>("latest", Types.INT, new TupleTypeInfo<>(Types.SQL_TIMESTAMP, Types.INT, Types.LONG, Types.LONG)));
}
@Override
public void flatMap(Tuple4<Timestamp, Integer, Long, Long> value, Collector<Tuple4<Timestamp, Integer, Long, Long>> out) throws Exception {
latest.put(value.f1, value);
long sum = 0;
long total = -1;
for (Map.Entry<Integer, Tuple4<Timestamp, Integer, Long, Long>> entry : latest.entries()) {
total = entry.getValue().f2;
sum += entry.getValue().f3;
}
if (total == sum) {
for (Map.Entry<Integer, Tuple4<Timestamp, Integer, Long, Long>> entry : latest.entries()) {
out.collect(entry.getValue());
}
}
}
});
inputStream.map(new RichMapFunction<Row, Tuple3<Timestamp, Integer, Row>>() {
@Override
public Tuple3<Timestamp, Integer, Row> map(Row value) {
return Tuple3.of((Timestamp) value.getField(timestampColIndex), getRuntimeContext().getIndexOfThisSubtask(), ModelStreamUtils.genRowWithoutIdentifier(value, timestampColIndex, countColIndex));
}
}).keyBy(0, 1).connect(count.keyBy(0, 1)).flatMap(new RichCoFlatMapFunction<Tuple3<Timestamp, Integer, Row>, Tuple4<Timestamp, Integer, Long, Long>, Tuple1<Timestamp>>() {
private final Map<Tuple2<Timestamp, Integer>, Tuple3<FileModelStreamSink, Long, Long>> writerContainer = new HashMap<>();
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
}
@Override
public void flatMap1(Tuple3<Timestamp, Integer, Row> value, Collector<Tuple1<Timestamp>> out) {
writerContainer.compute(Tuple2.of(value.f0, value.f1), (key, oldValue) -> {
if (oldValue == null) {
FileModelStreamSink localFileModelStreamSink = new FileModelStreamSink(path, dataSchemaStr);
try {
localFileModelStreamSink.open(value.f0, value.f1);
} catch (IOException e) {
throw new RuntimeException(e);
}
localFileModelStreamSink.collect(value.f2);
return Tuple3.of(localFileModelStreamSink, 1L, null);
} else if (oldValue.f0 == null) {
FileModelStreamSink localFileModelStreamSink = new FileModelStreamSink(path, dataSchemaStr);
try {
localFileModelStreamSink.open(value.f0, value.f1);
} catch (IOException e) {
throw new RuntimeException(e);
}
localFileModelStreamSink.collect(value.f2);
if (oldValue.f2 != null && oldValue.f2.equals(1L)) {
localFileModelStreamSink.close();
out.collect(Tuple1.of(key.f0));
return null;
} else {
return Tuple3.of(localFileModelStreamSink, 1L, null);
}
} else {
oldValue.f0.collect(value.f2);
++oldValue.f1;
if (oldValue.f2 != null && oldValue.f2.equals(oldValue.f1)) {
oldValue.f0.close();
out.collect(Tuple1.of(key.f0));
return null;
} else {
return Tuple3.of(oldValue.f0, oldValue.f1, oldValue.f2);
}
}
});
}
@Override
public void flatMap2(Tuple4<Timestamp, Integer, Long, Long> value, Collector<Tuple1<Timestamp>> out) {
writerContainer.compute(Tuple2.of(value.f0, value.f1), (key, oldValue) -> {
if (oldValue == null) {
return Tuple3.of(null, null, value.f3);
} else {
if (value.f3.equals(oldValue.f1)) {
oldValue.f0.close();
out.collect(Tuple1.of(key.f0));
return null;
} else {
return Tuple3.of(oldValue.f0, oldValue.f1, value.f3);
}
}
});
}
}).keyBy(0).connect(count.keyBy(0).flatMap(new RichFlatMapFunction<Tuple4<Timestamp, Integer, Long, Long>, Tuple4<Timestamp, Integer, Integer, Long>>() {
private transient ListState<Tuple2<Integer, Long>> filesCounter;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
filesCounter = getRuntimeContext().getListState(new ListStateDescriptor<>("filesCounter", new TupleTypeInfo<>(Types.INT, Types.LONG)));
}
@Override
public void flatMap(Tuple4<Timestamp, Integer, Long, Long> value, Collector<Tuple4<Timestamp, Integer, Integer, Long>> out) throws Exception {
Long sum = value.f3;
Integer fileCount = 1;
List<Tuple2<Integer, Long>> local = new ArrayList<>();
local.add(Tuple2.of(value.f1, value.f3));
for (Tuple2<Integer, Long> count : filesCounter.get()) {
sum += count.f1;
fileCount++;
local.add(count);
}
if (value.f2.equals(sum)) {
for (Tuple2<Integer, Long> count : local) {
out.collect(Tuple4.of(value.f0, count.f0, fileCount, value.f2));
}
}
filesCounter.add(Tuple2.of(value.f1, value.f3));
}
}).keyBy(0)).flatMap(new RichCoFlatMapFunction<Tuple1<Timestamp>, Tuple4<Timestamp, Integer, Integer, Long>, byte[]>() {
private transient ValueState<Integer> filesCounter;
private transient ListState<Tuple3<Integer, Integer, Long>> total;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
filesCounter = getRuntimeContext().getState(new ValueStateDescriptor<>("filesCounter", Types.INT));
total = getRuntimeContext().getListState(new ListStateDescriptor<>("total", new TupleTypeInfo<>(Types.INT, Types.LONG)));
}
@Override
public void flatMap1(Tuple1<Timestamp> value, Collector<byte[]> out) throws Exception {
Integer count = filesCounter.value();
if (count == null) {
count = 1;
} else {
++count;
}
List<Tuple3<Integer, Integer, Long>> local = new ArrayList<>();
for (Tuple3<Integer, Integer, Long> t : total.get()) {
local.add(t);
}
if (!local.isEmpty() && local.get(0).f1.equals(local.size()) && local.get(0).f1.equals(count)) {
List<Integer> filesId = new ArrayList<>();
for (Tuple3<Integer, Integer, Long> t : local) {
filesId.add(t.f0);
}
new FileModelStreamSink(path, dataSchemaStr).finalizeGlobal(value.f0, local.get(0).f2, filesId, numKeepModel);
}
filesCounter.update(count);
}
@Override
public void flatMap2(Tuple4<Timestamp, Integer, Integer, Long> value, Collector<byte[]> out) throws Exception {
List<Tuple3<Integer, Integer, Long>> local = new ArrayList<>();
local.add(Tuple3.of(value.f1, value.f2, value.f3));
for (Tuple3<Integer, Integer, Long> t : total.get()) {
local.add(t);
}
if (local.get(0).f1.equals(local.size()) && local.get(0).f1.equals(filesCounter.value())) {
List<Integer> filesId = new ArrayList<>();
for (Tuple3<Integer, Integer, Long> t : local) {
filesId.add(t.f0);
}
new FileModelStreamSink(path, dataSchemaStr).finalizeGlobal(value.f0, local.get(0).f2, filesId, numKeepModel);
}
total.add(Tuple3.of(value.f1, value.f2, value.f3));
}
}).writeUsingOutputFormat(new DummyOutputFormat<>());
return this;
}
use of com.alibaba.alink.common.io.filesystem.FilePath in project Alink by alibaba.
the class DLEnvConfig method getDefaultPythonEnv.
static String getDefaultPythonEnv(Version version) {
String pythonEnv = null;
// Try to get PythonEnv from environment variables
String pythonEnvKey = PYTHON_ENV_KEY.get(version);
if (null != pythonEnvKey) {
pythonEnv = System.getenv(pythonEnvKey);
if (null == pythonEnv) {
pythonEnv = System.getProperty(pythonEnvKey);
}
if (null != pythonEnv) {
if (!pythonEnv.startsWith("file://")) {
pythonEnv = "file://" + pythonEnv;
}
return pythonEnv;
}
}
OsType systemType = OsUtils.getSystemType();
String remotePath = PYTHON_ENV_PATH_MAP.get(Pair.of(systemType, version));
// Try to get PythonEnv from plugin directory
FilePath pluginFilePath = null;
RegisterKey registerKey = getRegisterKey(version);
try {
pluginFilePath = ResourcePluginFactory.getResourcePluginPath(registerKey);
} catch (Exception e) {
String info = String.format("Cannot prepare plugin for %s-%s, fallback to direct downloading from %s.", registerKey.getName(), registerKey.getVersion(), remotePath);
LOG.info(info, e);
if (AlinkGlobalConfiguration.isPrintProcessInfo()) {
System.out.println(info + ":" + e);
}
}
if (null != pluginFilePath) {
String compressedFileName = PythonFileUtils.getCompressedFileName(remotePath);
File directoryFile = new File(pluginFilePath.getPath().toString(), compressedFileName);
Preconditions.checkArgument(directoryFile.exists(), String.format("There should be a directory named %s in plugin directory %s, but cannot be found.", compressedFileName, pluginFilePath.getPath().toString()));
return "file://" + directoryFile.getAbsolutePath();
}
// Use default PythonEnv path in PYTHON_ENV_MAP
if (null == remotePath) {
throw new RuntimeException(String.format("Default python env for %s not specified.", version.name()));
}
return remotePath;
}
use of com.alibaba.alink.common.io.filesystem.FilePath in project Alink by alibaba.
the class FileModelStreamSink method cleanUp.
private static void cleanUp(FilePath filePath, int numKeepModel) throws IOException {
if (numKeepModel < 0) {
return;
}
List<Timestamp> models = ModelStreamUtils.listModels(filePath);
models.sort(Timestamp::compareTo);
BaseFileSystem<?> fileSystem = filePath.getFileSystem();
Path confFolder = new Path(filePath.getPath(), MODEL_CONF);
for (int i = 0; i < models.size() - numKeepModel; ++i) {
// do remove
// remove model
fileSystem.delete(new Path(filePath.getPath(), ModelStreamUtils.toStringPresentation(models.get(i))), true);
// remove log
fileSystem.delete(new Path(confFolder, String.format("%s.log", ModelStreamUtils.toStringPresentation(models.get(i)))), false);
}
}
use of com.alibaba.alink.common.io.filesystem.FilePath in project Alink by alibaba.
the class FileModelStreamSink method finalizeGlobal.
public void finalizeGlobal(Timestamp modelId, long numRows, List<Integer> filesId, int numKeepModel) throws IOException {
BaseFileSystem<?> fileSystem = filePath.getFileSystem();
// construct model folder
Path confDirPath = new Path(filePath.getPath(), MODEL_CONF);
Path modelPath = new Path(confDirPath, ModelStreamUtils.toStringPresentation(modelId));
if (fileSystem.exists(modelPath)) {
throw new IOException(String.format("ModelPath: %s has existed.", modelPath));
} else {
fileSystem.mkdirs(modelPath);
}
filesId.sort(Integer::compareTo);
for (int i = 0; i < filesId.size(); ++i) {
Path subInProgressModelFilePath = new Path(confDirPath, String.format("%s_%d", ModelStreamUtils.toStringPresentation(modelId), filesId.get(i)));
Path subToCommitModelFilePath = new Path(modelPath, String.valueOf(i));
if (!fileSystem.rename(subInProgressModelFilePath, subToCommitModelFilePath)) {
throw new IOException(String.format("Submit sub-model %s to %s failed. Maybe folder %s exists.", subInProgressModelFilePath, subToCommitModelFilePath, subToCommitModelFilePath));
}
}
// if done, write redo log.
Path logPath = new Path(confDirPath, String.format("%s.log", ModelStreamUtils.toStringPresentation(modelId)));
try (FSDataOutputStream outputStream = fileSystem.create(logPath, WriteMode.OVERWRITE)) {
outputStream.write(JsonConverter.toJson(new ModelStreamMeta(numRows, filesId.size())).getBytes());
} catch (Exception ex) {
// if write fail, delete the redo log to make the model invalid.
fileSystem.delete(logPath, false);
throw ex;
}
// if done, do commit.
Path finalModelPath = new Path(filePath.getPath(), ModelStreamUtils.toStringPresentation(modelId));
if (!fileSystem.rename(modelPath, finalModelPath)) {
throw new IOException(String.format("Submit model %s to %s failed. Maybe folder %s exists.", modelPath, finalModelPath, finalModelPath));
}
// if done, do clean up
cleanUp(filePath, numKeepModel);
}
use of com.alibaba.alink.common.io.filesystem.FilePath in project Alink by alibaba.
the class ModelStreamUtils method getSchemaFromFolder.
public static TableSchema getSchemaFromFolder(FilePath filePath) throws IOException {
List<Timestamp> models = listModels(filePath);
if (models.isEmpty()) {
throw new IllegalArgumentException("Stream model is empty. path: " + filePath.getPath().toString());
}
Timestamp timestamp = models.get(0);
AkMeta meta = AkUtils.getMetaFromPath(new FilePath(new Path(filePath.getPath(), toStringPresentation(timestamp)), filePath.getFileSystem()));
return CsvUtil.schemaStr2Schema(meta.schemaStr);
}
Aggregations