use of com.alibaba.alink.common.io.filesystem.copy.csv.RowCsvInputFormat in project Alink by alibaba.
the class InternalCsvSourceBatchOp method initializeDataSource.
@Override
public Table initializeDataSource() {
final String filePath = getFilePath().getPathStr();
final String schemaStr = getSchemaStr();
final String fieldDelim = getFieldDelimiter();
final String rowDelim = getRowDelimiter();
final Character quoteChar = getQuoteChar();
final boolean skipBlankLine = getSkipBlankLine();
final boolean lenient = getLenient();
final String[] colNames = CsvUtil.getColNames(schemaStr);
final TypeInformation<?>[] colTypes = CsvUtil.getColTypes(schemaStr);
boolean ignoreFirstLine = getIgnoreFirstLine();
String protocol = "";
try {
URL url = new URL(filePath);
protocol = url.getProtocol();
} catch (MalformedURLException ignored) {
}
DataSet<Row> rows;
ExecutionEnvironment execEnv = MLEnvironmentFactory.get(getMLEnvironmentId()).getExecutionEnvironment();
TableSchema dummySchema = new TableSchema(new String[] { "f1" }, new TypeInformation[] { Types.STRING });
if (protocol.equalsIgnoreCase("http") || protocol.equalsIgnoreCase("https")) {
HttpFileSplitReader reader = new HttpFileSplitReader(filePath);
rows = execEnv.createInput(new GenericCsvInputFormat(reader, dummySchema.getFieldTypes(), rowDelim, rowDelim, ignoreFirstLine), new RowTypeInfo(dummySchema.getFieldTypes(), dummySchema.getFieldNames())).name("http_csv_source");
} else {
RowCsvInputFormat inputFormat = new RowCsvInputFormat(new Path(filePath), dummySchema.getFieldTypes(), rowDelim, rowDelim, new int[] { 0 }, true, getFilePath().getFileSystem());
inputFormat.setSkipFirstLineAsHeader(ignoreFirstLine);
rows = execEnv.createInput(inputFormat).name("csv_source");
}
rows = rows.flatMap(new CsvUtil.ParseCsvFunc(colTypes, fieldDelim, quoteChar, skipBlankLine, lenient));
return DataSetConversionUtil.toTable(getMLEnvironmentId(), rows, colNames, colTypes);
}
use of com.alibaba.alink.common.io.filesystem.copy.csv.RowCsvInputFormat in project Alink by alibaba.
the class InternalCsvSourceStreamOp method initializeDataSource.
@Override
public Table initializeDataSource() {
final String filePath = getFilePath().getPathStr();
final String schemaStr = getSchemaStr();
final String fieldDelim = getFieldDelimiter();
final String rowDelim = getRowDelimiter();
final Character quoteChar = getQuoteChar();
final boolean skipBlankLine = getSkipBlankLine();
final boolean lenient = getLenient();
final String[] colNames = CsvUtil.getColNames(schemaStr);
final TypeInformation<?>[] colTypes = CsvUtil.getColTypes(schemaStr);
boolean ignoreFirstLine = getIgnoreFirstLine();
String protocol = "";
try {
URL url = new URL(filePath);
protocol = url.getProtocol();
} catch (MalformedURLException ignored) {
}
DataStream<Row> rows;
StreamExecutionEnvironment execEnv = MLEnvironmentFactory.get(getMLEnvironmentId()).getStreamExecutionEnvironment();
TableSchema dummySchema = new TableSchema(new String[] { "f1" }, new TypeInformation[] { Types.STRING });
if (protocol.equalsIgnoreCase("http") || protocol.equalsIgnoreCase("https")) {
HttpFileSplitReader reader = new HttpFileSplitReader(filePath);
rows = execEnv.createInput(new GenericCsvInputFormat(reader, dummySchema.getFieldTypes(), rowDelim, rowDelim, ignoreFirstLine), new RowTypeInfo(dummySchema.getFieldTypes(), dummySchema.getFieldNames())).name("http_csv_source");
} else {
RowCsvInputFormat inputFormat = new RowCsvInputFormat(new Path(filePath), dummySchema.getFieldTypes(), rowDelim, rowDelim, new int[] { 0 }, true, getFilePath().getFileSystem());
inputFormat.setSkipFirstLineAsHeader(ignoreFirstLine);
rows = execEnv.createInput(inputFormat).name("csv_source");
}
rows = rows.flatMap(new CsvUtil.ParseCsvFunc(colTypes, fieldDelim, quoteChar, skipBlankLine, lenient));
return DataStreamConversionUtil.toTable(getMLEnvironmentId(), rows, colNames, colTypes);
}
Aggregations