Search in sources :

Example 1 with RowCsvInputFormat

use of com.alibaba.alink.common.io.filesystem.copy.csv.RowCsvInputFormat in project Alink by alibaba.

the class InternalCsvSourceBatchOp method initializeDataSource.

@Override
public Table initializeDataSource() {
    final String filePath = getFilePath().getPathStr();
    final String schemaStr = getSchemaStr();
    final String fieldDelim = getFieldDelimiter();
    final String rowDelim = getRowDelimiter();
    final Character quoteChar = getQuoteChar();
    final boolean skipBlankLine = getSkipBlankLine();
    final boolean lenient = getLenient();
    final String[] colNames = CsvUtil.getColNames(schemaStr);
    final TypeInformation<?>[] colTypes = CsvUtil.getColTypes(schemaStr);
    boolean ignoreFirstLine = getIgnoreFirstLine();
    String protocol = "";
    try {
        URL url = new URL(filePath);
        protocol = url.getProtocol();
    } catch (MalformedURLException ignored) {
    }
    DataSet<Row> rows;
    ExecutionEnvironment execEnv = MLEnvironmentFactory.get(getMLEnvironmentId()).getExecutionEnvironment();
    TableSchema dummySchema = new TableSchema(new String[] { "f1" }, new TypeInformation[] { Types.STRING });
    if (protocol.equalsIgnoreCase("http") || protocol.equalsIgnoreCase("https")) {
        HttpFileSplitReader reader = new HttpFileSplitReader(filePath);
        rows = execEnv.createInput(new GenericCsvInputFormat(reader, dummySchema.getFieldTypes(), rowDelim, rowDelim, ignoreFirstLine), new RowTypeInfo(dummySchema.getFieldTypes(), dummySchema.getFieldNames())).name("http_csv_source");
    } else {
        RowCsvInputFormat inputFormat = new RowCsvInputFormat(new Path(filePath), dummySchema.getFieldTypes(), rowDelim, rowDelim, new int[] { 0 }, true, getFilePath().getFileSystem());
        inputFormat.setSkipFirstLineAsHeader(ignoreFirstLine);
        rows = execEnv.createInput(inputFormat).name("csv_source");
    }
    rows = rows.flatMap(new CsvUtil.ParseCsvFunc(colTypes, fieldDelim, quoteChar, skipBlankLine, lenient));
    return DataSetConversionUtil.toTable(getMLEnvironmentId(), rows, colNames, colTypes);
}
Also used : FilePath(com.alibaba.alink.common.io.filesystem.FilePath) Path(org.apache.flink.core.fs.Path) MalformedURLException(java.net.MalformedURLException) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) TableSchema(org.apache.flink.table.api.TableSchema) RowTypeInfo(org.apache.flink.api.java.typeutils.RowTypeInfo) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) URL(java.net.URL) Row(org.apache.flink.types.Row) HttpFileSplitReader(com.alibaba.alink.operator.common.io.reader.HttpFileSplitReader) RowCsvInputFormat(com.alibaba.alink.common.io.filesystem.copy.csv.RowCsvInputFormat)

Example 2 with RowCsvInputFormat

use of com.alibaba.alink.common.io.filesystem.copy.csv.RowCsvInputFormat in project Alink by alibaba.

the class InternalCsvSourceStreamOp method initializeDataSource.

@Override
public Table initializeDataSource() {
    final String filePath = getFilePath().getPathStr();
    final String schemaStr = getSchemaStr();
    final String fieldDelim = getFieldDelimiter();
    final String rowDelim = getRowDelimiter();
    final Character quoteChar = getQuoteChar();
    final boolean skipBlankLine = getSkipBlankLine();
    final boolean lenient = getLenient();
    final String[] colNames = CsvUtil.getColNames(schemaStr);
    final TypeInformation<?>[] colTypes = CsvUtil.getColTypes(schemaStr);
    boolean ignoreFirstLine = getIgnoreFirstLine();
    String protocol = "";
    try {
        URL url = new URL(filePath);
        protocol = url.getProtocol();
    } catch (MalformedURLException ignored) {
    }
    DataStream<Row> rows;
    StreamExecutionEnvironment execEnv = MLEnvironmentFactory.get(getMLEnvironmentId()).getStreamExecutionEnvironment();
    TableSchema dummySchema = new TableSchema(new String[] { "f1" }, new TypeInformation[] { Types.STRING });
    if (protocol.equalsIgnoreCase("http") || protocol.equalsIgnoreCase("https")) {
        HttpFileSplitReader reader = new HttpFileSplitReader(filePath);
        rows = execEnv.createInput(new GenericCsvInputFormat(reader, dummySchema.getFieldTypes(), rowDelim, rowDelim, ignoreFirstLine), new RowTypeInfo(dummySchema.getFieldTypes(), dummySchema.getFieldNames())).name("http_csv_source");
    } else {
        RowCsvInputFormat inputFormat = new RowCsvInputFormat(new Path(filePath), dummySchema.getFieldTypes(), rowDelim, rowDelim, new int[] { 0 }, true, getFilePath().getFileSystem());
        inputFormat.setSkipFirstLineAsHeader(ignoreFirstLine);
        rows = execEnv.createInput(inputFormat).name("csv_source");
    }
    rows = rows.flatMap(new CsvUtil.ParseCsvFunc(colTypes, fieldDelim, quoteChar, skipBlankLine, lenient));
    return DataStreamConversionUtil.toTable(getMLEnvironmentId(), rows, colNames, colTypes);
}
Also used : FilePath(com.alibaba.alink.common.io.filesystem.FilePath) Path(org.apache.flink.core.fs.Path) MalformedURLException(java.net.MalformedURLException) TableSchema(org.apache.flink.table.api.TableSchema) RowTypeInfo(org.apache.flink.api.java.typeutils.RowTypeInfo) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) URL(java.net.URL) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) Row(org.apache.flink.types.Row) HttpFileSplitReader(com.alibaba.alink.operator.common.io.reader.HttpFileSplitReader) RowCsvInputFormat(com.alibaba.alink.common.io.filesystem.copy.csv.RowCsvInputFormat)

Aggregations

FilePath (com.alibaba.alink.common.io.filesystem.FilePath)2 RowCsvInputFormat (com.alibaba.alink.common.io.filesystem.copy.csv.RowCsvInputFormat)2 HttpFileSplitReader (com.alibaba.alink.operator.common.io.reader.HttpFileSplitReader)2 MalformedURLException (java.net.MalformedURLException)2 URL (java.net.URL)2 TypeInformation (org.apache.flink.api.common.typeinfo.TypeInformation)2 RowTypeInfo (org.apache.flink.api.java.typeutils.RowTypeInfo)2 Path (org.apache.flink.core.fs.Path)2 TableSchema (org.apache.flink.table.api.TableSchema)2 Row (org.apache.flink.types.Row)2 ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)1 StreamExecutionEnvironment (org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)1