Search in sources :

Example 1 with HttpFileSplitReader

use of com.alibaba.alink.operator.common.io.reader.HttpFileSplitReader in project Alink by alibaba.

the class FilePath method download.

public static String download(FilePath folder, String fileName) throws IOException {
    // local
    if (folder.getFileSystem() instanceof LocalFileSystem) {
        return folder.getPathStr();
    }
    File localConfDir = new File(System.getProperty("java.io.tmpdir"), FileUtils.getRandomFilename(""));
    String scheme = folder.getPath().toUri().getScheme();
    if (!localConfDir.mkdir()) {
        throw new RuntimeException("Could not create the dir " + localConfDir.getAbsolutePath());
    }
    try (FileOutputStream outputStream = new FileOutputStream(Paths.get(localConfDir.getPath(), fileName).toFile())) {
        // http
        if (scheme != null && (scheme.equalsIgnoreCase("http") || scheme.equalsIgnoreCase("https"))) {
            try (HttpFileSplitReader reader = new HttpFileSplitReader(new Path(folder.getPath(), fileName).toString())) {
                long fileLen = reader.getFileLength();
                reader.open(null, 0, fileLen);
                int offset = 0;
                byte[] buffer = new byte[1024];
                while (offset < fileLen) {
                    int len = reader.read(buffer, offset, 1024);
                    outputStream.write(buffer, offset, len);
                    offset += len;
                }
            }
        } else {
            // file system
            try (FSDataInputStream inputStream = folder.getFileSystem().open(new Path(folder.getPath(), fileName))) {
                IOUtils.copy(inputStream, outputStream);
            }
        }
        return localConfDir.getAbsolutePath();
    }
}
Also used : Path(org.apache.flink.core.fs.Path) FileOutputStream(java.io.FileOutputStream) FSDataInputStream(org.apache.flink.core.fs.FSDataInputStream) HttpFileSplitReader(com.alibaba.alink.operator.common.io.reader.HttpFileSplitReader) File(java.io.File)

Example 2 with HttpFileSplitReader

use of com.alibaba.alink.operator.common.io.reader.HttpFileSplitReader in project Alink by alibaba.

the class HiveCatalog method fileExists.

public static boolean fileExists(FilePath folder, String file) throws IOException {
    // local
    if (folder.getFileSystem() instanceof LocalFileSystem) {
        return folder.getFileSystem().exists(new Path(folder.getPath(), file));
    }
    String scheme = folder.getPath().toUri().getScheme();
    if (scheme != null && (scheme.equalsIgnoreCase("http") || scheme.equalsIgnoreCase("https"))) {
        try (HttpFileSplitReader reader = new HttpFileSplitReader(folder.getPathStr() + "/" + file)) {
            long fileLen = reader.getFileLength();
            reader.open(null, 0, fileLen);
        } catch (FileNotFoundException exception) {
            return false;
        }
        return true;
    } else {
        return folder.getFileSystem().exists(new Path(folder.getPath(), file));
    }
}
Also used : Path(org.apache.flink.core.fs.Path) ObjectPath(org.apache.flink.table.catalog.ObjectPath) FilePath(com.alibaba.alink.common.io.filesystem.FilePath) LocalFileSystem(com.alibaba.alink.common.io.filesystem.LocalFileSystem) FileNotFoundException(java.io.FileNotFoundException) HttpFileSplitReader(com.alibaba.alink.operator.common.io.reader.HttpFileSplitReader)

Example 3 with HttpFileSplitReader

use of com.alibaba.alink.operator.common.io.reader.HttpFileSplitReader in project Alink by alibaba.

the class HiveCatalog method readFile.

public static String readFile(FilePath filePath) throws IOException {
    String scheme = filePath.getPath().toUri().getScheme();
    if (scheme != null && (scheme.equalsIgnoreCase("http") || scheme.equalsIgnoreCase("https"))) {
        try (HttpFileSplitReader reader = new HttpFileSplitReader(filePath.toString())) {
            long fileLen = reader.getFileLength();
            reader.open(null, 0, fileLen);
            int len = (int) reader.getFileLength();
            byte[] buffer = new byte[len];
            reader.read(buffer, 0, len);
            return new String(buffer, StandardCharsets.UTF_8);
        }
    } else {
        try (FSDataInputStream inputStream = filePath.getFileSystem().open(filePath.getPath())) {
            return IOUtils.toString(inputStream, StandardCharsets.UTF_8);
        }
    }
}
Also used : FSDataInputStream(org.apache.flink.core.fs.FSDataInputStream) HttpFileSplitReader(com.alibaba.alink.operator.common.io.reader.HttpFileSplitReader)

Example 4 with HttpFileSplitReader

use of com.alibaba.alink.operator.common.io.reader.HttpFileSplitReader in project Alink by alibaba.

the class HiveCatalog method downloadFolder.

public static String downloadFolder(FilePath folder, String... files) throws IOException {
    // local
    if (folder.getFileSystem() instanceof LocalFileSystem) {
        return folder.getPathStr();
    }
    File localConfDir = new File(System.getProperty("java.io.tmpdir"), FileUtils.getRandomFilename(""));
    String scheme = folder.getPath().toUri().getScheme();
    if (!localConfDir.mkdir()) {
        throw new RuntimeException("Could not create the dir " + localConfDir.getAbsolutePath());
    }
    if (scheme != null && (scheme.equalsIgnoreCase("http") || scheme.equalsIgnoreCase("https"))) {
        for (String path : files) {
            try (HttpFileSplitReader reader = new HttpFileSplitReader(folder.getPathStr() + "/" + path)) {
                long fileLen = reader.getFileLength();
                reader.open(null, 0, fileLen);
                int offset = 0;
                byte[] buffer = new byte[1024];
                try (FileOutputStream outputStream = new FileOutputStream(Paths.get(localConfDir.getPath(), path).toFile())) {
                    while (offset < fileLen) {
                        int len = reader.read(buffer, offset, 1024);
                        outputStream.write(buffer, offset, len);
                        offset += len;
                    }
                }
            } catch (FileNotFoundException exception) {
            // pass
            }
        }
    } else {
        for (String path : files) {
            // file system
            if (!folder.getFileSystem().exists(new Path(folder.getPath(), path))) {
                continue;
            }
            try (FSDataInputStream inputStream = folder.getFileSystem().open(new Path(folder.getPath(), path));
                FileOutputStream outputStream = new FileOutputStream(Paths.get(localConfDir.getPath(), path).toFile())) {
                IOUtils.copy(inputStream, outputStream);
            }
        }
    }
    return localConfDir.getAbsolutePath();
}
Also used : Path(org.apache.flink.core.fs.Path) ObjectPath(org.apache.flink.table.catalog.ObjectPath) FilePath(com.alibaba.alink.common.io.filesystem.FilePath) LocalFileSystem(com.alibaba.alink.common.io.filesystem.LocalFileSystem) FileOutputStream(java.io.FileOutputStream) FileNotFoundException(java.io.FileNotFoundException) FSDataInputStream(org.apache.flink.core.fs.FSDataInputStream) HttpFileSplitReader(com.alibaba.alink.operator.common.io.reader.HttpFileSplitReader) File(java.io.File)

Example 5 with HttpFileSplitReader

use of com.alibaba.alink.operator.common.io.reader.HttpFileSplitReader in project Alink by alibaba.

the class InternalCsvSourceBatchOp method initializeDataSource.

@Override
public Table initializeDataSource() {
    final String filePath = getFilePath().getPathStr();
    final String schemaStr = getSchemaStr();
    final String fieldDelim = getFieldDelimiter();
    final String rowDelim = getRowDelimiter();
    final Character quoteChar = getQuoteChar();
    final boolean skipBlankLine = getSkipBlankLine();
    final boolean lenient = getLenient();
    final String[] colNames = CsvUtil.getColNames(schemaStr);
    final TypeInformation<?>[] colTypes = CsvUtil.getColTypes(schemaStr);
    boolean ignoreFirstLine = getIgnoreFirstLine();
    String protocol = "";
    try {
        URL url = new URL(filePath);
        protocol = url.getProtocol();
    } catch (MalformedURLException ignored) {
    }
    DataSet<Row> rows;
    ExecutionEnvironment execEnv = MLEnvironmentFactory.get(getMLEnvironmentId()).getExecutionEnvironment();
    TableSchema dummySchema = new TableSchema(new String[] { "f1" }, new TypeInformation[] { Types.STRING });
    if (protocol.equalsIgnoreCase("http") || protocol.equalsIgnoreCase("https")) {
        HttpFileSplitReader reader = new HttpFileSplitReader(filePath);
        rows = execEnv.createInput(new GenericCsvInputFormat(reader, dummySchema.getFieldTypes(), rowDelim, rowDelim, ignoreFirstLine), new RowTypeInfo(dummySchema.getFieldTypes(), dummySchema.getFieldNames())).name("http_csv_source");
    } else {
        RowCsvInputFormat inputFormat = new RowCsvInputFormat(new Path(filePath), dummySchema.getFieldTypes(), rowDelim, rowDelim, new int[] { 0 }, true, getFilePath().getFileSystem());
        inputFormat.setSkipFirstLineAsHeader(ignoreFirstLine);
        rows = execEnv.createInput(inputFormat).name("csv_source");
    }
    rows = rows.flatMap(new CsvUtil.ParseCsvFunc(colTypes, fieldDelim, quoteChar, skipBlankLine, lenient));
    return DataSetConversionUtil.toTable(getMLEnvironmentId(), rows, colNames, colTypes);
}
Also used : FilePath(com.alibaba.alink.common.io.filesystem.FilePath) Path(org.apache.flink.core.fs.Path) MalformedURLException(java.net.MalformedURLException) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) TableSchema(org.apache.flink.table.api.TableSchema) RowTypeInfo(org.apache.flink.api.java.typeutils.RowTypeInfo) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) URL(java.net.URL) Row(org.apache.flink.types.Row) HttpFileSplitReader(com.alibaba.alink.operator.common.io.reader.HttpFileSplitReader) RowCsvInputFormat(com.alibaba.alink.common.io.filesystem.copy.csv.RowCsvInputFormat)

Aggregations

HttpFileSplitReader (com.alibaba.alink.operator.common.io.reader.HttpFileSplitReader)6 Path (org.apache.flink.core.fs.Path)5 FilePath (com.alibaba.alink.common.io.filesystem.FilePath)4 FSDataInputStream (org.apache.flink.core.fs.FSDataInputStream)3 LocalFileSystem (com.alibaba.alink.common.io.filesystem.LocalFileSystem)2 RowCsvInputFormat (com.alibaba.alink.common.io.filesystem.copy.csv.RowCsvInputFormat)2 File (java.io.File)2 FileNotFoundException (java.io.FileNotFoundException)2 FileOutputStream (java.io.FileOutputStream)2 MalformedURLException (java.net.MalformedURLException)2 URL (java.net.URL)2 TypeInformation (org.apache.flink.api.common.typeinfo.TypeInformation)2 RowTypeInfo (org.apache.flink.api.java.typeutils.RowTypeInfo)2 TableSchema (org.apache.flink.table.api.TableSchema)2 ObjectPath (org.apache.flink.table.catalog.ObjectPath)2 Row (org.apache.flink.types.Row)2 ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)1 StreamExecutionEnvironment (org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)1