Search in sources :

Example 31 with FileSystem

use of org.apache.flink.core.fs.FileSystem in project flink by apache.

the class FileInputFormat method createInputSplits.

/**
	 * Computes the input splits for the file. By default, one file block is one split. If more splits
	 * are requested than blocks are available, then a split may be a fraction of a block and splits may cross
	 * block boundaries.
	 * 
	 * @param minNumSplits The minimum desired number of file splits.
	 * @return The computed file splits.
	 * 
	 * @see org.apache.flink.api.common.io.InputFormat#createInputSplits(int)
	 */
@Override
public FileInputSplit[] createInputSplits(int minNumSplits) throws IOException {
    if (minNumSplits < 1) {
        throw new IllegalArgumentException("Number of input splits has to be at least 1.");
    }
    // take the desired number of splits into account
    minNumSplits = Math.max(minNumSplits, this.numSplits);
    final Path path = this.filePath;
    final List<FileInputSplit> inputSplits = new ArrayList<FileInputSplit>(minNumSplits);
    // get all the files that are involved in the splits
    List<FileStatus> files = new ArrayList<FileStatus>();
    long totalLength = 0;
    final FileSystem fs = path.getFileSystem();
    final FileStatus pathFile = fs.getFileStatus(path);
    if (pathFile.isDir()) {
        totalLength += addFilesInDir(path, files, true);
    } else {
        testForUnsplittable(pathFile);
        files.add(pathFile);
        totalLength += pathFile.getLen();
    }
    // returns if unsplittable
    if (unsplittable) {
        int splitNum = 0;
        for (final FileStatus file : files) {
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, file.getLen());
            Set<String> hosts = new HashSet<String>();
            for (BlockLocation block : blocks) {
                hosts.addAll(Arrays.asList(block.getHosts()));
            }
            long len = file.getLen();
            if (testForUnsplittable(file)) {
                len = READ_WHOLE_SPLIT_FLAG;
            }
            FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), 0, len, hosts.toArray(new String[hosts.size()]));
            inputSplits.add(fis);
        }
        return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
    }
    final long maxSplitSize = (minNumSplits < 1) ? Long.MAX_VALUE : (totalLength / minNumSplits + (totalLength % minNumSplits == 0 ? 0 : 1));
    // now that we have the files, generate the splits
    int splitNum = 0;
    for (final FileStatus file : files) {
        final long len = file.getLen();
        final long blockSize = file.getBlockSize();
        final long minSplitSize;
        if (this.minSplitSize <= blockSize) {
            minSplitSize = this.minSplitSize;
        } else {
            if (LOG.isWarnEnabled()) {
                LOG.warn("Minimal split size of " + this.minSplitSize + " is larger than the block size of " + blockSize + ". Decreasing minimal split size to block size.");
            }
            minSplitSize = blockSize;
        }
        final long splitSize = Math.max(minSplitSize, Math.min(maxSplitSize, blockSize));
        final long halfSplit = splitSize >>> 1;
        final long maxBytesForLastSplit = (long) (splitSize * MAX_SPLIT_SIZE_DISCREPANCY);
        if (len > 0) {
            // get the block locations and make sure they are in order with respect to their offset
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, len);
            Arrays.sort(blocks);
            long bytesUnassigned = len;
            long position = 0;
            int blockIndex = 0;
            while (bytesUnassigned > maxBytesForLastSplit) {
                // get the block containing the majority of the data
                blockIndex = getBlockIndexForPosition(blocks, position, halfSplit, blockIndex);
                // create a new split
                FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), position, splitSize, blocks[blockIndex].getHosts());
                inputSplits.add(fis);
                // adjust the positions
                position += splitSize;
                bytesUnassigned -= splitSize;
            }
            // assign the last split
            if (bytesUnassigned > 0) {
                blockIndex = getBlockIndexForPosition(blocks, position, halfSplit, blockIndex);
                final FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), position, bytesUnassigned, blocks[blockIndex].getHosts());
                inputSplits.add(fis);
            }
        } else {
            // special case with a file of zero bytes size
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, 0);
            String[] hosts;
            if (blocks.length > 0) {
                hosts = blocks[0].getHosts();
            } else {
                hosts = new String[0];
            }
            final FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), 0, 0, hosts);
            inputSplits.add(fis);
        }
    }
    return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
Also used : Path(org.apache.flink.core.fs.Path) FileStatus(org.apache.flink.core.fs.FileStatus) ArrayList(java.util.ArrayList) BlockLocation(org.apache.flink.core.fs.BlockLocation) FileInputSplit(org.apache.flink.core.fs.FileInputSplit) FileSystem(org.apache.flink.core.fs.FileSystem) HashSet(java.util.HashSet)

Example 32 with FileSystem

use of org.apache.flink.core.fs.FileSystem in project flink by apache.

the class Plan method registerCachedFile.

/**
	 *  register cache files in program level
	 * @param entry contains all relevant information
	 * @param name user defined name of that file
	 * @throws java.io.IOException
	 */
public void registerCachedFile(String name, DistributedCacheEntry entry) throws IOException {
    if (!this.cacheFile.containsKey(name)) {
        try {
            URI u = new URI(entry.filePath);
            if (!u.getPath().startsWith("/")) {
                u = new File(entry.filePath).toURI();
            }
            FileSystem fs = FileSystem.get(u);
            if (fs.exists(new Path(u.getPath()))) {
                this.cacheFile.put(name, new DistributedCacheEntry(u.toString(), entry.isExecutable));
            } else {
                throw new IOException("File " + u.toString() + " doesn't exist.");
            }
        } catch (URISyntaxException ex) {
            throw new IOException("Invalid path: " + entry.filePath, ex);
        }
    } else {
        throw new IOException("cache file " + name + "already exists!");
    }
}
Also used : Path(org.apache.flink.core.fs.Path) DistributedCacheEntry(org.apache.flink.api.common.cache.DistributedCache.DistributedCacheEntry) FileSystem(org.apache.flink.core.fs.FileSystem) IOException(java.io.IOException) URISyntaxException(java.net.URISyntaxException) URI(java.net.URI) File(java.io.File)

Example 33 with FileSystem

use of org.apache.flink.core.fs.FileSystem in project flink by apache.

the class BinaryInputFormat method getFiles.

protected List<FileStatus> getFiles() throws IOException {
    // get all the files that are involved in the splits
    List<FileStatus> files = new ArrayList<FileStatus>();
    final FileSystem fs = this.filePath.getFileSystem();
    final FileStatus pathFile = fs.getFileStatus(this.filePath);
    if (pathFile.isDir()) {
        // input is directory. list all contained files
        final FileStatus[] partials = fs.listStatus(this.filePath);
        for (FileStatus partial : partials) {
            if (!partial.isDir()) {
                files.add(partial);
            }
        }
    } else {
        files.add(pathFile);
    }
    return files;
}
Also used : FileStatus(org.apache.flink.core.fs.FileStatus) FileSystem(org.apache.flink.core.fs.FileSystem) ArrayList(java.util.ArrayList)

Example 34 with FileSystem

use of org.apache.flink.core.fs.FileSystem in project flink by apache.

the class BinaryInputFormat method getStatistics.

@Override
public SequentialStatistics getStatistics(BaseStatistics cachedStats) {
    final FileBaseStatistics cachedFileStats = (cachedStats != null && cachedStats instanceof FileBaseStatistics) ? (FileBaseStatistics) cachedStats : null;
    try {
        final Path filePath = this.filePath;
        // get the filesystem
        final FileSystem fs = FileSystem.get(filePath.toUri());
        final ArrayList<FileStatus> allFiles = new ArrayList<FileStatus>(1);
        // let the file input format deal with the up-to-date check and the basic size
        final FileBaseStatistics stats = getFileStats(cachedFileStats, filePath, fs, allFiles);
        if (stats == null) {
            return null;
        }
        // check whether the file stats are still sequential stats (in that case they are still valid)
        if (stats instanceof SequentialStatistics) {
            return (SequentialStatistics) stats;
        }
        return createStatistics(allFiles, stats);
    } catch (IOException ioex) {
        if (LOG.isWarnEnabled()) {
            LOG.warn(String.format("Could not determine complete statistics for file '%s' due to an I/O error", this.filePath), ioex);
        }
    } catch (Throwable t) {
        if (LOG.isErrorEnabled()) {
            LOG.error(String.format("Unexpected problem while getting the file statistics for file '%s'", this.filePath), t);
        }
    }
    // no stats available
    return null;
}
Also used : Path(org.apache.flink.core.fs.Path) FileStatus(org.apache.flink.core.fs.FileStatus) FileSystem(org.apache.flink.core.fs.FileSystem) ArrayList(java.util.ArrayList) IOException(java.io.IOException)

Example 35 with FileSystem

use of org.apache.flink.core.fs.FileSystem in project flink by apache.

the class DistCp method main.

public static void main(String[] args) throws Exception {
    // set up the execution environment
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    ParameterTool params = ParameterTool.fromArgs(args);
    if (!params.has("input") || !params.has("output")) {
        System.err.println("Usage: --input <path> --output <path> [--parallelism <n>]");
        return;
    }
    final Path sourcePath = new Path(params.get("input"));
    final Path targetPath = new Path(params.get("output"));
    if (!isLocal(env) && !(isOnDistributedFS(sourcePath) && isOnDistributedFS(targetPath))) {
        System.out.println("In a distributed mode only HDFS input/output paths are supported");
        return;
    }
    final int parallelism = params.getInt("parallelism", 10);
    if (parallelism <= 0) {
        System.err.println("Parallelism should be greater than 0");
        return;
    }
    // make parameters available in the web interface
    env.getConfig().setGlobalJobParameters(params);
    env.setParallelism(parallelism);
    long startTime = System.currentTimeMillis();
    LOGGER.info("Initializing copy tasks");
    List<FileCopyTask> tasks = getCopyTasks(sourcePath);
    LOGGER.info("Copy task initialization took " + (System.currentTimeMillis() - startTime) + "ms");
    DataSet<FileCopyTask> inputTasks = new DataSource<>(env, new FileCopyTaskInputFormat(tasks), new GenericTypeInfo<>(FileCopyTask.class), "fileCopyTasks");
    FlatMapOperator<FileCopyTask, Object> res = inputTasks.flatMap(new RichFlatMapFunction<FileCopyTask, Object>() {

        private static final long serialVersionUID = 1109254230243989929L;

        private LongCounter fileCounter;

        private LongCounter bytesCounter;

        @Override
        public void open(Configuration parameters) throws Exception {
            bytesCounter = getRuntimeContext().getLongCounter(BYTES_COPIED_CNT_NAME);
            fileCounter = getRuntimeContext().getLongCounter(FILES_COPIED_CNT_NAME);
        }

        @Override
        public void flatMap(FileCopyTask task, Collector<Object> out) throws Exception {
            LOGGER.info("Processing task: " + task);
            Path outPath = new Path(targetPath, task.getRelativePath());
            FileSystem targetFs = targetPath.getFileSystem();
            // creating parent folders in case of a local FS
            if (!targetFs.isDistributedFS()) {
                //dealing with cases like file:///tmp or just /tmp
                File outFile = outPath.toUri().isAbsolute() ? new File(outPath.toUri()) : new File(outPath.toString());
                File parentFile = outFile.getParentFile();
                if (!parentFile.mkdirs() && !parentFile.exists()) {
                    throw new RuntimeException("Cannot create local file system directories: " + parentFile);
                }
            }
            FSDataOutputStream outputStream = null;
            FSDataInputStream inputStream = null;
            try {
                outputStream = targetFs.create(outPath, true);
                inputStream = task.getPath().getFileSystem().open(task.getPath());
                int bytes = IOUtils.copy(inputStream, outputStream);
                bytesCounter.add(bytes);
            } finally {
                IOUtils.closeQuietly(inputStream);
                IOUtils.closeQuietly(outputStream);
            }
            fileCounter.add(1l);
        }
    });
    // no data sinks are needed, therefore just printing an empty result
    res.print();
    Map<String, Object> accumulators = env.getLastJobExecutionResult().getAllAccumulatorResults();
    LOGGER.info("== COUNTERS ==");
    for (Map.Entry<String, Object> e : accumulators.entrySet()) {
        LOGGER.info(e.getKey() + ": " + e.getValue());
    }
}
Also used : ParameterTool(org.apache.flink.api.java.utils.ParameterTool) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Configuration(org.apache.flink.configuration.Configuration) LongCounter(org.apache.flink.api.common.accumulators.LongCounter) FileSystem(org.apache.flink.core.fs.FileSystem) FSDataOutputStream(org.apache.flink.core.fs.FSDataOutputStream) Path(org.apache.flink.core.fs.Path) IOException(java.io.IOException) DataSource(org.apache.flink.api.java.operators.DataSource) FSDataInputStream(org.apache.flink.core.fs.FSDataInputStream) File(java.io.File) Map(java.util.Map)

Aggregations

FileSystem (org.apache.flink.core.fs.FileSystem)41 Path (org.apache.flink.core.fs.Path)34 IOException (java.io.IOException)18 FileStatus (org.apache.flink.core.fs.FileStatus)13 ArrayList (java.util.ArrayList)8 Test (org.junit.Test)8 FSDataInputStream (org.apache.flink.core.fs.FSDataInputStream)6 FSDataOutputStream (org.apache.flink.core.fs.FSDataOutputStream)6 File (java.io.File)5 URI (java.net.URI)5 URISyntaxException (java.net.URISyntaxException)4 FileNotFoundException (java.io.FileNotFoundException)3 FileInputSplit (org.apache.flink.core.fs.FileInputSplit)3 DataInputViewStreamWrapper (org.apache.flink.core.memory.DataInputViewStreamWrapper)3 FileStateHandle (org.apache.flink.runtime.state.filesystem.FileStateHandle)3 DataOutputStream (java.io.DataOutputStream)2 InputStream (java.io.InputStream)2 Field (java.lang.reflect.Field)2 Map (java.util.Map)2 FileBaseStatistics (org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics)2