Search in sources :

Example 1 with BlockLocation

use of org.apache.flink.core.fs.BlockLocation in project flink by apache.

the class BinaryInputFormat method createInputSplits.

@Override
public FileInputSplit[] createInputSplits(int minNumSplits) throws IOException {
    List<FileStatus> files = this.getFiles();
    final FileSystem fs = this.filePath.getFileSystem();
    final long blockSize = this.blockSize == NATIVE_BLOCK_SIZE ? fs.getDefaultBlockSize() : this.blockSize;
    final List<FileInputSplit> inputSplits = new ArrayList<FileInputSplit>(minNumSplits);
    for (FileStatus file : files) {
        for (long pos = 0, length = file.getLen(); pos < length; pos += blockSize) {
            long remainingLength = Math.min(pos + blockSize, length) - pos;
            // get the block locations and make sure they are in order with respect to their offset
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, pos, remainingLength);
            Arrays.sort(blocks);
            inputSplits.add(new FileInputSplit(inputSplits.size(), file.getPath(), pos, remainingLength, blocks[0].getHosts()));
        }
    }
    if (inputSplits.size() < minNumSplits) {
        LOG.warn(String.format("With the given block size %d, the file %s cannot be split into %d blocks. Filling up with empty splits...", blockSize, this.filePath, minNumSplits));
        FileStatus last = files.get(files.size() - 1);
        final BlockLocation[] blocks = fs.getFileBlockLocations(last, 0, last.getLen());
        for (int index = files.size(); index < minNumSplits; index++) {
            inputSplits.add(new FileInputSplit(index, last.getPath(), last.getLen(), 0, blocks[0].getHosts()));
        }
    }
    return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
Also used : FileInputSplit(org.apache.flink.core.fs.FileInputSplit) FileStatus(org.apache.flink.core.fs.FileStatus) FileSystem(org.apache.flink.core.fs.FileSystem) ArrayList(java.util.ArrayList) BlockLocation(org.apache.flink.core.fs.BlockLocation)

Example 2 with BlockLocation

use of org.apache.flink.core.fs.BlockLocation in project flink by apache.

the class FileInputFormat method createInputSplits.

/**
	 * Computes the input splits for the file. By default, one file block is one split. If more splits
	 * are requested than blocks are available, then a split may be a fraction of a block and splits may cross
	 * block boundaries.
	 * 
	 * @param minNumSplits The minimum desired number of file splits.
	 * @return The computed file splits.
	 * 
	 * @see org.apache.flink.api.common.io.InputFormat#createInputSplits(int)
	 */
@Override
public FileInputSplit[] createInputSplits(int minNumSplits) throws IOException {
    if (minNumSplits < 1) {
        throw new IllegalArgumentException("Number of input splits has to be at least 1.");
    }
    // take the desired number of splits into account
    minNumSplits = Math.max(minNumSplits, this.numSplits);
    final Path path = this.filePath;
    final List<FileInputSplit> inputSplits = new ArrayList<FileInputSplit>(minNumSplits);
    // get all the files that are involved in the splits
    List<FileStatus> files = new ArrayList<FileStatus>();
    long totalLength = 0;
    final FileSystem fs = path.getFileSystem();
    final FileStatus pathFile = fs.getFileStatus(path);
    if (pathFile.isDir()) {
        totalLength += addFilesInDir(path, files, true);
    } else {
        testForUnsplittable(pathFile);
        files.add(pathFile);
        totalLength += pathFile.getLen();
    }
    // returns if unsplittable
    if (unsplittable) {
        int splitNum = 0;
        for (final FileStatus file : files) {
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, file.getLen());
            Set<String> hosts = new HashSet<String>();
            for (BlockLocation block : blocks) {
                hosts.addAll(Arrays.asList(block.getHosts()));
            }
            long len = file.getLen();
            if (testForUnsplittable(file)) {
                len = READ_WHOLE_SPLIT_FLAG;
            }
            FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), 0, len, hosts.toArray(new String[hosts.size()]));
            inputSplits.add(fis);
        }
        return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
    }
    final long maxSplitSize = (minNumSplits < 1) ? Long.MAX_VALUE : (totalLength / minNumSplits + (totalLength % minNumSplits == 0 ? 0 : 1));
    // now that we have the files, generate the splits
    int splitNum = 0;
    for (final FileStatus file : files) {
        final long len = file.getLen();
        final long blockSize = file.getBlockSize();
        final long minSplitSize;
        if (this.minSplitSize <= blockSize) {
            minSplitSize = this.minSplitSize;
        } else {
            if (LOG.isWarnEnabled()) {
                LOG.warn("Minimal split size of " + this.minSplitSize + " is larger than the block size of " + blockSize + ". Decreasing minimal split size to block size.");
            }
            minSplitSize = blockSize;
        }
        final long splitSize = Math.max(minSplitSize, Math.min(maxSplitSize, blockSize));
        final long halfSplit = splitSize >>> 1;
        final long maxBytesForLastSplit = (long) (splitSize * MAX_SPLIT_SIZE_DISCREPANCY);
        if (len > 0) {
            // get the block locations and make sure they are in order with respect to their offset
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, len);
            Arrays.sort(blocks);
            long bytesUnassigned = len;
            long position = 0;
            int blockIndex = 0;
            while (bytesUnassigned > maxBytesForLastSplit) {
                // get the block containing the majority of the data
                blockIndex = getBlockIndexForPosition(blocks, position, halfSplit, blockIndex);
                // create a new split
                FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), position, splitSize, blocks[blockIndex].getHosts());
                inputSplits.add(fis);
                // adjust the positions
                position += splitSize;
                bytesUnassigned -= splitSize;
            }
            // assign the last split
            if (bytesUnassigned > 0) {
                blockIndex = getBlockIndexForPosition(blocks, position, halfSplit, blockIndex);
                final FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), position, bytesUnassigned, blocks[blockIndex].getHosts());
                inputSplits.add(fis);
            }
        } else {
            // special case with a file of zero bytes size
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, 0);
            String[] hosts;
            if (blocks.length > 0) {
                hosts = blocks[0].getHosts();
            } else {
                hosts = new String[0];
            }
            final FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), 0, 0, hosts);
            inputSplits.add(fis);
        }
    }
    return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
Also used : Path(org.apache.flink.core.fs.Path) FileStatus(org.apache.flink.core.fs.FileStatus) ArrayList(java.util.ArrayList) BlockLocation(org.apache.flink.core.fs.BlockLocation) FileInputSplit(org.apache.flink.core.fs.FileInputSplit) FileSystem(org.apache.flink.core.fs.FileSystem) HashSet(java.util.HashSet)

Example 3 with BlockLocation

use of org.apache.flink.core.fs.BlockLocation in project flink by apache.

the class MapRFileSystem method getFileBlockLocations.

@Override
public BlockLocation[] getFileBlockLocations(final FileStatus file, final long start, final long len) throws IOException {
    if (!(file instanceof HadoopFileStatus)) {
        throw new IOException("file is not an instance of DistributedFileStatus");
    }
    final HadoopFileStatus f = (HadoopFileStatus) file;
    final org.apache.hadoop.fs.BlockLocation[] blkLocations = fs.getFileBlockLocations(f.getInternalFileStatus(), start, len);
    // Wrap up HDFS specific block location objects
    final HadoopBlockLocation[] distBlkLocations = new HadoopBlockLocation[blkLocations.length];
    for (int i = 0; i < distBlkLocations.length; i++) {
        distBlkLocations[i] = new HadoopBlockLocation(blkLocations[i]);
    }
    return distBlkLocations;
}
Also used : HadoopBlockLocation(org.apache.flink.runtime.fs.hdfs.HadoopBlockLocation) IOException(java.io.IOException) HadoopBlockLocation(org.apache.flink.runtime.fs.hdfs.HadoopBlockLocation) BlockLocation(org.apache.flink.core.fs.BlockLocation) HadoopFileStatus(org.apache.flink.runtime.fs.hdfs.HadoopFileStatus)

Aggregations

BlockLocation (org.apache.flink.core.fs.BlockLocation)3 ArrayList (java.util.ArrayList)2 FileInputSplit (org.apache.flink.core.fs.FileInputSplit)2 FileStatus (org.apache.flink.core.fs.FileStatus)2 FileSystem (org.apache.flink.core.fs.FileSystem)2 IOException (java.io.IOException)1 HashSet (java.util.HashSet)1 Path (org.apache.flink.core.fs.Path)1 HadoopBlockLocation (org.apache.flink.runtime.fs.hdfs.HadoopBlockLocation)1 HadoopFileStatus (org.apache.flink.runtime.fs.hdfs.HadoopFileStatus)1