Search in sources :

Example 1 with BinaryInputSplit

use of edu.iu.dsc.tws.data.api.splits.BinaryInputSplit in project twister2 by DSC-SPIDAL.

the class BinaryInputPartitioner method createInputSplits.

/**
 * Computes the input splits for the file. By default, one file block is one split. If more
 * splits are requested than blocks are available, then a split may be a fraction of a block and
 * splits may cross block boundaries.
 *
 * @param minNumSplits The minimum desired number of file splits.
 * @return The computed file splits.
 */
@Override
public FileInputSplit[] createInputSplits(int minNumSplits) throws IOException {
    if (minNumSplits < 1) {
        throw new IllegalArgumentException("Number of input splits has to be at least 1.");
    }
    int curminNumSplits = Math.max(minNumSplits, this.numSplits);
    final Path path = this.filePath;
    final List<FileInputSplit> inputSplits = new ArrayList<FileInputSplit>(curminNumSplits);
    List<FileStatus> files = new ArrayList<FileStatus>();
    long totalLength = 0;
    // path.getFileSystem();
    final FileSystem fs = FileSystemUtils.get(path);
    final FileStatus pathFile = fs.getFileStatus(path);
    if (pathFile.isDir()) {
        totalLength += sumFilesInDir(path, files, true);
    } else {
        files.add(pathFile);
        totalLength += pathFile.getLen();
    }
    if (totalLength % this.recordLength != 0) {
        throw new IllegalStateException("The Binary file has a incomplete record");
    }
    long numberOfRecords = totalLength / this.recordLength;
    long minRecordsForSplit = Math.floorDiv(numberOfRecords, minNumSplits);
    long oddRecords = numberOfRecords % minNumSplits;
    // Generate the splits
    int splitNum = 0;
    for (final FileStatus file : files) {
        final long len = file.getLen();
        final long blockSize = file.getBlockSize();
        final long minSplitSize = minRecordsForSplit * this.recordLength;
        long currentSplitSize = minSplitSize;
        long halfSplit = currentSplitSize >>> 1;
        if (oddRecords > 0) {
            currentSplitSize = currentSplitSize + this.recordLength;
            oddRecords--;
        }
        if (len > 0) {
            // get the block locations and make sure they are in order with respect to their offset
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, len);
            Arrays.sort(blocks);
            long bytesUnassigned = len;
            long position = 0;
            int blockIndex = 0;
            while (bytesUnassigned >= currentSplitSize) {
                // get the block containing the majority of the data
                blockIndex = getBlockIndexForPosition(blocks, position, halfSplit, blockIndex);
                // create a new split
                FileInputSplit fis = new BinaryInputSplit(splitNum++, file.getPath(), position, currentSplitSize, blocks[blockIndex].getHosts());
                inputSplits.add(fis);
                // adjust the positions
                position += currentSplitSize;
                bytesUnassigned -= currentSplitSize;
            }
        } else {
            throw new IllegalStateException("The binary file " + file.getPath() + " is Empty");
        }
    }
    return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
Also used : Path(edu.iu.dsc.tws.api.data.Path) FileStatus(edu.iu.dsc.tws.api.data.FileStatus) ArrayList(java.util.ArrayList) BlockLocation(edu.iu.dsc.tws.api.data.BlockLocation) FileInputSplit(edu.iu.dsc.tws.data.api.splits.FileInputSplit) FileSystem(edu.iu.dsc.tws.api.data.FileSystem) BinaryInputSplit(edu.iu.dsc.tws.data.api.splits.BinaryInputSplit)

Aggregations

BlockLocation (edu.iu.dsc.tws.api.data.BlockLocation)1 FileStatus (edu.iu.dsc.tws.api.data.FileStatus)1 FileSystem (edu.iu.dsc.tws.api.data.FileSystem)1 Path (edu.iu.dsc.tws.api.data.Path)1 BinaryInputSplit (edu.iu.dsc.tws.data.api.splits.BinaryInputSplit)1 FileInputSplit (edu.iu.dsc.tws.data.api.splits.FileInputSplit)1 ArrayList (java.util.ArrayList)1