Search in sources :

Example 1 with FileInputSplit

use of edu.iu.dsc.tws.data.api.splits.FileInputSplit in project twister2 by DSC-SPIDAL.

the class BinaryInputPartitioner method createInputSplits.

/**
 * Computes the input splits for the file. By default, one file block is one split. If more
 * splits are requested than blocks are available, then a split may be a fraction of a block and
 * splits may cross block boundaries.
 *
 * @param minNumSplits The minimum desired number of file splits.
 * @return The computed file splits.
 */
@Override
public FileInputSplit[] createInputSplits(int minNumSplits) throws IOException {
    if (minNumSplits < 1) {
        throw new IllegalArgumentException("Number of input splits has to be at least 1.");
    }
    int curminNumSplits = Math.max(minNumSplits, this.numSplits);
    final Path path = this.filePath;
    final List<FileInputSplit> inputSplits = new ArrayList<FileInputSplit>(curminNumSplits);
    List<FileStatus> files = new ArrayList<FileStatus>();
    long totalLength = 0;
    // path.getFileSystem();
    final FileSystem fs = FileSystemUtils.get(path);
    final FileStatus pathFile = fs.getFileStatus(path);
    if (pathFile.isDir()) {
        totalLength += sumFilesInDir(path, files, true);
    } else {
        files.add(pathFile);
        totalLength += pathFile.getLen();
    }
    if (totalLength % this.recordLength != 0) {
        throw new IllegalStateException("The Binary file has a incomplete record");
    }
    long numberOfRecords = totalLength / this.recordLength;
    long minRecordsForSplit = Math.floorDiv(numberOfRecords, minNumSplits);
    long oddRecords = numberOfRecords % minNumSplits;
    // Generate the splits
    int splitNum = 0;
    for (final FileStatus file : files) {
        final long len = file.getLen();
        final long blockSize = file.getBlockSize();
        final long minSplitSize = minRecordsForSplit * this.recordLength;
        long currentSplitSize = minSplitSize;
        long halfSplit = currentSplitSize >>> 1;
        if (oddRecords > 0) {
            currentSplitSize = currentSplitSize + this.recordLength;
            oddRecords--;
        }
        if (len > 0) {
            // get the block locations and make sure they are in order with respect to their offset
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, len);
            Arrays.sort(blocks);
            long bytesUnassigned = len;
            long position = 0;
            int blockIndex = 0;
            while (bytesUnassigned >= currentSplitSize) {
                // get the block containing the majority of the data
                blockIndex = getBlockIndexForPosition(blocks, position, halfSplit, blockIndex);
                // create a new split
                FileInputSplit fis = new BinaryInputSplit(splitNum++, file.getPath(), position, currentSplitSize, blocks[blockIndex].getHosts());
                inputSplits.add(fis);
                // adjust the positions
                position += currentSplitSize;
                bytesUnassigned -= currentSplitSize;
            }
        } else {
            throw new IllegalStateException("The binary file " + file.getPath() + " is Empty");
        }
    }
    return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
Also used : Path(edu.iu.dsc.tws.api.data.Path) FileStatus(edu.iu.dsc.tws.api.data.FileStatus) ArrayList(java.util.ArrayList) BlockLocation(edu.iu.dsc.tws.api.data.BlockLocation) FileInputSplit(edu.iu.dsc.tws.data.api.splits.FileInputSplit) FileSystem(edu.iu.dsc.tws.api.data.FileSystem) BinaryInputSplit(edu.iu.dsc.tws.data.api.splits.BinaryInputSplit)

Example 2 with FileInputSplit

use of edu.iu.dsc.tws.data.api.splits.FileInputSplit in project twister2 by DSC-SPIDAL.

the class FixedInputPartitioner method createInputSplits.

/**
 * This method create the input splits which is based on the number of lines in the input and the
 * parallelism value.
 *
 * @param minNumSplits Number of minimal input splits, as a hint.
 */
@Override
public FileInputSplit<OT>[] createInputSplits(int minNumSplits) throws IOException {
    // take the desired number of splits into account
    int curminNumSplits = Math.max(minNumSplits, this.numSplits);
    final Path path = this.filePath;
    final List<FileInputSplit> inputSplits = new ArrayList<>(curminNumSplits);
    // get all the files that are involved in the splits
    List<FileStatus> files = new ArrayList<>();
    long totalLength = 0;
    final FileSystem fs = FileSystemUtils.get(path);
    final FileStatus pathFile = fs.getFileStatus(path);
    if (pathFile.isDir()) {
        totalLength += sumFilesInDir(path, files, true);
    } else {
        files.add(pathFile);
        totalLength += pathFile.getLen();
    }
    // Generate the splits
    final long maxSplitSize = totalLength / curminNumSplits + (totalLength % curminNumSplits == 0 ? 0 : 1);
    if (files.size() > 1) {
        throw new IllegalStateException("FixedInputPartitioner does not support multiple files" + "currently");
    }
    for (final FileStatus file : files) {
        // First Split Calculation
        // To retrieve the total count of the number of the lines in a file.
        // final long lineCount = Files.lines(Paths.get(file.getPath().getPath())).count();
        final long lineCount = dataSize;
        int splSize = (int) (lineCount / curminNumSplits);
        final long len = file.getLen();
        final long blockSize = file.getBlockSize();
        final long localminSplitSize;
        if (this.minSplitSize <= blockSize) {
            localminSplitSize = this.minSplitSize;
        } else {
            LOG.log(Level.WARNING, "Minimal split size of " + this.minSplitSize + " is larger than the block size of " + blockSize + ". Decreasing minimal split size to block size.");
            localminSplitSize = blockSize;
        }
        int currLineCount = 0;
        long[] splitSizes = getSplitSizes(fs, file.getPath(), curminNumSplits, splSize);
        int position = 0;
        if (len > 0) {
            for (int i = 0; i < splitSizes.length; i++) {
                String[] hosts = new String[0];
                final FileInputSplit fis = createSplit(i, file.getPath(), position, splitSizes[i], hosts);
                position += splitSizes[i];
                inputSplits.add(fis);
            }
        } else {
            // TODO need to check this section of the code for correctness
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, 0);
            String[] hosts;
            if (blocks.length > 0) {
                hosts = blocks[0].getHosts();
            } else {
                hosts = new String[0];
            }
            final FileInputSplit fis = createSplit(0, file.getPath(), 0, 0, hosts);
            inputSplits.add(fis);
        }
    // Old code that does splitting based on fixed byte sizes
    /*      final long splitSize = Math.max(localminSplitSize, Math.min(maxSplitSize, blockSize));
      final long maxBytesForLastSplit = (long) (splitSize * MAX_SPLIT_SIZE_DISCREPANCY);
      long bytesUnassigned = len;
      int splitNum = 0;
      int position = 0;

      if (len > 0) {
        while (bytesUnassigned > maxBytesForLastSplit) {
          String[] hosts = new String[0];
          final FileInputSplit fis
              = createSplit(splitNum++, file.getPath(), position, totalbytes, hosts);
          inputSplits.add(fis);
          position += totalbytes;
          bytesUnassigned -= totalbytes;
        }
        if (bytesUnassigned > 0) {
          long remainingBytes = getSplitSize(fs, file.getPath(), splSize, dataSize);
          String[] hosts = new String[0];
          final FileInputSplit fis
              = createSplit(splitNum++, file.getPath(), position, bytesUnassigned, hosts);
          inputSplits.add(fis);
        }
      } else {
        final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, 0);
        String[] hosts;
        if (blocks.length > 0) {
          hosts = blocks[0].getHosts();
        } else {
          hosts = new String[0];
        }
        final FileInputSplit fis = createSplit(splitNum++, file.getPath(), 0, 0, hosts);
        inputSplits.add(fis);
      }*/
    }
    return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
Also used : Path(edu.iu.dsc.tws.api.data.Path) FileStatus(edu.iu.dsc.tws.api.data.FileStatus) ArrayList(java.util.ArrayList) BlockLocation(edu.iu.dsc.tws.api.data.BlockLocation) FileInputSplit(edu.iu.dsc.tws.data.api.splits.FileInputSplit) FileSystem(edu.iu.dsc.tws.api.data.FileSystem)

Example 3 with FileInputSplit

use of edu.iu.dsc.tws.data.api.splits.FileInputSplit in project twister2 by DSC-SPIDAL.

the class CSVInputPartitioner method createInputSplits.

@Override
public FileInputSplit<OT>[] createInputSplits(int minNumSplits) throws IOException {
    // take the desired number of splits into account
    int curminNumSplits = Math.max(minNumSplits, this.numSplits);
    final Path path = this.filePath;
    final List<FileInputSplit> inputSplits = new ArrayList<>(curminNumSplits);
    // get all the files that are involved in the splits
    List<FileStatus> files = new ArrayList<>();
    final FileSystem fs = FileSystemUtils.get(path, config);
    final FileStatus pathFile = fs.getFileStatus(path);
    long totalLength = 0;
    if (pathFile.isDir()) {
        totalLength += sumFilesInDir(path, files, true);
    } else {
        files.add(pathFile);
        totalLength += pathFile.getLen();
    }
    if (files.size() > 1) {
        throw new IllegalStateException("FixedInputPartitioner does not support multiple files" + "currently");
    }
    for (final FileStatus file : files) {
        final long lineCount = dataSize;
        int splSize = (int) (lineCount / curminNumSplits);
        final long len = file.getLen();
        long[] splitSizes = getSplitSizes(fs, file.getPath(), curminNumSplits, splSize);
        int position = 0;
        if (len > 0) {
            for (int i = 0; i < splitSizes.length; i++) {
                String[] hosts = new String[0];
                final FileInputSplit fis = new CSVInputSplit(i, file.getPath(), position, splitSizes[i], hosts);
                position += splitSizes[i];
                inputSplits.add(fis);
            }
        } else {
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, 0);
            String[] hosts;
            if (blocks.length > 0) {
                hosts = blocks[0].getHosts();
            } else {
                hosts = new String[0];
            }
            final FileInputSplit fis = new CSVInputSplit(0, file.getPath(), 0, 0, hosts);
            inputSplits.add(fis);
        }
    }
    return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
Also used : Path(edu.iu.dsc.tws.api.data.Path) FileStatus(edu.iu.dsc.tws.api.data.FileStatus) ArrayList(java.util.ArrayList) BlockLocation(edu.iu.dsc.tws.api.data.BlockLocation) FileInputSplit(edu.iu.dsc.tws.data.api.splits.FileInputSplit) CSVInputSplit(edu.iu.dsc.tws.data.api.splits.CSVInputSplit) FileSystem(edu.iu.dsc.tws.api.data.FileSystem)

Example 4 with FileInputSplit

use of edu.iu.dsc.tws.data.api.splits.FileInputSplit in project twister2 by DSC-SPIDAL.

the class CSVInputFormatTest method testUniqueSchedules.

/**
 * To test the CSV Input Format
 */
@Test
public void testUniqueSchedules() throws IOException {
    Config config = getConfig();
    Path path = new Path("/tmp/dinput/");
    createOutputFile(path, config);
    LocalCSVInputPartitioner csvInputPartitioner = new LocalCSVInputPartitioner(path, 4, config);
    csvInputPartitioner.configure(config);
    FileInputSplit[] inputSplits = csvInputPartitioner.createInputSplits(2);
    LOG.info("input split values are:" + Arrays.toString(inputSplits));
    InputSplitAssigner inputSplitAssigner = csvInputPartitioner.getInputSplitAssigner(inputSplits);
    InputSplit inputSplit = inputSplitAssigner.getNextInputSplit("localhost", 0);
    inputSplit.open(config);
    do {
        inputSplit.nextRecord(null);
    } while (!inputSplit.reachedEnd());
}
Also used : Path(edu.iu.dsc.tws.api.data.Path) FileInputSplit(edu.iu.dsc.tws.data.api.splits.FileInputSplit) InputSplitAssigner(edu.iu.dsc.tws.data.fs.io.InputSplitAssigner) Config(edu.iu.dsc.tws.api.config.Config) LocalCSVInputPartitioner(edu.iu.dsc.tws.data.api.formatters.LocalCSVInputPartitioner) FileInputSplit(edu.iu.dsc.tws.data.api.splits.FileInputSplit) InputSplit(edu.iu.dsc.tws.data.fs.io.InputSplit) Test(org.junit.Test)

Example 5 with FileInputSplit

use of edu.iu.dsc.tws.data.api.splits.FileInputSplit in project twister2 by DSC-SPIDAL.

the class CompleteCSVInputPartitioner method createInputSplits.

/**
 * It creates the split for the complete file.
 *
 * @param minNumSplits Number of minimal input splits, as a hint.
 */
@Override
public FileInputSplit<OT>[] createInputSplits(int minNumSplits) throws IOException {
    if (minNumSplits < 1) {
        throw new IllegalArgumentException("Number of input splits has to be at least 1.");
    }
    int curminNumSplits = Math.max(minNumSplits, this.numSplits);
    final Path path = this.filePath;
    final List<FileInputSplit> inputSplits = new ArrayList<>(curminNumSplits);
    List<FileStatus> files = new ArrayList<>();
    long totalLength = 0;
    final FileSystem fs = FileSystemUtils.get(path, config);
    final FileStatus pathFile = fs.getFileStatus(path);
    if (pathFile.isDir()) {
        totalLength += sumFilesInDir(path, files, true);
    } else {
        files.add(pathFile);
        totalLength += pathFile.getLen();
    }
    final long maxSplitSize = totalLength;
    // Generate the splits
    int splitNum = 0;
    for (final FileStatus file : files) {
        final long len = file.getLen();
        final long blockSize = file.getBlockSize();
        final long localminSplitSize;
        if (this.minSplitSize <= blockSize) {
            localminSplitSize = this.minSplitSize;
        } else {
            LOG.log(Level.WARNING, "Minimal split size of " + this.minSplitSize + " is larger than the block size of " + blockSize + ". Decreasing minimal split size to block size.");
            localminSplitSize = blockSize;
        }
        final long splitSize = Math.max(localminSplitSize, Math.min(maxSplitSize, blockSize));
        if (len > 0) {
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, len);
            Arrays.sort(blocks);
            long position = 0;
            int blockIndex = 0;
            for (int i = 0; i < curminNumSplits; i++) {
                blockIndex = getBlockIndexForPosition(blocks, position, splitSize, blockIndex);
                final FileInputSplit fis = new CSVInputSplit(splitNum++, file.getPath(), position, splitSize, blocks[blockIndex].getHosts());
                inputSplits.add(fis);
            }
        } else {
            // special case with a file of zero bytes size
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, 0);
            String[] hosts;
            if (blocks.length > 0) {
                hosts = blocks[0].getHosts();
            } else {
                hosts = new String[0];
            }
            for (int i = 0; i < curminNumSplits; i++) {
                final FileInputSplit fis = new CSVInputSplit(splitNum++, file.getPath(), 0, 0, hosts);
                inputSplits.add(fis);
            }
        }
    }
    return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
Also used : Path(edu.iu.dsc.tws.api.data.Path) FileStatus(edu.iu.dsc.tws.api.data.FileStatus) ArrayList(java.util.ArrayList) BlockLocation(edu.iu.dsc.tws.api.data.BlockLocation) FileInputSplit(edu.iu.dsc.tws.data.api.splits.FileInputSplit) CSVInputSplit(edu.iu.dsc.tws.data.api.splits.CSVInputSplit) FileSystem(edu.iu.dsc.tws.api.data.FileSystem)

Aggregations

Path (edu.iu.dsc.tws.api.data.Path)8 FileInputSplit (edu.iu.dsc.tws.data.api.splits.FileInputSplit)8 BlockLocation (edu.iu.dsc.tws.api.data.BlockLocation)7 FileStatus (edu.iu.dsc.tws.api.data.FileStatus)7 FileSystem (edu.iu.dsc.tws.api.data.FileSystem)7 ArrayList (java.util.ArrayList)7 CSVInputSplit (edu.iu.dsc.tws.data.api.splits.CSVInputSplit)3 Config (edu.iu.dsc.tws.api.config.Config)1 LocalCSVInputPartitioner (edu.iu.dsc.tws.data.api.formatters.LocalCSVInputPartitioner)1 BinaryInputSplit (edu.iu.dsc.tws.data.api.splits.BinaryInputSplit)1 InputSplit (edu.iu.dsc.tws.data.fs.io.InputSplit)1 InputSplitAssigner (edu.iu.dsc.tws.data.fs.io.InputSplitAssigner)1 Test (org.junit.Test)1