Search in sources :

Example 1 with FileSystem

use of edu.iu.dsc.tws.api.data.FileSystem in project twister2 by DSC-SPIDAL.

the class DataLocalityBatchTaskScheduler method getInputFilesList.

private List<String> getInputFilesList() {
    List<String> inputDataList = new ArrayList<>();
    String directory = null;
    if (config.get(DataObjectConstants.DINPUT_DIRECTORY) != null) {
        directory = String.valueOf(config.get(DataObjectConstants.DINPUT_DIRECTORY));
    }
    final Path path = new Path(directory);
    final FileSystem fileSystem;
    try {
        fileSystem = FileSystemUtils.get(path);
        if (config.get(DataObjectConstants.FILE_SYSTEM).equals(DataContext.TWISTER2_HDFS_FILESYSTEM)) {
            final FileStatus pathFile = fileSystem.getFileStatus(path);
            inputDataList.add(String.valueOf(pathFile.getPath()));
        } else if (config.get(DataObjectConstants.FILE_SYSTEM).equals(DataContext.TWISTER2_LOCAL_FILESYSTEM)) {
            for (FileStatus file : fileSystem.listFiles(path)) {
                String filename = String.valueOf(file.getPath());
                if (filename != null) {
                    inputDataList.add(filename);
                }
            }
        }
    } catch (IOException e) {
        throw new RuntimeException("IOException Occured");
    }
    return inputDataList;
}
Also used : Path(edu.iu.dsc.tws.api.data.Path) FileStatus(edu.iu.dsc.tws.api.data.FileStatus) FileSystem(edu.iu.dsc.tws.api.data.FileSystem) ArrayList(java.util.ArrayList) IOException(java.io.IOException)

Example 2 with FileSystem

use of edu.iu.dsc.tws.api.data.FileSystem in project twister2 by DSC-SPIDAL.

the class DataLocalityStreamingTaskScheduler method getInputFilesList.

private List<String> getInputFilesList() {
    List<String> inputDataList = new ArrayList<>();
    String directory = null;
    if (config.get(DataObjectConstants.DINPUT_DIRECTORY) != null) {
        directory = String.valueOf(config.get(DataObjectConstants.DINPUT_DIRECTORY));
    }
    final Path path = new Path(directory);
    final FileSystem fileSystem;
    try {
        fileSystem = FileSystemUtils.get(path);
        if (config.get(DataObjectConstants.FILE_SYSTEM).equals(DataContext.TWISTER2_HDFS_FILESYSTEM)) {
            final FileStatus pathFile = fileSystem.getFileStatus(path);
            inputDataList.add(String.valueOf(pathFile.getPath()));
        } else if (config.get(DataObjectConstants.FILE_SYSTEM).equals(DataContext.TWISTER2_LOCAL_FILESYSTEM)) {
            for (FileStatus file : fileSystem.listFiles(path)) {
                String filename = String.valueOf(file.getPath());
                if (filename != null) {
                    inputDataList.add(filename);
                }
            }
        }
    } catch (IOException e) {
        throw new TaskSchedulerException("Not able to get the input files", e);
    }
    return inputDataList;
}
Also used : Path(edu.iu.dsc.tws.api.data.Path) FileStatus(edu.iu.dsc.tws.api.data.FileStatus) FileSystem(edu.iu.dsc.tws.api.data.FileSystem) ArrayList(java.util.ArrayList) IOException(java.io.IOException) TaskSchedulerException(edu.iu.dsc.tws.api.compute.exceptions.TaskSchedulerException)

Example 3 with FileSystem

use of edu.iu.dsc.tws.api.data.FileSystem in project twister2 by DSC-SPIDAL.

the class BinaryInputPartitioner method createInputSplits.

/**
 * Computes the input splits for the file. By default, one file block is one split. If more
 * splits are requested than blocks are available, then a split may be a fraction of a block and
 * splits may cross block boundaries.
 *
 * @param minNumSplits The minimum desired number of file splits.
 * @return The computed file splits.
 */
@Override
public FileInputSplit[] createInputSplits(int minNumSplits) throws IOException {
    if (minNumSplits < 1) {
        throw new IllegalArgumentException("Number of input splits has to be at least 1.");
    }
    int curminNumSplits = Math.max(minNumSplits, this.numSplits);
    final Path path = this.filePath;
    final List<FileInputSplit> inputSplits = new ArrayList<FileInputSplit>(curminNumSplits);
    List<FileStatus> files = new ArrayList<FileStatus>();
    long totalLength = 0;
    // path.getFileSystem();
    final FileSystem fs = FileSystemUtils.get(path);
    final FileStatus pathFile = fs.getFileStatus(path);
    if (pathFile.isDir()) {
        totalLength += sumFilesInDir(path, files, true);
    } else {
        files.add(pathFile);
        totalLength += pathFile.getLen();
    }
    if (totalLength % this.recordLength != 0) {
        throw new IllegalStateException("The Binary file has a incomplete record");
    }
    long numberOfRecords = totalLength / this.recordLength;
    long minRecordsForSplit = Math.floorDiv(numberOfRecords, minNumSplits);
    long oddRecords = numberOfRecords % minNumSplits;
    // Generate the splits
    int splitNum = 0;
    for (final FileStatus file : files) {
        final long len = file.getLen();
        final long blockSize = file.getBlockSize();
        final long minSplitSize = minRecordsForSplit * this.recordLength;
        long currentSplitSize = minSplitSize;
        long halfSplit = currentSplitSize >>> 1;
        if (oddRecords > 0) {
            currentSplitSize = currentSplitSize + this.recordLength;
            oddRecords--;
        }
        if (len > 0) {
            // get the block locations and make sure they are in order with respect to their offset
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, len);
            Arrays.sort(blocks);
            long bytesUnassigned = len;
            long position = 0;
            int blockIndex = 0;
            while (bytesUnassigned >= currentSplitSize) {
                // get the block containing the majority of the data
                blockIndex = getBlockIndexForPosition(blocks, position, halfSplit, blockIndex);
                // create a new split
                FileInputSplit fis = new BinaryInputSplit(splitNum++, file.getPath(), position, currentSplitSize, blocks[blockIndex].getHosts());
                inputSplits.add(fis);
                // adjust the positions
                position += currentSplitSize;
                bytesUnassigned -= currentSplitSize;
            }
        } else {
            throw new IllegalStateException("The binary file " + file.getPath() + " is Empty");
        }
    }
    return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
Also used : Path(edu.iu.dsc.tws.api.data.Path) FileStatus(edu.iu.dsc.tws.api.data.FileStatus) ArrayList(java.util.ArrayList) BlockLocation(edu.iu.dsc.tws.api.data.BlockLocation) FileInputSplit(edu.iu.dsc.tws.data.api.splits.FileInputSplit) FileSystem(edu.iu.dsc.tws.api.data.FileSystem) BinaryInputSplit(edu.iu.dsc.tws.data.api.splits.BinaryInputSplit)

Example 4 with FileSystem

use of edu.iu.dsc.tws.api.data.FileSystem in project twister2 by DSC-SPIDAL.

the class CompleteFileInputPartitioner method sumFilesInDir.

/**
 * Enumerate all files in the directory and recursive if enumerateNestedFiles is true.
 *
 * @return the total length of accepted files.
 */
long sumFilesInDir(Path path, List<FileStatus> files, boolean logExcludedFiles) throws IOException {
    final FileSystem fs = FileSystemUtils.get(path);
    long length = 0;
    for (FileStatus file : fs.listFiles(path)) {
        if (file.isDir()) {
            if (acceptFile(file) && enumerateNestedFiles) {
                length += sumFilesInDir(file.getPath(), files, logExcludedFiles);
            } else {
                if (logExcludedFiles) {
                    LOG.log(Level.INFO, "Directory " + file.getPath().toString() + " did not pass the " + "file-filter and is excluded.");
                }
            }
        } else {
            if (acceptFile(file)) {
                files.add(file);
                length += file.getLen();
            } else {
                if (logExcludedFiles) {
                    LOG.log(Level.INFO, "Directory " + file.getPath().toString() + " did not pass the file-filter and is excluded.");
                }
            }
        }
    }
    return length;
}
Also used : FileStatus(edu.iu.dsc.tws.api.data.FileStatus) FileSystem(edu.iu.dsc.tws.api.data.FileSystem)

Example 5 with FileSystem

use of edu.iu.dsc.tws.api.data.FileSystem in project twister2 by DSC-SPIDAL.

the class FixedInputPartitioner method createInputSplits.

/**
 * This method create the input splits which is based on the number of lines in the input and the
 * parallelism value.
 *
 * @param minNumSplits Number of minimal input splits, as a hint.
 */
@Override
public FileInputSplit<OT>[] createInputSplits(int minNumSplits) throws IOException {
    // take the desired number of splits into account
    int curminNumSplits = Math.max(minNumSplits, this.numSplits);
    final Path path = this.filePath;
    final List<FileInputSplit> inputSplits = new ArrayList<>(curminNumSplits);
    // get all the files that are involved in the splits
    List<FileStatus> files = new ArrayList<>();
    long totalLength = 0;
    final FileSystem fs = FileSystemUtils.get(path);
    final FileStatus pathFile = fs.getFileStatus(path);
    if (pathFile.isDir()) {
        totalLength += sumFilesInDir(path, files, true);
    } else {
        files.add(pathFile);
        totalLength += pathFile.getLen();
    }
    // Generate the splits
    final long maxSplitSize = totalLength / curminNumSplits + (totalLength % curminNumSplits == 0 ? 0 : 1);
    if (files.size() > 1) {
        throw new IllegalStateException("FixedInputPartitioner does not support multiple files" + "currently");
    }
    for (final FileStatus file : files) {
        // First Split Calculation
        // To retrieve the total count of the number of the lines in a file.
        // final long lineCount = Files.lines(Paths.get(file.getPath().getPath())).count();
        final long lineCount = dataSize;
        int splSize = (int) (lineCount / curminNumSplits);
        final long len = file.getLen();
        final long blockSize = file.getBlockSize();
        final long localminSplitSize;
        if (this.minSplitSize <= blockSize) {
            localminSplitSize = this.minSplitSize;
        } else {
            LOG.log(Level.WARNING, "Minimal split size of " + this.minSplitSize + " is larger than the block size of " + blockSize + ". Decreasing minimal split size to block size.");
            localminSplitSize = blockSize;
        }
        int currLineCount = 0;
        long[] splitSizes = getSplitSizes(fs, file.getPath(), curminNumSplits, splSize);
        int position = 0;
        if (len > 0) {
            for (int i = 0; i < splitSizes.length; i++) {
                String[] hosts = new String[0];
                final FileInputSplit fis = createSplit(i, file.getPath(), position, splitSizes[i], hosts);
                position += splitSizes[i];
                inputSplits.add(fis);
            }
        } else {
            // TODO need to check this section of the code for correctness
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, 0);
            String[] hosts;
            if (blocks.length > 0) {
                hosts = blocks[0].getHosts();
            } else {
                hosts = new String[0];
            }
            final FileInputSplit fis = createSplit(0, file.getPath(), 0, 0, hosts);
            inputSplits.add(fis);
        }
    // Old code that does splitting based on fixed byte sizes
    /*      final long splitSize = Math.max(localminSplitSize, Math.min(maxSplitSize, blockSize));
      final long maxBytesForLastSplit = (long) (splitSize * MAX_SPLIT_SIZE_DISCREPANCY);
      long bytesUnassigned = len;
      int splitNum = 0;
      int position = 0;

      if (len > 0) {
        while (bytesUnassigned > maxBytesForLastSplit) {
          String[] hosts = new String[0];
          final FileInputSplit fis
              = createSplit(splitNum++, file.getPath(), position, totalbytes, hosts);
          inputSplits.add(fis);
          position += totalbytes;
          bytesUnassigned -= totalbytes;
        }
        if (bytesUnassigned > 0) {
          long remainingBytes = getSplitSize(fs, file.getPath(), splSize, dataSize);
          String[] hosts = new String[0];
          final FileInputSplit fis
              = createSplit(splitNum++, file.getPath(), position, bytesUnassigned, hosts);
          inputSplits.add(fis);
        }
      } else {
        final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, 0);
        String[] hosts;
        if (blocks.length > 0) {
          hosts = blocks[0].getHosts();
        } else {
          hosts = new String[0];
        }
        final FileInputSplit fis = createSplit(splitNum++, file.getPath(), 0, 0, hosts);
        inputSplits.add(fis);
      }*/
    }
    return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
Also used : Path(edu.iu.dsc.tws.api.data.Path) FileStatus(edu.iu.dsc.tws.api.data.FileStatus) ArrayList(java.util.ArrayList) BlockLocation(edu.iu.dsc.tws.api.data.BlockLocation) FileInputSplit(edu.iu.dsc.tws.data.api.splits.FileInputSplit) FileSystem(edu.iu.dsc.tws.api.data.FileSystem)

Aggregations

FileSystem (edu.iu.dsc.tws.api.data.FileSystem)26 FileStatus (edu.iu.dsc.tws.api.data.FileStatus)19 Path (edu.iu.dsc.tws.api.data.Path)18 ArrayList (java.util.ArrayList)11 IOException (java.io.IOException)10 BlockLocation (edu.iu.dsc.tws.api.data.BlockLocation)7 FileInputSplit (edu.iu.dsc.tws.data.api.splits.FileInputSplit)7 FSDataOutputStream (edu.iu.dsc.tws.api.data.FSDataOutputStream)4 PrintWriter (java.io.PrintWriter)4 CSVInputSplit (edu.iu.dsc.tws.data.api.splits.CSVInputSplit)3 LocalFileSystem (edu.iu.dsc.tws.data.fs.local.LocalFileSystem)3 HadoopFileSystem (edu.iu.dsc.tws.data.hdfs.HadoopFileSystem)3 File (java.io.File)2 URI (java.net.URI)2 URISyntaxException (java.net.URISyntaxException)2 Random (java.util.Random)2 TaskSchedulerException (edu.iu.dsc.tws.api.compute.exceptions.TaskSchedulerException)1 Config (edu.iu.dsc.tws.api.config.Config)1 BinaryInputSplit (edu.iu.dsc.tws.data.api.splits.BinaryInputSplit)1 DataFileReader (edu.iu.dsc.tws.data.utils.DataFileReader)1