Search in sources :

Example 16 with FileStatus

use of edu.iu.dsc.tws.api.data.FileStatus in project twister2 by DSC-SPIDAL.

the class FileInputPartitioner method createInputSplits.

/**
 * Computes the input splits for the file. By default, one file block is one split. If more splits
 * are requested than blocks are available, then a split may be a fraction of a
 * block and splits may cross block boundaries.
 *
 * @param minNumSplits The minimum desired number of file splits.
 * @return The computed file splits.
 */
@Override
public FileInputSplit<OT>[] createInputSplits(int minNumSplits) throws IOException {
    if (minNumSplits < 1) {
        throw new IllegalArgumentException("Number of input splits has to be at least 1.");
    }
    // take the desired number of splits into account
    int curminNumSplits = Math.max(minNumSplits, this.numSplits);
    final Path path = this.filePath;
    final List<FileInputSplit> inputSplits = new ArrayList<FileInputSplit>(curminNumSplits);
    // get all the files that are involved in the splits
    List<FileStatus> files = new ArrayList<FileStatus>();
    long totalLength = 0;
    // final FileSystem fs = path.getFileSystem();
    final FileSystem fs = FileSystemUtils.get(path, config);
    final FileStatus pathFile = fs.getFileStatus(path);
    if (pathFile.isDir()) {
        totalLength += sumFilesInDir(path, files, true);
    } else {
        // TODO L3: implement test for unsplittable
        // testForUnsplittable(pathFile);
        files.add(pathFile);
        totalLength += pathFile.getLen();
    }
    // TODO L3: Handle if unsplittable
    // TODO L1: check if we can add the i j method when making splits so that the last split is not
    // larger than the other splits
    final long maxSplitSize = totalLength / curminNumSplits + (totalLength % curminNumSplits == 0 ? 0 : 1);
    // Generate the splits
    int splitNum = 0;
    for (final FileStatus file : files) {
        final long len = file.getLen();
        final long blockSize = file.getBlockSize();
        final long localminSplitSize;
        if (this.minSplitSize <= blockSize) {
            localminSplitSize = this.minSplitSize;
        } else {
            LOG.log(Level.WARNING, "Minimal split size of " + this.minSplitSize + " is larger than the block size of " + blockSize + ". Decreasing minimal split size to block size.");
            localminSplitSize = blockSize;
        }
        final long splitSize = Math.max(localminSplitSize, Math.min(maxSplitSize, blockSize));
        final long halfSplit = splitSize >>> 1;
        final long maxBytesForLastSplit = (long) (splitSize * MAX_SPLIT_SIZE_DISCREPANCY);
        if (len > 0) {
            // get the block locations and make sure they are in order with respect to their offset
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, len);
            Arrays.sort(blocks);
            long bytesUnassigned = len;
            long position = 0;
            int blockIndex = 0;
            while (bytesUnassigned > maxBytesForLastSplit) {
                // get the block containing the majority of the data
                blockIndex = getBlockIndexForPosition(blocks, position, halfSplit, blockIndex);
                // create a new split
                FileInputSplit fis = createSplit(splitNum++, file.getPath(), position, splitSize, blocks[blockIndex].getHosts());
                inputSplits.add(fis);
                // adjust the positions
                position += splitSize;
                bytesUnassigned -= splitSize;
            }
            if (bytesUnassigned > 0) {
                blockIndex = getBlockIndexForPosition(blocks, position, halfSplit, blockIndex);
                final FileInputSplit fis = createSplit(splitNum++, file.getPath(), position, bytesUnassigned, blocks[blockIndex].getHosts());
                inputSplits.add(fis);
            }
        } else {
            // special case with a file of zero bytes size
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, 0);
            String[] hosts;
            if (blocks.length > 0) {
                hosts = blocks[0].getHosts();
            } else {
                hosts = new String[0];
            }
            final FileInputSplit fis = createSplit(splitNum++, file.getPath(), 0, 0, hosts);
            inputSplits.add(fis);
        }
    }
    LOG.fine("input splits value:" + inputSplits.size() + "\t" + Arrays.toString(inputSplits.toArray()));
    return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
Also used : Path(edu.iu.dsc.tws.api.data.Path) FileStatus(edu.iu.dsc.tws.api.data.FileStatus) ArrayList(java.util.ArrayList) BlockLocation(edu.iu.dsc.tws.api.data.BlockLocation) FileInputSplit(edu.iu.dsc.tws.data.api.splits.FileInputSplit) FileSystem(edu.iu.dsc.tws.api.data.FileSystem)

Example 17 with FileStatus

use of edu.iu.dsc.tws.api.data.FileStatus in project twister2 by DSC-SPIDAL.

the class BufferedCollectionPartition method loadFromFS.

/**
 * This method loads existing frames on disk
 */
private void loadFromFS() {
    try {
        FileStatus[] fileStatuses = this.fileSystem.listFiles(this.rootPath);
        this.filesList = Arrays.stream(fileStatuses).map(FileStatus::getPath).filter(p -> p.getName().contains(EXTENSION)).sorted(Comparator.comparingLong(path -> Long.parseLong(path.getName().replace(EXTENSION, "")))).collect(Collectors.toList());
        this.fileCounter = fileStatuses.length;
    } catch (IOException e) {
        throw new Twister2RuntimeException("Failed to load frames from file system", e);
    }
}
Also used : DataInputStream(java.io.DataInputStream) Arrays(java.util.Arrays) Iterator(java.util.Iterator) FileStatus(edu.iu.dsc.tws.api.data.FileStatus) DataPartitionConsumer(edu.iu.dsc.tws.api.dataset.DataPartitionConsumer) MessageType(edu.iu.dsc.tws.api.comms.messaging.types.MessageType) Collection(java.util.Collection) IOException(java.io.IOException) Config(edu.iu.dsc.tws.api.config.Config) UUID(java.util.UUID) MessageTypes(edu.iu.dsc.tws.api.comms.messaging.types.MessageTypes) Logger(java.util.logging.Logger) Collectors(java.util.stream.Collectors) Twister2RuntimeException(edu.iu.dsc.tws.api.exceptions.Twister2RuntimeException) ArrayList(java.util.ArrayList) FileSystem(edu.iu.dsc.tws.api.data.FileSystem) List(java.util.List) DataOutputStream(java.io.DataOutputStream) Closeable(java.io.Closeable) Path(edu.iu.dsc.tws.api.data.Path) Queue(java.util.Queue) Comparator(java.util.Comparator) LinkedList(java.util.LinkedList) Twister2RuntimeException(edu.iu.dsc.tws.api.exceptions.Twister2RuntimeException) FileStatus(edu.iu.dsc.tws.api.data.FileStatus) IOException(java.io.IOException)

Example 18 with FileStatus

use of edu.iu.dsc.tws.api.data.FileStatus in project twister2 by DSC-SPIDAL.

the class LocalFileSystem method listFiles.

@Override
public FileStatus[] listFiles(Path f) throws IOException {
    final File localf = pathToFile(f);
    FileStatus[] results;
    if (!localf.exists()) {
        return null;
    }
    if (localf.isFile()) {
        return new FileStatus[] { new LocalFileStatus(localf, this) };
    }
    final String[] names = localf.list();
    if (names == null) {
        return null;
    }
    results = new FileStatus[names.length];
    for (int i = 0; i < names.length; i++) {
        results[i] = getFileStatus(new Path(f, names[i]));
    }
    return results;
}
Also used : Path(edu.iu.dsc.tws.api.data.Path) FileStatus(edu.iu.dsc.tws.api.data.FileStatus) File(java.io.File)

Example 19 with FileStatus

use of edu.iu.dsc.tws.api.data.FileStatus in project twister2 by DSC-SPIDAL.

the class CompleteArrowInputPartitioner method createInputSplits.

/**
 * It creates the split for the complete file.
 *
 * @param minNumSplits Number of minimal input splits, as a hint.
 */
@Override
public FileInputSplit<OT>[] createInputSplits(int minNumSplits) throws IOException {
    if (minNumSplits < 1) {
        throw new IllegalArgumentException("Number of input splits has to be at least 1.");
    }
    int curminNumSplits = Math.max(minNumSplits, this.numSplits);
    final Path path = this.filePath;
    final List<FileInputSplit> inputSplits = new ArrayList<>(curminNumSplits);
    List<FileStatus> files = new ArrayList<>();
    long totalLength = 0;
    final FileSystem fs = FileSystemUtils.get(path, config);
    final FileStatus pathFile = fs.getFileStatus(path);
    if (pathFile.isDir()) {
        totalLength += sumFilesInDir(path, files, true);
    } else {
        files.add(pathFile);
        totalLength += pathFile.getLen();
    }
    final long maxSplitSize = totalLength;
    // Generate the splits
    int splitNum = 0;
    for (final FileStatus file : files) {
        final long len = file.getLen();
        final long blockSize = file.getBlockSize();
        final long localminSplitSize;
        if (this.minSplitSize <= blockSize) {
            localminSplitSize = this.minSplitSize;
        } else {
            LOG.log(Level.WARNING, "Minimal split size of " + this.minSplitSize + " is larger than the block size of " + blockSize + ". Decreasing minimal split size to block size.");
            localminSplitSize = blockSize;
        }
        final long splitSize = Math.max(localminSplitSize, Math.min(maxSplitSize, blockSize));
        if (len > 0) {
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, len);
            Arrays.sort(blocks);
            long position = 0;
            int blockIndex = 0;
            for (int i = 0; i < curminNumSplits; i++) {
                blockIndex = getBlockIndexForPosition(blocks, position, splitSize, blockIndex);
                final FileInputSplit fis = new CSVInputSplit(splitNum++, file.getPath(), position, splitSize, blocks[blockIndex].getHosts());
                inputSplits.add(fis);
            }
        } else {
            // special case with a file of zero bytes size
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, 0);
            String[] hosts;
            if (blocks.length > 0) {
                hosts = blocks[0].getHosts();
            } else {
                hosts = new String[0];
            }
            for (int i = 0; i < curminNumSplits; i++) {
                final FileInputSplit fis = new CSVInputSplit(splitNum++, file.getPath(), 0, 0, hosts);
                inputSplits.add(fis);
            }
        }
    }
    return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
Also used : Path(edu.iu.dsc.tws.api.data.Path) FileStatus(edu.iu.dsc.tws.api.data.FileStatus) ArrayList(java.util.ArrayList) BlockLocation(edu.iu.dsc.tws.api.data.BlockLocation) FileInputSplit(edu.iu.dsc.tws.data.api.splits.FileInputSplit) CSVInputSplit(edu.iu.dsc.tws.data.api.splits.CSVInputSplit) FileSystem(edu.iu.dsc.tws.api.data.FileSystem)

Example 20 with FileStatus

use of edu.iu.dsc.tws.api.data.FileStatus in project twister2 by DSC-SPIDAL.

the class CompleteArrowInputPartitioner method sumFilesInDir.

/**
 * Enumerate all files in the directory and recursive if enumerateNestedFiles is true.
 *
 * @return the total length of accepted files.
 */
long sumFilesInDir(Path path, List<FileStatus> files, boolean logExcludedFiles) throws IOException {
    final FileSystem fs = FileSystemUtils.get(path);
    long length = 0;
    for (FileStatus file : fs.listFiles(path)) {
        if (file.isDir()) {
            if (acceptFile(file) && enumerateNestedFiles) {
                length += sumFilesInDir(file.getPath(), files, logExcludedFiles);
            } else {
                if (logExcludedFiles) {
                    LOG.log(Level.INFO, "Directory " + file.getPath().toString() + " did not pass the " + "file-filter and is excluded.");
                }
            }
        } else {
            if (acceptFile(file)) {
                files.add(file);
                length += file.getLen();
            } else {
                if (logExcludedFiles) {
                    LOG.log(Level.INFO, "Directory " + file.getPath().toString() + " did not pass the file-filter and is excluded.");
                }
            }
        }
    }
    return length;
}
Also used : FileStatus(edu.iu.dsc.tws.api.data.FileStatus) FileSystem(edu.iu.dsc.tws.api.data.FileSystem)

Aggregations

FileStatus (edu.iu.dsc.tws.api.data.FileStatus)22 FileSystem (edu.iu.dsc.tws.api.data.FileSystem)20 Path (edu.iu.dsc.tws.api.data.Path)14 ArrayList (java.util.ArrayList)13 BlockLocation (edu.iu.dsc.tws.api.data.BlockLocation)7 FileInputSplit (edu.iu.dsc.tws.data.api.splits.FileInputSplit)7 IOException (java.io.IOException)6 CSVInputSplit (edu.iu.dsc.tws.data.api.splits.CSVInputSplit)3 Config (edu.iu.dsc.tws.api.config.Config)2 MessageType (edu.iu.dsc.tws.api.comms.messaging.types.MessageType)1 MessageTypes (edu.iu.dsc.tws.api.comms.messaging.types.MessageTypes)1 TaskSchedulerException (edu.iu.dsc.tws.api.compute.exceptions.TaskSchedulerException)1 DataPartitionConsumer (edu.iu.dsc.tws.api.dataset.DataPartitionConsumer)1 Twister2RuntimeException (edu.iu.dsc.tws.api.exceptions.Twister2RuntimeException)1 BinaryInputSplit (edu.iu.dsc.tws.data.api.splits.BinaryInputSplit)1 DataFileReader (edu.iu.dsc.tws.data.utils.DataFileReader)1 DataFileReplicatedReadSource (edu.iu.dsc.tws.task.dataobjects.DataFileReplicatedReadSource)1 ComputeGraphBuilder (edu.iu.dsc.tws.task.impl.ComputeGraphBuilder)1 BufferedReader (java.io.BufferedReader)1 Closeable (java.io.Closeable)1