Search in sources :

Example 6 with FileStatus

use of org.apache.flink.core.fs.FileStatus in project flink by apache.

the class BinaryInputFormat method createInputSplits.

@Override
public FileInputSplit[] createInputSplits(int minNumSplits) throws IOException {
    List<FileStatus> files = this.getFiles();
    final FileSystem fs = this.filePath.getFileSystem();
    final long blockSize = this.blockSize == NATIVE_BLOCK_SIZE ? fs.getDefaultBlockSize() : this.blockSize;
    final List<FileInputSplit> inputSplits = new ArrayList<FileInputSplit>(minNumSplits);
    for (FileStatus file : files) {
        for (long pos = 0, length = file.getLen(); pos < length; pos += blockSize) {
            long remainingLength = Math.min(pos + blockSize, length) - pos;
            // get the block locations and make sure they are in order with respect to their offset
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, pos, remainingLength);
            Arrays.sort(blocks);
            inputSplits.add(new FileInputSplit(inputSplits.size(), file.getPath(), pos, remainingLength, blocks[0].getHosts()));
        }
    }
    if (inputSplits.size() < minNumSplits) {
        LOG.warn(String.format("With the given block size %d, the file %s cannot be split into %d blocks. Filling up with empty splits...", blockSize, this.filePath, minNumSplits));
        FileStatus last = files.get(files.size() - 1);
        final BlockLocation[] blocks = fs.getFileBlockLocations(last, 0, last.getLen());
        for (int index = files.size(); index < minNumSplits; index++) {
            inputSplits.add(new FileInputSplit(index, last.getPath(), last.getLen(), 0, blocks[0].getHosts()));
        }
    }
    return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
Also used : FileInputSplit(org.apache.flink.core.fs.FileInputSplit) FileStatus(org.apache.flink.core.fs.FileStatus) FileSystem(org.apache.flink.core.fs.FileSystem) ArrayList(java.util.ArrayList) BlockLocation(org.apache.flink.core.fs.BlockLocation)

Example 7 with FileStatus

use of org.apache.flink.core.fs.FileStatus in project flink by apache.

the class BinaryInputFormat method createStatistics.

/**
	 * Fill in the statistics. The last modification time and the total input size are prefilled.
	 *
	 * @param files
	 *        The files that are associated with this block input format.
	 * @param stats
	 *        The pre-filled statistics.
	 */
protected SequentialStatistics createStatistics(List<FileStatus> files, FileBaseStatistics stats) throws IOException {
    if (files.isEmpty()) {
        return null;
    }
    BlockInfo blockInfo = new BlockInfo();
    long totalCount = 0;
    for (FileStatus file : files) {
        // invalid file
        if (file.getLen() < blockInfo.getInfoSize()) {
            continue;
        }
        FileSystem fs = file.getPath().getFileSystem();
        try (FSDataInputStream fdis = fs.open(file.getPath(), blockInfo.getInfoSize())) {
            fdis.seek(file.getLen() - blockInfo.getInfoSize());
            blockInfo.read(new DataInputViewStreamWrapper(fdis));
            totalCount += blockInfo.getAccumulatedRecordCount();
        }
    }
    final float avgWidth = totalCount == 0 ? 0 : ((float) stats.getTotalInputSize() / totalCount);
    return new SequentialStatistics(stats.getLastModificationTime(), stats.getTotalInputSize(), avgWidth, totalCount);
}
Also used : FileStatus(org.apache.flink.core.fs.FileStatus) FileSystem(org.apache.flink.core.fs.FileSystem) FSDataInputStream(org.apache.flink.core.fs.FSDataInputStream) DataInputViewStreamWrapper(org.apache.flink.core.memory.DataInputViewStreamWrapper)

Example 8 with FileStatus

use of org.apache.flink.core.fs.FileStatus in project flink by apache.

the class DelimitedInputFormat method getStatistics.

@Override
public FileBaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException {
    final FileBaseStatistics cachedFileStats = (cachedStats != null && cachedStats instanceof FileBaseStatistics) ? (FileBaseStatistics) cachedStats : null;
    // store properties
    final long oldTimeout = this.openTimeout;
    final int oldBufferSize = this.bufferSize;
    final int oldLineLengthLimit = this.lineLengthLimit;
    try {
        final Path filePath = this.filePath;
        // get the filesystem
        final FileSystem fs = FileSystem.get(filePath.toUri());
        final ArrayList<FileStatus> allFiles = new ArrayList<FileStatus>(1);
        // let the file input format deal with the up-to-date check and the basic size
        final FileBaseStatistics stats = getFileStats(cachedFileStats, filePath, fs, allFiles);
        if (stats == null) {
            return null;
        }
        // in both cases, we return the stats as they are
        if (stats.getAverageRecordWidth() != FileBaseStatistics.AVG_RECORD_BYTES_UNKNOWN || stats.getTotalInputSize() == FileBaseStatistics.SIZE_UNKNOWN) {
            return stats;
        }
        // TODO: Add sampling for unsplittable files. Right now, only compressed text files are affected by this limitation.
        if (unsplittable) {
            return stats;
        }
        // compute how many samples to take, depending on the defined upper and lower bound
        final int numSamples;
        if (this.numLineSamples != NUM_SAMPLES_UNDEFINED) {
            numSamples = this.numLineSamples;
        } else {
            // make the samples small for very small files
            final int calcSamples = (int) (stats.getTotalInputSize() / 1024);
            numSamples = Math.min(DEFAULT_MAX_NUM_SAMPLES, Math.max(DEFAULT_MIN_NUM_SAMPLES, calcSamples));
        }
        // check if sampling is disabled.
        if (numSamples == 0) {
            return stats;
        }
        if (numSamples < 0) {
            throw new RuntimeException("Error: Invalid number of samples: " + numSamples);
        }
        // make sure that the sampling times out after a while if the file system does not answer in time
        this.openTimeout = 10000;
        // set a small read buffer size
        this.bufferSize = 4 * 1024;
        // prevent overly large records, for example if we have an incorrectly configured delimiter
        this.lineLengthLimit = MAX_SAMPLE_LEN;
        long offset = 0;
        long totalNumBytes = 0;
        long stepSize = stats.getTotalInputSize() / numSamples;
        int fileNum = 0;
        int samplesTaken = 0;
        // take the samples
        while (samplesTaken < numSamples && fileNum < allFiles.size()) {
            // make a split for the sample and use it to read a record
            FileStatus file = allFiles.get(fileNum);
            FileInputSplit split = new FileInputSplit(0, file.getPath(), offset, file.getLen() - offset, null);
            // we open the split, read one line, and take its length
            try {
                open(split);
                if (readLine()) {
                    totalNumBytes += this.currLen + this.delimiter.length;
                    samplesTaken++;
                }
            } finally {
                // close the file stream, do not release the buffers
                super.close();
            }
            offset += stepSize;
            // skip to the next file, if necessary
            while (fileNum < allFiles.size() && offset >= (file = allFiles.get(fileNum)).getLen()) {
                offset -= file.getLen();
                fileNum++;
            }
        }
        // we have the width, store it
        return new FileBaseStatistics(stats.getLastModificationTime(), stats.getTotalInputSize(), totalNumBytes / (float) samplesTaken);
    } catch (IOException ioex) {
        if (LOG.isWarnEnabled()) {
            LOG.warn("Could not determine statistics for file '" + this.filePath + "' due to an io error: " + ioex.getMessage());
        }
    } catch (Throwable t) {
        if (LOG.isErrorEnabled()) {
            LOG.error("Unexpected problen while getting the file statistics for file '" + this.filePath + "': " + t.getMessage(), t);
        }
    } finally {
        // restore properties (even on return)
        this.openTimeout = oldTimeout;
        this.bufferSize = oldBufferSize;
        this.lineLengthLimit = oldLineLengthLimit;
    }
    // no statistics possible
    return null;
}
Also used : Path(org.apache.flink.core.fs.Path) FileInputSplit(org.apache.flink.core.fs.FileInputSplit) FileStatus(org.apache.flink.core.fs.FileStatus) FileSystem(org.apache.flink.core.fs.FileSystem) ArrayList(java.util.ArrayList) IOException(java.io.IOException)

Example 9 with FileStatus

use of org.apache.flink.core.fs.FileStatus in project flink by apache.

the class FileInputFormat method getFileStats.

protected FileBaseStatistics getFileStats(FileBaseStatistics cachedStats, Path filePath, FileSystem fs, ArrayList<FileStatus> files) throws IOException {
    // get the file info and check whether the cached statistics are still valid.
    final FileStatus file = fs.getFileStatus(filePath);
    long totalLength = 0;
    // enumerate all files
    if (file.isDir()) {
        totalLength += addFilesInDir(file.getPath(), files, false);
    } else {
        files.add(file);
        testForUnsplittable(file);
        totalLength += file.getLen();
    }
    // check the modification time stamp
    long latestModTime = 0;
    for (FileStatus f : files) {
        latestModTime = Math.max(f.getModificationTime(), latestModTime);
    }
    // check whether the cached statistics are still valid, if we have any
    if (cachedStats != null && latestModTime <= cachedStats.getLastModificationTime()) {
        return cachedStats;
    }
    // sanity check
    if (totalLength <= 0) {
        totalLength = BaseStatistics.SIZE_UNKNOWN;
    }
    return new FileBaseStatistics(latestModTime, totalLength, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN);
}
Also used : FileStatus(org.apache.flink.core.fs.FileStatus)

Example 10 with FileStatus

use of org.apache.flink.core.fs.FileStatus in project flink by apache.

the class FileInputFormat method addFilesInDir.

/**
	 * Enumerate all files in the directory and recursive if enumerateNestedFiles is true.
	 * @return the total length of accepted files.
	 */
private long addFilesInDir(Path path, List<FileStatus> files, boolean logExcludedFiles) throws IOException {
    final FileSystem fs = path.getFileSystem();
    long length = 0;
    for (FileStatus dir : fs.listStatus(path)) {
        if (dir.isDir()) {
            if (acceptFile(dir) && enumerateNestedFiles) {
                length += addFilesInDir(dir.getPath(), files, logExcludedFiles);
            } else {
                if (logExcludedFiles && LOG.isDebugEnabled()) {
                    LOG.debug("Directory " + dir.getPath().toString() + " did not pass the file-filter and is excluded.");
                }
            }
        } else {
            if (acceptFile(dir)) {
                files.add(dir);
                length += dir.getLen();
                testForUnsplittable(dir);
            } else {
                if (logExcludedFiles && LOG.isDebugEnabled()) {
                    LOG.debug("Directory " + dir.getPath().toString() + " did not pass the file-filter and is excluded.");
                }
            }
        }
    }
    return length;
}
Also used : FileStatus(org.apache.flink.core.fs.FileStatus) FileSystem(org.apache.flink.core.fs.FileSystem)

Aggregations

FileStatus (org.apache.flink.core.fs.FileStatus)24 Path (org.apache.flink.core.fs.Path)16 FileSystem (org.apache.flink.core.fs.FileSystem)13 ArrayList (java.util.ArrayList)9 IOException (java.io.IOException)8 FileInputSplit (org.apache.flink.core.fs.FileInputSplit)4 File (java.io.File)3 FileBaseStatistics (org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics)3 FSDataInputStream (org.apache.flink.core.fs.FSDataInputStream)3 HashMap (java.util.HashMap)2 List (java.util.List)2 TreeMap (java.util.TreeMap)2 BlockLocation (org.apache.flink.core.fs.BlockLocation)2 FSDataOutputStream (org.apache.flink.core.fs.FSDataOutputStream)2 DataInputViewStreamWrapper (org.apache.flink.core.memory.DataInputViewStreamWrapper)2 DataInputStream (java.io.DataInputStream)1 FileInputStream (java.io.FileInputStream)1 FileOutputStream (java.io.FileOutputStream)1 HashSet (java.util.HashSet)1 Map (java.util.Map)1