Search in sources :

Example 6 with FileSystem

use of org.apache.flink.core.fs.FileSystem in project flink by apache.

the class HadoopInputFormatBase method getFileStats.

// --------------------------------------------------------------------------------------------
//  Helper methods
// --------------------------------------------------------------------------------------------
private FileBaseStatistics getFileStats(FileBaseStatistics cachedStats, org.apache.hadoop.fs.Path[] hadoopFilePaths, ArrayList<FileStatus> files) throws IOException {
    long latestModTime = 0L;
    // get the file info and check whether the cached statistics are still valid.
    for (org.apache.hadoop.fs.Path hadoopPath : hadoopFilePaths) {
        final Path filePath = new Path(hadoopPath.toUri());
        final FileSystem fs = FileSystem.get(filePath.toUri());
        final FileStatus file = fs.getFileStatus(filePath);
        latestModTime = Math.max(latestModTime, file.getModificationTime());
        // enumerate all files and check their modification time stamp.
        if (file.isDir()) {
            FileStatus[] fss = fs.listStatus(filePath);
            files.ensureCapacity(files.size() + fss.length);
            for (FileStatus s : fss) {
                if (!s.isDir()) {
                    files.add(s);
                    latestModTime = Math.max(s.getModificationTime(), latestModTime);
                }
            }
        } else {
            files.add(file);
        }
    }
    // check whether the cached statistics are still valid, if we have any
    if (cachedStats != null && latestModTime <= cachedStats.getLastModificationTime()) {
        return cachedStats;
    }
    // calculate the whole length
    long len = 0;
    for (FileStatus s : files) {
        len += s.getLen();
    }
    // sanity check
    if (len <= 0) {
        len = BaseStatistics.SIZE_UNKNOWN;
    }
    return new FileBaseStatistics(latestModTime, len, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN);
}
Also used : Path(org.apache.flink.core.fs.Path) FileStatus(org.apache.flink.core.fs.FileStatus) FileBaseStatistics(org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics) FileSystem(org.apache.flink.core.fs.FileSystem)

Example 7 with FileSystem

use of org.apache.flink.core.fs.FileSystem in project flink by apache.

the class BinaryInputFormat method createInputSplits.

@Override
public FileInputSplit[] createInputSplits(int minNumSplits) throws IOException {
    List<FileStatus> files = this.getFiles();
    final FileSystem fs = this.filePath.getFileSystem();
    final long blockSize = this.blockSize == NATIVE_BLOCK_SIZE ? fs.getDefaultBlockSize() : this.blockSize;
    final List<FileInputSplit> inputSplits = new ArrayList<FileInputSplit>(minNumSplits);
    for (FileStatus file : files) {
        for (long pos = 0, length = file.getLen(); pos < length; pos += blockSize) {
            long remainingLength = Math.min(pos + blockSize, length) - pos;
            // get the block locations and make sure they are in order with respect to their offset
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, pos, remainingLength);
            Arrays.sort(blocks);
            inputSplits.add(new FileInputSplit(inputSplits.size(), file.getPath(), pos, remainingLength, blocks[0].getHosts()));
        }
    }
    if (inputSplits.size() < minNumSplits) {
        LOG.warn(String.format("With the given block size %d, the file %s cannot be split into %d blocks. Filling up with empty splits...", blockSize, this.filePath, minNumSplits));
        FileStatus last = files.get(files.size() - 1);
        final BlockLocation[] blocks = fs.getFileBlockLocations(last, 0, last.getLen());
        for (int index = files.size(); index < minNumSplits; index++) {
            inputSplits.add(new FileInputSplit(index, last.getPath(), last.getLen(), 0, blocks[0].getHosts()));
        }
    }
    return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
Also used : FileInputSplit(org.apache.flink.core.fs.FileInputSplit) FileStatus(org.apache.flink.core.fs.FileStatus) FileSystem(org.apache.flink.core.fs.FileSystem) ArrayList(java.util.ArrayList) BlockLocation(org.apache.flink.core.fs.BlockLocation)

Example 8 with FileSystem

use of org.apache.flink.core.fs.FileSystem in project flink by apache.

the class BinaryInputFormat method createStatistics.

/**
	 * Fill in the statistics. The last modification time and the total input size are prefilled.
	 *
	 * @param files
	 *        The files that are associated with this block input format.
	 * @param stats
	 *        The pre-filled statistics.
	 */
protected SequentialStatistics createStatistics(List<FileStatus> files, FileBaseStatistics stats) throws IOException {
    if (files.isEmpty()) {
        return null;
    }
    BlockInfo blockInfo = new BlockInfo();
    long totalCount = 0;
    for (FileStatus file : files) {
        // invalid file
        if (file.getLen() < blockInfo.getInfoSize()) {
            continue;
        }
        FileSystem fs = file.getPath().getFileSystem();
        try (FSDataInputStream fdis = fs.open(file.getPath(), blockInfo.getInfoSize())) {
            fdis.seek(file.getLen() - blockInfo.getInfoSize());
            blockInfo.read(new DataInputViewStreamWrapper(fdis));
            totalCount += blockInfo.getAccumulatedRecordCount();
        }
    }
    final float avgWidth = totalCount == 0 ? 0 : ((float) stats.getTotalInputSize() / totalCount);
    return new SequentialStatistics(stats.getLastModificationTime(), stats.getTotalInputSize(), avgWidth, totalCount);
}
Also used : FileStatus(org.apache.flink.core.fs.FileStatus) FileSystem(org.apache.flink.core.fs.FileSystem) FSDataInputStream(org.apache.flink.core.fs.FSDataInputStream) DataInputViewStreamWrapper(org.apache.flink.core.memory.DataInputViewStreamWrapper)

Example 9 with FileSystem

use of org.apache.flink.core.fs.FileSystem in project flink by apache.

the class DelimitedInputFormat method getStatistics.

@Override
public FileBaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException {
    final FileBaseStatistics cachedFileStats = (cachedStats != null && cachedStats instanceof FileBaseStatistics) ? (FileBaseStatistics) cachedStats : null;
    // store properties
    final long oldTimeout = this.openTimeout;
    final int oldBufferSize = this.bufferSize;
    final int oldLineLengthLimit = this.lineLengthLimit;
    try {
        final Path filePath = this.filePath;
        // get the filesystem
        final FileSystem fs = FileSystem.get(filePath.toUri());
        final ArrayList<FileStatus> allFiles = new ArrayList<FileStatus>(1);
        // let the file input format deal with the up-to-date check and the basic size
        final FileBaseStatistics stats = getFileStats(cachedFileStats, filePath, fs, allFiles);
        if (stats == null) {
            return null;
        }
        // in both cases, we return the stats as they are
        if (stats.getAverageRecordWidth() != FileBaseStatistics.AVG_RECORD_BYTES_UNKNOWN || stats.getTotalInputSize() == FileBaseStatistics.SIZE_UNKNOWN) {
            return stats;
        }
        // TODO: Add sampling for unsplittable files. Right now, only compressed text files are affected by this limitation.
        if (unsplittable) {
            return stats;
        }
        // compute how many samples to take, depending on the defined upper and lower bound
        final int numSamples;
        if (this.numLineSamples != NUM_SAMPLES_UNDEFINED) {
            numSamples = this.numLineSamples;
        } else {
            // make the samples small for very small files
            final int calcSamples = (int) (stats.getTotalInputSize() / 1024);
            numSamples = Math.min(DEFAULT_MAX_NUM_SAMPLES, Math.max(DEFAULT_MIN_NUM_SAMPLES, calcSamples));
        }
        // check if sampling is disabled.
        if (numSamples == 0) {
            return stats;
        }
        if (numSamples < 0) {
            throw new RuntimeException("Error: Invalid number of samples: " + numSamples);
        }
        // make sure that the sampling times out after a while if the file system does not answer in time
        this.openTimeout = 10000;
        // set a small read buffer size
        this.bufferSize = 4 * 1024;
        // prevent overly large records, for example if we have an incorrectly configured delimiter
        this.lineLengthLimit = MAX_SAMPLE_LEN;
        long offset = 0;
        long totalNumBytes = 0;
        long stepSize = stats.getTotalInputSize() / numSamples;
        int fileNum = 0;
        int samplesTaken = 0;
        // take the samples
        while (samplesTaken < numSamples && fileNum < allFiles.size()) {
            // make a split for the sample and use it to read a record
            FileStatus file = allFiles.get(fileNum);
            FileInputSplit split = new FileInputSplit(0, file.getPath(), offset, file.getLen() - offset, null);
            // we open the split, read one line, and take its length
            try {
                open(split);
                if (readLine()) {
                    totalNumBytes += this.currLen + this.delimiter.length;
                    samplesTaken++;
                }
            } finally {
                // close the file stream, do not release the buffers
                super.close();
            }
            offset += stepSize;
            // skip to the next file, if necessary
            while (fileNum < allFiles.size() && offset >= (file = allFiles.get(fileNum)).getLen()) {
                offset -= file.getLen();
                fileNum++;
            }
        }
        // we have the width, store it
        return new FileBaseStatistics(stats.getLastModificationTime(), stats.getTotalInputSize(), totalNumBytes / (float) samplesTaken);
    } catch (IOException ioex) {
        if (LOG.isWarnEnabled()) {
            LOG.warn("Could not determine statistics for file '" + this.filePath + "' due to an io error: " + ioex.getMessage());
        }
    } catch (Throwable t) {
        if (LOG.isErrorEnabled()) {
            LOG.error("Unexpected problen while getting the file statistics for file '" + this.filePath + "': " + t.getMessage(), t);
        }
    } finally {
        // restore properties (even on return)
        this.openTimeout = oldTimeout;
        this.bufferSize = oldBufferSize;
        this.lineLengthLimit = oldLineLengthLimit;
    }
    // no statistics possible
    return null;
}
Also used : Path(org.apache.flink.core.fs.Path) FileInputSplit(org.apache.flink.core.fs.FileInputSplit) FileStatus(org.apache.flink.core.fs.FileStatus) FileSystem(org.apache.flink.core.fs.FileSystem) ArrayList(java.util.ArrayList) IOException(java.io.IOException)

Example 10 with FileSystem

use of org.apache.flink.core.fs.FileSystem in project flink by apache.

the class FileInputFormat method addFilesInDir.

/**
	 * Enumerate all files in the directory and recursive if enumerateNestedFiles is true.
	 * @return the total length of accepted files.
	 */
private long addFilesInDir(Path path, List<FileStatus> files, boolean logExcludedFiles) throws IOException {
    final FileSystem fs = path.getFileSystem();
    long length = 0;
    for (FileStatus dir : fs.listStatus(path)) {
        if (dir.isDir()) {
            if (acceptFile(dir) && enumerateNestedFiles) {
                length += addFilesInDir(dir.getPath(), files, logExcludedFiles);
            } else {
                if (logExcludedFiles && LOG.isDebugEnabled()) {
                    LOG.debug("Directory " + dir.getPath().toString() + " did not pass the file-filter and is excluded.");
                }
            }
        } else {
            if (acceptFile(dir)) {
                files.add(dir);
                length += dir.getLen();
                testForUnsplittable(dir);
            } else {
                if (logExcludedFiles && LOG.isDebugEnabled()) {
                    LOG.debug("Directory " + dir.getPath().toString() + " did not pass the file-filter and is excluded.");
                }
            }
        }
    }
    return length;
}
Also used : FileStatus(org.apache.flink.core.fs.FileStatus) FileSystem(org.apache.flink.core.fs.FileSystem)

Aggregations

FileSystem (org.apache.flink.core.fs.FileSystem)41 Path (org.apache.flink.core.fs.Path)34 IOException (java.io.IOException)18 FileStatus (org.apache.flink.core.fs.FileStatus)13 ArrayList (java.util.ArrayList)8 Test (org.junit.Test)8 FSDataInputStream (org.apache.flink.core.fs.FSDataInputStream)6 FSDataOutputStream (org.apache.flink.core.fs.FSDataOutputStream)6 File (java.io.File)5 URI (java.net.URI)5 URISyntaxException (java.net.URISyntaxException)4 FileNotFoundException (java.io.FileNotFoundException)3 FileInputSplit (org.apache.flink.core.fs.FileInputSplit)3 DataInputViewStreamWrapper (org.apache.flink.core.memory.DataInputViewStreamWrapper)3 FileStateHandle (org.apache.flink.runtime.state.filesystem.FileStateHandle)3 DataOutputStream (java.io.DataOutputStream)2 InputStream (java.io.InputStream)2 Field (java.lang.reflect.Field)2 Map (java.util.Map)2 FileBaseStatistics (org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics)2