use of org.apache.flink.core.fs.FileSystem in project flink by apache.
the class HadoopInputFormatBase method getFileStats.
// --------------------------------------------------------------------------------------------
// Helper methods
// --------------------------------------------------------------------------------------------
private FileBaseStatistics getFileStats(FileBaseStatistics cachedStats, org.apache.hadoop.fs.Path[] hadoopFilePaths, ArrayList<FileStatus> files) throws IOException {
long latestModTime = 0L;
// get the file info and check whether the cached statistics are still valid.
for (org.apache.hadoop.fs.Path hadoopPath : hadoopFilePaths) {
final Path filePath = new Path(hadoopPath.toUri());
final FileSystem fs = FileSystem.get(filePath.toUri());
final FileStatus file = fs.getFileStatus(filePath);
latestModTime = Math.max(latestModTime, file.getModificationTime());
// enumerate all files and check their modification time stamp.
if (file.isDir()) {
FileStatus[] fss = fs.listStatus(filePath);
files.ensureCapacity(files.size() + fss.length);
for (FileStatus s : fss) {
if (!s.isDir()) {
files.add(s);
latestModTime = Math.max(s.getModificationTime(), latestModTime);
}
}
} else {
files.add(file);
}
}
// check whether the cached statistics are still valid, if we have any
if (cachedStats != null && latestModTime <= cachedStats.getLastModificationTime()) {
return cachedStats;
}
// calculate the whole length
long len = 0;
for (FileStatus s : files) {
len += s.getLen();
}
// sanity check
if (len <= 0) {
len = BaseStatistics.SIZE_UNKNOWN;
}
return new FileBaseStatistics(latestModTime, len, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN);
}
use of org.apache.flink.core.fs.FileSystem in project flink by apache.
the class BinaryInputFormat method createInputSplits.
@Override
public FileInputSplit[] createInputSplits(int minNumSplits) throws IOException {
List<FileStatus> files = this.getFiles();
final FileSystem fs = this.filePath.getFileSystem();
final long blockSize = this.blockSize == NATIVE_BLOCK_SIZE ? fs.getDefaultBlockSize() : this.blockSize;
final List<FileInputSplit> inputSplits = new ArrayList<FileInputSplit>(minNumSplits);
for (FileStatus file : files) {
for (long pos = 0, length = file.getLen(); pos < length; pos += blockSize) {
long remainingLength = Math.min(pos + blockSize, length) - pos;
// get the block locations and make sure they are in order with respect to their offset
final BlockLocation[] blocks = fs.getFileBlockLocations(file, pos, remainingLength);
Arrays.sort(blocks);
inputSplits.add(new FileInputSplit(inputSplits.size(), file.getPath(), pos, remainingLength, blocks[0].getHosts()));
}
}
if (inputSplits.size() < minNumSplits) {
LOG.warn(String.format("With the given block size %d, the file %s cannot be split into %d blocks. Filling up with empty splits...", blockSize, this.filePath, minNumSplits));
FileStatus last = files.get(files.size() - 1);
final BlockLocation[] blocks = fs.getFileBlockLocations(last, 0, last.getLen());
for (int index = files.size(); index < minNumSplits; index++) {
inputSplits.add(new FileInputSplit(index, last.getPath(), last.getLen(), 0, blocks[0].getHosts()));
}
}
return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
use of org.apache.flink.core.fs.FileSystem in project flink by apache.
the class BinaryInputFormat method createStatistics.
/**
* Fill in the statistics. The last modification time and the total input size are prefilled.
*
* @param files
* The files that are associated with this block input format.
* @param stats
* The pre-filled statistics.
*/
protected SequentialStatistics createStatistics(List<FileStatus> files, FileBaseStatistics stats) throws IOException {
if (files.isEmpty()) {
return null;
}
BlockInfo blockInfo = new BlockInfo();
long totalCount = 0;
for (FileStatus file : files) {
// invalid file
if (file.getLen() < blockInfo.getInfoSize()) {
continue;
}
FileSystem fs = file.getPath().getFileSystem();
try (FSDataInputStream fdis = fs.open(file.getPath(), blockInfo.getInfoSize())) {
fdis.seek(file.getLen() - blockInfo.getInfoSize());
blockInfo.read(new DataInputViewStreamWrapper(fdis));
totalCount += blockInfo.getAccumulatedRecordCount();
}
}
final float avgWidth = totalCount == 0 ? 0 : ((float) stats.getTotalInputSize() / totalCount);
return new SequentialStatistics(stats.getLastModificationTime(), stats.getTotalInputSize(), avgWidth, totalCount);
}
use of org.apache.flink.core.fs.FileSystem in project flink by apache.
the class DelimitedInputFormat method getStatistics.
@Override
public FileBaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException {
final FileBaseStatistics cachedFileStats = (cachedStats != null && cachedStats instanceof FileBaseStatistics) ? (FileBaseStatistics) cachedStats : null;
// store properties
final long oldTimeout = this.openTimeout;
final int oldBufferSize = this.bufferSize;
final int oldLineLengthLimit = this.lineLengthLimit;
try {
final Path filePath = this.filePath;
// get the filesystem
final FileSystem fs = FileSystem.get(filePath.toUri());
final ArrayList<FileStatus> allFiles = new ArrayList<FileStatus>(1);
// let the file input format deal with the up-to-date check and the basic size
final FileBaseStatistics stats = getFileStats(cachedFileStats, filePath, fs, allFiles);
if (stats == null) {
return null;
}
// in both cases, we return the stats as they are
if (stats.getAverageRecordWidth() != FileBaseStatistics.AVG_RECORD_BYTES_UNKNOWN || stats.getTotalInputSize() == FileBaseStatistics.SIZE_UNKNOWN) {
return stats;
}
// TODO: Add sampling for unsplittable files. Right now, only compressed text files are affected by this limitation.
if (unsplittable) {
return stats;
}
// compute how many samples to take, depending on the defined upper and lower bound
final int numSamples;
if (this.numLineSamples != NUM_SAMPLES_UNDEFINED) {
numSamples = this.numLineSamples;
} else {
// make the samples small for very small files
final int calcSamples = (int) (stats.getTotalInputSize() / 1024);
numSamples = Math.min(DEFAULT_MAX_NUM_SAMPLES, Math.max(DEFAULT_MIN_NUM_SAMPLES, calcSamples));
}
// check if sampling is disabled.
if (numSamples == 0) {
return stats;
}
if (numSamples < 0) {
throw new RuntimeException("Error: Invalid number of samples: " + numSamples);
}
// make sure that the sampling times out after a while if the file system does not answer in time
this.openTimeout = 10000;
// set a small read buffer size
this.bufferSize = 4 * 1024;
// prevent overly large records, for example if we have an incorrectly configured delimiter
this.lineLengthLimit = MAX_SAMPLE_LEN;
long offset = 0;
long totalNumBytes = 0;
long stepSize = stats.getTotalInputSize() / numSamples;
int fileNum = 0;
int samplesTaken = 0;
// take the samples
while (samplesTaken < numSamples && fileNum < allFiles.size()) {
// make a split for the sample and use it to read a record
FileStatus file = allFiles.get(fileNum);
FileInputSplit split = new FileInputSplit(0, file.getPath(), offset, file.getLen() - offset, null);
// we open the split, read one line, and take its length
try {
open(split);
if (readLine()) {
totalNumBytes += this.currLen + this.delimiter.length;
samplesTaken++;
}
} finally {
// close the file stream, do not release the buffers
super.close();
}
offset += stepSize;
// skip to the next file, if necessary
while (fileNum < allFiles.size() && offset >= (file = allFiles.get(fileNum)).getLen()) {
offset -= file.getLen();
fileNum++;
}
}
// we have the width, store it
return new FileBaseStatistics(stats.getLastModificationTime(), stats.getTotalInputSize(), totalNumBytes / (float) samplesTaken);
} catch (IOException ioex) {
if (LOG.isWarnEnabled()) {
LOG.warn("Could not determine statistics for file '" + this.filePath + "' due to an io error: " + ioex.getMessage());
}
} catch (Throwable t) {
if (LOG.isErrorEnabled()) {
LOG.error("Unexpected problen while getting the file statistics for file '" + this.filePath + "': " + t.getMessage(), t);
}
} finally {
// restore properties (even on return)
this.openTimeout = oldTimeout;
this.bufferSize = oldBufferSize;
this.lineLengthLimit = oldLineLengthLimit;
}
// no statistics possible
return null;
}
use of org.apache.flink.core.fs.FileSystem in project flink by apache.
the class FileInputFormat method addFilesInDir.
/**
* Enumerate all files in the directory and recursive if enumerateNestedFiles is true.
* @return the total length of accepted files.
*/
private long addFilesInDir(Path path, List<FileStatus> files, boolean logExcludedFiles) throws IOException {
final FileSystem fs = path.getFileSystem();
long length = 0;
for (FileStatus dir : fs.listStatus(path)) {
if (dir.isDir()) {
if (acceptFile(dir) && enumerateNestedFiles) {
length += addFilesInDir(dir.getPath(), files, logExcludedFiles);
} else {
if (logExcludedFiles && LOG.isDebugEnabled()) {
LOG.debug("Directory " + dir.getPath().toString() + " did not pass the file-filter and is excluded.");
}
}
} else {
if (acceptFile(dir)) {
files.add(dir);
length += dir.getLen();
testForUnsplittable(dir);
} else {
if (logExcludedFiles && LOG.isDebugEnabled()) {
LOG.debug("Directory " + dir.getPath().toString() + " did not pass the file-filter and is excluded.");
}
}
}
}
return length;
}
Aggregations