use of org.apache.flink.core.fs.FileSystem in project flink by apache.
the class BinaryInputFormat method createStatistics.
/**
* Fill in the statistics. The last modification time and the total input size are prefilled.
*
* @param files
* The files that are associated with this block input format.
* @param stats
* The pre-filled statistics.
*/
protected SequentialStatistics createStatistics(List<FileStatus> files, FileBaseStatistics stats) throws IOException {
if (files.isEmpty()) {
return null;
}
BlockInfo blockInfo = new BlockInfo();
long totalCount = 0;
for (FileStatus file : files) {
// invalid file
if (file.getLen() < blockInfo.getInfoSize()) {
continue;
}
FileSystem fs = file.getPath().getFileSystem();
try (FSDataInputStream fdis = fs.open(file.getPath(), blockInfo.getInfoSize())) {
fdis.seek(file.getLen() - blockInfo.getInfoSize());
blockInfo.read(new DataInputViewStreamWrapper(fdis));
totalCount += blockInfo.getAccumulatedRecordCount();
}
}
final float avgWidth = totalCount == 0 ? 0 : ((float) stats.getTotalInputSize() / totalCount);
return new SequentialStatistics(stats.getLastModificationTime(), stats.getTotalInputSize(), avgWidth, totalCount);
}
use of org.apache.flink.core.fs.FileSystem in project flink by apache.
the class DelimitedInputFormat method getStatistics.
@Override
public FileBaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException {
final FileBaseStatistics cachedFileStats = (cachedStats != null && cachedStats instanceof FileBaseStatistics) ? (FileBaseStatistics) cachedStats : null;
// store properties
final long oldTimeout = this.openTimeout;
final int oldBufferSize = this.bufferSize;
final int oldLineLengthLimit = this.lineLengthLimit;
try {
final Path filePath = this.filePath;
// get the filesystem
final FileSystem fs = FileSystem.get(filePath.toUri());
final ArrayList<FileStatus> allFiles = new ArrayList<FileStatus>(1);
// let the file input format deal with the up-to-date check and the basic size
final FileBaseStatistics stats = getFileStats(cachedFileStats, filePath, fs, allFiles);
if (stats == null) {
return null;
}
// in both cases, we return the stats as they are
if (stats.getAverageRecordWidth() != FileBaseStatistics.AVG_RECORD_BYTES_UNKNOWN || stats.getTotalInputSize() == FileBaseStatistics.SIZE_UNKNOWN) {
return stats;
}
// TODO: Add sampling for unsplittable files. Right now, only compressed text files are affected by this limitation.
if (unsplittable) {
return stats;
}
// compute how many samples to take, depending on the defined upper and lower bound
final int numSamples;
if (this.numLineSamples != NUM_SAMPLES_UNDEFINED) {
numSamples = this.numLineSamples;
} else {
// make the samples small for very small files
final int calcSamples = (int) (stats.getTotalInputSize() / 1024);
numSamples = Math.min(DEFAULT_MAX_NUM_SAMPLES, Math.max(DEFAULT_MIN_NUM_SAMPLES, calcSamples));
}
// check if sampling is disabled.
if (numSamples == 0) {
return stats;
}
if (numSamples < 0) {
throw new RuntimeException("Error: Invalid number of samples: " + numSamples);
}
// make sure that the sampling times out after a while if the file system does not answer in time
this.openTimeout = 10000;
// set a small read buffer size
this.bufferSize = 4 * 1024;
// prevent overly large records, for example if we have an incorrectly configured delimiter
this.lineLengthLimit = MAX_SAMPLE_LEN;
long offset = 0;
long totalNumBytes = 0;
long stepSize = stats.getTotalInputSize() / numSamples;
int fileNum = 0;
int samplesTaken = 0;
// take the samples
while (samplesTaken < numSamples && fileNum < allFiles.size()) {
// make a split for the sample and use it to read a record
FileStatus file = allFiles.get(fileNum);
FileInputSplit split = new FileInputSplit(0, file.getPath(), offset, file.getLen() - offset, null);
// we open the split, read one line, and take its length
try {
open(split);
if (readLine()) {
totalNumBytes += this.currLen + this.delimiter.length;
samplesTaken++;
}
} finally {
// close the file stream, do not release the buffers
super.close();
}
offset += stepSize;
// skip to the next file, if necessary
while (fileNum < allFiles.size() && offset >= (file = allFiles.get(fileNum)).getLen()) {
offset -= file.getLen();
fileNum++;
}
}
// we have the width, store it
return new FileBaseStatistics(stats.getLastModificationTime(), stats.getTotalInputSize(), totalNumBytes / (float) samplesTaken);
} catch (IOException ioex) {
if (LOG.isWarnEnabled()) {
LOG.warn("Could not determine statistics for file '" + this.filePath + "' due to an io error: " + ioex.getMessage());
}
} catch (Throwable t) {
if (LOG.isErrorEnabled()) {
LOG.error("Unexpected problen while getting the file statistics for file '" + this.filePath + "': " + t.getMessage(), t);
}
} finally {
// restore properties (even on return)
this.openTimeout = oldTimeout;
this.bufferSize = oldBufferSize;
this.lineLengthLimit = oldLineLengthLimit;
}
// no statistics possible
return null;
}
use of org.apache.flink.core.fs.FileSystem in project flink by apache.
the class FileInputFormat method addFilesInDir.
/**
* Enumerate all files in the directory and recursive if enumerateNestedFiles is true.
* @return the total length of accepted files.
*/
private long addFilesInDir(Path path, List<FileStatus> files, boolean logExcludedFiles) throws IOException {
final FileSystem fs = path.getFileSystem();
long length = 0;
for (FileStatus dir : fs.listStatus(path)) {
if (dir.isDir()) {
if (acceptFile(dir) && enumerateNestedFiles) {
length += addFilesInDir(dir.getPath(), files, logExcludedFiles);
} else {
if (logExcludedFiles && LOG.isDebugEnabled()) {
LOG.debug("Directory " + dir.getPath().toString() + " did not pass the file-filter and is excluded.");
}
}
} else {
if (acceptFile(dir)) {
files.add(dir);
length += dir.getLen();
testForUnsplittable(dir);
} else {
if (logExcludedFiles && LOG.isDebugEnabled()) {
LOG.debug("Directory " + dir.getPath().toString() + " did not pass the file-filter and is excluded.");
}
}
}
}
return length;
}
use of org.apache.flink.core.fs.FileSystem in project flink by apache.
the class FileInputFormat method getStatistics.
/**
* Obtains basic file statistics containing only file size. If the input is a directory, then the size is the sum of all contained files.
*
* @see org.apache.flink.api.common.io.InputFormat#getStatistics(org.apache.flink.api.common.io.statistics.BaseStatistics)
*/
@Override
public FileBaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException {
final FileBaseStatistics cachedFileStats = (cachedStats != null && cachedStats instanceof FileBaseStatistics) ? (FileBaseStatistics) cachedStats : null;
try {
final Path path = this.filePath;
final FileSystem fs = FileSystem.get(path.toUri());
return getFileStats(cachedFileStats, path, fs, new ArrayList<FileStatus>(1));
} catch (IOException ioex) {
if (LOG.isWarnEnabled()) {
LOG.warn("Could not determine statistics for file '" + this.filePath + "' due to an io error: " + ioex.getMessage());
}
} catch (Throwable t) {
if (LOG.isErrorEnabled()) {
LOG.error("Unexpected problem while getting the file statistics for file '" + this.filePath + "': " + t.getMessage(), t);
}
}
// no statistics available
return null;
}
use of org.apache.flink.core.fs.FileSystem in project flink by apache.
the class FileOutputFormat method open.
@Override
public void open(int taskNumber, int numTasks) throws IOException {
if (taskNumber < 0 || numTasks < 1) {
throw new IllegalArgumentException("TaskNumber: " + taskNumber + ", numTasks: " + numTasks);
}
if (LOG.isDebugEnabled()) {
LOG.debug("Opening stream for output (" + (taskNumber + 1) + "/" + numTasks + "). WriteMode=" + writeMode + ", OutputDirectoryMode=" + outputDirectoryMode);
}
Path p = this.outputFilePath;
if (p == null) {
throw new IOException("The file path is null.");
}
final FileSystem fs = p.getFileSystem();
// if this is a local file system, we need to initialize the local output directory here
if (!fs.isDistributedFS()) {
if (numTasks == 1 && outputDirectoryMode == OutputDirectoryMode.PARONLY) {
// prepare local output path. checks for write mode and removes existing files in case of OVERWRITE mode
if (!fs.initOutPathLocalFS(p, writeMode, false)) {
// output preparation failed! Cancel task.
throw new IOException("Output path '" + p.toString() + "' could not be initialized. Canceling task...");
}
} else {
if (!fs.initOutPathLocalFS(p, writeMode, true)) {
// output preparation failed! Cancel task.
throw new IOException("Output directory '" + p.toString() + "' could not be created. Canceling task...");
}
}
}
// Suffix the path with the parallel instance index, if needed
this.actualFilePath = (numTasks > 1 || outputDirectoryMode == OutputDirectoryMode.ALWAYS) ? p.suffix("/" + getDirectoryFileName(taskNumber)) : p;
// create output file
this.stream = fs.create(this.actualFilePath, writeMode);
// at this point, the file creation must have succeeded, or an exception has been thrown
this.fileCreated = true;
}
Aggregations