use of org.apache.flink.core.fs.FileStatus in project flink by apache.
the class FileInputFormat method createInputSplits.
/**
* Computes the input splits for the file. By default, one file block is one split. If more splits
* are requested than blocks are available, then a split may be a fraction of a block and splits may cross
* block boundaries.
*
* @param minNumSplits The minimum desired number of file splits.
* @return The computed file splits.
*
* @see org.apache.flink.api.common.io.InputFormat#createInputSplits(int)
*/
@Override
public FileInputSplit[] createInputSplits(int minNumSplits) throws IOException {
if (minNumSplits < 1) {
throw new IllegalArgumentException("Number of input splits has to be at least 1.");
}
// take the desired number of splits into account
minNumSplits = Math.max(minNumSplits, this.numSplits);
final Path path = this.filePath;
final List<FileInputSplit> inputSplits = new ArrayList<FileInputSplit>(minNumSplits);
// get all the files that are involved in the splits
List<FileStatus> files = new ArrayList<FileStatus>();
long totalLength = 0;
final FileSystem fs = path.getFileSystem();
final FileStatus pathFile = fs.getFileStatus(path);
if (pathFile.isDir()) {
totalLength += addFilesInDir(path, files, true);
} else {
testForUnsplittable(pathFile);
files.add(pathFile);
totalLength += pathFile.getLen();
}
// returns if unsplittable
if (unsplittable) {
int splitNum = 0;
for (final FileStatus file : files) {
final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, file.getLen());
Set<String> hosts = new HashSet<String>();
for (BlockLocation block : blocks) {
hosts.addAll(Arrays.asList(block.getHosts()));
}
long len = file.getLen();
if (testForUnsplittable(file)) {
len = READ_WHOLE_SPLIT_FLAG;
}
FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), 0, len, hosts.toArray(new String[hosts.size()]));
inputSplits.add(fis);
}
return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
final long maxSplitSize = (minNumSplits < 1) ? Long.MAX_VALUE : (totalLength / minNumSplits + (totalLength % minNumSplits == 0 ? 0 : 1));
// now that we have the files, generate the splits
int splitNum = 0;
for (final FileStatus file : files) {
final long len = file.getLen();
final long blockSize = file.getBlockSize();
final long minSplitSize;
if (this.minSplitSize <= blockSize) {
minSplitSize = this.minSplitSize;
} else {
if (LOG.isWarnEnabled()) {
LOG.warn("Minimal split size of " + this.minSplitSize + " is larger than the block size of " + blockSize + ". Decreasing minimal split size to block size.");
}
minSplitSize = blockSize;
}
final long splitSize = Math.max(minSplitSize, Math.min(maxSplitSize, blockSize));
final long halfSplit = splitSize >>> 1;
final long maxBytesForLastSplit = (long) (splitSize * MAX_SPLIT_SIZE_DISCREPANCY);
if (len > 0) {
// get the block locations and make sure they are in order with respect to their offset
final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, len);
Arrays.sort(blocks);
long bytesUnassigned = len;
long position = 0;
int blockIndex = 0;
while (bytesUnassigned > maxBytesForLastSplit) {
// get the block containing the majority of the data
blockIndex = getBlockIndexForPosition(blocks, position, halfSplit, blockIndex);
// create a new split
FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), position, splitSize, blocks[blockIndex].getHosts());
inputSplits.add(fis);
// adjust the positions
position += splitSize;
bytesUnassigned -= splitSize;
}
// assign the last split
if (bytesUnassigned > 0) {
blockIndex = getBlockIndexForPosition(blocks, position, halfSplit, blockIndex);
final FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), position, bytesUnassigned, blocks[blockIndex].getHosts());
inputSplits.add(fis);
}
} else {
// special case with a file of zero bytes size
final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, 0);
String[] hosts;
if (blocks.length > 0) {
hosts = blocks[0].getHosts();
} else {
hosts = new String[0];
}
final FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), 0, 0, hosts);
inputSplits.add(fis);
}
}
return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
use of org.apache.flink.core.fs.FileStatus in project flink by apache.
the class BinaryInputFormat method getFiles.
protected List<FileStatus> getFiles() throws IOException {
// get all the files that are involved in the splits
List<FileStatus> files = new ArrayList<FileStatus>();
final FileSystem fs = this.filePath.getFileSystem();
final FileStatus pathFile = fs.getFileStatus(this.filePath);
if (pathFile.isDir()) {
// input is directory. list all contained files
final FileStatus[] partials = fs.listStatus(this.filePath);
for (FileStatus partial : partials) {
if (!partial.isDir()) {
files.add(partial);
}
}
} else {
files.add(pathFile);
}
return files;
}
use of org.apache.flink.core.fs.FileStatus in project flink by apache.
the class BinaryInputFormat method getStatistics.
@Override
public SequentialStatistics getStatistics(BaseStatistics cachedStats) {
final FileBaseStatistics cachedFileStats = (cachedStats != null && cachedStats instanceof FileBaseStatistics) ? (FileBaseStatistics) cachedStats : null;
try {
final Path filePath = this.filePath;
// get the filesystem
final FileSystem fs = FileSystem.get(filePath.toUri());
final ArrayList<FileStatus> allFiles = new ArrayList<FileStatus>(1);
// let the file input format deal with the up-to-date check and the basic size
final FileBaseStatistics stats = getFileStats(cachedFileStats, filePath, fs, allFiles);
if (stats == null) {
return null;
}
// check whether the file stats are still sequential stats (in that case they are still valid)
if (stats instanceof SequentialStatistics) {
return (SequentialStatistics) stats;
}
return createStatistics(allFiles, stats);
} catch (IOException ioex) {
if (LOG.isWarnEnabled()) {
LOG.warn(String.format("Could not determine complete statistics for file '%s' due to an I/O error", this.filePath), ioex);
}
} catch (Throwable t) {
if (LOG.isErrorEnabled()) {
LOG.error(String.format("Unexpected problem while getting the file statistics for file '%s'", this.filePath), t);
}
}
// no stats available
return null;
}
use of org.apache.flink.core.fs.FileStatus in project flink by apache.
the class LocalFileSystemTest method testLocalFilesystem.
/**
* This test checks the functionality of the {@link LocalFileSystem} class.
*/
@Test
public void testLocalFilesystem() {
final File tempdir = new File(CommonTestUtils.getTempDir(), UUID.randomUUID().toString());
final File testfile1 = new File(tempdir, UUID.randomUUID().toString());
final File testfile2 = new File(tempdir, UUID.randomUUID().toString());
final Path pathtotestfile1 = new Path(testfile1.toURI().getPath());
final Path pathtotestfile2 = new Path(testfile2.toURI().getPath());
try {
final LocalFileSystem lfs = new LocalFileSystem();
final Path pathtotmpdir = new Path(tempdir.toURI().getPath());
/*
* check that lfs can see/create/delete/read directories
*/
// check that dir is not existent yet
assertFalse(lfs.exists(pathtotmpdir));
assertTrue(tempdir.mkdirs());
// check that local file system recognizes file..
assertTrue(lfs.exists(pathtotmpdir));
final FileStatus localstatus1 = lfs.getFileStatus(pathtotmpdir);
// check that lfs recognizes directory..
assertTrue(localstatus1.isDir());
// get status for files in this (empty) directory..
final FileStatus[] statusforfiles = lfs.listStatus(pathtotmpdir);
// no files in there.. hence, must be zero
assertTrue(statusforfiles.length == 0);
// check that lfs can delete directory..
lfs.delete(pathtotmpdir, true);
// double check that directory is not existent anymore..
assertFalse(lfs.exists(pathtotmpdir));
assertFalse(tempdir.exists());
// re-create directory..
lfs.mkdirs(pathtotmpdir);
// creation successful?
assertTrue(tempdir.exists());
/*
* check that lfs can create/read/write from/to files properly and read meta information..
*/
// create files.. one ""natively"", one using lfs
final FSDataOutputStream lfsoutput1 = lfs.create(pathtotestfile1, false);
assertTrue(testfile2.createNewFile());
// does lfs create files? does lfs recognize created files?
assertTrue(testfile1.exists());
assertTrue(lfs.exists(pathtotestfile2));
// test that lfs can write to files properly
final byte[] testbytes = { 1, 2, 3, 4, 5 };
lfsoutput1.write(testbytes);
lfsoutput1.close();
assertEquals(testfile1.length(), 5L);
byte[] testbytestest = new byte[5];
try (FileInputStream fisfile1 = new FileInputStream(testfile1)) {
assertEquals(testbytestest.length, fisfile1.read(testbytestest));
}
assertArrayEquals(testbytes, testbytestest);
// does lfs see the correct file length?
assertEquals(lfs.getFileStatus(pathtotestfile1).getLen(), testfile1.length());
// as well, when we call the listStatus (that is intended for directories?)
assertEquals(lfs.listStatus(pathtotestfile1)[0].getLen(), testfile1.length());
// test that lfs can read files properly
final FileOutputStream fosfile2 = new FileOutputStream(testfile2);
fosfile2.write(testbytes);
fosfile2.close();
testbytestest = new byte[5];
final FSDataInputStream lfsinput2 = lfs.open(pathtotestfile2);
assertEquals(lfsinput2.read(testbytestest), 5);
lfsinput2.close();
assertTrue(Arrays.equals(testbytes, testbytestest));
// does lfs see two files?
assertEquals(lfs.listStatus(pathtotmpdir).length, 2);
// do we get exactly one blocklocation per file? no matter what start and len we provide
assertEquals(lfs.getFileBlockLocations(lfs.getFileStatus(pathtotestfile1), 0, 0).length, 1);
/*
* can lfs delete files / directories?
*/
assertTrue(lfs.delete(pathtotestfile1, false));
// and can lfs also delete directories recursively?
assertTrue(lfs.delete(pathtotmpdir, true));
assertTrue(!tempdir.exists());
} catch (IOException e) {
fail(e.getMessage());
} finally {
// clean up!
testfile1.delete();
testfile2.delete();
tempdir.delete();
}
}
use of org.apache.flink.core.fs.FileStatus in project flink by apache.
the class HadoopInputFormatBase method getStatistics.
@Override
public BaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException {
// only gather base statistics for FileInputFormats
if (!(mapreduceInputFormat instanceof FileInputFormat)) {
return null;
}
JobContext jobContext;
try {
jobContext = HadoopUtils.instantiateJobContext(configuration, null);
} catch (Exception e) {
throw new RuntimeException(e);
}
final FileBaseStatistics cachedFileStats = (cachedStats != null && cachedStats instanceof FileBaseStatistics) ? (FileBaseStatistics) cachedStats : null;
try {
final org.apache.hadoop.fs.Path[] paths = FileInputFormat.getInputPaths(jobContext);
return getFileStats(cachedFileStats, paths, new ArrayList<FileStatus>(1));
} catch (IOException ioex) {
if (LOG.isWarnEnabled()) {
LOG.warn("Could not determine statistics due to an io error: " + ioex.getMessage());
}
} catch (Throwable t) {
if (LOG.isErrorEnabled()) {
LOG.error("Unexpected problem while getting the file statistics: " + t.getMessage(), t);
}
}
// no statistics available
return null;
}
Aggregations