use of org.apache.flink.core.fs.FileSystem in project flink by apache.
the class FileInputFormatTest method testGetStatisticsMultipleOneFileWithCachedVersion.
@Test
public void testGetStatisticsMultipleOneFileWithCachedVersion() throws IOException {
FileSystem fs = FileSystem.getLocalFileSystem();
final long size1 = 50873;
final long fakeSize = 10065;
String tempFile1 = TestFileUtils.createTempFile(size1);
final long lastModTime1 = fs.getFileStatus(new Path(tempFile1)).getModificationTime();
final long size2 = 52573;
String tempFile2 = TestFileUtils.createTempFile(size2);
final long lastModTime2 = fs.getFileStatus(new Path(tempFile2)).getModificationTime();
final long sizeTotal = size1 + size2;
MultiDummyFileInputFormat format = new MultiDummyFileInputFormat();
format.setFilePaths(tempFile1, tempFile2);
format.configure(new Configuration());
FileBaseStatistics stats = format.getStatistics(null);
Assert.assertEquals("The file size from the statistics is wrong.", sizeTotal, stats.getTotalInputSize());
format = new MultiDummyFileInputFormat();
format.setFilePath(tempFile1);
format.configure(new Configuration());
FileBaseStatistics newStats = format.getStatistics(stats);
Assert.assertTrue("Statistics object was changed", newStats == stats);
// insert fake stats with the correct modification time. the call should return the fake
// stats
format = new MultiDummyFileInputFormat();
format.setFilePath(tempFile1);
format.configure(new Configuration());
FileBaseStatistics fakeStats = new FileBaseStatistics(stats.getLastModificationTime(), fakeSize, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN);
BaseStatistics latest = format.getStatistics(fakeStats);
Assert.assertEquals("The file size from the statistics is wrong.", fakeSize, latest.getTotalInputSize());
// insert fake stats with the expired modification time. the call should return new accurate
// stats
format = new MultiDummyFileInputFormat();
format.setFilePaths(tempFile1, tempFile2);
format.configure(new Configuration());
FileBaseStatistics outDatedFakeStats = new FileBaseStatistics(Math.min(lastModTime1, lastModTime2) - 1, fakeSize, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN);
BaseStatistics reGathered = format.getStatistics(outDatedFakeStats);
Assert.assertEquals("The file size from the statistics is wrong.", sizeTotal, reGathered.getTotalInputSize());
}
use of org.apache.flink.core.fs.FileSystem in project flink by apache.
the class DistCp method main.
public static void main(String[] args) throws Exception {
// set up the execution environment
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
ParameterTool params = ParameterTool.fromArgs(args);
if (!params.has("input") || !params.has("output")) {
System.err.println("Usage: --input <path> --output <path> [--parallelism <n>]");
return;
}
final Path sourcePath = new Path(params.get("input"));
final Path targetPath = new Path(params.get("output"));
if (!isLocal(env) && !(isOnDistributedFS(sourcePath) && isOnDistributedFS(targetPath))) {
System.out.println("In a distributed mode only HDFS input/output paths are supported");
return;
}
final int parallelism = params.getInt("parallelism", 10);
if (parallelism <= 0) {
System.err.println("Parallelism should be greater than 0");
return;
}
// make parameters available in the web interface
env.getConfig().setGlobalJobParameters(params);
env.setParallelism(parallelism);
long startTime = System.currentTimeMillis();
LOGGER.info("Initializing copy tasks");
List<FileCopyTask> tasks = getCopyTasks(sourcePath);
LOGGER.info("Copy task initialization took " + (System.currentTimeMillis() - startTime) + "ms");
DataSet<FileCopyTask> inputTasks = new DataSource<>(env, new FileCopyTaskInputFormat(tasks), new GenericTypeInfo<>(FileCopyTask.class), "fileCopyTasks");
FlatMapOperator<FileCopyTask, Object> res = inputTasks.flatMap(new RichFlatMapFunction<FileCopyTask, Object>() {
private static final long serialVersionUID = 1109254230243989929L;
private LongCounter fileCounter;
private LongCounter bytesCounter;
@Override
public void open(Configuration parameters) throws Exception {
bytesCounter = getRuntimeContext().getLongCounter(BYTES_COPIED_CNT_NAME);
fileCounter = getRuntimeContext().getLongCounter(FILES_COPIED_CNT_NAME);
}
@Override
public void flatMap(FileCopyTask task, Collector<Object> out) throws Exception {
LOGGER.info("Processing task: " + task);
Path outPath = new Path(targetPath, task.getRelativePath());
FileSystem targetFs = targetPath.getFileSystem();
// creating parent folders in case of a local FS
if (!targetFs.isDistributedFS()) {
// dealing with cases like file:///tmp or just /tmp
File outFile = outPath.toUri().isAbsolute() ? new File(outPath.toUri()) : new File(outPath.toString());
File parentFile = outFile.getParentFile();
if (!parentFile.mkdirs() && !parentFile.exists()) {
throw new RuntimeException("Cannot create local file system directories: " + parentFile);
}
}
FSDataOutputStream outputStream = null;
FSDataInputStream inputStream = null;
try {
outputStream = targetFs.create(outPath, FileSystem.WriteMode.OVERWRITE);
inputStream = task.getPath().getFileSystem().open(task.getPath());
int bytes = IOUtils.copy(inputStream, outputStream);
bytesCounter.add(bytes);
} finally {
IOUtils.closeQuietly(inputStream);
IOUtils.closeQuietly(outputStream);
}
fileCounter.add(1L);
}
});
// no data sinks are needed, therefore just printing an empty result
res.print();
Map<String, Object> accumulators = env.getLastJobExecutionResult().getAllAccumulatorResults();
LOGGER.info("== COUNTERS ==");
for (Map.Entry<String, Object> e : accumulators.entrySet()) {
LOGGER.info(e.getKey() + ": " + e.getValue());
}
}
use of org.apache.flink.core.fs.FileSystem in project flink by apache.
the class FileInputFormat method createInputSplits.
/**
* Computes the input splits for the file. By default, one file block is one split. If more
* splits are requested than blocks are available, then a split may be a fraction of a block and
* splits may cross block boundaries.
*
* @param minNumSplits The minimum desired number of file splits.
* @return The computed file splits.
* @see org.apache.flink.api.common.io.InputFormat#createInputSplits(int)
*/
@Override
public FileInputSplit[] createInputSplits(int minNumSplits) throws IOException {
if (minNumSplits < 1) {
throw new IllegalArgumentException("Number of input splits has to be at least 1.");
}
// take the desired number of splits into account
minNumSplits = Math.max(minNumSplits, this.numSplits);
final List<FileInputSplit> inputSplits = new ArrayList<FileInputSplit>(minNumSplits);
// get all the files that are involved in the splits
List<FileStatus> files = new ArrayList<>();
long totalLength = 0;
for (Path path : getFilePaths()) {
final FileSystem fs = path.getFileSystem();
final FileStatus pathFile = fs.getFileStatus(path);
if (pathFile.isDir()) {
totalLength += addFilesInDir(path, files, true);
} else {
testForUnsplittable(pathFile);
files.add(pathFile);
totalLength += pathFile.getLen();
}
}
// returns if unsplittable
if (unsplittable) {
int splitNum = 0;
for (final FileStatus file : files) {
final FileSystem fs = file.getPath().getFileSystem();
final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, file.getLen());
Set<String> hosts = new HashSet<String>();
for (BlockLocation block : blocks) {
hosts.addAll(Arrays.asList(block.getHosts()));
}
long len = file.getLen();
if (testForUnsplittable(file)) {
len = READ_WHOLE_SPLIT_FLAG;
}
FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), 0, len, hosts.toArray(new String[hosts.size()]));
inputSplits.add(fis);
}
return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
final long maxSplitSize = totalLength / minNumSplits + (totalLength % minNumSplits == 0 ? 0 : 1);
// now that we have the files, generate the splits
int splitNum = 0;
for (final FileStatus file : files) {
final FileSystem fs = file.getPath().getFileSystem();
final long len = file.getLen();
final long blockSize = file.getBlockSize();
final long minSplitSize;
if (this.minSplitSize <= blockSize) {
minSplitSize = this.minSplitSize;
} else {
if (LOG.isWarnEnabled()) {
LOG.warn("Minimal split size of " + this.minSplitSize + " is larger than the block size of " + blockSize + ". Decreasing minimal split size to block size.");
}
minSplitSize = blockSize;
}
final long splitSize = Math.max(minSplitSize, Math.min(maxSplitSize, blockSize));
final long halfSplit = splitSize >>> 1;
final long maxBytesForLastSplit = (long) (splitSize * MAX_SPLIT_SIZE_DISCREPANCY);
if (len > 0) {
// get the block locations and make sure they are in order with respect to their
// offset
final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, len);
Arrays.sort(blocks);
long bytesUnassigned = len;
long position = 0;
int blockIndex = 0;
while (bytesUnassigned > maxBytesForLastSplit) {
// get the block containing the majority of the data
blockIndex = getBlockIndexForPosition(blocks, position, halfSplit, blockIndex);
// create a new split
FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), position, splitSize, blocks[blockIndex].getHosts());
inputSplits.add(fis);
// adjust the positions
position += splitSize;
bytesUnassigned -= splitSize;
}
// assign the last split
if (bytesUnassigned > 0) {
blockIndex = getBlockIndexForPosition(blocks, position, halfSplit, blockIndex);
final FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), position, bytesUnassigned, blocks[blockIndex].getHosts());
inputSplits.add(fis);
}
} else {
// special case with a file of zero bytes size
final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, 0);
String[] hosts;
if (blocks.length > 0) {
hosts = blocks[0].getHosts();
} else {
hosts = new String[0];
}
final FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), 0, 0, hosts);
inputSplits.add(fis);
}
}
return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
use of org.apache.flink.core.fs.FileSystem in project flink by apache.
the class BinaryInputFormat method getFiles.
protected List<FileStatus> getFiles() throws IOException {
// get all the files that are involved in the splits
List<FileStatus> files = new ArrayList<>();
for (Path filePath : getFilePaths()) {
final FileSystem fs = filePath.getFileSystem();
final FileStatus pathFile = fs.getFileStatus(filePath);
if (pathFile.isDir()) {
// input is directory. list all contained files
final FileStatus[] partials = fs.listStatus(filePath);
for (FileStatus partial : partials) {
if (!partial.isDir()) {
files.add(partial);
}
}
} else {
files.add(pathFile);
}
}
return files;
}
use of org.apache.flink.core.fs.FileSystem in project flink by apache.
the class FileUtilsTest method generateTestFile.
/**
* Generates a random content file.
*
* @param outputFile the path of the output file
* @param length the size of content to generate
* @return MD5 of the output file
* @throws IOException
* @throws NoSuchAlgorithmException
*/
private static String generateTestFile(String outputFile, int length) throws IOException, NoSuchAlgorithmException {
Path outputFilePath = new Path(outputFile);
final FileSystem fileSystem = outputFilePath.getFileSystem();
try (final FSDataOutputStream fsDataOutputStream = fileSystem.create(outputFilePath, FileSystem.WriteMode.OVERWRITE)) {
return writeRandomContent(fsDataOutputStream, length);
}
}
Aggregations