Search in sources :

Example 66 with FileSystem

use of org.apache.flink.core.fs.FileSystem in project flink by apache.

the class FileInputFormatTest method testGetStatisticsMultipleOneFileWithCachedVersion.

@Test
public void testGetStatisticsMultipleOneFileWithCachedVersion() throws IOException {
    FileSystem fs = FileSystem.getLocalFileSystem();
    final long size1 = 50873;
    final long fakeSize = 10065;
    String tempFile1 = TestFileUtils.createTempFile(size1);
    final long lastModTime1 = fs.getFileStatus(new Path(tempFile1)).getModificationTime();
    final long size2 = 52573;
    String tempFile2 = TestFileUtils.createTempFile(size2);
    final long lastModTime2 = fs.getFileStatus(new Path(tempFile2)).getModificationTime();
    final long sizeTotal = size1 + size2;
    MultiDummyFileInputFormat format = new MultiDummyFileInputFormat();
    format.setFilePaths(tempFile1, tempFile2);
    format.configure(new Configuration());
    FileBaseStatistics stats = format.getStatistics(null);
    Assert.assertEquals("The file size from the statistics is wrong.", sizeTotal, stats.getTotalInputSize());
    format = new MultiDummyFileInputFormat();
    format.setFilePath(tempFile1);
    format.configure(new Configuration());
    FileBaseStatistics newStats = format.getStatistics(stats);
    Assert.assertTrue("Statistics object was changed", newStats == stats);
    // insert fake stats with the correct modification time. the call should return the fake
    // stats
    format = new MultiDummyFileInputFormat();
    format.setFilePath(tempFile1);
    format.configure(new Configuration());
    FileBaseStatistics fakeStats = new FileBaseStatistics(stats.getLastModificationTime(), fakeSize, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN);
    BaseStatistics latest = format.getStatistics(fakeStats);
    Assert.assertEquals("The file size from the statistics is wrong.", fakeSize, latest.getTotalInputSize());
    // insert fake stats with the expired modification time. the call should return new accurate
    // stats
    format = new MultiDummyFileInputFormat();
    format.setFilePaths(tempFile1, tempFile2);
    format.configure(new Configuration());
    FileBaseStatistics outDatedFakeStats = new FileBaseStatistics(Math.min(lastModTime1, lastModTime2) - 1, fakeSize, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN);
    BaseStatistics reGathered = format.getStatistics(outDatedFakeStats);
    Assert.assertEquals("The file size from the statistics is wrong.", sizeTotal, reGathered.getTotalInputSize());
}
Also used : Path(org.apache.flink.core.fs.Path) Configuration(org.apache.flink.configuration.Configuration) FileBaseStatistics(org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics) FileSystem(org.apache.flink.core.fs.FileSystem) BaseStatistics(org.apache.flink.api.common.io.statistics.BaseStatistics) FileBaseStatistics(org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics) Test(org.junit.Test)

Example 67 with FileSystem

use of org.apache.flink.core.fs.FileSystem in project flink by apache.

the class DistCp method main.

public static void main(String[] args) throws Exception {
    // set up the execution environment
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    ParameterTool params = ParameterTool.fromArgs(args);
    if (!params.has("input") || !params.has("output")) {
        System.err.println("Usage: --input <path> --output <path> [--parallelism <n>]");
        return;
    }
    final Path sourcePath = new Path(params.get("input"));
    final Path targetPath = new Path(params.get("output"));
    if (!isLocal(env) && !(isOnDistributedFS(sourcePath) && isOnDistributedFS(targetPath))) {
        System.out.println("In a distributed mode only HDFS input/output paths are supported");
        return;
    }
    final int parallelism = params.getInt("parallelism", 10);
    if (parallelism <= 0) {
        System.err.println("Parallelism should be greater than 0");
        return;
    }
    // make parameters available in the web interface
    env.getConfig().setGlobalJobParameters(params);
    env.setParallelism(parallelism);
    long startTime = System.currentTimeMillis();
    LOGGER.info("Initializing copy tasks");
    List<FileCopyTask> tasks = getCopyTasks(sourcePath);
    LOGGER.info("Copy task initialization took " + (System.currentTimeMillis() - startTime) + "ms");
    DataSet<FileCopyTask> inputTasks = new DataSource<>(env, new FileCopyTaskInputFormat(tasks), new GenericTypeInfo<>(FileCopyTask.class), "fileCopyTasks");
    FlatMapOperator<FileCopyTask, Object> res = inputTasks.flatMap(new RichFlatMapFunction<FileCopyTask, Object>() {

        private static final long serialVersionUID = 1109254230243989929L;

        private LongCounter fileCounter;

        private LongCounter bytesCounter;

        @Override
        public void open(Configuration parameters) throws Exception {
            bytesCounter = getRuntimeContext().getLongCounter(BYTES_COPIED_CNT_NAME);
            fileCounter = getRuntimeContext().getLongCounter(FILES_COPIED_CNT_NAME);
        }

        @Override
        public void flatMap(FileCopyTask task, Collector<Object> out) throws Exception {
            LOGGER.info("Processing task: " + task);
            Path outPath = new Path(targetPath, task.getRelativePath());
            FileSystem targetFs = targetPath.getFileSystem();
            // creating parent folders in case of a local FS
            if (!targetFs.isDistributedFS()) {
                // dealing with cases like file:///tmp or just /tmp
                File outFile = outPath.toUri().isAbsolute() ? new File(outPath.toUri()) : new File(outPath.toString());
                File parentFile = outFile.getParentFile();
                if (!parentFile.mkdirs() && !parentFile.exists()) {
                    throw new RuntimeException("Cannot create local file system directories: " + parentFile);
                }
            }
            FSDataOutputStream outputStream = null;
            FSDataInputStream inputStream = null;
            try {
                outputStream = targetFs.create(outPath, FileSystem.WriteMode.OVERWRITE);
                inputStream = task.getPath().getFileSystem().open(task.getPath());
                int bytes = IOUtils.copy(inputStream, outputStream);
                bytesCounter.add(bytes);
            } finally {
                IOUtils.closeQuietly(inputStream);
                IOUtils.closeQuietly(outputStream);
            }
            fileCounter.add(1L);
        }
    });
    // no data sinks are needed, therefore just printing an empty result
    res.print();
    Map<String, Object> accumulators = env.getLastJobExecutionResult().getAllAccumulatorResults();
    LOGGER.info("== COUNTERS ==");
    for (Map.Entry<String, Object> e : accumulators.entrySet()) {
        LOGGER.info(e.getKey() + ": " + e.getValue());
    }
}
Also used : ParameterTool(org.apache.flink.api.java.utils.ParameterTool) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Configuration(org.apache.flink.configuration.Configuration) LongCounter(org.apache.flink.api.common.accumulators.LongCounter) FileSystem(org.apache.flink.core.fs.FileSystem) FSDataOutputStream(org.apache.flink.core.fs.FSDataOutputStream) Path(org.apache.flink.core.fs.Path) IOException(java.io.IOException) DataSource(org.apache.flink.api.java.operators.DataSource) FSDataInputStream(org.apache.flink.core.fs.FSDataInputStream) File(java.io.File) Map(java.util.Map)

Example 68 with FileSystem

use of org.apache.flink.core.fs.FileSystem in project flink by apache.

the class FileInputFormat method createInputSplits.

/**
 * Computes the input splits for the file. By default, one file block is one split. If more
 * splits are requested than blocks are available, then a split may be a fraction of a block and
 * splits may cross block boundaries.
 *
 * @param minNumSplits The minimum desired number of file splits.
 * @return The computed file splits.
 * @see org.apache.flink.api.common.io.InputFormat#createInputSplits(int)
 */
@Override
public FileInputSplit[] createInputSplits(int minNumSplits) throws IOException {
    if (minNumSplits < 1) {
        throw new IllegalArgumentException("Number of input splits has to be at least 1.");
    }
    // take the desired number of splits into account
    minNumSplits = Math.max(minNumSplits, this.numSplits);
    final List<FileInputSplit> inputSplits = new ArrayList<FileInputSplit>(minNumSplits);
    // get all the files that are involved in the splits
    List<FileStatus> files = new ArrayList<>();
    long totalLength = 0;
    for (Path path : getFilePaths()) {
        final FileSystem fs = path.getFileSystem();
        final FileStatus pathFile = fs.getFileStatus(path);
        if (pathFile.isDir()) {
            totalLength += addFilesInDir(path, files, true);
        } else {
            testForUnsplittable(pathFile);
            files.add(pathFile);
            totalLength += pathFile.getLen();
        }
    }
    // returns if unsplittable
    if (unsplittable) {
        int splitNum = 0;
        for (final FileStatus file : files) {
            final FileSystem fs = file.getPath().getFileSystem();
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, file.getLen());
            Set<String> hosts = new HashSet<String>();
            for (BlockLocation block : blocks) {
                hosts.addAll(Arrays.asList(block.getHosts()));
            }
            long len = file.getLen();
            if (testForUnsplittable(file)) {
                len = READ_WHOLE_SPLIT_FLAG;
            }
            FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), 0, len, hosts.toArray(new String[hosts.size()]));
            inputSplits.add(fis);
        }
        return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
    }
    final long maxSplitSize = totalLength / minNumSplits + (totalLength % minNumSplits == 0 ? 0 : 1);
    // now that we have the files, generate the splits
    int splitNum = 0;
    for (final FileStatus file : files) {
        final FileSystem fs = file.getPath().getFileSystem();
        final long len = file.getLen();
        final long blockSize = file.getBlockSize();
        final long minSplitSize;
        if (this.minSplitSize <= blockSize) {
            minSplitSize = this.minSplitSize;
        } else {
            if (LOG.isWarnEnabled()) {
                LOG.warn("Minimal split size of " + this.minSplitSize + " is larger than the block size of " + blockSize + ". Decreasing minimal split size to block size.");
            }
            minSplitSize = blockSize;
        }
        final long splitSize = Math.max(minSplitSize, Math.min(maxSplitSize, blockSize));
        final long halfSplit = splitSize >>> 1;
        final long maxBytesForLastSplit = (long) (splitSize * MAX_SPLIT_SIZE_DISCREPANCY);
        if (len > 0) {
            // get the block locations and make sure they are in order with respect to their
            // offset
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, len);
            Arrays.sort(blocks);
            long bytesUnassigned = len;
            long position = 0;
            int blockIndex = 0;
            while (bytesUnassigned > maxBytesForLastSplit) {
                // get the block containing the majority of the data
                blockIndex = getBlockIndexForPosition(blocks, position, halfSplit, blockIndex);
                // create a new split
                FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), position, splitSize, blocks[blockIndex].getHosts());
                inputSplits.add(fis);
                // adjust the positions
                position += splitSize;
                bytesUnassigned -= splitSize;
            }
            // assign the last split
            if (bytesUnassigned > 0) {
                blockIndex = getBlockIndexForPosition(blocks, position, halfSplit, blockIndex);
                final FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), position, bytesUnassigned, blocks[blockIndex].getHosts());
                inputSplits.add(fis);
            }
        } else {
            // special case with a file of zero bytes size
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, 0);
            String[] hosts;
            if (blocks.length > 0) {
                hosts = blocks[0].getHosts();
            } else {
                hosts = new String[0];
            }
            final FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), 0, 0, hosts);
            inputSplits.add(fis);
        }
    }
    return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
Also used : Path(org.apache.flink.core.fs.Path) FileStatus(org.apache.flink.core.fs.FileStatus) ArrayList(java.util.ArrayList) BlockLocation(org.apache.flink.core.fs.BlockLocation) FileInputSplit(org.apache.flink.core.fs.FileInputSplit) FileSystem(org.apache.flink.core.fs.FileSystem) HashSet(java.util.HashSet)

Example 69 with FileSystem

use of org.apache.flink.core.fs.FileSystem in project flink by apache.

the class BinaryInputFormat method getFiles.

protected List<FileStatus> getFiles() throws IOException {
    // get all the files that are involved in the splits
    List<FileStatus> files = new ArrayList<>();
    for (Path filePath : getFilePaths()) {
        final FileSystem fs = filePath.getFileSystem();
        final FileStatus pathFile = fs.getFileStatus(filePath);
        if (pathFile.isDir()) {
            // input is directory. list all contained files
            final FileStatus[] partials = fs.listStatus(filePath);
            for (FileStatus partial : partials) {
                if (!partial.isDir()) {
                    files.add(partial);
                }
            }
        } else {
            files.add(pathFile);
        }
    }
    return files;
}
Also used : Path(org.apache.flink.core.fs.Path) FileStatus(org.apache.flink.core.fs.FileStatus) FileSystem(org.apache.flink.core.fs.FileSystem) ArrayList(java.util.ArrayList)

Example 70 with FileSystem

use of org.apache.flink.core.fs.FileSystem in project flink by apache.

the class FileUtilsTest method generateTestFile.

/**
 * Generates a random content file.
 *
 * @param outputFile the path of the output file
 * @param length the size of content to generate
 * @return MD5 of the output file
 * @throws IOException
 * @throws NoSuchAlgorithmException
 */
private static String generateTestFile(String outputFile, int length) throws IOException, NoSuchAlgorithmException {
    Path outputFilePath = new Path(outputFile);
    final FileSystem fileSystem = outputFilePath.getFileSystem();
    try (final FSDataOutputStream fsDataOutputStream = fileSystem.create(outputFilePath, FileSystem.WriteMode.OVERWRITE)) {
        return writeRandomContent(fsDataOutputStream, length);
    }
}
Also used : Path(org.apache.flink.core.fs.Path) FileSystem(org.apache.flink.core.fs.FileSystem) FSDataOutputStream(org.apache.flink.core.fs.FSDataOutputStream)

Aggregations

FileSystem (org.apache.flink.core.fs.FileSystem)102 Path (org.apache.flink.core.fs.Path)80 Test (org.junit.Test)49 IOException (java.io.IOException)28 File (java.io.File)24 FileStatus (org.apache.flink.core.fs.FileStatus)20 FSDataOutputStream (org.apache.flink.core.fs.FSDataOutputStream)18 FSDataInputStream (org.apache.flink.core.fs.FSDataInputStream)14 URI (java.net.URI)13 LocalFileSystem (org.apache.flink.core.fs.local.LocalFileSystem)13 ArrayList (java.util.ArrayList)10 Random (java.util.Random)8 Configuration (org.apache.flink.configuration.Configuration)8 JobID (org.apache.flink.api.common.JobID)7 FileNotFoundException (java.io.FileNotFoundException)5 StreamStateHandle (org.apache.flink.runtime.state.StreamStateHandle)5 InputStream (java.io.InputStream)4 URISyntaxException (java.net.URISyntaxException)4 FileBaseStatistics (org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics)4 FsCheckpointStateOutputStream (org.apache.flink.runtime.state.filesystem.FsCheckpointStreamFactory.FsCheckpointStateOutputStream)4