Search in sources :

Example 16 with FileStatus

use of org.apache.flink.core.fs.FileStatus in project flink by apache.

the class FileInputFormat method createInputSplits.

/**
	 * Computes the input splits for the file. By default, one file block is one split. If more splits
	 * are requested than blocks are available, then a split may be a fraction of a block and splits may cross
	 * block boundaries.
	 * 
	 * @param minNumSplits The minimum desired number of file splits.
	 * @return The computed file splits.
	 * 
	 * @see org.apache.flink.api.common.io.InputFormat#createInputSplits(int)
	 */
@Override
public FileInputSplit[] createInputSplits(int minNumSplits) throws IOException {
    if (minNumSplits < 1) {
        throw new IllegalArgumentException("Number of input splits has to be at least 1.");
    }
    // take the desired number of splits into account
    minNumSplits = Math.max(minNumSplits, this.numSplits);
    final Path path = this.filePath;
    final List<FileInputSplit> inputSplits = new ArrayList<FileInputSplit>(minNumSplits);
    // get all the files that are involved in the splits
    List<FileStatus> files = new ArrayList<FileStatus>();
    long totalLength = 0;
    final FileSystem fs = path.getFileSystem();
    final FileStatus pathFile = fs.getFileStatus(path);
    if (pathFile.isDir()) {
        totalLength += addFilesInDir(path, files, true);
    } else {
        testForUnsplittable(pathFile);
        files.add(pathFile);
        totalLength += pathFile.getLen();
    }
    // returns if unsplittable
    if (unsplittable) {
        int splitNum = 0;
        for (final FileStatus file : files) {
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, file.getLen());
            Set<String> hosts = new HashSet<String>();
            for (BlockLocation block : blocks) {
                hosts.addAll(Arrays.asList(block.getHosts()));
            }
            long len = file.getLen();
            if (testForUnsplittable(file)) {
                len = READ_WHOLE_SPLIT_FLAG;
            }
            FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), 0, len, hosts.toArray(new String[hosts.size()]));
            inputSplits.add(fis);
        }
        return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
    }
    final long maxSplitSize = (minNumSplits < 1) ? Long.MAX_VALUE : (totalLength / minNumSplits + (totalLength % minNumSplits == 0 ? 0 : 1));
    // now that we have the files, generate the splits
    int splitNum = 0;
    for (final FileStatus file : files) {
        final long len = file.getLen();
        final long blockSize = file.getBlockSize();
        final long minSplitSize;
        if (this.minSplitSize <= blockSize) {
            minSplitSize = this.minSplitSize;
        } else {
            if (LOG.isWarnEnabled()) {
                LOG.warn("Minimal split size of " + this.minSplitSize + " is larger than the block size of " + blockSize + ". Decreasing minimal split size to block size.");
            }
            minSplitSize = blockSize;
        }
        final long splitSize = Math.max(minSplitSize, Math.min(maxSplitSize, blockSize));
        final long halfSplit = splitSize >>> 1;
        final long maxBytesForLastSplit = (long) (splitSize * MAX_SPLIT_SIZE_DISCREPANCY);
        if (len > 0) {
            // get the block locations and make sure they are in order with respect to their offset
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, len);
            Arrays.sort(blocks);
            long bytesUnassigned = len;
            long position = 0;
            int blockIndex = 0;
            while (bytesUnassigned > maxBytesForLastSplit) {
                // get the block containing the majority of the data
                blockIndex = getBlockIndexForPosition(blocks, position, halfSplit, blockIndex);
                // create a new split
                FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), position, splitSize, blocks[blockIndex].getHosts());
                inputSplits.add(fis);
                // adjust the positions
                position += splitSize;
                bytesUnassigned -= splitSize;
            }
            // assign the last split
            if (bytesUnassigned > 0) {
                blockIndex = getBlockIndexForPosition(blocks, position, halfSplit, blockIndex);
                final FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), position, bytesUnassigned, blocks[blockIndex].getHosts());
                inputSplits.add(fis);
            }
        } else {
            // special case with a file of zero bytes size
            final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, 0);
            String[] hosts;
            if (blocks.length > 0) {
                hosts = blocks[0].getHosts();
            } else {
                hosts = new String[0];
            }
            final FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), 0, 0, hosts);
            inputSplits.add(fis);
        }
    }
    return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
Also used : Path(org.apache.flink.core.fs.Path) FileStatus(org.apache.flink.core.fs.FileStatus) ArrayList(java.util.ArrayList) BlockLocation(org.apache.flink.core.fs.BlockLocation) FileInputSplit(org.apache.flink.core.fs.FileInputSplit) FileSystem(org.apache.flink.core.fs.FileSystem) HashSet(java.util.HashSet)

Example 17 with FileStatus

use of org.apache.flink.core.fs.FileStatus in project flink by apache.

the class BinaryInputFormat method getFiles.

protected List<FileStatus> getFiles() throws IOException {
    // get all the files that are involved in the splits
    List<FileStatus> files = new ArrayList<FileStatus>();
    final FileSystem fs = this.filePath.getFileSystem();
    final FileStatus pathFile = fs.getFileStatus(this.filePath);
    if (pathFile.isDir()) {
        // input is directory. list all contained files
        final FileStatus[] partials = fs.listStatus(this.filePath);
        for (FileStatus partial : partials) {
            if (!partial.isDir()) {
                files.add(partial);
            }
        }
    } else {
        files.add(pathFile);
    }
    return files;
}
Also used : FileStatus(org.apache.flink.core.fs.FileStatus) FileSystem(org.apache.flink.core.fs.FileSystem) ArrayList(java.util.ArrayList)

Example 18 with FileStatus

use of org.apache.flink.core.fs.FileStatus in project flink by apache.

the class BinaryInputFormat method getStatistics.

@Override
public SequentialStatistics getStatistics(BaseStatistics cachedStats) {
    final FileBaseStatistics cachedFileStats = (cachedStats != null && cachedStats instanceof FileBaseStatistics) ? (FileBaseStatistics) cachedStats : null;
    try {
        final Path filePath = this.filePath;
        // get the filesystem
        final FileSystem fs = FileSystem.get(filePath.toUri());
        final ArrayList<FileStatus> allFiles = new ArrayList<FileStatus>(1);
        // let the file input format deal with the up-to-date check and the basic size
        final FileBaseStatistics stats = getFileStats(cachedFileStats, filePath, fs, allFiles);
        if (stats == null) {
            return null;
        }
        // check whether the file stats are still sequential stats (in that case they are still valid)
        if (stats instanceof SequentialStatistics) {
            return (SequentialStatistics) stats;
        }
        return createStatistics(allFiles, stats);
    } catch (IOException ioex) {
        if (LOG.isWarnEnabled()) {
            LOG.warn(String.format("Could not determine complete statistics for file '%s' due to an I/O error", this.filePath), ioex);
        }
    } catch (Throwable t) {
        if (LOG.isErrorEnabled()) {
            LOG.error(String.format("Unexpected problem while getting the file statistics for file '%s'", this.filePath), t);
        }
    }
    // no stats available
    return null;
}
Also used : Path(org.apache.flink.core.fs.Path) FileStatus(org.apache.flink.core.fs.FileStatus) FileSystem(org.apache.flink.core.fs.FileSystem) ArrayList(java.util.ArrayList) IOException(java.io.IOException)

Example 19 with FileStatus

use of org.apache.flink.core.fs.FileStatus in project flink by apache.

the class LocalFileSystemTest method testLocalFilesystem.

/**
	 * This test checks the functionality of the {@link LocalFileSystem} class.
	 */
@Test
public void testLocalFilesystem() {
    final File tempdir = new File(CommonTestUtils.getTempDir(), UUID.randomUUID().toString());
    final File testfile1 = new File(tempdir, UUID.randomUUID().toString());
    final File testfile2 = new File(tempdir, UUID.randomUUID().toString());
    final Path pathtotestfile1 = new Path(testfile1.toURI().getPath());
    final Path pathtotestfile2 = new Path(testfile2.toURI().getPath());
    try {
        final LocalFileSystem lfs = new LocalFileSystem();
        final Path pathtotmpdir = new Path(tempdir.toURI().getPath());
        /*
			 * check that lfs can see/create/delete/read directories
			 */
        // check that dir is not existent yet
        assertFalse(lfs.exists(pathtotmpdir));
        assertTrue(tempdir.mkdirs());
        // check that local file system recognizes file..
        assertTrue(lfs.exists(pathtotmpdir));
        final FileStatus localstatus1 = lfs.getFileStatus(pathtotmpdir);
        // check that lfs recognizes directory..
        assertTrue(localstatus1.isDir());
        // get status for files in this (empty) directory..
        final FileStatus[] statusforfiles = lfs.listStatus(pathtotmpdir);
        // no files in there.. hence, must be zero
        assertTrue(statusforfiles.length == 0);
        // check that lfs can delete directory..
        lfs.delete(pathtotmpdir, true);
        // double check that directory is not existent anymore..
        assertFalse(lfs.exists(pathtotmpdir));
        assertFalse(tempdir.exists());
        // re-create directory..
        lfs.mkdirs(pathtotmpdir);
        // creation successful?
        assertTrue(tempdir.exists());
        /*
			 * check that lfs can create/read/write from/to files properly and read meta information..
			 */
        // create files.. one ""natively"", one using lfs
        final FSDataOutputStream lfsoutput1 = lfs.create(pathtotestfile1, false);
        assertTrue(testfile2.createNewFile());
        // does lfs create files? does lfs recognize created files?
        assertTrue(testfile1.exists());
        assertTrue(lfs.exists(pathtotestfile2));
        // test that lfs can write to files properly
        final byte[] testbytes = { 1, 2, 3, 4, 5 };
        lfsoutput1.write(testbytes);
        lfsoutput1.close();
        assertEquals(testfile1.length(), 5L);
        byte[] testbytestest = new byte[5];
        try (FileInputStream fisfile1 = new FileInputStream(testfile1)) {
            assertEquals(testbytestest.length, fisfile1.read(testbytestest));
        }
        assertArrayEquals(testbytes, testbytestest);
        // does lfs see the correct file length?
        assertEquals(lfs.getFileStatus(pathtotestfile1).getLen(), testfile1.length());
        // as well, when we call the listStatus (that is intended for directories?)
        assertEquals(lfs.listStatus(pathtotestfile1)[0].getLen(), testfile1.length());
        // test that lfs can read files properly
        final FileOutputStream fosfile2 = new FileOutputStream(testfile2);
        fosfile2.write(testbytes);
        fosfile2.close();
        testbytestest = new byte[5];
        final FSDataInputStream lfsinput2 = lfs.open(pathtotestfile2);
        assertEquals(lfsinput2.read(testbytestest), 5);
        lfsinput2.close();
        assertTrue(Arrays.equals(testbytes, testbytestest));
        // does lfs see two files?
        assertEquals(lfs.listStatus(pathtotmpdir).length, 2);
        // do we get exactly one blocklocation per file? no matter what start and len we provide
        assertEquals(lfs.getFileBlockLocations(lfs.getFileStatus(pathtotestfile1), 0, 0).length, 1);
        /*
			 * can lfs delete files / directories?
			 */
        assertTrue(lfs.delete(pathtotestfile1, false));
        // and can lfs also delete directories recursively?
        assertTrue(lfs.delete(pathtotmpdir, true));
        assertTrue(!tempdir.exists());
    } catch (IOException e) {
        fail(e.getMessage());
    } finally {
        // clean up!
        testfile1.delete();
        testfile2.delete();
        tempdir.delete();
    }
}
Also used : Path(org.apache.flink.core.fs.Path) FileStatus(org.apache.flink.core.fs.FileStatus) FileOutputStream(java.io.FileOutputStream) FSDataInputStream(org.apache.flink.core.fs.FSDataInputStream) FSDataOutputStream(org.apache.flink.core.fs.FSDataOutputStream) IOException(java.io.IOException) File(java.io.File) FileInputStream(java.io.FileInputStream) Test(org.junit.Test)

Example 20 with FileStatus

use of org.apache.flink.core.fs.FileStatus in project flink by apache.

the class HadoopInputFormatBase method getStatistics.

@Override
public BaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException {
    // only gather base statistics for FileInputFormats
    if (!(mapreduceInputFormat instanceof FileInputFormat)) {
        return null;
    }
    JobContext jobContext;
    try {
        jobContext = HadoopUtils.instantiateJobContext(configuration, null);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
    final FileBaseStatistics cachedFileStats = (cachedStats != null && cachedStats instanceof FileBaseStatistics) ? (FileBaseStatistics) cachedStats : null;
    try {
        final org.apache.hadoop.fs.Path[] paths = FileInputFormat.getInputPaths(jobContext);
        return getFileStats(cachedFileStats, paths, new ArrayList<FileStatus>(1));
    } catch (IOException ioex) {
        if (LOG.isWarnEnabled()) {
            LOG.warn("Could not determine statistics due to an io error: " + ioex.getMessage());
        }
    } catch (Throwable t) {
        if (LOG.isErrorEnabled()) {
            LOG.error("Unexpected problem while getting the file statistics: " + t.getMessage(), t);
        }
    }
    // no statistics available
    return null;
}
Also used : Path(org.apache.flink.core.fs.Path) FileStatus(org.apache.flink.core.fs.FileStatus) FileBaseStatistics(org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics) IOException(java.io.IOException) FileInputFormat(org.apache.hadoop.mapreduce.lib.input.FileInputFormat) IOException(java.io.IOException) JobContext(org.apache.hadoop.mapreduce.JobContext)

Aggregations

FileStatus (org.apache.flink.core.fs.FileStatus)24 Path (org.apache.flink.core.fs.Path)16 FileSystem (org.apache.flink.core.fs.FileSystem)13 ArrayList (java.util.ArrayList)9 IOException (java.io.IOException)8 FileInputSplit (org.apache.flink.core.fs.FileInputSplit)4 File (java.io.File)3 FileBaseStatistics (org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics)3 FSDataInputStream (org.apache.flink.core.fs.FSDataInputStream)3 HashMap (java.util.HashMap)2 List (java.util.List)2 TreeMap (java.util.TreeMap)2 BlockLocation (org.apache.flink.core.fs.BlockLocation)2 FSDataOutputStream (org.apache.flink.core.fs.FSDataOutputStream)2 DataInputViewStreamWrapper (org.apache.flink.core.memory.DataInputViewStreamWrapper)2 DataInputStream (java.io.DataInputStream)1 FileInputStream (java.io.FileInputStream)1 FileOutputStream (java.io.FileOutputStream)1 HashSet (java.util.HashSet)1 Map (java.util.Map)1