Search in sources :

Example 1 with FileBaseStatistics

use of org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics in project flink by apache.

the class FileInputFormatTest method testGetStatisticsOneFileWithCachedVersion.

@Test
public void testGetStatisticsOneFileWithCachedVersion() {
    try {
        final long SIZE = 50873;
        final long FAKE_SIZE = 10065;
        String tempFile = TestFileUtils.createTempFile(SIZE);
        DummyFileInputFormat format = new DummyFileInputFormat();
        format.setFilePath(tempFile);
        format.configure(new Configuration());
        FileBaseStatistics stats = format.getStatistics(null);
        Assert.assertEquals("The file size from the statistics is wrong.", SIZE, stats.getTotalInputSize());
        format = new DummyFileInputFormat();
        format.setFilePath(tempFile);
        format.configure(new Configuration());
        FileBaseStatistics newStats = format.getStatistics(stats);
        Assert.assertTrue("Statistics object was changed", newStats == stats);
        // insert fake stats with the correct modification time. the call should return the fake stats
        format = new DummyFileInputFormat();
        format.setFilePath(tempFile);
        format.configure(new Configuration());
        FileBaseStatistics fakeStats = new FileBaseStatistics(stats.getLastModificationTime(), FAKE_SIZE, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN);
        BaseStatistics latest = format.getStatistics(fakeStats);
        Assert.assertEquals("The file size from the statistics is wrong.", FAKE_SIZE, latest.getTotalInputSize());
        // insert fake stats with the expired modification time. the call should return new accurate stats
        format = new DummyFileInputFormat();
        format.setFilePath(tempFile);
        format.configure(new Configuration());
        FileBaseStatistics outDatedFakeStats = new FileBaseStatistics(stats.getLastModificationTime() - 1, FAKE_SIZE, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN);
        BaseStatistics reGathered = format.getStatistics(outDatedFakeStats);
        Assert.assertEquals("The file size from the statistics is wrong.", SIZE, reGathered.getTotalInputSize());
    } catch (Exception ex) {
        ex.printStackTrace();
        Assert.fail(ex.getMessage());
    }
}
Also used : Configuration(org.apache.flink.configuration.Configuration) FileBaseStatistics(org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics) BaseStatistics(org.apache.flink.api.common.io.statistics.BaseStatistics) FileBaseStatistics(org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics) IOException(java.io.IOException) Test(org.junit.Test)

Example 2 with FileBaseStatistics

use of org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics in project flink by apache.

the class FileInputFormatTest method testGetStatisticsMultipleFilesWithCachedVersion.

@Test
public void testGetStatisticsMultipleFilesWithCachedVersion() {
    try {
        final long SIZE1 = 2077;
        final long SIZE2 = 31909;
        final long SIZE3 = 10;
        final long TOTAL = SIZE1 + SIZE2 + SIZE3;
        final long FAKE_SIZE = 10065;
        String tempDir = TestFileUtils.createTempFileDir(SIZE1, SIZE2, SIZE3);
        DummyFileInputFormat format = new DummyFileInputFormat();
        format.setFilePath(tempDir);
        format.configure(new Configuration());
        FileBaseStatistics stats = format.getStatistics(null);
        Assert.assertEquals("The file size from the statistics is wrong.", TOTAL, stats.getTotalInputSize());
        format = new DummyFileInputFormat();
        format.setFilePath(tempDir);
        format.configure(new Configuration());
        FileBaseStatistics newStats = format.getStatistics(stats);
        Assert.assertTrue("Statistics object was changed", newStats == stats);
        // insert fake stats with the correct modification time. the call should return the fake stats
        format = new DummyFileInputFormat();
        format.setFilePath(tempDir);
        format.configure(new Configuration());
        FileBaseStatistics fakeStats = new FileBaseStatistics(stats.getLastModificationTime(), FAKE_SIZE, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN);
        BaseStatistics latest = format.getStatistics(fakeStats);
        Assert.assertEquals("The file size from the statistics is wrong.", FAKE_SIZE, latest.getTotalInputSize());
        // insert fake stats with the correct modification time. the call should return the fake stats
        format = new DummyFileInputFormat();
        format.setFilePath(tempDir);
        format.configure(new Configuration());
        FileBaseStatistics outDatedFakeStats = new FileBaseStatistics(stats.getLastModificationTime() - 1, FAKE_SIZE, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN);
        BaseStatistics reGathered = format.getStatistics(outDatedFakeStats);
        Assert.assertEquals("The file size from the statistics is wrong.", TOTAL, reGathered.getTotalInputSize());
    } catch (Exception ex) {
        ex.printStackTrace();
        Assert.fail(ex.getMessage());
    }
}
Also used : Configuration(org.apache.flink.configuration.Configuration) FileBaseStatistics(org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics) BaseStatistics(org.apache.flink.api.common.io.statistics.BaseStatistics) FileBaseStatistics(org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics) IOException(java.io.IOException) Test(org.junit.Test)

Example 3 with FileBaseStatistics

use of org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics in project flink by apache.

the class HadoopInputFormatBase method getFileStats.

// --------------------------------------------------------------------------------------------
//  Helper methods
// --------------------------------------------------------------------------------------------
private FileBaseStatistics getFileStats(FileBaseStatistics cachedStats, org.apache.hadoop.fs.Path[] hadoopFilePaths, ArrayList<FileStatus> files) throws IOException {
    long latestModTime = 0L;
    // get the file info and check whether the cached statistics are still valid.
    for (org.apache.hadoop.fs.Path hadoopPath : hadoopFilePaths) {
        final Path filePath = new Path(hadoopPath.toUri());
        final FileSystem fs = FileSystem.get(filePath.toUri());
        final FileStatus file = fs.getFileStatus(filePath);
        latestModTime = Math.max(latestModTime, file.getModificationTime());
        // enumerate all files and check their modification time stamp.
        if (file.isDir()) {
            FileStatus[] fss = fs.listStatus(filePath);
            files.ensureCapacity(files.size() + fss.length);
            for (FileStatus s : fss) {
                if (!s.isDir()) {
                    files.add(s);
                    latestModTime = Math.max(s.getModificationTime(), latestModTime);
                }
            }
        } else {
            files.add(file);
        }
    }
    // check whether the cached statistics are still valid, if we have any
    if (cachedStats != null && latestModTime <= cachedStats.getLastModificationTime()) {
        return cachedStats;
    }
    // calculate the whole length
    long len = 0;
    for (FileStatus s : files) {
        len += s.getLen();
    }
    // sanity check
    if (len <= 0) {
        len = BaseStatistics.SIZE_UNKNOWN;
    }
    return new FileBaseStatistics(latestModTime, len, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN);
}
Also used : Path(org.apache.flink.core.fs.Path) FileStatus(org.apache.flink.core.fs.FileStatus) FileBaseStatistics(org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics) FileSystem(org.apache.flink.core.fs.FileSystem)

Example 4 with FileBaseStatistics

use of org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics in project flink by apache.

the class HadoopInputFormatBase method getStatistics.

@Override
public BaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException {
    // only gather base statistics for FileInputFormats
    if (!(mapreduceInputFormat instanceof FileInputFormat)) {
        return null;
    }
    JobContext jobContext;
    try {
        jobContext = HadoopUtils.instantiateJobContext(configuration, null);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
    final FileBaseStatistics cachedFileStats = (cachedStats != null && cachedStats instanceof FileBaseStatistics) ? (FileBaseStatistics) cachedStats : null;
    try {
        final org.apache.hadoop.fs.Path[] paths = FileInputFormat.getInputPaths(jobContext);
        return getFileStats(cachedFileStats, paths, new ArrayList<FileStatus>(1));
    } catch (IOException ioex) {
        if (LOG.isWarnEnabled()) {
            LOG.warn("Could not determine statistics due to an io error: " + ioex.getMessage());
        }
    } catch (Throwable t) {
        if (LOG.isErrorEnabled()) {
            LOG.error("Unexpected problem while getting the file statistics: " + t.getMessage(), t);
        }
    }
    // no statistics available
    return null;
}
Also used : Path(org.apache.flink.core.fs.Path) FileStatus(org.apache.flink.core.fs.FileStatus) FileBaseStatistics(org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics) IOException(java.io.IOException) FileInputFormat(org.apache.hadoop.mapreduce.lib.input.FileInputFormat) IOException(java.io.IOException) JobContext(org.apache.hadoop.mapreduce.JobContext)

Example 5 with FileBaseStatistics

use of org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics in project flink by apache.

the class HadoopInputFormatBase method getFileStats.

// --------------------------------------------------------------------------------------------
//  Helper methods
// --------------------------------------------------------------------------------------------
private FileBaseStatistics getFileStats(FileBaseStatistics cachedStats, org.apache.hadoop.fs.Path[] hadoopFilePaths, ArrayList<FileStatus> files) throws IOException {
    long latestModTime = 0L;
    // get the file info and check whether the cached statistics are still valid.
    for (org.apache.hadoop.fs.Path hadoopPath : hadoopFilePaths) {
        final Path filePath = new Path(hadoopPath.toUri());
        final FileSystem fs = FileSystem.get(filePath.toUri());
        final FileStatus file = fs.getFileStatus(filePath);
        latestModTime = Math.max(latestModTime, file.getModificationTime());
        // enumerate all files and check their modification time stamp.
        if (file.isDir()) {
            FileStatus[] fss = fs.listStatus(filePath);
            files.ensureCapacity(files.size() + fss.length);
            for (FileStatus s : fss) {
                if (!s.isDir()) {
                    files.add(s);
                    latestModTime = Math.max(s.getModificationTime(), latestModTime);
                }
            }
        } else {
            files.add(file);
        }
    }
    // check whether the cached statistics are still valid, if we have any
    if (cachedStats != null && latestModTime <= cachedStats.getLastModificationTime()) {
        return cachedStats;
    }
    // calculate the whole length
    long len = 0;
    for (FileStatus s : files) {
        len += s.getLen();
    }
    // sanity check
    if (len <= 0) {
        len = BaseStatistics.SIZE_UNKNOWN;
    }
    return new FileBaseStatistics(latestModTime, len, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN);
}
Also used : Path(org.apache.flink.core.fs.Path) FileStatus(org.apache.flink.core.fs.FileStatus) FileBaseStatistics(org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics) FileSystem(org.apache.flink.core.fs.FileSystem)

Aggregations

FileBaseStatistics (org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics)5 IOException (java.io.IOException)3 FileStatus (org.apache.flink.core.fs.FileStatus)3 Path (org.apache.flink.core.fs.Path)3 BaseStatistics (org.apache.flink.api.common.io.statistics.BaseStatistics)2 Configuration (org.apache.flink.configuration.Configuration)2 FileSystem (org.apache.flink.core.fs.FileSystem)2 Test (org.junit.Test)2 JobContext (org.apache.hadoop.mapreduce.JobContext)1 FileInputFormat (org.apache.hadoop.mapreduce.lib.input.FileInputFormat)1