Search in sources :

Example 6 with BaseStatistics

use of org.apache.flink.api.common.io.statistics.BaseStatistics in project flink by apache.

the class DelimitedInputFormatSamplingTest method testSamplingOneFile.

@Test
public void testSamplingOneFile() {
    try {
        final String tempFile = TestFileUtils.createTempFile(TEST_DATA1);
        final Configuration conf = new Configuration();
        final TestDelimitedInputFormat format = new TestDelimitedInputFormat(CONFIG);
        format.setFilePath(tempFile);
        format.configure(conf);
        BaseStatistics stats = format.getStatistics(null);
        final int numLines = TEST_DATA_1_LINES;
        final float avgWidth = ((float) TEST_DATA1.length()) / TEST_DATA_1_LINES;
        Assert.assertTrue("Wrong record count.", stats.getNumberOfRecords() < numLines + 1 & stats.getNumberOfRecords() > numLines - 1);
        Assert.assertTrue("Wrong avg record size.", stats.getAverageRecordWidth() < avgWidth + 1 & stats.getAverageRecordWidth() > avgWidth - 1);
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail(e.getMessage());
    }
}
Also used : Configuration(org.apache.flink.configuration.Configuration) BaseStatistics(org.apache.flink.api.common.io.statistics.BaseStatistics) Test(org.junit.Test)

Example 7 with BaseStatistics

use of org.apache.flink.api.common.io.statistics.BaseStatistics in project flink by apache.

the class EnumerateNestedFilesTest method testGetStatisticsMultipleNestedFiles.

@Test
public void testGetStatisticsMultipleNestedFiles() {
    try {
        final long SIZE1 = 2077;
        final long SIZE2 = 31909;
        final long SIZE3 = 10;
        final long SIZE4 = 71;
        final long TOTAL = SIZE1 + SIZE2 + SIZE3 + SIZE4;
        String firstLevelDir = TestFileUtils.randomFileName();
        String secondLevelDir = TestFileUtils.randomFileName();
        String secondLevelDir2 = TestFileUtils.randomFileName();
        File nestedDir = new File(tempPath + System.getProperty("file.separator") + firstLevelDir);
        nestedDir.mkdirs();
        nestedDir.deleteOnExit();
        File insideNestedDir = new File(tempPath + System.getProperty("file.separator") + firstLevelDir + System.getProperty("file.separator") + secondLevelDir);
        insideNestedDir.mkdirs();
        insideNestedDir.deleteOnExit();
        File insideNestedDir2 = new File(tempPath + System.getProperty("file.separator") + firstLevelDir + System.getProperty("file.separator") + secondLevelDir2);
        insideNestedDir2.mkdirs();
        insideNestedDir2.deleteOnExit();
        // create a file in the first-level and two files in the nested dir
        TestFileUtils.createTempFileInDirectory(nestedDir.getAbsolutePath(), SIZE1);
        TestFileUtils.createTempFileInDirectory(insideNestedDir.getAbsolutePath(), SIZE2);
        TestFileUtils.createTempFileInDirectory(insideNestedDir.getAbsolutePath(), SIZE3);
        TestFileUtils.createTempFileInDirectory(insideNestedDir2.getAbsolutePath(), SIZE4);
        this.format.setFilePath(new Path(nestedDir.toURI().toString()));
        this.config.setBoolean("recursive.file.enumeration", true);
        format.configure(this.config);
        BaseStatistics stats = format.getStatistics(null);
        Assert.assertEquals("The file size from the statistics is wrong.", TOTAL, stats.getTotalInputSize());
        /* Now invalidate the cache and check again */
        // accuracy of file modification times is rather low
        Thread.sleep(1000);
        TestFileUtils.createTempFileInDirectory(insideNestedDir.getAbsolutePath(), 42L);
        BaseStatistics stats2 = format.getStatistics(stats);
        Assert.assertNotEquals(stats2, stats);
        Assert.assertEquals("The file size from the statistics is wrong.", TOTAL + 42L, stats2.getTotalInputSize());
    } catch (Exception ex) {
        ex.printStackTrace();
        Assert.fail(ex.getMessage());
    }
}
Also used : Path(org.apache.flink.core.fs.Path) File(java.io.File) BaseStatistics(org.apache.flink.api.common.io.statistics.BaseStatistics) IOException(java.io.IOException) Test(org.junit.Test)

Example 8 with BaseStatistics

use of org.apache.flink.api.common.io.statistics.BaseStatistics in project flink by apache.

the class FileInputFormatTest method testGetStatisticsMultipleFilesWithCachedVersion.

@Test
public void testGetStatisticsMultipleFilesWithCachedVersion() {
    try {
        final long SIZE1 = 2077;
        final long SIZE2 = 31909;
        final long SIZE3 = 10;
        final long TOTAL = SIZE1 + SIZE2 + SIZE3;
        final long FAKE_SIZE = 10065;
        String tempDir = TestFileUtils.createTempFileDir(SIZE1, SIZE2, SIZE3);
        DummyFileInputFormat format = new DummyFileInputFormat();
        format.setFilePath(tempDir);
        format.configure(new Configuration());
        FileBaseStatistics stats = format.getStatistics(null);
        Assert.assertEquals("The file size from the statistics is wrong.", TOTAL, stats.getTotalInputSize());
        format = new DummyFileInputFormat();
        format.setFilePath(tempDir);
        format.configure(new Configuration());
        FileBaseStatistics newStats = format.getStatistics(stats);
        Assert.assertTrue("Statistics object was changed", newStats == stats);
        // insert fake stats with the correct modification time. the call should return the fake stats
        format = new DummyFileInputFormat();
        format.setFilePath(tempDir);
        format.configure(new Configuration());
        FileBaseStatistics fakeStats = new FileBaseStatistics(stats.getLastModificationTime(), FAKE_SIZE, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN);
        BaseStatistics latest = format.getStatistics(fakeStats);
        Assert.assertEquals("The file size from the statistics is wrong.", FAKE_SIZE, latest.getTotalInputSize());
        // insert fake stats with the correct modification time. the call should return the fake stats
        format = new DummyFileInputFormat();
        format.setFilePath(tempDir);
        format.configure(new Configuration());
        FileBaseStatistics outDatedFakeStats = new FileBaseStatistics(stats.getLastModificationTime() - 1, FAKE_SIZE, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN);
        BaseStatistics reGathered = format.getStatistics(outDatedFakeStats);
        Assert.assertEquals("The file size from the statistics is wrong.", TOTAL, reGathered.getTotalInputSize());
    } catch (Exception ex) {
        ex.printStackTrace();
        Assert.fail(ex.getMessage());
    }
}
Also used : Configuration(org.apache.flink.configuration.Configuration) FileBaseStatistics(org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics) BaseStatistics(org.apache.flink.api.common.io.statistics.BaseStatistics) FileBaseStatistics(org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics) IOException(java.io.IOException) Test(org.junit.Test)

Example 9 with BaseStatistics

use of org.apache.flink.api.common.io.statistics.BaseStatistics in project flink by apache.

the class FileInputFormatTest method testGetStatsIgnoredUnderscoreFiles.

@Test
public void testGetStatsIgnoredUnderscoreFiles() {
    try {
        final int SIZE = 2048;
        final long TOTAL = 2 * SIZE;
        // create two accepted and two ignored files
        File child1 = temporaryFolder.newFile("dataFile1.txt");
        File child2 = temporaryFolder.newFile("another_file.bin");
        File luigiFile = temporaryFolder.newFile("_luigi");
        File success = temporaryFolder.newFile("_SUCCESS");
        createTempFiles(new byte[SIZE], child1, child2, luigiFile, success);
        final DummyFileInputFormat format = new DummyFileInputFormat();
        format.setFilePath(temporaryFolder.getRoot().toURI().toString());
        format.configure(new Configuration());
        // check that only valid files are used for statistics computation
        BaseStatistics stats = format.getStatistics(null);
        Assert.assertEquals(TOTAL, stats.getTotalInputSize());
    } catch (Exception e) {
        System.err.println(e.getMessage());
        e.printStackTrace();
        Assert.fail(e.getMessage());
    }
}
Also used : Configuration(org.apache.flink.configuration.Configuration) File(java.io.File) BaseStatistics(org.apache.flink.api.common.io.statistics.BaseStatistics) FileBaseStatistics(org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics) IOException(java.io.IOException) Test(org.junit.Test)

Example 10 with BaseStatistics

use of org.apache.flink.api.common.io.statistics.BaseStatistics in project flink by apache.

the class EnumerateNestedFilesTest method testGetStatisticsOneFileInNestedDir.

@Test
public void testGetStatisticsOneFileInNestedDir() {
    try {
        final long SIZE = 1024 * 500;
        String firstLevelDir = TestFileUtils.randomFileName();
        String secondLevelDir = TestFileUtils.randomFileName();
        File nestedDir = new File(tempPath + System.getProperty("file.separator") + firstLevelDir);
        nestedDir.mkdirs();
        nestedDir.deleteOnExit();
        File insideNestedDir = new File(tempPath + System.getProperty("file.separator") + firstLevelDir + System.getProperty("file.separator") + secondLevelDir);
        insideNestedDir.mkdirs();
        insideNestedDir.deleteOnExit();
        // create a file in the nested dir
        TestFileUtils.createTempFileInDirectory(insideNestedDir.getAbsolutePath(), SIZE);
        this.format.setFilePath(new Path(nestedDir.toURI().toString()));
        this.config.setBoolean("recursive.file.enumeration", true);
        format.configure(this.config);
        BaseStatistics stats = format.getStatistics(null);
        Assert.assertEquals("The file size from the statistics is wrong.", SIZE, stats.getTotalInputSize());
    } catch (Exception ex) {
        ex.printStackTrace();
        Assert.fail(ex.getMessage());
    }
}
Also used : Path(org.apache.flink.core.fs.Path) File(java.io.File) BaseStatistics(org.apache.flink.api.common.io.statistics.BaseStatistics) IOException(java.io.IOException) Test(org.junit.Test)

Aggregations

BaseStatistics (org.apache.flink.api.common.io.statistics.BaseStatistics)14 Test (org.junit.Test)13 Configuration (org.apache.flink.configuration.Configuration)11 IOException (java.io.IOException)8 FileBaseStatistics (org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics)6 File (java.io.File)3 Path (org.apache.flink.core.fs.Path)2