use of org.apache.flink.api.common.io.statistics.BaseStatistics in project flink by apache.
the class DelimitedInputFormatSamplingTest method testSamplingOneFile.
@Test
public void testSamplingOneFile() {
try {
final String tempFile = TestFileUtils.createTempFile(TEST_DATA1);
final Configuration conf = new Configuration();
final TestDelimitedInputFormat format = new TestDelimitedInputFormat(CONFIG);
format.setFilePath(tempFile);
format.configure(conf);
BaseStatistics stats = format.getStatistics(null);
final int numLines = TEST_DATA_1_LINES;
final float avgWidth = ((float) TEST_DATA1.length()) / TEST_DATA_1_LINES;
Assert.assertTrue("Wrong record count.", stats.getNumberOfRecords() < numLines + 1 & stats.getNumberOfRecords() > numLines - 1);
Assert.assertTrue("Wrong avg record size.", stats.getAverageRecordWidth() < avgWidth + 1 & stats.getAverageRecordWidth() > avgWidth - 1);
} catch (Exception e) {
e.printStackTrace();
Assert.fail(e.getMessage());
}
}
use of org.apache.flink.api.common.io.statistics.BaseStatistics in project flink by apache.
the class EnumerateNestedFilesTest method testGetStatisticsMultipleNestedFiles.
@Test
public void testGetStatisticsMultipleNestedFiles() {
try {
final long SIZE1 = 2077;
final long SIZE2 = 31909;
final long SIZE3 = 10;
final long SIZE4 = 71;
final long TOTAL = SIZE1 + SIZE2 + SIZE3 + SIZE4;
String firstLevelDir = TestFileUtils.randomFileName();
String secondLevelDir = TestFileUtils.randomFileName();
String secondLevelDir2 = TestFileUtils.randomFileName();
File nestedDir = new File(tempPath + System.getProperty("file.separator") + firstLevelDir);
nestedDir.mkdirs();
nestedDir.deleteOnExit();
File insideNestedDir = new File(tempPath + System.getProperty("file.separator") + firstLevelDir + System.getProperty("file.separator") + secondLevelDir);
insideNestedDir.mkdirs();
insideNestedDir.deleteOnExit();
File insideNestedDir2 = new File(tempPath + System.getProperty("file.separator") + firstLevelDir + System.getProperty("file.separator") + secondLevelDir2);
insideNestedDir2.mkdirs();
insideNestedDir2.deleteOnExit();
// create a file in the first-level and two files in the nested dir
TestFileUtils.createTempFileInDirectory(nestedDir.getAbsolutePath(), SIZE1);
TestFileUtils.createTempFileInDirectory(insideNestedDir.getAbsolutePath(), SIZE2);
TestFileUtils.createTempFileInDirectory(insideNestedDir.getAbsolutePath(), SIZE3);
TestFileUtils.createTempFileInDirectory(insideNestedDir2.getAbsolutePath(), SIZE4);
this.format.setFilePath(new Path(nestedDir.toURI().toString()));
this.config.setBoolean("recursive.file.enumeration", true);
format.configure(this.config);
BaseStatistics stats = format.getStatistics(null);
Assert.assertEquals("The file size from the statistics is wrong.", TOTAL, stats.getTotalInputSize());
/* Now invalidate the cache and check again */
// accuracy of file modification times is rather low
Thread.sleep(1000);
TestFileUtils.createTempFileInDirectory(insideNestedDir.getAbsolutePath(), 42L);
BaseStatistics stats2 = format.getStatistics(stats);
Assert.assertNotEquals(stats2, stats);
Assert.assertEquals("The file size from the statistics is wrong.", TOTAL + 42L, stats2.getTotalInputSize());
} catch (Exception ex) {
ex.printStackTrace();
Assert.fail(ex.getMessage());
}
}
use of org.apache.flink.api.common.io.statistics.BaseStatistics in project flink by apache.
the class FileInputFormatTest method testGetStatisticsMultipleFilesWithCachedVersion.
@Test
public void testGetStatisticsMultipleFilesWithCachedVersion() {
try {
final long SIZE1 = 2077;
final long SIZE2 = 31909;
final long SIZE3 = 10;
final long TOTAL = SIZE1 + SIZE2 + SIZE3;
final long FAKE_SIZE = 10065;
String tempDir = TestFileUtils.createTempFileDir(SIZE1, SIZE2, SIZE3);
DummyFileInputFormat format = new DummyFileInputFormat();
format.setFilePath(tempDir);
format.configure(new Configuration());
FileBaseStatistics stats = format.getStatistics(null);
Assert.assertEquals("The file size from the statistics is wrong.", TOTAL, stats.getTotalInputSize());
format = new DummyFileInputFormat();
format.setFilePath(tempDir);
format.configure(new Configuration());
FileBaseStatistics newStats = format.getStatistics(stats);
Assert.assertTrue("Statistics object was changed", newStats == stats);
// insert fake stats with the correct modification time. the call should return the fake stats
format = new DummyFileInputFormat();
format.setFilePath(tempDir);
format.configure(new Configuration());
FileBaseStatistics fakeStats = new FileBaseStatistics(stats.getLastModificationTime(), FAKE_SIZE, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN);
BaseStatistics latest = format.getStatistics(fakeStats);
Assert.assertEquals("The file size from the statistics is wrong.", FAKE_SIZE, latest.getTotalInputSize());
// insert fake stats with the correct modification time. the call should return the fake stats
format = new DummyFileInputFormat();
format.setFilePath(tempDir);
format.configure(new Configuration());
FileBaseStatistics outDatedFakeStats = new FileBaseStatistics(stats.getLastModificationTime() - 1, FAKE_SIZE, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN);
BaseStatistics reGathered = format.getStatistics(outDatedFakeStats);
Assert.assertEquals("The file size from the statistics is wrong.", TOTAL, reGathered.getTotalInputSize());
} catch (Exception ex) {
ex.printStackTrace();
Assert.fail(ex.getMessage());
}
}
use of org.apache.flink.api.common.io.statistics.BaseStatistics in project flink by apache.
the class FileInputFormatTest method testGetStatsIgnoredUnderscoreFiles.
@Test
public void testGetStatsIgnoredUnderscoreFiles() {
try {
final int SIZE = 2048;
final long TOTAL = 2 * SIZE;
// create two accepted and two ignored files
File child1 = temporaryFolder.newFile("dataFile1.txt");
File child2 = temporaryFolder.newFile("another_file.bin");
File luigiFile = temporaryFolder.newFile("_luigi");
File success = temporaryFolder.newFile("_SUCCESS");
createTempFiles(new byte[SIZE], child1, child2, luigiFile, success);
final DummyFileInputFormat format = new DummyFileInputFormat();
format.setFilePath(temporaryFolder.getRoot().toURI().toString());
format.configure(new Configuration());
// check that only valid files are used for statistics computation
BaseStatistics stats = format.getStatistics(null);
Assert.assertEquals(TOTAL, stats.getTotalInputSize());
} catch (Exception e) {
System.err.println(e.getMessage());
e.printStackTrace();
Assert.fail(e.getMessage());
}
}
use of org.apache.flink.api.common.io.statistics.BaseStatistics in project flink by apache.
the class EnumerateNestedFilesTest method testGetStatisticsOneFileInNestedDir.
@Test
public void testGetStatisticsOneFileInNestedDir() {
try {
final long SIZE = 1024 * 500;
String firstLevelDir = TestFileUtils.randomFileName();
String secondLevelDir = TestFileUtils.randomFileName();
File nestedDir = new File(tempPath + System.getProperty("file.separator") + firstLevelDir);
nestedDir.mkdirs();
nestedDir.deleteOnExit();
File insideNestedDir = new File(tempPath + System.getProperty("file.separator") + firstLevelDir + System.getProperty("file.separator") + secondLevelDir);
insideNestedDir.mkdirs();
insideNestedDir.deleteOnExit();
// create a file in the nested dir
TestFileUtils.createTempFileInDirectory(insideNestedDir.getAbsolutePath(), SIZE);
this.format.setFilePath(new Path(nestedDir.toURI().toString()));
this.config.setBoolean("recursive.file.enumeration", true);
format.configure(this.config);
BaseStatistics stats = format.getStatistics(null);
Assert.assertEquals("The file size from the statistics is wrong.", SIZE, stats.getTotalInputSize());
} catch (Exception ex) {
ex.printStackTrace();
Assert.fail(ex.getMessage());
}
}
Aggregations