Search in sources :

Example 11 with BaseStatistics

use of org.apache.flink.api.common.io.statistics.BaseStatistics in project flink by apache.

the class FileInputFormatTest method testGetStatisticsNonExistingFile.

// ------------------------------------------------------------------------
//  Statistics
// ------------------------------------------------------------------------
@Test
public void testGetStatisticsNonExistingFile() {
    try {
        final DummyFileInputFormat format = new DummyFileInputFormat();
        format.setFilePath("file:///some/none/existing/directory/");
        format.configure(new Configuration());
        BaseStatistics stats = format.getStatistics(null);
        Assert.assertNull("The file statistics should be null.", stats);
    } catch (Exception ex) {
        ex.printStackTrace();
        Assert.fail(ex.getMessage());
    }
}
Also used : Configuration(org.apache.flink.configuration.Configuration) BaseStatistics(org.apache.flink.api.common.io.statistics.BaseStatistics) FileBaseStatistics(org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics) IOException(java.io.IOException) Test(org.junit.Test)

Example 12 with BaseStatistics

use of org.apache.flink.api.common.io.statistics.BaseStatistics in project flink by apache.

the class DelimitedInputFormatSamplingTest method testSamplingDirectory.

@Test
public void testSamplingDirectory() {
    try {
        final String tempFile = TestFileUtils.createTempFileDir(TEST_DATA1, TEST_DATA2);
        final Configuration conf = new Configuration();
        final TestDelimitedInputFormat format = new TestDelimitedInputFormat(CONFIG);
        format.setFilePath(tempFile);
        format.configure(conf);
        BaseStatistics stats = format.getStatistics(null);
        final int maxNumLines = (int) Math.ceil(TOTAL_SIZE / ((double) Math.min(TEST_DATA_1_LINEWIDTH, TEST_DATA_2_LINEWIDTH)));
        final int minNumLines = (int) (TOTAL_SIZE / ((double) Math.max(TEST_DATA_1_LINEWIDTH, TEST_DATA_2_LINEWIDTH)));
        final float maxAvgWidth = ((float) (TOTAL_SIZE)) / minNumLines;
        final float minAvgWidth = ((float) (TOTAL_SIZE)) / maxNumLines;
        if (!(stats.getNumberOfRecords() <= maxNumLines & stats.getNumberOfRecords() >= minNumLines)) {
            System.err.println("Records: " + stats.getNumberOfRecords() + " out of (" + minNumLines + ", " + maxNumLines + ").");
            Assert.fail("Wrong record count.");
        }
        if (!(stats.getAverageRecordWidth() <= maxAvgWidth & stats.getAverageRecordWidth() >= minAvgWidth)) {
            Assert.fail("Wrong avg record size.");
        }
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail(e.getMessage());
    }
}
Also used : Configuration(org.apache.flink.configuration.Configuration) BaseStatistics(org.apache.flink.api.common.io.statistics.BaseStatistics) Test(org.junit.Test)

Example 13 with BaseStatistics

use of org.apache.flink.api.common.io.statistics.BaseStatistics in project flink by apache.

the class SequentialFormatTestBase method checkStatistics.

/**
	 * Tests the statistics of the given format.
	 */
@Test
public void checkStatistics() {
    BinaryInputFormat<T> input = this.createInputFormat();
    BaseStatistics statistics = input.getStatistics(null);
    Assert.assertEquals(this.numberOfTuples, statistics.getNumberOfRecords());
}
Also used : BaseStatistics(org.apache.flink.api.common.io.statistics.BaseStatistics) Test(org.junit.Test)

Example 14 with BaseStatistics

use of org.apache.flink.api.common.io.statistics.BaseStatistics in project flink by apache.

the class DataSourceNode method computeOperatorSpecificDefaultEstimates.

@Override
protected void computeOperatorSpecificDefaultEstimates(DataStatistics statistics) {
    // see, if we have a statistics object that can tell us a bit about the file
    if (statistics != null) {
        // instantiate the input format, as this is needed by the statistics 
        InputFormat<?, ?> format;
        String inFormatDescription = "<unknown>";
        try {
            format = getOperator().getFormatWrapper().getUserCodeObject();
            Configuration config = getOperator().getParameters();
            format.configure(config);
        } catch (Throwable t) {
            if (Optimizer.LOG.isWarnEnabled()) {
                Optimizer.LOG.warn("Could not instantiate InputFormat to obtain statistics." + " Limited statistics will be available.", t);
            }
            return;
        }
        try {
            inFormatDescription = format.toString();
        } catch (Throwable t) {
        // we can ignore this error, as it only prevents us to use a cosmetic string
        }
        // first of all, get the statistics from the cache
        final String statisticsKey = getOperator().getStatisticsKey();
        final BaseStatistics cachedStatistics = statistics.getBaseStatistics(statisticsKey);
        BaseStatistics bs = null;
        try {
            bs = format.getStatistics(cachedStatistics);
        } catch (Throwable t) {
            if (Optimizer.LOG.isWarnEnabled()) {
                Optimizer.LOG.warn("Error obtaining statistics from input format: " + t.getMessage(), t);
            }
        }
        if (bs != null) {
            final long len = bs.getTotalInputSize();
            if (len == BaseStatistics.SIZE_UNKNOWN) {
                if (Optimizer.LOG.isInfoEnabled()) {
                    Optimizer.LOG.info("Compiler could not determine the size of input '" + inFormatDescription + "'. Using default estimates.");
                }
            } else if (len >= 0) {
                this.estimatedOutputSize = len;
            }
            final long card = bs.getNumberOfRecords();
            if (card != BaseStatistics.NUM_RECORDS_UNKNOWN) {
                this.estimatedNumRecords = card;
            }
        }
    }
}
Also used : Configuration(org.apache.flink.configuration.Configuration) BaseStatistics(org.apache.flink.api.common.io.statistics.BaseStatistics)

Aggregations

BaseStatistics (org.apache.flink.api.common.io.statistics.BaseStatistics)14 Test (org.junit.Test)13 Configuration (org.apache.flink.configuration.Configuration)11 IOException (java.io.IOException)8 FileBaseStatistics (org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics)6 File (java.io.File)3 Path (org.apache.flink.core.fs.Path)2