Search in sources :

Example 6 with CatalogColumnStatisticsDataString

use of org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataString in project flink by apache.

the class CatalogStatisticsTest method createColumnStats.

private CatalogColumnStatistics createColumnStats() {
    CatalogColumnStatisticsDataBoolean booleanColStats = new CatalogColumnStatisticsDataBoolean(55L, 45L, 5L);
    CatalogColumnStatisticsDataLong longColStats = new CatalogColumnStatisticsDataLong(-123L, 763322L, 23L, 77L);
    CatalogColumnStatisticsDataString stringColStats = new CatalogColumnStatisticsDataString(152L, 43.5D, 20L, 0L);
    CatalogColumnStatisticsDataDate dateColStats = new CatalogColumnStatisticsDataDate(new Date(71L), new Date(17923L), 100L, 0L);
    CatalogColumnStatisticsDataDouble doubleColStats = new CatalogColumnStatisticsDataDouble(-123.35D, 7633.22D, 73L, 27L);
    Map<String, CatalogColumnStatisticsDataBase> colStatsMap = new HashMap<>(6);
    colStatsMap.put("b1", booleanColStats);
    colStatsMap.put("l2", longColStats);
    colStatsMap.put("s3", stringColStats);
    colStatsMap.put("d4", dateColStats);
    colStatsMap.put("dd5", doubleColStats);
    return new CatalogColumnStatistics(colStatsMap);
}
Also used : CatalogColumnStatisticsDataDate(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataDate) CatalogColumnStatisticsDataLong(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataLong) CatalogColumnStatisticsDataDouble(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataDouble) CatalogColumnStatisticsDataBase(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataBase) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) CatalogColumnStatisticsDataBoolean(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataBoolean) CatalogColumnStatisticsDataString(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataString) CatalogColumnStatisticsDataString(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataString) Date(org.apache.flink.table.catalog.stats.Date) CatalogColumnStatisticsDataDate(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataDate) CatalogColumnStatistics(org.apache.flink.table.catalog.stats.CatalogColumnStatistics)

Example 7 with CatalogColumnStatisticsDataString

use of org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataString in project flink by apache.

the class HiveStatsUtil method createTableColumnStats.

/**
 * Create Flink ColumnStats from Hive ColumnStatisticsData.
 */
private static CatalogColumnStatisticsDataBase createTableColumnStats(DataType colType, ColumnStatisticsData stats, String hiveVersion) {
    HiveShim hiveShim = HiveShimLoader.loadHiveShim(hiveVersion);
    if (stats.isSetBinaryStats()) {
        BinaryColumnStatsData binaryStats = stats.getBinaryStats();
        return new CatalogColumnStatisticsDataBinary(binaryStats.isSetMaxColLen() ? binaryStats.getMaxColLen() : null, binaryStats.isSetAvgColLen() ? binaryStats.getAvgColLen() : null, binaryStats.isSetNumNulls() ? binaryStats.getNumNulls() : null);
    } else if (stats.isSetBooleanStats()) {
        BooleanColumnStatsData booleanStats = stats.getBooleanStats();
        return new CatalogColumnStatisticsDataBoolean(booleanStats.isSetNumTrues() ? booleanStats.getNumTrues() : null, booleanStats.isSetNumFalses() ? booleanStats.getNumFalses() : null, booleanStats.isSetNumNulls() ? booleanStats.getNumNulls() : null);
    } else if (hiveShim.isDateStats(stats)) {
        return hiveShim.toFlinkDateColStats(stats);
    } else if (stats.isSetDoubleStats()) {
        DoubleColumnStatsData doubleStats = stats.getDoubleStats();
        return new CatalogColumnStatisticsDataDouble(doubleStats.isSetLowValue() ? doubleStats.getLowValue() : null, doubleStats.isSetHighValue() ? doubleStats.getHighValue() : null, doubleStats.isSetNumDVs() ? doubleStats.getNumDVs() : null, doubleStats.isSetNumNulls() ? doubleStats.getNumNulls() : null);
    } else if (stats.isSetLongStats()) {
        LongColumnStatsData longColStats = stats.getLongStats();
        return new CatalogColumnStatisticsDataLong(longColStats.isSetLowValue() ? longColStats.getLowValue() : null, longColStats.isSetHighValue() ? longColStats.getHighValue() : null, longColStats.isSetNumDVs() ? longColStats.getNumDVs() : null, longColStats.isSetNumNulls() ? longColStats.getNumNulls() : null);
    } else if (stats.isSetStringStats()) {
        StringColumnStatsData stringStats = stats.getStringStats();
        return new CatalogColumnStatisticsDataString(stringStats.isSetMaxColLen() ? stringStats.getMaxColLen() : null, stringStats.isSetAvgColLen() ? stringStats.getAvgColLen() : null, stringStats.isSetNumDVs() ? stringStats.getNumDVs() : null, stringStats.isSetNumDVs() ? stringStats.getNumNulls() : null);
    } else if (stats.isSetDecimalStats()) {
        DecimalColumnStatsData decimalStats = stats.getDecimalStats();
        // for now, just return CatalogColumnStatisticsDataDouble for decimal columns
        Double max = null;
        if (decimalStats.isSetHighValue()) {
            max = toHiveDecimal(decimalStats.getHighValue()).doubleValue();
        }
        Double min = null;
        if (decimalStats.isSetLowValue()) {
            min = toHiveDecimal(decimalStats.getLowValue()).doubleValue();
        }
        Long ndv = decimalStats.isSetNumDVs() ? decimalStats.getNumDVs() : null;
        Long nullCount = decimalStats.isSetNumNulls() ? decimalStats.getNumNulls() : null;
        return new CatalogColumnStatisticsDataDouble(min, max, ndv, nullCount);
    } else {
        LOG.warn("Flink does not support converting ColumnStatisticsData '{}' for Hive column type '{}' yet.", stats, colType);
        return null;
    }
}
Also used : BooleanColumnStatsData(org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData) StringColumnStatsData(org.apache.hadoop.hive.metastore.api.StringColumnStatsData) CatalogColumnStatisticsDataBinary(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataBinary) LongColumnStatsData(org.apache.hadoop.hive.metastore.api.LongColumnStatsData) CatalogColumnStatisticsDataDouble(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataDouble) BinaryColumnStatsData(org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData) DoubleColumnStatsData(org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData) DecimalColumnStatsData(org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData) CatalogColumnStatisticsDataDouble(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataDouble) CatalogColumnStatisticsDataLong(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataLong) CatalogColumnStatisticsDataBoolean(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataBoolean) CatalogColumnStatisticsDataLong(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataLong) CatalogColumnStatisticsDataString(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataString) HiveShim(org.apache.flink.table.catalog.hive.client.HiveShim)

Example 8 with CatalogColumnStatisticsDataString

use of org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataString in project flink by apache.

the class GenericInMemoryCatalogTest method createColumnStats.

private CatalogColumnStatistics createColumnStats() {
    CatalogColumnStatisticsDataBoolean booleanColStats = new CatalogColumnStatisticsDataBoolean(55L, 45L, 5L);
    CatalogColumnStatisticsDataLong longColStats = new CatalogColumnStatisticsDataLong(-123L, 763322L, 23L, 79L);
    CatalogColumnStatisticsDataString stringColStats = new CatalogColumnStatisticsDataString(152L, 43.5D, 20L, 0L);
    CatalogColumnStatisticsDataDate dateColStats = new CatalogColumnStatisticsDataDate(new Date(71L), new Date(17923L), 1321L, 0L);
    CatalogColumnStatisticsDataDouble doubleColStats = new CatalogColumnStatisticsDataDouble(-123.35D, 7633.22D, 23L, 79L);
    CatalogColumnStatisticsDataBinary binaryColStats = new CatalogColumnStatisticsDataBinary(755L, 43.5D, 20L);
    Map<String, CatalogColumnStatisticsDataBase> colStatsMap = new HashMap<>(6);
    colStatsMap.put("b1", booleanColStats);
    colStatsMap.put("l2", longColStats);
    colStatsMap.put("s3", stringColStats);
    colStatsMap.put("d4", dateColStats);
    colStatsMap.put("dd5", doubleColStats);
    colStatsMap.put("bb6", binaryColStats);
    return new CatalogColumnStatistics(colStatsMap);
}
Also used : CatalogColumnStatisticsDataDate(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataDate) CatalogColumnStatisticsDataLong(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataLong) CatalogColumnStatisticsDataDouble(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataDouble) CatalogColumnStatisticsDataBase(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataBase) HashMap(java.util.HashMap) CatalogColumnStatisticsDataBoolean(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataBoolean) CatalogColumnStatisticsDataString(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataString) CatalogColumnStatisticsDataBinary(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataBinary) CatalogColumnStatisticsDataString(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataString) Date(org.apache.flink.table.catalog.stats.Date) CatalogColumnStatisticsDataDate(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataDate) CatalogColumnStatistics(org.apache.flink.table.catalog.stats.CatalogColumnStatistics)

Example 9 with CatalogColumnStatisticsDataString

use of org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataString in project flink by apache.

the class CatalogTableStatisticsConverter method convertToColumnStats.

private static ColumnStats convertToColumnStats(CatalogColumnStatisticsDataBase columnStatisticsData) {
    Long ndv = null;
    Long nullCount = columnStatisticsData.getNullCount();
    Double avgLen = null;
    Integer maxLen = null;
    Comparable<?> max = null;
    Comparable<?> min = null;
    if (columnStatisticsData instanceof CatalogColumnStatisticsDataBoolean) {
        CatalogColumnStatisticsDataBoolean booleanData = (CatalogColumnStatisticsDataBoolean) columnStatisticsData;
        avgLen = 1.0;
        maxLen = 1;
        if (null == booleanData.getFalseCount() || null == booleanData.getTrueCount()) {
            ndv = 2L;
        } else if ((booleanData.getFalseCount() == 0 && booleanData.getTrueCount() > 0) || (booleanData.getFalseCount() > 0 && booleanData.getTrueCount() == 0)) {
            ndv = 1L;
        } else {
            ndv = 2L;
        }
    } else if (columnStatisticsData instanceof CatalogColumnStatisticsDataLong) {
        CatalogColumnStatisticsDataLong longData = (CatalogColumnStatisticsDataLong) columnStatisticsData;
        ndv = longData.getNdv();
        avgLen = 8.0;
        maxLen = 8;
        max = longData.getMax();
        min = longData.getMin();
    } else if (columnStatisticsData instanceof CatalogColumnStatisticsDataDouble) {
        CatalogColumnStatisticsDataDouble doubleData = (CatalogColumnStatisticsDataDouble) columnStatisticsData;
        ndv = doubleData.getNdv();
        avgLen = 8.0;
        maxLen = 8;
        max = doubleData.getMax();
        min = doubleData.getMin();
    } else if (columnStatisticsData instanceof CatalogColumnStatisticsDataString) {
        CatalogColumnStatisticsDataString strData = (CatalogColumnStatisticsDataString) columnStatisticsData;
        ndv = strData.getNdv();
        avgLen = strData.getAvgLength();
        maxLen = null == strData.getMaxLength() ? null : strData.getMaxLength().intValue();
    } else if (columnStatisticsData instanceof CatalogColumnStatisticsDataBinary) {
        CatalogColumnStatisticsDataBinary binaryData = (CatalogColumnStatisticsDataBinary) columnStatisticsData;
        avgLen = binaryData.getAvgLength();
        maxLen = null == binaryData.getMaxLength() ? null : binaryData.getMaxLength().intValue();
    } else if (columnStatisticsData instanceof CatalogColumnStatisticsDataDate) {
        CatalogColumnStatisticsDataDate dateData = (CatalogColumnStatisticsDataDate) columnStatisticsData;
        ndv = dateData.getNdv();
        if (dateData.getMax() != null) {
            max = Date.valueOf(DateTimeUtils.unixDateToString((int) dateData.getMax().getDaysSinceEpoch()));
        }
        if (dateData.getMin() != null) {
            min = Date.valueOf(DateTimeUtils.unixDateToString((int) dateData.getMin().getDaysSinceEpoch()));
        }
    } else {
        throw new TableException("Unsupported CatalogColumnStatisticsDataBase: " + columnStatisticsData.getClass().getCanonicalName());
    }
    return ColumnStats.Builder.builder().setNdv(ndv).setNullCount(nullCount).setAvgLen(avgLen).setMaxLen(maxLen).setMax(max).setMin(min).build();
}
Also used : TableException(org.apache.flink.table.api.TableException) CatalogColumnStatisticsDataDate(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataDate) CatalogColumnStatisticsDataLong(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataLong) CatalogColumnStatisticsDataDouble(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataDouble) CatalogColumnStatisticsDataLong(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataLong) CatalogColumnStatisticsDataBoolean(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataBoolean) CatalogColumnStatisticsDataString(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataString) CatalogColumnStatisticsDataBinary(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataBinary) CatalogColumnStatisticsDataDouble(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataDouble)

Example 10 with CatalogColumnStatisticsDataString

use of org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataString in project flink by apache.

the class CatalogTableStatisticsConverterTest method testConvertToColumnStatsMapWithNullColumnStatisticsData.

@Test
public void testConvertToColumnStatsMapWithNullColumnStatisticsData() {
    Map<String, CatalogColumnStatisticsDataBase> columnStatisticsDataBaseMap = new HashMap<>();
    columnStatisticsDataBaseMap.put("first", new CatalogColumnStatisticsDataString(10L, 5.2, 3L, 100L));
    columnStatisticsDataBaseMap.put("second", null);
    Map<String, ColumnStats> columnStatsMap = CatalogTableStatisticsConverter.convertToColumnStatsMap(columnStatisticsDataBaseMap);
    assertNotNull(columnStatsMap);
    assertEquals(columnStatisticsDataBaseMap.size() - 1, columnStatsMap.size());
    assertTrue(columnStatsMap.containsKey("first"));
    assertFalse(columnStatsMap.containsKey("second"));
}
Also used : CatalogColumnStatisticsDataBase(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataBase) HashMap(java.util.HashMap) ColumnStats(org.apache.flink.table.plan.stats.ColumnStats) CatalogColumnStatisticsDataString(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataString) CatalogColumnStatisticsDataString(org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataString) Test(org.junit.Test)

Aggregations

CatalogColumnStatisticsDataString (org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataString)10 CatalogColumnStatisticsDataLong (org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataLong)8 HashMap (java.util.HashMap)7 CatalogColumnStatisticsDataBase (org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataBase)7 CatalogColumnStatisticsDataDouble (org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataDouble)7 CatalogColumnStatistics (org.apache.flink.table.catalog.stats.CatalogColumnStatistics)6 CatalogColumnStatisticsDataBoolean (org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataBoolean)6 CatalogColumnStatisticsDataDate (org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataDate)6 CatalogColumnStatisticsDataBinary (org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataBinary)5 Date (org.apache.flink.table.catalog.stats.Date)3 Test (org.junit.Test)3 LinkedHashMap (java.util.LinkedHashMap)2 CatalogPartitionSpec (org.apache.flink.table.catalog.CatalogPartitionSpec)2 CatalogTable (org.apache.flink.table.catalog.CatalogTable)2 HiveShim (org.apache.flink.table.catalog.hive.client.HiveShim)2 BinaryColumnStatsData (org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData)2 BooleanColumnStatsData (org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData)2 DecimalColumnStatsData (org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData)2 DoubleColumnStatsData (org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData)2 LongColumnStatsData (org.apache.hadoop.hive.metastore.api.LongColumnStatsData)2