Search in sources :

Example 6 with ColumnStatisticsInfo

use of alluxio.grpc.table.ColumnStatisticsInfo in project alluxio by Alluxio.

the class GlueUtils method toProto.

/**
 * Convert glue ColumnStatistics to Alluxio ColumnStatisticsInfo.
 *
 * @param glueColumnStatistic glue column statistic info
 * @return Alluxio ColumnStatisticsInfo
 */
public static ColumnStatisticsInfo toProto(ColumnStatistics glueColumnStatistic) {
    if (glueColumnStatistic == null) {
        return ColumnStatisticsInfo.newBuilder().build();
    }
    ColumnStatisticsInfo.Builder columnStatisticsInfoBuilder = ColumnStatisticsInfo.newBuilder();
    columnStatisticsInfoBuilder.setColName(glueColumnStatistic.getColumnName()).setColType(glueColumnStatistic.getColumnType());
    if (glueColumnStatistic.getStatisticsData() != null) {
        com.amazonaws.services.glue.model.ColumnStatisticsData glueColumnStatisticsData = glueColumnStatistic.getStatisticsData();
        String columnType = glueColumnStatistic.getStatisticsData().getType();
        if (columnType != null) {
            if (columnType.equals("BOOLEAN") && glueColumnStatisticsData.getBooleanColumnStatisticsData() != null) {
                com.amazonaws.services.glue.model.BooleanColumnStatisticsData booleanData = glueColumnStatisticsData.getBooleanColumnStatisticsData();
                if (booleanData != null) {
                    columnStatisticsInfoBuilder.setData(ColumnStatisticsData.newBuilder().setBooleanStats(toProto(booleanData)).build());
                }
            }
            if (columnType.equals("DATE") && glueColumnStatisticsData.getDateColumnStatisticsData() != null) {
                com.amazonaws.services.glue.model.DateColumnStatisticsData dateData = glueColumnStatisticsData.getDateColumnStatisticsData();
                if (dateData != null) {
                    columnStatisticsInfoBuilder.setData(ColumnStatisticsData.newBuilder().setDateStats(toProto(dateData)).build());
                }
            }
            if (columnType.equals("DECIMAL") && glueColumnStatisticsData.getDecimalColumnStatisticsData() != null) {
                com.amazonaws.services.glue.model.DecimalColumnStatisticsData decimalData = glueColumnStatisticsData.getDecimalColumnStatisticsData();
                if (decimalData != null) {
                    columnStatisticsInfoBuilder.setData(ColumnStatisticsData.newBuilder().setDecimalStats(toProto(decimalData)).build());
                }
            }
            if (columnType.equals("DOUBLE") && glueColumnStatisticsData.getDoubleColumnStatisticsData() != null) {
                com.amazonaws.services.glue.model.DoubleColumnStatisticsData doubleData = glueColumnStatisticsData.getDoubleColumnStatisticsData();
                if (doubleData != null) {
                    columnStatisticsInfoBuilder.setData(ColumnStatisticsData.newBuilder().setDoubleStats(toProto(doubleData)).build());
                }
            }
            if (columnType.equals("LONG") && glueColumnStatisticsData.getLongColumnStatisticsData() != null) {
                com.amazonaws.services.glue.model.LongColumnStatisticsData longData = glueColumnStatisticsData.getLongColumnStatisticsData();
                if (longData != null) {
                    columnStatisticsInfoBuilder.setData(ColumnStatisticsData.newBuilder().setLongStats(toProto(longData)).build());
                }
            }
            if (columnType.equals("STRING") && glueColumnStatisticsData.getStringColumnStatisticsData() != null) {
                com.amazonaws.services.glue.model.StringColumnStatisticsData stringData = glueColumnStatisticsData.getStringColumnStatisticsData();
                if (stringData != null) {
                    columnStatisticsInfoBuilder.setData(ColumnStatisticsData.newBuilder().setStringStats(toProto(stringData)).build());
                }
            }
            if (columnType.equals("BINARY") && glueColumnStatisticsData.getBinaryColumnStatisticsData() != null) {
                com.amazonaws.services.glue.model.BinaryColumnStatisticsData binaryData = glueColumnStatisticsData.getBinaryColumnStatisticsData();
                if (binaryData != null) {
                    columnStatisticsInfoBuilder.setData(ColumnStatisticsData.newBuilder().setBinaryStats(toProto(binaryData)).build());
                }
            }
        }
    }
    return columnStatisticsInfoBuilder.build();
}
Also used : ByteString(com.google.protobuf.ByteString) ColumnStatisticsInfo(alluxio.grpc.table.ColumnStatisticsInfo)

Example 7 with ColumnStatisticsInfo

use of alluxio.grpc.table.ColumnStatisticsInfo in project alluxio by Alluxio.

the class HiveUtilsTest method verifyColumnStats.

private void verifyColumnStats(ColumnStatisticsObj hiveColStats) {
    ColumnStatisticsInfo colStats = HiveUtils.toProto(hiveColStats);
    assertEquals(hiveColStats.getColName(), colStats.getColName());
    assertEquals(hiveColStats.getColType(), colStats.getColType());
    assertEquals(hiveColStats.isSetStatsData(), colStats.hasData());
    if (hiveColStats.isSetStatsData()) {
        ColumnStatisticsData hiveData = hiveColStats.getStatsData();
        alluxio.grpc.table.ColumnStatisticsData data = colStats.getData();
        // verify binary
        assertEquals(hiveData.isSetBinaryStats(), data.hasBinaryStats());
        if (hiveData.isSetBinaryStats()) {
            BinaryColumnStatsData hiveBinary = hiveData.getBinaryStats();
            alluxio.grpc.table.BinaryColumnStatsData binary = data.getBinaryStats();
            assertEquals(hiveBinary.isSetBitVectors(), binary.hasBitVectors());
            if (hiveBinary.isSetBitVectors()) {
                assertEquals(hiveBinary.getBitVectors(), binary.getBitVectors());
            }
            assertEquals(hiveBinary.getAvgColLen(), binary.getAvgColLen(), 0.01);
            assertEquals(hiveBinary.getMaxColLen(), binary.getMaxColLen());
            assertEquals(hiveBinary.getNumNulls(), binary.getNumNulls());
        }
        // verify boolean
        assertEquals(hiveData.isSetBooleanStats(), data.hasBooleanStats());
        if (hiveData.isSetBooleanStats()) {
            BooleanColumnStatsData hiveBoolean = hiveData.getBooleanStats();
            alluxio.grpc.table.BooleanColumnStatsData bool = data.getBooleanStats();
            assertEquals(hiveBoolean.isSetBitVectors(), bool.hasBitVectors());
            if (hiveBoolean.isSetBitVectors()) {
                assertEquals(hiveBoolean.getBitVectors(), bool.getBitVectors());
            }
            assertEquals(hiveBoolean.getNumFalses(), bool.getNumFalses());
            assertEquals(hiveBoolean.getNumTrues(), bool.getNumTrues());
            assertEquals(hiveBoolean.getNumNulls(), bool.getNumNulls());
        }
        // verify date
        assertEquals(hiveData.isSetDateStats(), data.hasDateStats());
        if (hiveData.isSetDateStats()) {
            DateColumnStatsData hiveDate = hiveData.getDateStats();
            alluxio.grpc.table.DateColumnStatsData date = data.getDateStats();
            assertEquals(hiveDate.isSetBitVectors(), date.hasBitVectors());
            if (hiveDate.isSetBitVectors()) {
                assertEquals(hiveDate.getBitVectors(), date.getBitVectors());
            }
            assertEquals(hiveDate.getNumNulls(), date.getNumNulls());
            assertEquals(hiveDate.getNumDVs(), date.getNumDistincts());
            assertEquals(hiveDate.isSetHighValue(), date.hasHighValue());
            if (hiveDate.isSetHighValue()) {
                assertEquals(hiveDate.getHighValue().getDaysSinceEpoch(), date.getHighValue().getDaysSinceEpoch());
            }
            assertEquals(hiveDate.isSetLowValue(), date.hasLowValue());
            if (hiveDate.isSetLowValue()) {
                assertEquals(hiveDate.getLowValue().getDaysSinceEpoch(), date.getLowValue().getDaysSinceEpoch());
            }
        }
        // verify decimal
        assertEquals(hiveData.isSetDecimalStats(), data.hasDecimalStats());
        if (hiveData.isSetDecimalStats()) {
            DecimalColumnStatsData hiveDecimal = hiveData.getDecimalStats();
            alluxio.grpc.table.DecimalColumnStatsData decimal = data.getDecimalStats();
            assertEquals(hiveDecimal.isSetBitVectors(), decimal.hasBitVectors());
            if (hiveDecimal.isSetBitVectors()) {
                assertEquals(hiveDecimal.getBitVectors(), decimal.getBitVectors());
            }
            assertEquals(hiveDecimal.getNumNulls(), decimal.getNumNulls());
            assertEquals(hiveDecimal.getNumDVs(), decimal.getNumDistincts());
            assertEquals(hiveDecimal.isSetHighValue(), decimal.hasHighValue());
            if (hiveDecimal.isSetHighValue()) {
                assertEquals(hiveDecimal.getHighValue().getScale(), decimal.getHighValue().getScale());
                assertArrayEquals(hiveDecimal.getHighValue().getUnscaled(), decimal.getHighValue().getUnscaled().toByteArray());
            }
            assertEquals(hiveDecimal.isSetLowValue(), decimal.hasLowValue());
            if (hiveDecimal.isSetLowValue()) {
                assertEquals(hiveDecimal.getLowValue().getScale(), decimal.getLowValue().getScale());
                assertArrayEquals(hiveDecimal.getLowValue().getUnscaled(), decimal.getLowValue().getUnscaled().toByteArray());
            }
        }
        // verify double
        assertEquals(hiveData.isSetDoubleStats(), data.hasDoubleStats());
        if (hiveData.isSetDoubleStats()) {
            DoubleColumnStatsData hiveDouble = hiveData.getDoubleStats();
            alluxio.grpc.table.DoubleColumnStatsData dbl = data.getDoubleStats();
            assertEquals(hiveDouble.isSetBitVectors(), dbl.hasBitVectors());
            if (hiveDouble.isSetBitVectors()) {
                assertEquals(hiveDouble.getBitVectors(), dbl.getBitVectors());
            }
            assertEquals(hiveDouble.getNumNulls(), dbl.getNumNulls());
            assertEquals(hiveDouble.getNumDVs(), dbl.getNumDistincts());
            assertEquals(hiveDouble.isSetHighValue(), dbl.hasHighValue());
            if (hiveDouble.isSetHighValue()) {
                assertEquals(hiveDouble.getHighValue(), dbl.getHighValue(), 0.01);
            }
            assertEquals(hiveDouble.isSetLowValue(), dbl.hasLowValue());
            if (hiveDouble.isSetLowValue()) {
                assertEquals(hiveDouble.getLowValue(), dbl.getLowValue(), 0.01);
            }
        }
        // verify long
        assertEquals(hiveData.isSetLongStats(), data.hasLongStats());
        if (hiveData.isSetLongStats()) {
            LongColumnStatsData hiveLong = hiveData.getLongStats();
            alluxio.grpc.table.LongColumnStatsData dbl = data.getLongStats();
            assertEquals(hiveLong.isSetBitVectors(), dbl.hasBitVectors());
            if (hiveLong.isSetBitVectors()) {
                assertEquals(hiveLong.getBitVectors(), dbl.getBitVectors());
            }
            assertEquals(hiveLong.getNumNulls(), dbl.getNumNulls());
            assertEquals(hiveLong.getNumDVs(), dbl.getNumDistincts());
            assertEquals(hiveLong.isSetHighValue(), dbl.hasHighValue());
            if (hiveLong.isSetHighValue()) {
                assertEquals(hiveLong.getHighValue(), dbl.getHighValue());
            }
            assertEquals(hiveLong.isSetLowValue(), dbl.hasLowValue());
            if (hiveLong.isSetLowValue()) {
                assertEquals(hiveLong.getLowValue(), dbl.getLowValue());
            }
        }
        // verify string
        assertEquals(hiveData.isSetStringStats(), data.hasStringStats());
        if (hiveData.isSetStringStats()) {
            StringColumnStatsData hiveString = hiveData.getStringStats();
            alluxio.grpc.table.StringColumnStatsData string = data.getStringStats();
            assertEquals(hiveString.isSetBitVectors(), string.hasBitVectors());
            if (hiveString.isSetBitVectors()) {
                assertEquals(hiveString.getBitVectors(), string.getBitVectors());
            }
            assertEquals(hiveString.getAvgColLen(), string.getAvgColLen(), 0.01);
            assertEquals(hiveString.getMaxColLen(), string.getMaxColLen());
            assertEquals(hiveString.getNumNulls(), string.getNumNulls());
            assertEquals(hiveString.getNumDVs(), string.getNumDistincts());
        }
    }
}
Also used : BooleanColumnStatsData(org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData) DateColumnStatsData(org.apache.hadoop.hive.metastore.api.DateColumnStatsData) StringColumnStatsData(org.apache.hadoop.hive.metastore.api.StringColumnStatsData) LongColumnStatsData(org.apache.hadoop.hive.metastore.api.LongColumnStatsData) BinaryColumnStatsData(org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData) DecimalColumnStatsData(org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData) DoubleColumnStatsData(org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData) ColumnStatisticsInfo(alluxio.grpc.table.ColumnStatisticsInfo) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)

Example 8 with ColumnStatisticsInfo

use of alluxio.grpc.table.ColumnStatisticsInfo in project alluxio by Alluxio.

the class HiveDatabase method getTable.

@Override
public UdbTable getTable(String tableName, UdbBypassSpec bypassSpec) throws IOException {
    try {
        Table table;
        List<Partition> partitions;
        List<ColumnStatisticsObj> columnStats;
        List<String> partitionColumns;
        Map<String, List<ColumnStatisticsInfo>> statsMap = new HashMap<>();
        // perform all the hive client operations, and release the client early.
        try (CloseableResource<IMetaStoreClient> client = mClientPool.acquireClientResource()) {
            table = client.get().getTable(mHiveDbName, tableName);
            // Potentially expensive call
            partitions = client.get().listPartitions(mHiveDbName, table.getTableName(), (short) -1);
            List<String> colNames = table.getSd().getCols().stream().map(FieldSchema::getName).collect(Collectors.toList());
            columnStats = client.get().getTableColumnStatistics(mHiveDbName, tableName, colNames);
            // construct the partition statistics
            List<String> dataColumns = table.getSd().getCols().stream().map(org.apache.hadoop.hive.metastore.api.FieldSchema::getName).collect(Collectors.toList());
            partitionColumns = table.getPartitionKeys().stream().map(org.apache.hadoop.hive.metastore.api.FieldSchema::getName).collect(Collectors.toList());
            List<String> partitionNames = partitions.stream().map(partition -> FileUtils.makePartName(partitionColumns, partition.getValues())).collect(Collectors.toList());
            for (List<String> partialPartitionNames : Lists.partition(partitionNames, MAX_PARTITION_COLUMN_STATISTICS)) {
                statsMap.putAll(client.get().getPartitionColumnStatistics(mHiveDbName, tableName, partialPartitionNames, dataColumns).entrySet().stream().collect(Collectors.toMap(Map.Entry::getKey, e -> e.getValue().stream().map(HiveUtils::toProto).collect(Collectors.toList()), (e1, e2) -> e2)));
            }
        }
        PathTranslator pathTranslator = mountAlluxioPaths(table, partitions, bypassSpec);
        List<ColumnStatisticsInfo> colStats = columnStats.stream().map(HiveUtils::toProto).collect(Collectors.toList());
        // construct table layout
        PartitionInfo partitionInfo = PartitionInfo.newBuilder().setDbName(getUdbContext().getDbName()).setTableName(tableName).addAllDataCols(HiveUtils.toProto(table.getSd().getCols())).setStorage(HiveUtils.toProto(table.getSd(), pathTranslator)).putAllParameters(table.getParameters()).build();
        Layout layout = Layout.newBuilder().setLayoutType(HiveLayout.TYPE).setLayoutData(partitionInfo.toByteString()).build();
        // create udb partitions info
        List<UdbPartition> udbPartitions = new ArrayList<>();
        if (partitionColumns.isEmpty()) {
            // unpartitioned table, generate a partition
            PartitionInfo.Builder pib = PartitionInfo.newBuilder().setDbName(getUdbContext().getDbName()).setTableName(tableName).addAllDataCols(HiveUtils.toProto(table.getSd().getCols())).setStorage(HiveUtils.toProto(table.getSd(), pathTranslator)).setPartitionName(tableName).putAllParameters(table.getParameters());
            udbPartitions.add(new HivePartition(new HiveLayout(pib.build(), Collections.emptyList())));
        } else {
            for (Partition partition : partitions) {
                String partName = FileUtils.makePartName(partitionColumns, partition.getValues());
                PartitionInfo.Builder pib = PartitionInfo.newBuilder().setDbName(getUdbContext().getDbName()).setTableName(tableName).addAllDataCols(HiveUtils.toProto(partition.getSd().getCols())).setStorage(HiveUtils.toProto(partition.getSd(), pathTranslator)).setPartitionName(partName).putAllParameters(partition.getParameters());
                if (partition.getValues() != null) {
                    pib.addAllValues(partition.getValues());
                }
                udbPartitions.add(new HivePartition(new HiveLayout(pib.build(), statsMap.getOrDefault(partName, Collections.emptyList()))));
            }
        }
        return new HiveTable(tableName, HiveUtils.toProtoSchema(table.getSd().getCols()), colStats, HiveUtils.toProto(table.getPartitionKeys()), udbPartitions, layout, table);
    } catch (NoSuchObjectException e) {
        throw new NotFoundException("Table " + tableName + " does not exist.", e);
    } catch (TException e) {
        throw new IOException("Failed to get table: " + tableName + " error: " + e.getMessage(), e);
    }
}
Also used : ColumnStatisticsInfo(alluxio.grpc.table.ColumnStatisticsInfo) UdbUtils(alluxio.table.common.udb.UdbUtils) UnderDatabase(alluxio.table.common.udb.UnderDatabase) UdbPartition(alluxio.table.common.UdbPartition) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) UdbContext(alluxio.table.common.udb.UdbContext) LoggerFactory(org.slf4j.LoggerFactory) HashMap(java.util.HashMap) Partition(org.apache.hadoop.hive.metastore.api.Partition) Warehouse(org.apache.hadoop.hive.metastore.Warehouse) ArrayList(java.util.ArrayList) DatabaseInfo(alluxio.master.table.DatabaseInfo) PathUtils(alluxio.util.io.PathUtils) HiveClientPoolCache(alluxio.table.under.hive.util.HiveClientPoolCache) Lists(com.google.common.collect.Lists) CloseableResource(alluxio.resource.CloseableResource) AbstractHiveClientPool(alluxio.table.under.hive.util.AbstractHiveClientPool) AlluxioURI(alluxio.AlluxioURI) UdbBypassSpec(alluxio.table.common.udb.UdbBypassSpec) Map(java.util.Map) UdbConfiguration(alluxio.table.common.udb.UdbConfiguration) Logger(org.slf4j.Logger) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) Layout(alluxio.grpc.table.Layout) TException(org.apache.thrift.TException) AlluxioException(alluxio.exception.AlluxioException) PrincipalType(org.apache.hadoop.hive.metastore.api.PrincipalType) IOException(java.io.IOException) NotFoundException(alluxio.exception.status.NotFoundException) Collectors(java.util.stream.Collectors) Table(org.apache.hadoop.hive.metastore.api.Table) Objects(java.util.Objects) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) PartitionInfo(alluxio.grpc.table.layout.hive.PartitionInfo) List(java.util.List) PathTranslator(alluxio.table.common.udb.PathTranslator) IMetaStoreClient(org.apache.hadoop.hive.metastore.IMetaStoreClient) UdbTable(alluxio.table.common.udb.UdbTable) FileUtils(org.apache.hadoop.hive.common.FileUtils) VisibleForTesting(com.google.common.annotations.VisibleForTesting) HiveLayout(alluxio.table.common.layout.HiveLayout) Database(org.apache.hadoop.hive.metastore.api.Database) Collections(java.util.Collections) NoSuchObjectException(org.apache.hadoop.hive.metastore.api.NoSuchObjectException) TException(org.apache.thrift.TException) HiveLayout(alluxio.table.common.layout.HiveLayout) HashMap(java.util.HashMap) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) ArrayList(java.util.ArrayList) NotFoundException(alluxio.exception.status.NotFoundException) IMetaStoreClient(org.apache.hadoop.hive.metastore.IMetaStoreClient) UdbPartition(alluxio.table.common.UdbPartition) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ArrayList(java.util.ArrayList) List(java.util.List) PartitionInfo(alluxio.grpc.table.layout.hive.PartitionInfo) UdbPartition(alluxio.table.common.UdbPartition) Partition(org.apache.hadoop.hive.metastore.api.Partition) Table(org.apache.hadoop.hive.metastore.api.Table) UdbTable(alluxio.table.common.udb.UdbTable) IOException(java.io.IOException) PathTranslator(alluxio.table.common.udb.PathTranslator) Layout(alluxio.grpc.table.Layout) HiveLayout(alluxio.table.common.layout.HiveLayout) ColumnStatisticsInfo(alluxio.grpc.table.ColumnStatisticsInfo) NoSuchObjectException(org.apache.hadoop.hive.metastore.api.NoSuchObjectException)

Aggregations

ColumnStatisticsInfo (alluxio.grpc.table.ColumnStatisticsInfo)8 PartitionInfo (alluxio.grpc.table.layout.hive.PartitionInfo)4 AlluxioStatusException (alluxio.exception.status.AlluxioStatusException)2 NotFoundException (alluxio.exception.status.NotFoundException)2 Layout (alluxio.grpc.table.Layout)2 UdbPartition (alluxio.table.common.UdbPartition)2 HiveLayout (alluxio.table.common.layout.HiveLayout)2 PathTranslator (alluxio.table.common.udb.PathTranslator)2 UdbTable (alluxio.table.common.udb.UdbTable)2 HiveBasicStatistics (com.facebook.presto.hive.HiveBasicStatistics)2 Column (com.facebook.presto.hive.metastore.Column)2 MetastoreUtil.getHiveBasicStatistics (com.facebook.presto.hive.metastore.MetastoreUtil.getHiveBasicStatistics)2 PartitionStatistics (com.facebook.presto.hive.metastore.PartitionStatistics)2 Table (com.facebook.presto.hive.metastore.Table)2 NotFoundException (com.facebook.presto.spi.NotFoundException)2 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 HashMap (java.util.HashMap)2 List (java.util.List)2 AlluxioURI (alluxio.AlluxioURI)1