use of alluxio.grpc.table.ColumnStatisticsInfo in project alluxio by Alluxio.
the class GlueUtils method toProto.
/**
* Convert glue ColumnStatistics to Alluxio ColumnStatisticsInfo.
*
* @param glueColumnStatistic glue column statistic info
* @return Alluxio ColumnStatisticsInfo
*/
public static ColumnStatisticsInfo toProto(ColumnStatistics glueColumnStatistic) {
if (glueColumnStatistic == null) {
return ColumnStatisticsInfo.newBuilder().build();
}
ColumnStatisticsInfo.Builder columnStatisticsInfoBuilder = ColumnStatisticsInfo.newBuilder();
columnStatisticsInfoBuilder.setColName(glueColumnStatistic.getColumnName()).setColType(glueColumnStatistic.getColumnType());
if (glueColumnStatistic.getStatisticsData() != null) {
com.amazonaws.services.glue.model.ColumnStatisticsData glueColumnStatisticsData = glueColumnStatistic.getStatisticsData();
String columnType = glueColumnStatistic.getStatisticsData().getType();
if (columnType != null) {
if (columnType.equals("BOOLEAN") && glueColumnStatisticsData.getBooleanColumnStatisticsData() != null) {
com.amazonaws.services.glue.model.BooleanColumnStatisticsData booleanData = glueColumnStatisticsData.getBooleanColumnStatisticsData();
if (booleanData != null) {
columnStatisticsInfoBuilder.setData(ColumnStatisticsData.newBuilder().setBooleanStats(toProto(booleanData)).build());
}
}
if (columnType.equals("DATE") && glueColumnStatisticsData.getDateColumnStatisticsData() != null) {
com.amazonaws.services.glue.model.DateColumnStatisticsData dateData = glueColumnStatisticsData.getDateColumnStatisticsData();
if (dateData != null) {
columnStatisticsInfoBuilder.setData(ColumnStatisticsData.newBuilder().setDateStats(toProto(dateData)).build());
}
}
if (columnType.equals("DECIMAL") && glueColumnStatisticsData.getDecimalColumnStatisticsData() != null) {
com.amazonaws.services.glue.model.DecimalColumnStatisticsData decimalData = glueColumnStatisticsData.getDecimalColumnStatisticsData();
if (decimalData != null) {
columnStatisticsInfoBuilder.setData(ColumnStatisticsData.newBuilder().setDecimalStats(toProto(decimalData)).build());
}
}
if (columnType.equals("DOUBLE") && glueColumnStatisticsData.getDoubleColumnStatisticsData() != null) {
com.amazonaws.services.glue.model.DoubleColumnStatisticsData doubleData = glueColumnStatisticsData.getDoubleColumnStatisticsData();
if (doubleData != null) {
columnStatisticsInfoBuilder.setData(ColumnStatisticsData.newBuilder().setDoubleStats(toProto(doubleData)).build());
}
}
if (columnType.equals("LONG") && glueColumnStatisticsData.getLongColumnStatisticsData() != null) {
com.amazonaws.services.glue.model.LongColumnStatisticsData longData = glueColumnStatisticsData.getLongColumnStatisticsData();
if (longData != null) {
columnStatisticsInfoBuilder.setData(ColumnStatisticsData.newBuilder().setLongStats(toProto(longData)).build());
}
}
if (columnType.equals("STRING") && glueColumnStatisticsData.getStringColumnStatisticsData() != null) {
com.amazonaws.services.glue.model.StringColumnStatisticsData stringData = glueColumnStatisticsData.getStringColumnStatisticsData();
if (stringData != null) {
columnStatisticsInfoBuilder.setData(ColumnStatisticsData.newBuilder().setStringStats(toProto(stringData)).build());
}
}
if (columnType.equals("BINARY") && glueColumnStatisticsData.getBinaryColumnStatisticsData() != null) {
com.amazonaws.services.glue.model.BinaryColumnStatisticsData binaryData = glueColumnStatisticsData.getBinaryColumnStatisticsData();
if (binaryData != null) {
columnStatisticsInfoBuilder.setData(ColumnStatisticsData.newBuilder().setBinaryStats(toProto(binaryData)).build());
}
}
}
}
return columnStatisticsInfoBuilder.build();
}
use of alluxio.grpc.table.ColumnStatisticsInfo in project alluxio by Alluxio.
the class HiveUtilsTest method verifyColumnStats.
private void verifyColumnStats(ColumnStatisticsObj hiveColStats) {
ColumnStatisticsInfo colStats = HiveUtils.toProto(hiveColStats);
assertEquals(hiveColStats.getColName(), colStats.getColName());
assertEquals(hiveColStats.getColType(), colStats.getColType());
assertEquals(hiveColStats.isSetStatsData(), colStats.hasData());
if (hiveColStats.isSetStatsData()) {
ColumnStatisticsData hiveData = hiveColStats.getStatsData();
alluxio.grpc.table.ColumnStatisticsData data = colStats.getData();
// verify binary
assertEquals(hiveData.isSetBinaryStats(), data.hasBinaryStats());
if (hiveData.isSetBinaryStats()) {
BinaryColumnStatsData hiveBinary = hiveData.getBinaryStats();
alluxio.grpc.table.BinaryColumnStatsData binary = data.getBinaryStats();
assertEquals(hiveBinary.isSetBitVectors(), binary.hasBitVectors());
if (hiveBinary.isSetBitVectors()) {
assertEquals(hiveBinary.getBitVectors(), binary.getBitVectors());
}
assertEquals(hiveBinary.getAvgColLen(), binary.getAvgColLen(), 0.01);
assertEquals(hiveBinary.getMaxColLen(), binary.getMaxColLen());
assertEquals(hiveBinary.getNumNulls(), binary.getNumNulls());
}
// verify boolean
assertEquals(hiveData.isSetBooleanStats(), data.hasBooleanStats());
if (hiveData.isSetBooleanStats()) {
BooleanColumnStatsData hiveBoolean = hiveData.getBooleanStats();
alluxio.grpc.table.BooleanColumnStatsData bool = data.getBooleanStats();
assertEquals(hiveBoolean.isSetBitVectors(), bool.hasBitVectors());
if (hiveBoolean.isSetBitVectors()) {
assertEquals(hiveBoolean.getBitVectors(), bool.getBitVectors());
}
assertEquals(hiveBoolean.getNumFalses(), bool.getNumFalses());
assertEquals(hiveBoolean.getNumTrues(), bool.getNumTrues());
assertEquals(hiveBoolean.getNumNulls(), bool.getNumNulls());
}
// verify date
assertEquals(hiveData.isSetDateStats(), data.hasDateStats());
if (hiveData.isSetDateStats()) {
DateColumnStatsData hiveDate = hiveData.getDateStats();
alluxio.grpc.table.DateColumnStatsData date = data.getDateStats();
assertEquals(hiveDate.isSetBitVectors(), date.hasBitVectors());
if (hiveDate.isSetBitVectors()) {
assertEquals(hiveDate.getBitVectors(), date.getBitVectors());
}
assertEquals(hiveDate.getNumNulls(), date.getNumNulls());
assertEquals(hiveDate.getNumDVs(), date.getNumDistincts());
assertEquals(hiveDate.isSetHighValue(), date.hasHighValue());
if (hiveDate.isSetHighValue()) {
assertEquals(hiveDate.getHighValue().getDaysSinceEpoch(), date.getHighValue().getDaysSinceEpoch());
}
assertEquals(hiveDate.isSetLowValue(), date.hasLowValue());
if (hiveDate.isSetLowValue()) {
assertEquals(hiveDate.getLowValue().getDaysSinceEpoch(), date.getLowValue().getDaysSinceEpoch());
}
}
// verify decimal
assertEquals(hiveData.isSetDecimalStats(), data.hasDecimalStats());
if (hiveData.isSetDecimalStats()) {
DecimalColumnStatsData hiveDecimal = hiveData.getDecimalStats();
alluxio.grpc.table.DecimalColumnStatsData decimal = data.getDecimalStats();
assertEquals(hiveDecimal.isSetBitVectors(), decimal.hasBitVectors());
if (hiveDecimal.isSetBitVectors()) {
assertEquals(hiveDecimal.getBitVectors(), decimal.getBitVectors());
}
assertEquals(hiveDecimal.getNumNulls(), decimal.getNumNulls());
assertEquals(hiveDecimal.getNumDVs(), decimal.getNumDistincts());
assertEquals(hiveDecimal.isSetHighValue(), decimal.hasHighValue());
if (hiveDecimal.isSetHighValue()) {
assertEquals(hiveDecimal.getHighValue().getScale(), decimal.getHighValue().getScale());
assertArrayEquals(hiveDecimal.getHighValue().getUnscaled(), decimal.getHighValue().getUnscaled().toByteArray());
}
assertEquals(hiveDecimal.isSetLowValue(), decimal.hasLowValue());
if (hiveDecimal.isSetLowValue()) {
assertEquals(hiveDecimal.getLowValue().getScale(), decimal.getLowValue().getScale());
assertArrayEquals(hiveDecimal.getLowValue().getUnscaled(), decimal.getLowValue().getUnscaled().toByteArray());
}
}
// verify double
assertEquals(hiveData.isSetDoubleStats(), data.hasDoubleStats());
if (hiveData.isSetDoubleStats()) {
DoubleColumnStatsData hiveDouble = hiveData.getDoubleStats();
alluxio.grpc.table.DoubleColumnStatsData dbl = data.getDoubleStats();
assertEquals(hiveDouble.isSetBitVectors(), dbl.hasBitVectors());
if (hiveDouble.isSetBitVectors()) {
assertEquals(hiveDouble.getBitVectors(), dbl.getBitVectors());
}
assertEquals(hiveDouble.getNumNulls(), dbl.getNumNulls());
assertEquals(hiveDouble.getNumDVs(), dbl.getNumDistincts());
assertEquals(hiveDouble.isSetHighValue(), dbl.hasHighValue());
if (hiveDouble.isSetHighValue()) {
assertEquals(hiveDouble.getHighValue(), dbl.getHighValue(), 0.01);
}
assertEquals(hiveDouble.isSetLowValue(), dbl.hasLowValue());
if (hiveDouble.isSetLowValue()) {
assertEquals(hiveDouble.getLowValue(), dbl.getLowValue(), 0.01);
}
}
// verify long
assertEquals(hiveData.isSetLongStats(), data.hasLongStats());
if (hiveData.isSetLongStats()) {
LongColumnStatsData hiveLong = hiveData.getLongStats();
alluxio.grpc.table.LongColumnStatsData dbl = data.getLongStats();
assertEquals(hiveLong.isSetBitVectors(), dbl.hasBitVectors());
if (hiveLong.isSetBitVectors()) {
assertEquals(hiveLong.getBitVectors(), dbl.getBitVectors());
}
assertEquals(hiveLong.getNumNulls(), dbl.getNumNulls());
assertEquals(hiveLong.getNumDVs(), dbl.getNumDistincts());
assertEquals(hiveLong.isSetHighValue(), dbl.hasHighValue());
if (hiveLong.isSetHighValue()) {
assertEquals(hiveLong.getHighValue(), dbl.getHighValue());
}
assertEquals(hiveLong.isSetLowValue(), dbl.hasLowValue());
if (hiveLong.isSetLowValue()) {
assertEquals(hiveLong.getLowValue(), dbl.getLowValue());
}
}
// verify string
assertEquals(hiveData.isSetStringStats(), data.hasStringStats());
if (hiveData.isSetStringStats()) {
StringColumnStatsData hiveString = hiveData.getStringStats();
alluxio.grpc.table.StringColumnStatsData string = data.getStringStats();
assertEquals(hiveString.isSetBitVectors(), string.hasBitVectors());
if (hiveString.isSetBitVectors()) {
assertEquals(hiveString.getBitVectors(), string.getBitVectors());
}
assertEquals(hiveString.getAvgColLen(), string.getAvgColLen(), 0.01);
assertEquals(hiveString.getMaxColLen(), string.getMaxColLen());
assertEquals(hiveString.getNumNulls(), string.getNumNulls());
assertEquals(hiveString.getNumDVs(), string.getNumDistincts());
}
}
}
use of alluxio.grpc.table.ColumnStatisticsInfo in project alluxio by Alluxio.
the class HiveDatabase method getTable.
@Override
public UdbTable getTable(String tableName, UdbBypassSpec bypassSpec) throws IOException {
try {
Table table;
List<Partition> partitions;
List<ColumnStatisticsObj> columnStats;
List<String> partitionColumns;
Map<String, List<ColumnStatisticsInfo>> statsMap = new HashMap<>();
// perform all the hive client operations, and release the client early.
try (CloseableResource<IMetaStoreClient> client = mClientPool.acquireClientResource()) {
table = client.get().getTable(mHiveDbName, tableName);
// Potentially expensive call
partitions = client.get().listPartitions(mHiveDbName, table.getTableName(), (short) -1);
List<String> colNames = table.getSd().getCols().stream().map(FieldSchema::getName).collect(Collectors.toList());
columnStats = client.get().getTableColumnStatistics(mHiveDbName, tableName, colNames);
// construct the partition statistics
List<String> dataColumns = table.getSd().getCols().stream().map(org.apache.hadoop.hive.metastore.api.FieldSchema::getName).collect(Collectors.toList());
partitionColumns = table.getPartitionKeys().stream().map(org.apache.hadoop.hive.metastore.api.FieldSchema::getName).collect(Collectors.toList());
List<String> partitionNames = partitions.stream().map(partition -> FileUtils.makePartName(partitionColumns, partition.getValues())).collect(Collectors.toList());
for (List<String> partialPartitionNames : Lists.partition(partitionNames, MAX_PARTITION_COLUMN_STATISTICS)) {
statsMap.putAll(client.get().getPartitionColumnStatistics(mHiveDbName, tableName, partialPartitionNames, dataColumns).entrySet().stream().collect(Collectors.toMap(Map.Entry::getKey, e -> e.getValue().stream().map(HiveUtils::toProto).collect(Collectors.toList()), (e1, e2) -> e2)));
}
}
PathTranslator pathTranslator = mountAlluxioPaths(table, partitions, bypassSpec);
List<ColumnStatisticsInfo> colStats = columnStats.stream().map(HiveUtils::toProto).collect(Collectors.toList());
// construct table layout
PartitionInfo partitionInfo = PartitionInfo.newBuilder().setDbName(getUdbContext().getDbName()).setTableName(tableName).addAllDataCols(HiveUtils.toProto(table.getSd().getCols())).setStorage(HiveUtils.toProto(table.getSd(), pathTranslator)).putAllParameters(table.getParameters()).build();
Layout layout = Layout.newBuilder().setLayoutType(HiveLayout.TYPE).setLayoutData(partitionInfo.toByteString()).build();
// create udb partitions info
List<UdbPartition> udbPartitions = new ArrayList<>();
if (partitionColumns.isEmpty()) {
// unpartitioned table, generate a partition
PartitionInfo.Builder pib = PartitionInfo.newBuilder().setDbName(getUdbContext().getDbName()).setTableName(tableName).addAllDataCols(HiveUtils.toProto(table.getSd().getCols())).setStorage(HiveUtils.toProto(table.getSd(), pathTranslator)).setPartitionName(tableName).putAllParameters(table.getParameters());
udbPartitions.add(new HivePartition(new HiveLayout(pib.build(), Collections.emptyList())));
} else {
for (Partition partition : partitions) {
String partName = FileUtils.makePartName(partitionColumns, partition.getValues());
PartitionInfo.Builder pib = PartitionInfo.newBuilder().setDbName(getUdbContext().getDbName()).setTableName(tableName).addAllDataCols(HiveUtils.toProto(partition.getSd().getCols())).setStorage(HiveUtils.toProto(partition.getSd(), pathTranslator)).setPartitionName(partName).putAllParameters(partition.getParameters());
if (partition.getValues() != null) {
pib.addAllValues(partition.getValues());
}
udbPartitions.add(new HivePartition(new HiveLayout(pib.build(), statsMap.getOrDefault(partName, Collections.emptyList()))));
}
}
return new HiveTable(tableName, HiveUtils.toProtoSchema(table.getSd().getCols()), colStats, HiveUtils.toProto(table.getPartitionKeys()), udbPartitions, layout, table);
} catch (NoSuchObjectException e) {
throw new NotFoundException("Table " + tableName + " does not exist.", e);
} catch (TException e) {
throw new IOException("Failed to get table: " + tableName + " error: " + e.getMessage(), e);
}
}
Aggregations