use of alluxio.grpc.table.ColumnStatisticsInfo in project presto by prestodb.
the class AlluxioHiveMetastore method getPartitionStatistics.
@Override
public Map<String, PartitionStatistics> getPartitionStatistics(MetastoreContext metastoreContext, String databaseName, String tableName, Set<String> partitionNames) {
Table table = getTable(metastoreContext, databaseName, tableName).orElseThrow(() -> new TableNotFoundException(new SchemaTableName(databaseName, tableName)));
Map<String, HiveBasicStatistics> partitionBasicStatistics = getPartitionsByNames(metastoreContext, databaseName, tableName, ImmutableList.copyOf(partitionNames)).entrySet().stream().filter(entry -> entry.getValue().isPresent()).collect(toImmutableMap(entry -> MetastoreUtil.makePartName(table.getPartitionColumns(), entry.getValue().get().getValues()), entry -> getHiveBasicStatistics(entry.getValue().get().getParameters())));
Map<String, OptionalLong> partitionRowCounts = partitionBasicStatistics.entrySet().stream().collect(toImmutableMap(Map.Entry::getKey, entry -> entry.getValue().getRowCount()));
List<String> dataColumns = table.getDataColumns().stream().map(Column::getName).collect(toImmutableList());
Map<String, List<ColumnStatisticsInfo>> columnStatisticss;
try {
columnStatisticss = client.getPartitionColumnStatistics(table.getDatabaseName(), table.getTableName(), partitionBasicStatistics.keySet().stream().collect(toImmutableList()), dataColumns);
} catch (AlluxioStatusException e) {
throw new PrestoException(HIVE_METASTORE_ERROR, e);
}
Map<String, Map<String, HiveColumnStatistics>> partitionColumnStatistics = columnStatisticss.entrySet().stream().filter(entry -> !entry.getValue().isEmpty()).collect(toImmutableMap(Map.Entry::getKey, entry -> groupStatisticsByColumn(metastoreContext, entry.getValue(), partitionRowCounts.getOrDefault(entry.getKey(), OptionalLong.empty()))));
ImmutableMap.Builder<String, PartitionStatistics> result = ImmutableMap.builder();
for (String partitionName : partitionBasicStatistics.keySet()) {
HiveBasicStatistics basicStatistics = partitionBasicStatistics.get(partitionName);
Map<String, HiveColumnStatistics> columnStatistics = partitionColumnStatistics.getOrDefault(partitionName, ImmutableMap.of());
result.put(partitionName, new PartitionStatistics(basicStatistics, columnStatistics));
}
return result.build();
}
use of alluxio.grpc.table.ColumnStatisticsInfo in project alluxio by Alluxio.
the class GlueUtilsTest method verifyColumnStats.
private void verifyColumnStats(ColumnStatistics glueColStats) {
ColumnStatisticsInfo colStats = GlueUtils.toProto(glueColStats);
assertEquals(glueColStats.getColumnName(), colStats.getColName());
assertEquals(glueColStats.getColumnType(), colStats.getColType());
// verify empty ColumnStatisticData
if (glueColStats.getStatisticsData() == null) {
assertEquals(glueColStats.getStatisticsData() == null && glueColStats.getStatisticsData().getType() != null, colStats.hasData());
}
if (glueColStats.getStatisticsData() != null) {
ColumnStatisticsData glueData = glueColStats.getStatisticsData();
alluxio.grpc.table.ColumnStatisticsData data = colStats.getData();
// verify boolean
if (glueData.getBooleanColumnStatisticsData() != null) {
assertEquals(glueData.getType(), "BOOLEAN");
BooleanColumnStatisticsData glueBoolean = glueData.getBooleanColumnStatisticsData();
assertEquals(glueBoolean.getNumberOfFalses() != null && glueBoolean.getNumberOfTrues() != null && glueBoolean.getNumberOfNulls() != null, data.hasBooleanStats());
if (data.hasBooleanStats()) {
alluxio.grpc.table.BooleanColumnStatsData boolData = data.getBooleanStats();
assertEquals(glueBoolean.getNumberOfFalses().longValue(), boolData.getNumFalses());
assertEquals(glueBoolean.getNumberOfTrues().longValue(), boolData.getNumTrues());
assertEquals(glueBoolean.getNumberOfNulls().longValue(), boolData.getNumNulls());
}
}
// verify date
if (glueData.getDateColumnStatisticsData() != null) {
assertEquals(glueData.getType(), "DATE");
DateColumnStatisticsData glueDate = glueData.getDateColumnStatisticsData();
assertEquals(glueDate.getNumberOfDistinctValues() != null && glueDate.getNumberOfNulls() != null, data.hasDateStats());
if (data.hasDateStats()) {
alluxio.grpc.table.DateColumnStatsData date = data.getDateStats();
assertEquals(glueDate.getNumberOfDistinctValues().longValue(), date.getNumDistincts());
assertEquals(glueDate.getNumberOfNulls().longValue(), date.getNumNulls());
assertEquals(glueDate.getMaximumValue() != null, date.hasHighValue());
if (glueDate.getMaximumValue() != null) {
assertEquals(glueDate.getMaximumValue().getTime(), date.getHighValue().getDaysSinceEpoch());
}
assertEquals(glueDate.getMinimumValue() != null, date.hasLowValue());
if (glueDate.getMinimumValue() != null) {
assertEquals(glueDate.getMinimumValue().getTime(), date.getLowValue().getDaysSinceEpoch());
}
}
}
// verify decimal
if (glueData.getDecimalColumnStatisticsData() != null) {
assertEquals(glueData.getType(), "DECIMAL");
DecimalColumnStatisticsData glueDecimal = glueData.getDecimalColumnStatisticsData();
assertEquals(glueDecimal.getNumberOfDistinctValues() != null && glueDecimal.getNumberOfNulls() != null, data.hasDecimalStats());
if (data.hasDecimalStats()) {
alluxio.grpc.table.DecimalColumnStatsData decimal = data.getDecimalStats();
assertEquals(glueDecimal.getNumberOfDistinctValues().longValue(), decimal.getNumDistincts());
assertEquals(glueDecimal.getNumberOfNulls().longValue(), decimal.getNumNulls());
assertEquals(glueDecimal.getMaximumValue() != null, decimal.hasHighValue());
if (glueDecimal.getMaximumValue() != null) {
assertEquals(glueDecimal.getMaximumValue().getScale().longValue(), decimal.getHighValue().getScale());
assertArrayEquals(glueDecimal.getMaximumValue().getUnscaledValue().array(), decimal.getHighValue().getUnscaled().toByteArray());
}
assertEquals(glueDecimal.getMinimumValue() != null, decimal.hasLowValue());
if (glueDecimal.getMinimumValue() != null) {
assertEquals(glueDecimal.getMinimumValue().getScale().longValue(), decimal.getLowValue().getScale());
assertArrayEquals(glueDecimal.getMinimumValue().getUnscaledValue().array(), decimal.getLowValue().getUnscaled().toByteArray());
}
}
}
// verify double
if (glueData.getDoubleColumnStatisticsData() != null) {
assertEquals(glueData.getType(), "DOUBLE");
DoubleColumnStatisticsData glueDouble = glueData.getDoubleColumnStatisticsData();
assertEquals(glueDouble.getNumberOfDistinctValues() != null && glueDouble.getNumberOfNulls() != null, data.hasDoubleStats());
if (data.hasDoubleStats()) {
alluxio.grpc.table.DoubleColumnStatsData doubleData = data.getDoubleStats();
assertEquals(glueDouble.getNumberOfDistinctValues().longValue(), doubleData.getNumDistincts());
assertEquals(glueDouble.getNumberOfNulls().longValue(), doubleData.getNumNulls());
assertEquals(glueDouble.getMaximumValue() != null, doubleData.hasHighValue());
if (glueDouble.getMaximumValue() != null) {
assertEquals(glueDouble.getMaximumValue().doubleValue(), doubleData.getHighValue(), 0.01);
}
assertEquals(glueDouble.getMinimumValue() != null, doubleData.hasLowValue());
if (glueDouble.getMinimumValue() != null) {
assertEquals(glueDouble.getMinimumValue().doubleValue(), doubleData.getLowValue(), 0.01);
}
}
}
// verify long
if (glueData.getLongColumnStatisticsData() != null) {
assertEquals(glueData.getType(), "LONG");
LongColumnStatisticsData glueLong = glueData.getLongColumnStatisticsData();
assertEquals(glueLong.getNumberOfDistinctValues() != null && glueLong.getNumberOfNulls() != null, data.hasLongStats());
if (data.hasLongStats()) {
alluxio.grpc.table.LongColumnStatsData longData = data.getLongStats();
assertEquals(glueLong.getNumberOfDistinctValues().longValue(), longData.getNumDistincts());
assertEquals(glueLong.getNumberOfNulls().longValue(), longData.getNumNulls());
assertEquals(glueLong.getMaximumValue() != null, longData.hasHighValue());
if (glueLong.getMaximumValue() != null) {
assertEquals(glueLong.getMaximumValue().longValue(), longData.getHighValue());
}
assertEquals(glueLong.getMinimumValue() != null, longData.hasLowValue());
if (glueLong.getMinimumValue() != null) {
assertEquals(glueLong.getMinimumValue().longValue(), longData.getLowValue());
}
}
}
// verify string
if (glueData.getStringColumnStatisticsData() != null) {
assertEquals(glueData.getType(), "STRING");
StringColumnStatisticsData glueString = glueData.getStringColumnStatisticsData();
assertEquals(glueString.getNumberOfDistinctValues() != null && glueString.getNumberOfNulls() != null && glueString.getMaximumLength() != null && glueString.getAverageLength() != null, data.hasStringStats());
if (data.hasStringStats()) {
alluxio.grpc.table.StringColumnStatsData stringData = data.getStringStats();
assertEquals(glueString.getNumberOfDistinctValues().longValue(), stringData.getNumDistincts());
assertEquals(glueString.getNumberOfNulls().longValue(), stringData.getNumNulls());
assertEquals(glueString.getMaximumLength().longValue(), stringData.getMaxColLen());
assertEquals(glueString.getAverageLength().doubleValue(), stringData.getAvgColLen(), 0.01);
}
}
// verify binary
if (glueData.getBinaryColumnStatisticsData() != null) {
assertEquals(glueData.getType(), "BINARY");
BinaryColumnStatisticsData glueBinary = glueData.getBinaryColumnStatisticsData();
assertEquals(glueBinary.getAverageLength() != null && glueBinary.getMaximumLength() != null && glueBinary.getNumberOfNulls() != null, data.hasBinaryStats());
if (data.hasBinaryStats()) {
alluxio.grpc.table.BinaryColumnStatsData binary = data.getBinaryStats();
assertEquals(glueBinary.getAverageLength().doubleValue(), binary.getAvgColLen(), 0.01);
assertEquals(glueBinary.getMaximumLength().longValue(), binary.getMaxColLen());
assertEquals(glueBinary.getNumberOfNulls().longValue(), binary.getNumNulls());
}
}
}
}
use of alluxio.grpc.table.ColumnStatisticsInfo in project alluxio by Alluxio.
the class HiveLayout method transformLayout.
private HiveLayout transformLayout(AlluxioURI transformedUri, TransformDefinition definition) {
final Properties properties = definition.getProperties();
// TODO(cc): assumption here is the transformed data is in Parquet format.
final StorageFormat.Builder storageFormatBuilder = mPartitionInfo.getStorage().getStorageFormat().toBuilder().setSerde(HiveConstants.PARQUET_SERDE_CLASS).setInputFormat(HiveConstants.PARQUET_INPUT_FORMAT_CLASS).setOutputFormat(HiveConstants.PARQUET_OUTPUT_FORMAT_CLASS);
final String compressionKey = alluxio.job.plan.transform.PartitionInfo.PARQUET_COMPRESSION;
final String compression = properties.getProperty(compressionKey);
if (!StringUtils.isEmpty(compression)) {
storageFormatBuilder.putSerdelibParameters(compressionKey, compression);
}
PartitionInfo info = mPartitionInfo.toBuilder().putAllParameters(mPartitionInfo.getParametersMap()).setStorage(mPartitionInfo.getStorage().toBuilder().setStorageFormat(storageFormatBuilder.build()).setLocation(transformedUri.toString()).build()).build();
List<ColumnStatisticsInfo> stats = new ArrayList<>(mPartitionStatsInfo.values());
return new HiveLayout(info, stats);
}
use of alluxio.grpc.table.ColumnStatisticsInfo in project presto by prestodb.
the class AlluxioHiveMetastore method getTableStatistics.
@Override
public PartitionStatistics getTableStatistics(MetastoreContext metastoreContext, String databaseName, String tableName) {
try {
Table table = getTable(metastoreContext, databaseName, tableName).orElseThrow(() -> new PrestoException(HIVE_METASTORE_ERROR, String.format("Could not retrieve table %s.%s", databaseName, tableName)));
HiveBasicStatistics basicStatistics = getHiveBasicStatistics(table.getParameters());
List<Column> columns = table.getPartitionColumns();
List<String> columnNames = columns.stream().map(Column::getName).collect(toImmutableList());
List<ColumnStatisticsInfo> columnStatistics = client.getTableColumnStatistics(table.getDatabaseName(), table.getTableName(), columnNames);
return new PartitionStatistics(basicStatistics, groupStatisticsByColumn(metastoreContext, columnStatistics, basicStatistics.getRowCount()));
} catch (Exception e) {
throw new PrestoException(HIVE_METASTORE_ERROR, e);
}
}
use of alluxio.grpc.table.ColumnStatisticsInfo in project alluxio by Alluxio.
the class GlueDatabase method getTable.
@Override
public UdbTable getTable(String tableName, UdbBypassSpec bypassSpec) throws IOException {
Table table;
List<Partition> partitions;
try {
GetTableRequest tableRequest = new GetTableRequest().withCatalogId(mGlueConfiguration.get(Property.CATALOG_ID)).withDatabaseName(mGlueDbName).withName(tableName);
table = getClient().getTable(tableRequest).getTable();
partitions = batchGetPartitions(getClient(), tableName);
PathTranslator pathTranslator = mountAlluxioPaths(table, partitions, bypassSpec);
List<Column> partitionColumns;
if (table.getPartitionKeys() == null) {
partitionColumns = Collections.emptyList();
} else {
partitionColumns = table.getPartitionKeys();
}
// Get table parameters
Map<String, String> tableParameters = table.getParameters() == null ? Collections.emptyMap() : table.getParameters();
// Get column statistics info for table
List<String> columnNames = table.getStorageDescriptor().getColumns().stream().map(Column::getName).collect(Collectors.toList());
GetColumnStatisticsForTableRequest getColumnStatisticsForTableRequest = new GetColumnStatisticsForTableRequest().withCatalogId(mGlueConfiguration.get(Property.CATALOG_ID)).withDatabaseName(mGlueDbName).withTableName(tableName).withColumnNames(columnNames);
List<ColumnStatisticsInfo> columnStatisticsTableData = new ArrayList<>();
if (mGlueConfiguration.getBoolean(Property.TABLE_COLUMN_STATISTICS_ENABLE)) {
columnStatisticsTableData = getTableColumnStatistics(mGlueDbName, tableName, getColumnStatisticsForTableRequest);
}
// Get column statistics info for partitions
// potential expensive call
Map<String, List<ColumnStatisticsInfo>> statsMap = new HashMap<>();
if (mGlueConfiguration.getBoolean(Property.PARTITION_COLUMN_STATISTICS_ENABLE)) {
for (Partition partition : partitions) {
List<String> partitionValue = partition.getValues();
if (partitionValue != null) {
GetColumnStatisticsForPartitionRequest getColumnStatisticsForPartitionRequest = new GetColumnStatisticsForPartitionRequest().withCatalogId(mGlueConfiguration.get(Property.CATALOG_ID)).withDatabaseName(mGlueDbName).withTableName(tableName).withColumnNames(columnNames).withPartitionValues(partitionValue);
String partName = GlueUtils.makePartitionName(partitionColumns, partition.getValues());
statsMap.put(partName, getPartitionColumnStatistics(mGlueDbName, tableName, getColumnStatisticsForPartitionRequest));
}
}
}
PartitionInfo partitionInfo = PartitionInfo.newBuilder().setDbName(mGlueDbName).setTableName(tableName).addAllDataCols(GlueUtils.toProto(table.getStorageDescriptor().getColumns())).setStorage(GlueUtils.toProto(table.getStorageDescriptor(), pathTranslator)).putAllParameters(tableParameters).build();
Layout layout = Layout.newBuilder().setLayoutType(HiveLayout.TYPE).setLayoutData(partitionInfo.toByteString()).build();
List<UdbPartition> udbPartitions = new ArrayList<>();
if (partitionColumns.isEmpty()) {
PartitionInfo.Builder partitionInfoBuilder = PartitionInfo.newBuilder().setDbName(mGlueDbName).setTableName(tableName).addAllDataCols(GlueUtils.toProto(table.getStorageDescriptor().getColumns())).setStorage(GlueUtils.toProto(table.getStorageDescriptor(), pathTranslator)).setPartitionName(tableName).putAllParameters(tableParameters);
udbPartitions.add(new GluePartition(new HiveLayout(partitionInfoBuilder.build(), Collections.emptyList())));
} else {
for (Partition partition : partitions) {
String partName = GlueUtils.makePartitionName(partitionColumns, partition.getValues());
PartitionInfo.Builder partitionInfoBuilder = PartitionInfo.newBuilder().setDbName(mGlueDbName).setTableName(tableName).addAllDataCols(GlueUtils.toProto(partition.getStorageDescriptor().getColumns())).setStorage(GlueUtils.toProto(partition.getStorageDescriptor(), pathTranslator)).setPartitionName(partName).putAllParameters(partition.getParameters() == null ? Collections.emptyMap() : partition.getParameters());
if (partition.getValues() != null) {
partitionInfoBuilder.addAllValues(partition.getValues());
}
udbPartitions.add(new GluePartition(new HiveLayout(partitionInfoBuilder.build(), statsMap.getOrDefault(partName, Collections.emptyList()))));
}
}
return new GlueTable(this, pathTranslator, tableName, GlueUtils.toProtoSchema(table.getStorageDescriptor().getColumns()), columnStatisticsTableData, // Get FieldSchema from partition keys
GlueUtils.toProto(table.getPartitionKeys()), udbPartitions, layout, table);
} catch (EntityNotFoundException e) {
throw new NotFoundException("Table " + tableName + " does not exist in Database: " + mGlueDbName + "; Catalog ID: " + mGlueConfiguration.get(Property.CATALOG_ID) + ".", e);
} catch (ValidationException e) {
throw new IOException("Failed to get table: " + tableName + " in Database: " + mGlueDbName + "; Catalog ID: " + mGlueConfiguration.get(Property.CATALOG_ID) + " with validation error: " + e.getMessage(), e);
} catch (GlueEncryptionException e) {
throw new IOException("Failed to get table: " + tableName + " in Database: " + mGlueDbName + "; Catalog ID: " + mGlueConfiguration.get(Property.CATALOG_ID) + " error: " + e.getMessage(), e);
}
}
Aggregations