use of io.trino.plugin.hive.metastore.SortingColumn in project trino by trinodb.
the class HiveMetadata method getTableProperties.
@Override
public ConnectorTableProperties getTableProperties(ConnectorSession session, ConnectorTableHandle table) {
HiveTableHandle hiveTable = (HiveTableHandle) table;
List<ColumnHandle> partitionColumns = ImmutableList.copyOf(hiveTable.getPartitionColumns());
TupleDomain<ColumnHandle> predicate = TupleDomain.all();
Optional<DiscretePredicates> discretePredicates = Optional.empty();
// So computation of predicate and discretePredicates are not valid.
if (hiveTable.getPartitionNames().isEmpty()) {
Optional<List<HivePartition>> partitions = hiveTable.getPartitions().or(() -> {
// We load the partitions to compute the predicates enforced by the table.
// Note that the computation is not persisted in the table handle, so can be redone many times
// TODO: https://github.com/trinodb/trino/issues/10980.
HivePartitionResult partitionResult = partitionManager.getPartitions(metastore, table, new Constraint(hiveTable.getEnforcedConstraint()));
if (partitionManager.canPartitionsBeLoaded(partitionResult)) {
return Optional.of(partitionManager.getPartitionsAsList(partitionResult));
}
return Optional.empty();
});
if (partitions.isPresent()) {
List<HivePartition> hivePartitions = partitions.orElseThrow();
// Since the partitions are fully loaded now, we need to compute
predicate = createPredicate(partitionColumns, hivePartitions);
// this check allows us to ensure that table is partitioned
if (!partitionColumns.isEmpty()) {
// Do not create tuple domains for every partition at the same time!
// There can be a huge number of partitions so use an iterable so
// all domains do not need to be in memory at the same time.
Iterable<TupleDomain<ColumnHandle>> partitionDomains = Iterables.transform(hivePartitions, hivePartition -> TupleDomain.fromFixedValues(hivePartition.getKeys()));
discretePredicates = Optional.of(new DiscretePredicates(partitionColumns, partitionDomains));
}
}
}
Optional<ConnectorTablePartitioning> tablePartitioning = Optional.empty();
List<LocalProperty<ColumnHandle>> sortingProperties = ImmutableList.of();
if (hiveTable.getBucketHandle().isPresent()) {
if (isPropagateTableScanSortingProperties(session) && !hiveTable.getBucketHandle().get().getSortedBy().isEmpty()) {
// Populating SortingProperty guarantees to the engine that it is reading pre-sorted input.
// We detect compatibility between table and partition level sorted_by properties
// and fail the query if there is a mismatch in HiveSplitManager#getPartitionMetadata.
// This can lead to incorrect results if a sorted_by property is defined over unsorted files.
Map<String, ColumnHandle> columnHandles = getColumnHandles(session, table);
sortingProperties = hiveTable.getBucketHandle().get().getSortedBy().stream().map(sortingColumn -> new SortingProperty<>(columnHandles.get(sortingColumn.getColumnName()), sortingColumn.getOrder().getSortOrder())).collect(toImmutableList());
}
if (isBucketExecutionEnabled(session)) {
tablePartitioning = hiveTable.getBucketHandle().map(bucketing -> new ConnectorTablePartitioning(new HivePartitioningHandle(bucketing.getBucketingVersion(), bucketing.getReadBucketCount(), bucketing.getColumns().stream().map(HiveColumnHandle::getHiveType).collect(toImmutableList()), OptionalInt.empty(), false), bucketing.getColumns().stream().map(ColumnHandle.class::cast).collect(toImmutableList())));
}
}
return new ConnectorTableProperties(predicate, tablePartitioning, Optional.empty(), discretePredicates, sortingProperties);
}
use of io.trino.plugin.hive.metastore.SortingColumn in project trino by trinodb.
the class HiveBucketProperty method fromStorageDescriptor.
public static Optional<HiveBucketProperty> fromStorageDescriptor(Map<String, String> tableParameters, StorageDescriptor storageDescriptor, String tablePartitionName) {
boolean bucketColsSet = storageDescriptor.isSetBucketCols() && !storageDescriptor.getBucketCols().isEmpty();
boolean numBucketsSet = storageDescriptor.isSetNumBuckets() && storageDescriptor.getNumBuckets() > 0;
if (!numBucketsSet) {
// In Hive, a table is considered as not bucketed when its bucketCols is set but its numBucket is not set.
return Optional.empty();
}
if (!bucketColsSet) {
throw new TrinoException(HIVE_INVALID_METADATA, "Table/partition metadata has 'numBuckets' set, but 'bucketCols' is not set: " + tablePartitionName);
}
List<SortingColumn> sortedBy = ImmutableList.of();
if (storageDescriptor.isSetSortCols()) {
sortedBy = storageDescriptor.getSortCols().stream().map(order -> SortingColumn.fromMetastoreApiOrder(order, tablePartitionName)).collect(toImmutableList());
}
BucketingVersion bucketingVersion = HiveBucketing.getBucketingVersion(tableParameters);
return Optional.of(new HiveBucketProperty(storageDescriptor.getBucketCols(), bucketingVersion, storageDescriptor.getNumBuckets(), sortedBy));
}
use of io.trino.plugin.hive.metastore.SortingColumn in project trino by trinodb.
the class AbstractTestHive method doTestBucketSortedTables.
private void doTestBucketSortedTables(SchemaTableName table) throws IOException {
int bucketCount = 3;
int expectedRowCount = 0;
try (Transaction transaction = newTransaction()) {
ConnectorSession session = newSession();
ConnectorMetadata metadata = transaction.getMetadata();
// begin creating the table
ConnectorTableMetadata tableMetadata = new ConnectorTableMetadata(table, ImmutableList.<ColumnMetadata>builder().add(new ColumnMetadata("id", VARCHAR)).add(new ColumnMetadata("value_asc", VARCHAR)).add(new ColumnMetadata("value_desc", BIGINT)).add(new ColumnMetadata("ds", VARCHAR)).build(), ImmutableMap.<String, Object>builder().put(STORAGE_FORMAT_PROPERTY, RCBINARY).put(PARTITIONED_BY_PROPERTY, ImmutableList.of("ds")).put(BUCKETED_BY_PROPERTY, ImmutableList.of("id")).put(BUCKET_COUNT_PROPERTY, bucketCount).put(SORTED_BY_PROPERTY, ImmutableList.builder().add(new SortingColumn("value_asc", ASCENDING)).add(new SortingColumn("value_desc", DESCENDING)).build()).buildOrThrow());
ConnectorOutputTableHandle outputHandle = metadata.beginCreateTable(session, tableMetadata, Optional.empty(), NO_RETRIES);
// write the data
ConnectorPageSink sink = pageSinkProvider.createPageSink(transaction.getTransactionHandle(), session, outputHandle);
List<Type> types = tableMetadata.getColumns().stream().map(ColumnMetadata::getType).collect(toList());
ThreadLocalRandom random = ThreadLocalRandom.current();
for (int i = 0; i < 50; i++) {
MaterializedResult.Builder builder = MaterializedResult.resultBuilder(session, types);
for (int j = 0; j < 1000; j++) {
builder.row(sha256().hashLong(random.nextLong()).toString(), "test" + random.nextInt(100), random.nextLong(100_000), "2018-04-01");
expectedRowCount++;
}
sink.appendPage(builder.build().toPage());
}
HdfsContext context = new HdfsContext(session);
// verify we have enough temporary files per bucket to require multiple passes
Path stagingPathRoot;
if (isTemporaryStagingDirectoryEnabled(session)) {
stagingPathRoot = new Path(getTemporaryStagingDirectoryPath(session).replace("${USER}", context.getIdentity().getUser()));
} else {
stagingPathRoot = getStagingPathRoot(outputHandle);
}
assertThat(listAllDataFiles(context, stagingPathRoot)).filteredOn(file -> file.contains(".tmp-sort.")).size().isGreaterThan(bucketCount * getHiveConfig().getMaxOpenSortFiles() * 2);
// finish the write
Collection<Slice> fragments = getFutureValue(sink.finish());
// verify there are no temporary files
for (String file : listAllDataFiles(context, stagingPathRoot)) {
assertThat(file).doesNotContain(".tmp-sort.");
}
// finish creating table
metadata.finishCreateTable(session, outputHandle, fragments, ImmutableList.of());
transaction.commit();
}
// verify that bucket files are sorted
try (Transaction transaction = newTransaction()) {
ConnectorMetadata metadata = transaction.getMetadata();
ConnectorSession session = newSession();
metadata.beginQuery(session);
ConnectorTableHandle tableHandle = getTableHandle(metadata, table);
List<ColumnHandle> columnHandles = ImmutableList.copyOf(metadata.getColumnHandles(session, tableHandle).values());
// verify local sorting property
ConnectorTableProperties properties = metadata.getTableProperties(newSession(ImmutableMap.of("propagate_table_scan_sorting_properties", true, "bucket_execution_enabled", false)), tableHandle);
Map<String, Integer> columnIndex = indexColumns(columnHandles);
assertEquals(properties.getLocalProperties(), ImmutableList.of(new SortingProperty<>(columnHandles.get(columnIndex.get("value_asc")), ASC_NULLS_FIRST), new SortingProperty<>(columnHandles.get(columnIndex.get("value_desc")), DESC_NULLS_LAST)));
assertThat(metadata.getTableProperties(newSession(), tableHandle).getLocalProperties()).isEmpty();
List<ConnectorSplit> splits = getAllSplits(tableHandle, transaction, session);
assertThat(splits).hasSize(bucketCount);
int actualRowCount = 0;
for (ConnectorSplit split : splits) {
try (ConnectorPageSource pageSource = pageSourceProvider.createPageSource(transaction.getTransactionHandle(), session, split, tableHandle, columnHandles, DynamicFilter.EMPTY)) {
String lastValueAsc = null;
long lastValueDesc = -1;
while (!pageSource.isFinished()) {
Page page = pageSource.getNextPage();
if (page == null) {
continue;
}
for (int i = 0; i < page.getPositionCount(); i++) {
Block blockAsc = page.getBlock(1);
Block blockDesc = page.getBlock(2);
assertFalse(blockAsc.isNull(i));
assertFalse(blockDesc.isNull(i));
String valueAsc = VARCHAR.getSlice(blockAsc, i).toStringUtf8();
if (lastValueAsc != null) {
assertGreaterThanOrEqual(valueAsc, lastValueAsc);
if (valueAsc.equals(lastValueAsc)) {
long valueDesc = BIGINT.getLong(blockDesc, i);
if (lastValueDesc != -1) {
assertLessThanOrEqual(valueDesc, lastValueDesc);
}
lastValueDesc = valueDesc;
} else {
lastValueDesc = -1;
}
}
lastValueAsc = valueAsc;
actualRowCount++;
}
}
}
}
assertThat(actualRowCount).isEqualTo(expectedRowCount);
}
}
use of io.trino.plugin.hive.metastore.SortingColumn in project trino by trinodb.
the class AbstractTestHiveLocal method doTestSparkBucketedTableValidation.
private void doTestSparkBucketedTableValidation(SchemaTableName tableName) throws Exception {
java.nio.file.Path externalLocation = copyResourceDirToTemporaryDirectory("spark_bucketed_nation");
try {
createExternalTable(tableName, ORC, ImmutableList.of(new Column("nationkey", HIVE_INT, Optional.empty()), new Column("name", HIVE_STRING, Optional.empty()), new Column("regionkey", HIVE_INT, Optional.empty()), new Column("comment", HIVE_STRING, Optional.empty())), ImmutableList.of(), Optional.of(new HiveBucketProperty(ImmutableList.of("nationkey"), BUCKETING_V1, 3, ImmutableList.of(new SortingColumn("name", SortingColumn.Order.ASCENDING)))), new Path(URI.create("file://" + externalLocation.toString())));
assertReadFailsWithMessageMatching(ORC, tableName, "Hive table is corrupt\\. File '.*/.*' is for bucket [0-2], but contains a row for bucket [0-2].");
markTableAsCreatedBySpark(tableName, "orc");
assertReadReturnsRowCount(ORC, tableName, 25);
} finally {
deleteRecursively(externalLocation, RecursiveDeleteOption.ALLOW_INSECURE);
}
}
use of io.trino.plugin.hive.metastore.SortingColumn in project trino by trinodb.
the class AbstractTestHive method doTestBucketedSortedTableEvolution.
private void doTestBucketedSortedTableEvolution(SchemaTableName tableName) throws Exception {
int rowCount = 100;
// Create table and populate it with 3 partitions with different sort orders but same bucketing
createEmptyTable(tableName, ORC, ImmutableList.of(new Column("id", HIVE_LONG, Optional.empty()), new Column("name", HIVE_STRING, Optional.empty())), ImmutableList.of(new Column("pk", HIVE_STRING, Optional.empty())), Optional.of(new HiveBucketProperty(ImmutableList.of("id"), BUCKETING_V1, 4, ImmutableList.of(new SortingColumn("id", ASCENDING), new SortingColumn("name", ASCENDING)))));
// write a 4-bucket partition sorted by id, name
MaterializedResult.Builder sortedByIdNameBuilder = MaterializedResult.resultBuilder(SESSION, BIGINT, VARCHAR, VARCHAR);
IntStream.range(0, rowCount).forEach(i -> sortedByIdNameBuilder.row((long) i, String.valueOf(i), "sorted_by_id_name"));
insertData(tableName, sortedByIdNameBuilder.build());
// write a 4-bucket partition sorted by name
alterBucketProperty(tableName, Optional.of(new HiveBucketProperty(ImmutableList.of("id"), BUCKETING_V1, 4, ImmutableList.of(new SortingColumn("name", ASCENDING)))));
MaterializedResult.Builder sortedByNameBuilder = MaterializedResult.resultBuilder(SESSION, BIGINT, VARCHAR, VARCHAR);
IntStream.range(0, rowCount).forEach(i -> sortedByNameBuilder.row((long) i, String.valueOf(i), "sorted_by_name"));
insertData(tableName, sortedByNameBuilder.build());
// write a 4-bucket partition sorted by id
alterBucketProperty(tableName, Optional.of(new HiveBucketProperty(ImmutableList.of("id"), BUCKETING_V1, 4, ImmutableList.of(new SortingColumn("id", ASCENDING)))));
MaterializedResult.Builder sortedByIdBuilder = MaterializedResult.resultBuilder(SESSION, BIGINT, VARCHAR, VARCHAR);
IntStream.range(0, rowCount).forEach(i -> sortedByIdBuilder.row((long) i, String.valueOf(i), "sorted_by_id"));
insertData(tableName, sortedByIdBuilder.build());
ConnectorTableHandle tableHandle;
try (Transaction transaction = newTransaction()) {
ConnectorMetadata metadata = transaction.getMetadata();
ConnectorSession session = newSession();
metadata.beginQuery(session);
tableHandle = getTableHandle(metadata, tableName);
// read entire table
List<ColumnHandle> columnHandles = metadata.getColumnHandles(session, tableHandle).values().stream().collect(toImmutableList());
MaterializedResult result = readTable(transaction, tableHandle, columnHandles, session, TupleDomain.all(), OptionalInt.empty(), Optional.empty());
assertEquals(result.getRowCount(), 300);
}
try (Transaction transaction = newTransaction()) {
ConnectorMetadata metadata = transaction.getMetadata();
ConnectorSession session = newSession(ImmutableMap.of("propagate_table_scan_sorting_properties", true));
metadata.beginQuery(session);
Map<String, ColumnHandle> columnHandles = metadata.getColumnHandles(session, tableHandle);
// verify local sorting property
ConnectorTableProperties properties = metadata.getTableProperties(session, tableHandle);
assertEquals(properties.getLocalProperties(), ImmutableList.of(new SortingProperty<>(columnHandles.get("id"), ASC_NULLS_FIRST)));
// read on a entire table should fail with exception
assertThatThrownBy(() -> readTable(transaction, tableHandle, ImmutableList.copyOf(columnHandles.values()), session, TupleDomain.all(), OptionalInt.empty(), Optional.empty())).isInstanceOf(TrinoException.class).hasMessage("Hive table (%s) sorting by [id] is not compatible with partition (pk=sorted_by_name) sorting by [name]." + " This restriction can be avoided by disabling propagate_table_scan_sorting_properties.", tableName);
// read only the partitions with sorting that is compatible to table sorting
MaterializedResult result = readTable(transaction, tableHandle, ImmutableList.copyOf(columnHandles.values()), session, TupleDomain.withColumnDomains(ImmutableMap.of(columnHandles.get("pk"), Domain.create(ValueSet.of(VARCHAR, utf8Slice("sorted_by_id_name"), utf8Slice("sorted_by_id")), false))), OptionalInt.empty(), Optional.empty());
assertEquals(result.getRowCount(), 200);
}
}
Aggregations