use of io.prestosql.spi.connector.ConnectorPageSource in project hetu-core by openlookeng.
the class TestOrcAcidPageSource method readFile.
private static List<Nation> readFile(Map<NationColumn, Integer> columns, TupleDomain<HiveColumnHandle> tupleDomain, Optional<DeleteDeltaLocations> deleteDeltaLocations) {
List<HiveColumnHandle> columnHandles = columns.entrySet().stream().map(column -> toHiveColumnHandle(column.getKey(), column.getValue())).collect(toImmutableList());
List<String> columnNames = columnHandles.stream().map(HiveColumnHandle::getName).collect(toImmutableList());
// This file has the contains the TPC-H nation table which each row repeated 1000 times
File nationFileWithReplicatedRows = new File(TestOrcAcidPageSource.class.getClassLoader().getResource("nationFile25kRowsSortedOnNationKey/bucket_00000").getPath());
ConnectorPageSource pageSource = PAGE_SOURCE_FACTORY.createPageSource(new JobConf(new Configuration(false)), HiveTestUtils.SESSION, new Path(nationFileWithReplicatedRows.getAbsoluteFile().toURI()), 0, nationFileWithReplicatedRows.length(), nationFileWithReplicatedRows.length(), createSchema(), columnHandles, tupleDomain, Optional.empty(), deleteDeltaLocations, Optional.empty(), Optional.empty(), null, false, -1L).get();
int nationKeyColumn = columnNames.indexOf("n_nationkey");
int nameColumn = columnNames.indexOf("n_name");
int regionKeyColumn = columnNames.indexOf("n_regionkey");
int commentColumn = columnNames.indexOf("n_comment");
ImmutableList.Builder<Nation> rows = ImmutableList.builder();
while (!pageSource.isFinished()) {
Page page = pageSource.getNextPage();
if (page == null) {
continue;
}
page = page.getLoadedPage();
for (int position = 0; position < page.getPositionCount(); position++) {
long nationKey = -42;
if (nationKeyColumn >= 0) {
nationKey = BIGINT.getLong(page.getBlock(nationKeyColumn), position);
}
String name = "<not read>";
if (nameColumn >= 0) {
name = VARCHAR.getSlice(page.getBlock(nameColumn), position).toStringUtf8();
}
long regionKey = -42;
if (regionKeyColumn >= 0) {
regionKey = BIGINT.getLong(page.getBlock(regionKeyColumn), position);
}
String comment = "<not read>";
if (commentColumn >= 0) {
comment = VARCHAR.getSlice(page.getBlock(commentColumn), position).toStringUtf8();
}
rows.add(new Nation(position, nationKey, name, regionKey, comment));
}
}
return rows.build();
}
use of io.prestosql.spi.connector.ConnectorPageSource in project hetu-core by openlookeng.
the class HivePageSourceProvider method createPageSourceInternal.
private ConnectorPageSource createPageSourceInternal(ConnectorSession session, Optional<DynamicFilterSupplier> dynamicFilterSupplier, List<Map<ColumnHandle, DynamicFilter>> dynamicFilters, HiveTableHandle hiveTable, List<HiveColumnHandle> hiveColumns, HiveSplit hiveSplit) {
Path path = new Path(hiveSplit.getPath());
List<Set<DynamicFilter>> dynamicFilterList = new ArrayList();
if (dynamicFilters != null) {
for (Map<ColumnHandle, DynamicFilter> df : dynamicFilters) {
Set<DynamicFilter> values = df.values().stream().collect(Collectors.toSet());
dynamicFilterList.add(values);
}
}
// Filter out splits using partition values and dynamic filters
if (dynamicFilters != null && !dynamicFilters.isEmpty() && isPartitionFiltered(hiveSplit.getPartitionKeys(), dynamicFilterList, typeManager)) {
return new FixedPageSource(ImmutableList.of());
}
Configuration configuration = hdfsEnvironment.getConfiguration(new HdfsEnvironment.HdfsContext(session, hiveSplit.getDatabase(), hiveSplit.getTable()), path);
Properties schema = hiveSplit.getSchema();
String columnNameDelimiter = schema.containsKey(serdeConstants.COLUMN_NAME_DELIMITER) ? schema.getProperty(serdeConstants.COLUMN_NAME_DELIMITER) : String.valueOf(SerDeUtils.COMMA);
List<String> partitionColumnNames;
if (schema.containsKey(META_PARTITION_COLUMNS)) {
partitionColumnNames = Arrays.asList(schema.getProperty(META_PARTITION_COLUMNS).split(columnNameDelimiter));
} else if (schema.containsKey(META_TABLE_COLUMNS)) {
partitionColumnNames = Arrays.asList(schema.getProperty(META_TABLE_COLUMNS).split(columnNameDelimiter));
} else {
partitionColumnNames = new ArrayList<>();
}
List<String> tableColumns = hiveColumns.stream().map(cols -> cols.getName()).collect(toList());
List<String> missingColumns = tableColumns.stream().skip(partitionColumnNames.size()).collect(toList());
List<IndexMetadata> indexes = new ArrayList<>();
if (indexCache != null && session.isHeuristicIndexFilterEnabled()) {
indexes.addAll(this.indexCache.getIndices(session.getCatalog().orElse(null), hiveTable.getSchemaTableName().toString(), hiveSplit, hiveTable.getCompactEffectivePredicate(), hiveTable.getPartitionColumns()));
/* Bloom/Bitmap indices are checked for given table and added to the possible matchers for pushdown. */
if (hiveTable.getDisjunctCompactEffectivePredicate().isPresent() && hiveTable.getDisjunctCompactEffectivePredicate().get().size() > 0) {
hiveTable.getDisjunctCompactEffectivePredicate().get().forEach(orPredicate -> indexes.addAll(this.indexCache.getIndices(session.getCatalog().orElse(null), hiveTable.getSchemaTableName().toString(), hiveSplit, orPredicate, hiveTable.getPartitionColumns())));
}
}
Optional<List<IndexMetadata>> indexOptional = indexes == null || indexes.isEmpty() ? Optional.empty() : Optional.of(indexes);
URI splitUri = URI.create(URIUtil.encodePath(hiveSplit.getPath()));
SplitMetadata splitMetadata = new SplitMetadata(splitUri.getRawPath(), hiveSplit.getLastModifiedTime());
TupleDomain<HiveColumnHandle> predicate = TupleDomain.all();
if (dynamicFilterSupplier.isPresent() && dynamicFilters != null && !dynamicFilters.isEmpty()) {
if (dynamicFilters.size() == 1) {
List<HiveColumnHandle> filteredHiveColumnHandles = hiveColumns.stream().filter(column -> dynamicFilters.get(0).containsKey(column)).collect(toList());
HiveColumnHandle hiveColumnHandle = filteredHiveColumnHandles.get(0);
Type type = hiveColumnHandle.getColumnMetadata(typeManager).getType();
predicate = getPredicate(dynamicFilters.get(0).get(hiveColumnHandle), type, hiveColumnHandle);
if (predicate.isNone()) {
predicate = TupleDomain.all();
}
}
}
/**
* This is main logical division point to process filter pushdown enabled case (aka as selective read flow).
* If user configuration orc_predicate_pushdown_enabled is true and if all clause of query can be handled by hive
* selective read flow, then hiveTable.isSuitableToPush() will be enabled.
* (Refer HiveMetadata.checkIfSuitableToPush).
*/
if (hiveTable.isSuitableToPush()) {
return createSelectivePageSource(selectivePageSourceFactories, configuration, session, hiveSplit, assignUniqueIndicesToPartitionColumns(hiveColumns), typeManager, dynamicFilterSupplier, hiveSplit.getDeleteDeltaLocations(), hiveSplit.getStartRowOffsetOfFile(), indexOptional, hiveSplit.isCacheable(), hiveTable.getCompactEffectivePredicate(), hiveTable.getPredicateColumns(), hiveTable.getDisjunctCompactEffectivePredicate(), hiveSplit.getBucketConversion(), hiveSplit.getBucketNumber(), hiveSplit.getLastModifiedTime(), missingColumns);
}
Optional<ConnectorPageSource> pageSource = createHivePageSource(cursorProviders, pageSourceFactories, configuration, session, path, hiveSplit.getBucketNumber(), hiveSplit.getStart(), hiveSplit.getLength(), hiveSplit.getFileSize(), hiveSplit.getSchema(), hiveTable.getCompactEffectivePredicate().intersect(predicate), hiveColumns, hiveSplit.getPartitionKeys(), typeManager, hiveSplit.getColumnCoercions(), hiveSplit.getBucketConversion(), hiveSplit.isS3SelectPushdownEnabled(), dynamicFilterSupplier, hiveSplit.getDeleteDeltaLocations(), hiveSplit.getStartRowOffsetOfFile(), indexOptional, splitMetadata, hiveSplit.isCacheable(), hiveSplit.getLastModifiedTime(), hiveSplit.getCustomSplitInfo(), missingColumns);
if (pageSource.isPresent()) {
return pageSource.get();
}
throw new RuntimeException("Could not find a file reader for split " + hiveSplit);
}
use of io.prestosql.spi.connector.ConnectorPageSource in project hetu-core by openlookeng.
the class HivePageSourceProvider method createSelectivePageSource.
/**
* Create selective page source, which will be used for selective reader flow.
* Unlike normal page source, selective page source required to pass below additional details to reader
* a. Pre-filled values of all constant.
* b. Coercion information of all columns.
* c. Columns which required to be projected.
* d. Total list of columns which will be read (projection + filter).
* All these info gets used by reader.
* @param columns List of all columns being part of scan.
* @param effectivePredicate Predicates related to AND clause
* @param predicateColumns Map of all columns handles being part of predicate
* @param additionPredicates Predicates related to OR clause.
* Remaining columns are same as for createHivePageSource.
* @param missingColumns
* @return
*/
private static ConnectorPageSource createSelectivePageSource(Set<HiveSelectivePageSourceFactory> selectivePageSourceFactories, Configuration configuration, ConnectorSession session, HiveSplit split, List<HiveColumnHandle> columns, TypeManager typeManager, Optional<DynamicFilterSupplier> dynamicFilterSupplier, Optional<DeleteDeltaLocations> deleteDeltaLocations, Optional<Long> startRowOffsetOfFile, Optional<List<IndexMetadata>> indexes, boolean splitCacheable, TupleDomain<HiveColumnHandle> effectivePredicate, Map<String, HiveColumnHandle> predicateColumns, Optional<List<TupleDomain<HiveColumnHandle>>> additionPredicates, Optional<HiveSplit.BucketConversion> bucketConversion, OptionalInt bucketNumber, long dataSourceLastModifiedTime, List<String> missingColumns) {
Set<HiveColumnHandle> interimColumns = ImmutableSet.<HiveColumnHandle>builder().addAll(predicateColumns.values()).addAll(bucketConversion.map(HiveSplit.BucketConversion::getBucketColumnHandles).orElse(ImmutableList.of())).build();
Path path = new Path(split.getPath());
List<ColumnMapping> columnMappings = ColumnMapping.buildColumnMappings(split.getPartitionKeys(), columns, ImmutableList.copyOf(interimColumns), split.getColumnCoercions(), path, bucketNumber, true, missingColumns);
List<ColumnMapping> regularAndInterimColumnMappings = ColumnMapping.extractRegularAndInterimColumnMappings(columnMappings);
Optional<BucketAdaptation> bucketAdaptation = toBucketAdaptation(bucketConversion, regularAndInterimColumnMappings, bucketNumber);
checkArgument(!bucketAdaptation.isPresent(), "Bucket conversion is not yet supported");
// Make a list of all PREFILLED columns, which can be passed to reader. Unlike normal flow, selective read
// flow require to pass this below at reader level as we need to make block of all column values.
Map<Integer, String> prefilledValues = columnMappings.stream().filter(mapping -> mapping.getKind() == ColumnMappingKind.PREFILLED).collect(toImmutableMap(mapping -> mapping.getHiveColumnHandle().getHiveColumnIndex(), ColumnMapping::getPrefilledValue));
// Make a map of column required to be coerced. This also needs to be sent to reader level as coercion
// should be applied before adding values in block.
Map<Integer, HiveCoercer> coercers = columnMappings.stream().filter(mapping -> mapping.getCoercionFrom().isPresent()).collect(toImmutableMap(mapping -> mapping.getHiveColumnHandle().getHiveColumnIndex(), mapping -> createCoercer(typeManager, mapping.getCoercionFrom().get(), mapping.getHiveColumnHandle().getHiveType())));
List<Integer> outputColumns = columns.stream().map(HiveColumnHandle::getHiveColumnIndex).collect(toImmutableList());
for (HiveSelectivePageSourceFactory pageSourceFactory : selectivePageSourceFactories) {
Optional<? extends ConnectorPageSource> pageSource = pageSourceFactory.createPageSource(configuration, session, path, split.getStart(), split.getLength(), split.getFileSize(), split.getSchema(), toColumnHandles(columnMappings, true), prefilledValues, outputColumns, effectivePredicate, additionPredicates, deleteDeltaLocations, startRowOffsetOfFile, indexes, splitCacheable, columnMappings, coercers, dataSourceLastModifiedTime);
if (pageSource.isPresent()) {
return new HivePageSource(columnMappings, Optional.empty(), typeManager, pageSource.get(), dynamicFilterSupplier, session, split.getPartitionKeys());
}
}
throw new IllegalStateException("Could not find a file reader for split " + split);
}
use of io.prestosql.spi.connector.ConnectorPageSource in project boostkit-bigdata by kunpengcompute.
the class AbstractTestHive method testGetRecordsUnpartitioned.
@Test
public void testGetRecordsUnpartitioned() throws Exception {
try (Transaction transaction = newTransaction()) {
ConnectorMetadata metadata = transaction.getMetadata();
ConnectorSession session = newSession();
metadata.beginQuery(session);
ConnectorTableHandle tableHandle = getTableHandle(metadata, tableUnpartitioned);
List<ColumnHandle> columnHandles = ImmutableList.copyOf(metadata.getColumnHandles(session, tableHandle).values());
Map<String, Integer> columnIndex = indexColumns(columnHandles);
List<ConnectorSplit> splits = getAllSplits(tableHandle, transaction, session);
assertThat(splits).hasSameSizeAs(tableUnpartitionedPartitions);
for (ConnectorSplit split : splits) {
HiveSplit hiveSplit = HiveSplitWrapper.getOnlyHiveSplit(split);
assertEquals(hiveSplit.getPartitionKeys(), ImmutableList.of());
long rowNumber = 0;
try (ConnectorPageSource pageSource = pageSourceProvider.createPageSource(transaction.getTransactionHandle(), session, split, tableHandle, columnHandles)) {
assertPageSourceType(pageSource, TEXTFILE);
MaterializedResult result = materializeSourceDataStream(session, pageSource, getTypes(columnHandles));
for (MaterializedRow row : result) {
rowNumber++;
if (rowNumber % 19 == 0) {
assertNull(row.getField(columnIndex.get("t_string")));
} else if (rowNumber % 19 == 1) {
assertEquals(row.getField(columnIndex.get("t_string")), "");
} else {
assertEquals(row.getField(columnIndex.get("t_string")), "unpartitioned");
}
assertEquals(row.getField(columnIndex.get("t_tinyint")), (byte) (1 + rowNumber));
}
}
assertEquals(rowNumber, 100);
}
}
}
use of io.prestosql.spi.connector.ConnectorPageSource in project boostkit-bigdata by kunpengcompute.
the class AbstractTestHiveFileSystem method createTable.
private void createTable(SchemaTableName tableName, HiveStorageFormat storageFormat) throws Exception {
List<ColumnMetadata> columns = ImmutableList.<ColumnMetadata>builder().add(new ColumnMetadata("id", BIGINT)).build();
MaterializedResult data = MaterializedResult.resultBuilder(newSession(), BIGINT).row(1L).row(3L).row(2L).build();
try (Transaction transaction = newTransaction()) {
ConnectorMetadata metadata = transaction.getMetadata();
ConnectorSession session = newSession();
// begin creating the table
ConnectorTableMetadata tableMetadata = new ConnectorTableMetadata(tableName, columns, createTableProperties(storageFormat));
ConnectorOutputTableHandle outputHandle = metadata.beginCreateTable(session, tableMetadata, Optional.empty());
// write the records
ConnectorPageSink sink = pageSinkProvider.createPageSink(transaction.getTransactionHandle(), session, outputHandle);
sink.appendPage(data.toPage());
Collection<Slice> fragments = getFutureValue(sink.finish());
// commit the table
metadata.finishCreateTable(session, outputHandle, fragments, ImmutableList.of());
transaction.commit();
// Hack to work around the metastore not being configured for S3 or other FS.
// The metastore tries to validate the location when creating the
// table, which fails without explicit configuration for file system.
// We work around that by using a dummy location when creating the
// table and update it here to the correct location.
metastoreClient.updateTableLocation(database, tableName.getTableName(), locationService.getTableWriteInfo(((HiveOutputTableHandle) outputHandle).getLocationHandle(), false).getTargetPath().toString());
}
try (Transaction transaction = newTransaction()) {
ConnectorMetadata metadata = transaction.getMetadata();
ConnectorSession session = newSession();
// load the new table
ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName);
List<ColumnHandle> columnHandles = filterNonHiddenColumnHandles(metadata.getColumnHandles(session, tableHandle).values());
// verify the metadata
ConnectorTableMetadata tableMetadata = metadata.getTableMetadata(session, getTableHandle(metadata, tableName));
assertEquals(filterNonHiddenColumnMetadata(tableMetadata.getColumns()), columns);
// verify the data
ConnectorSplitSource splitSource = splitManager.getSplits(transaction.getTransactionHandle(), session, tableHandle, UNGROUPED_SCHEDULING);
ConnectorSplit split = getOnlyElement(getAllSplits(splitSource));
try (ConnectorPageSource pageSource = pageSourceProvider.createPageSource(transaction.getTransactionHandle(), session, split, tableHandle, columnHandles)) {
MaterializedResult result = materializeSourceDataStream(session, pageSource, getTypes(columnHandles));
assertEqualsIgnoreOrder(result.getMaterializedRows(), data.getMaterializedRows());
}
}
}
Aggregations