use of io.trino.spi.connector.ConnectorSplit in project trino by trinodb.
the class AbstractTestHive method testGetRecords.
@Test
public void testGetRecords() throws Exception {
try (Transaction transaction = newTransaction()) {
ConnectorMetadata metadata = transaction.getMetadata();
ConnectorSession session = newSession();
metadata.beginQuery(session);
ConnectorTableHandle tableHandle = getTableHandle(metadata, tablePartitionFormat);
ConnectorTableMetadata tableMetadata = metadata.getTableMetadata(session, tableHandle);
List<ColumnHandle> columnHandles = ImmutableList.copyOf(metadata.getColumnHandles(session, tableHandle).values());
Map<String, Integer> columnIndex = indexColumns(columnHandles);
List<ConnectorSplit> splits = getAllSplits(tableHandle, transaction, session);
assertEquals(splits.size(), tablePartitionFormatPartitions.size());
for (ConnectorSplit split : splits) {
HiveSplit hiveSplit = (HiveSplit) split;
List<HivePartitionKey> partitionKeys = hiveSplit.getPartitionKeys();
String ds = partitionKeys.get(0).getValue();
String fileFormat = partitionKeys.get(1).getValue();
HiveStorageFormat fileType = HiveStorageFormat.valueOf(fileFormat.toUpperCase(ENGLISH));
int dummyPartition = Integer.parseInt(partitionKeys.get(2).getValue());
long rowNumber = 0;
long completedBytes = 0;
try (ConnectorPageSource pageSource = pageSourceProvider.createPageSource(transaction.getTransactionHandle(), session, hiveSplit, tableHandle, columnHandles, DynamicFilter.EMPTY)) {
MaterializedResult result = materializeSourceDataStream(session, pageSource, getTypes(columnHandles));
assertPageSourceType(pageSource, fileType);
for (MaterializedRow row : result) {
try {
assertValueTypes(row, tableMetadata.getColumns());
} catch (RuntimeException e) {
throw new RuntimeException("row " + rowNumber, e);
}
rowNumber++;
Object value;
value = row.getField(columnIndex.get("t_string"));
if (rowNumber % 19 == 0) {
assertNull(value);
} else if (rowNumber % 19 == 1) {
assertEquals(value, "");
} else {
assertEquals(value, "test");
}
assertEquals(row.getField(columnIndex.get("t_tinyint")), (byte) (1 + rowNumber));
assertEquals(row.getField(columnIndex.get("t_smallint")), (short) (2 + rowNumber));
assertEquals(row.getField(columnIndex.get("t_int")), 3 + (int) rowNumber);
if (rowNumber % 13 == 0) {
assertNull(row.getField(columnIndex.get("t_bigint")));
} else {
assertEquals(row.getField(columnIndex.get("t_bigint")), 4 + rowNumber);
}
assertEquals((Float) row.getField(columnIndex.get("t_float")), 5.1f + rowNumber, 0.001);
assertEquals(row.getField(columnIndex.get("t_double")), 6.2 + rowNumber);
if (rowNumber % 3 == 2) {
assertNull(row.getField(columnIndex.get("t_boolean")));
} else {
assertEquals(row.getField(columnIndex.get("t_boolean")), rowNumber % 3 != 0);
}
assertEquals(row.getField(columnIndex.get("ds")), ds);
assertEquals(row.getField(columnIndex.get("file_format")), fileFormat);
assertEquals(row.getField(columnIndex.get("dummy")), dummyPartition);
long newCompletedBytes = pageSource.getCompletedBytes();
assertTrue(newCompletedBytes >= completedBytes);
assertTrue(newCompletedBytes <= hiveSplit.getLength());
completedBytes = newCompletedBytes;
}
assertTrue(completedBytes <= hiveSplit.getLength());
assertEquals(rowNumber, 100);
}
}
}
}
use of io.trino.spi.connector.ConnectorSplit in project trino by trinodb.
the class AbstractTestHive method assertTableIsBucketed.
private void assertTableIsBucketed(ConnectorTableHandle tableHandle, Transaction transaction, ConnectorSession session) {
// the bucketed test tables should have ~32 splits
List<ConnectorSplit> splits = getAllSplits(tableHandle, transaction, session);
assertThat(splits.size()).as("splits.size()").isBetween(31, 32);
// verify all paths are unique
Set<String> paths = new HashSet<>();
for (ConnectorSplit split : splits) {
assertTrue(paths.add(((HiveSplit) split).getPath()));
}
}
use of io.trino.spi.connector.ConnectorSplit in project trino by trinodb.
the class TestHiveSplitSource method testCorrectlyGeneratingInitialRowId.
@Test
public void testCorrectlyGeneratingInitialRowId() {
HiveSplitSource hiveSplitSource = HiveSplitSource.allAtOnce(SESSION, "database", "table", 10, 10, DataSize.of(1, MEGABYTE), Integer.MAX_VALUE, new TestingHiveSplitLoader(), Executors.newFixedThreadPool(5), new CounterStat(), false);
// add 10 splits
for (int i = 0; i < 10; i++) {
hiveSplitSource.addToQueue(new TestSplit(i));
assertEquals(hiveSplitSource.getBufferedInternalSplitCount(), i + 1);
}
List<ConnectorSplit> splits = getSplits(hiveSplitSource, 10);
assertEquals(((HiveSplit) splits.get(0)).getSplitNumber(), 0);
assertEquals(((HiveSplit) splits.get(5)).getSplitNumber(), 5);
assertEquals(hiveSplitSource.getBufferedInternalSplitCount(), 0);
}
use of io.trino.spi.connector.ConnectorSplit in project trino by trinodb.
the class CassandraSplitManager method getSplitsForPartitions.
private List<ConnectorSplit> getSplitsForPartitions(CassandraTableHandle cassTableHandle, List<CassandraPartition> partitions, String clusteringPredicates) {
String schema = cassTableHandle.getSchemaName();
HostAddressFactory hostAddressFactory = new HostAddressFactory();
ImmutableList.Builder<ConnectorSplit> builder = ImmutableList.builder();
// For single partition key column table, we can merge multiple partitions into a single split
// by using IN CLAUSE in a single select query if the partitions have the same host list.
// For multiple partition key columns table, we can't merge them into a single select query, so
// keep them in a separate split.
boolean singlePartitionKeyColumn = true;
String partitionKeyColumnName = null;
if (!partitions.isEmpty()) {
singlePartitionKeyColumn = partitions.get(0).getTupleDomain().getDomains().get().size() == 1;
if (singlePartitionKeyColumn) {
String partitionId = partitions.get(0).getPartitionId();
partitionKeyColumnName = partitionId.substring(0, partitionId.lastIndexOf('=') - 1);
}
}
Map<Set<String>, Set<String>> hostsToPartitionKeys = new HashMap<>();
Map<Set<String>, List<HostAddress>> hostMap = new HashMap<>();
for (CassandraPartition cassandraPartition : partitions) {
Set<Host> hosts = cassandraSession.getReplicas(schema, cassandraPartition.getKeyAsByteBuffer());
List<HostAddress> addresses = hostAddressFactory.toHostAddressList(hosts);
if (singlePartitionKeyColumn) {
// host ip addresses
ImmutableSet.Builder<String> sb = ImmutableSet.builder();
for (HostAddress address : addresses) {
sb.add(address.getHostText());
}
Set<String> hostAddresses = sb.build();
// partition key values
Set<String> values = hostsToPartitionKeys.get(hostAddresses);
if (values == null) {
values = new HashSet<>();
}
String partitionId = cassandraPartition.getPartitionId();
values.add(partitionId.substring(partitionId.lastIndexOf('=') + 2));
hostsToPartitionKeys.put(hostAddresses, values);
hostMap.put(hostAddresses, addresses);
} else {
builder.add(createSplitForClusteringPredicates(cassandraPartition.getPartitionId(), addresses, clusteringPredicates));
}
}
if (singlePartitionKeyColumn) {
for (Map.Entry<Set<String>, Set<String>> entry : hostsToPartitionKeys.entrySet()) {
StringBuilder sb = new StringBuilder(partitionSizeForBatchSelect);
int size = 0;
for (String value : entry.getValue()) {
if (size > 0) {
sb.append(",");
}
sb.append(value);
size++;
if (size > partitionSizeForBatchSelect) {
String partitionId = format("%s in (%s)", partitionKeyColumnName, sb);
builder.add(createSplitForClusteringPredicates(partitionId, hostMap.get(entry.getKey()), clusteringPredicates));
size = 0;
sb.setLength(0);
sb.trimToSize();
}
}
if (size > 0) {
String partitionId = format("%s in (%s)", partitionKeyColumnName, sb);
builder.add(createSplitForClusteringPredicates(partitionId, hostMap.get(entry.getKey()), clusteringPredicates));
}
}
}
return builder.build();
}
use of io.trino.spi.connector.ConnectorSplit in project trino by trinodb.
the class TestDeltaLakeSplitManager method getSplits.
private List<DeltaLakeSplit> getSplits(DeltaLakeSplitManager splitManager, DeltaLakeConfig deltaLakeConfig) throws ExecutionException, InterruptedException {
ConnectorSplitSource splitSource = splitManager.getSplits(// ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorTableHandle handle, SplitSchedulingStrategy splitSchedulingStrategy
new HiveTransactionHandle(false), testingConnectorSessionWithConfig(deltaLakeConfig), tableHandle, ConnectorSplitManager.SplitSchedulingStrategy.UNGROUPED_SCHEDULING, DynamicFilter.EMPTY, Constraint.alwaysTrue());
ImmutableList.Builder<DeltaLakeSplit> splits = ImmutableList.builder();
while (!splitSource.isFinished()) {
List<ConnectorSplit> nextBatch = splitSource.getNextBatch(NOT_PARTITIONED, 10).get().getSplits();
splits.addAll(nextBatch.stream().map(split -> (DeltaLakeSplit) split).collect(Collectors.toList()));
}
return splits.build();
}
Aggregations