use of io.trino.spi.connector.ConnectorPageSource in project trino by trinodb.
the class TestHivePageSink method createPageSource.
private static ConnectorPageSource createPageSource(HiveTransactionHandle transaction, HiveConfig config, File outputFile) {
Properties splitProperties = new Properties();
splitProperties.setProperty(FILE_INPUT_FORMAT, config.getHiveStorageFormat().getInputFormat());
splitProperties.setProperty(SERIALIZATION_LIB, config.getHiveStorageFormat().getSerde());
splitProperties.setProperty("columns", Joiner.on(',').join(getColumnHandles().stream().map(HiveColumnHandle::getName).collect(toImmutableList())));
splitProperties.setProperty("columns.types", Joiner.on(',').join(getColumnHandles().stream().map(HiveColumnHandle::getHiveType).map(hiveType -> hiveType.getHiveTypeName().toString()).collect(toImmutableList())));
HiveSplit split = new HiveSplit(SCHEMA_NAME, TABLE_NAME, "", "file:///" + outputFile.getAbsolutePath(), 0, outputFile.length(), outputFile.length(), outputFile.lastModified(), splitProperties, ImmutableList.of(), ImmutableList.of(), OptionalInt.empty(), 0, false, TableToPartitionMapping.empty(), Optional.empty(), Optional.empty(), false, Optional.empty(), 0, SplitWeight.standard());
ConnectorTableHandle table = new HiveTableHandle(SCHEMA_NAME, TABLE_NAME, ImmutableMap.of(), ImmutableList.of(), ImmutableList.of(), Optional.empty());
HivePageSourceProvider provider = new HivePageSourceProvider(TESTING_TYPE_MANAGER, HDFS_ENVIRONMENT, config, getDefaultHivePageSourceFactories(HDFS_ENVIRONMENT, config), getDefaultHiveRecordCursorProviders(config, HDFS_ENVIRONMENT), new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT, config), Optional.empty());
return provider.createPageSource(transaction, getHiveSession(config), split, table, ImmutableList.copyOf(getColumnHandles()), DynamicFilter.EMPTY);
}
use of io.trino.spi.connector.ConnectorPageSource in project trino by trinodb.
the class TestNodeLocalDynamicSplitPruning method testDynamicBucketPruning.
@Test
public void testDynamicBucketPruning() throws IOException {
HiveConfig config = new HiveConfig();
HiveTransactionHandle transaction = new HiveTransactionHandle(false);
try (TempFile tempFile = new TempFile()) {
ConnectorPageSource emptyPageSource = createTestingPageSource(transaction, config, tempFile.file(), getDynamicFilter(getTupleDomainForBucketSplitPruning()));
assertEquals(emptyPageSource.getClass(), EmptyPageSource.class);
ConnectorPageSource nonEmptyPageSource = createTestingPageSource(transaction, config, tempFile.file(), getDynamicFilter(getNonSelectiveBucketTupleDomain()));
assertEquals(nonEmptyPageSource.getClass(), HivePageSource.class);
}
}
use of io.trino.spi.connector.ConnectorPageSource in project trino by trinodb.
the class TestOrcPageSourceMemoryTracking method testMaxReadBytes.
@Test(dataProvider = "rowCount")
public void testMaxReadBytes(int rowCount) throws Exception {
int maxReadBytes = 1_000;
HiveSessionProperties hiveSessionProperties = new HiveSessionProperties(new HiveConfig(), new OrcReaderConfig().setMaxBlockSize(DataSize.ofBytes(maxReadBytes)), new OrcWriterConfig(), new ParquetReaderConfig(), new ParquetWriterConfig());
ConnectorSession session = TestingConnectorSession.builder().setPropertyMetadata(hiveSessionProperties.getSessionProperties()).build();
FileFormatDataSourceStats stats = new FileFormatDataSourceStats();
// Build a table where every row gets larger, so we can test that the "batchSize" reduces
int numColumns = 5;
int step = 250;
ImmutableList.Builder<TestColumn> columnBuilder = ImmutableList.<TestColumn>builder().add(new TestColumn("p_empty_string", javaStringObjectInspector, () -> "", true));
GrowingTestColumn[] dataColumns = new GrowingTestColumn[numColumns];
for (int i = 0; i < numColumns; i++) {
dataColumns[i] = new GrowingTestColumn("p_string" + "_" + i, javaStringObjectInspector, () -> Long.toHexString(random.nextLong()), false, step * (i + 1));
columnBuilder.add(dataColumns[i]);
}
List<TestColumn> testColumns = columnBuilder.build();
File tempFile = File.createTempFile("trino_test_orc_page_source_max_read_bytes", "orc");
tempFile.delete();
TestPreparer testPreparer = new TestPreparer(tempFile.getAbsolutePath(), testColumns, rowCount, rowCount);
ConnectorPageSource pageSource = testPreparer.newPageSource(stats, session);
try {
int positionCount = 0;
while (true) {
Page page = pageSource.getNextPage();
if (pageSource.isFinished()) {
break;
}
assertNotNull(page);
page = page.getLoadedPage();
positionCount += page.getPositionCount();
// ignore the first MAX_BATCH_SIZE rows given the sizes are set when loading the blocks
if (positionCount > MAX_BATCH_SIZE) {
// either the block is bounded by maxReadBytes or we just load one single large block
// an error margin MAX_BATCH_SIZE / step is needed given the block sizes are increasing
assertTrue(page.getSizeInBytes() < maxReadBytes * (MAX_BATCH_SIZE / step) || 1 == page.getPositionCount());
}
}
// verify the stats are correctly recorded
Distribution distribution = stats.getMaxCombinedBytesPerRow().getAllTime();
assertEquals((int) distribution.getCount(), 1);
// the block is VariableWidthBlock that contains valueIsNull and offsets arrays as overhead
assertEquals((int) distribution.getMax(), Arrays.stream(dataColumns).mapToInt(GrowingTestColumn::getMaxSize).sum() + (Integer.BYTES + Byte.BYTES) * numColumns);
pageSource.close();
} finally {
tempFile.delete();
}
}
use of io.trino.spi.connector.ConnectorPageSource in project trino by trinodb.
the class TestOrcPageSourceMemoryTracking method testPageSource.
private void testPageSource(boolean useCache) throws Exception {
// Numbers used in assertions in this test may change when implementation is modified,
// feel free to change them if they break in the future
FileFormatDataSourceStats stats = new FileFormatDataSourceStats();
ConnectorPageSource pageSource = testPreparer.newPageSource(stats, useCache ? CACHED_SESSION : UNCACHED_SESSION);
if (useCache) {
// file is fully cached
assertBetweenInclusive(pageSource.getMemoryUsage(), testPreparer.getFileSize(), testPreparer.getFileSize() + 200);
} else {
assertEquals(pageSource.getMemoryUsage(), 0);
}
long memoryUsage = -1;
int totalRows = 0;
while (totalRows < 20000) {
assertFalse(pageSource.isFinished());
Page page = pageSource.getNextPage();
assertNotNull(page);
Block block = page.getBlock(1);
if (memoryUsage == -1) {
// Memory usage before lazy-loading the block
if (useCache) {
assertBetweenInclusive(pageSource.getMemoryUsage(), testPreparer.getFileSize(), testPreparer.getFileSize() + 2000);
} else {
assertBetweenInclusive(pageSource.getMemoryUsage(), 0L, 1000L);
}
// trigger loading for lazy block
createUnboundedVarcharType().getSlice(block, block.getPositionCount() - 1);
memoryUsage = pageSource.getMemoryUsage();
// Memory usage after lazy-loading the actual block
if (useCache) {
assertBetweenInclusive(pageSource.getMemoryUsage(), testPreparer.getFileSize() + 270_000, testPreparer.getFileSize() + 280_000);
} else {
assertBetweenInclusive(memoryUsage, 460_000L, 469_999L);
}
} else {
assertEquals(pageSource.getMemoryUsage(), memoryUsage);
// trigger loading for lazy block
createUnboundedVarcharType().getSlice(block, block.getPositionCount() - 1);
assertEquals(pageSource.getMemoryUsage(), memoryUsage);
}
totalRows += page.getPositionCount();
}
memoryUsage = -1;
while (totalRows < 40000) {
assertFalse(pageSource.isFinished());
Page page = pageSource.getNextPage();
assertNotNull(page);
Block block = page.getBlock(1);
if (memoryUsage == -1) {
// Memory usage before lazy-loading the block
if (useCache) {
assertBetweenInclusive(pageSource.getMemoryUsage(), testPreparer.getFileSize(), testPreparer.getFileSize() + 2000);
} else {
assertBetweenInclusive(pageSource.getMemoryUsage(), 0L, 1000L);
}
// trigger loading for lazy block
createUnboundedVarcharType().getSlice(block, block.getPositionCount() - 1);
memoryUsage = pageSource.getMemoryUsage();
// Memory usage after lazy-loading the actual block
if (useCache) {
assertBetweenInclusive(pageSource.getMemoryUsage(), testPreparer.getFileSize() + 270_000, testPreparer.getFileSize() + 280_000);
} else {
assertBetweenInclusive(memoryUsage, 460_000L, 469_999L);
}
} else {
assertEquals(pageSource.getMemoryUsage(), memoryUsage);
// trigger loading for lazy block
createUnboundedVarcharType().getSlice(block, block.getPositionCount() - 1);
assertEquals(pageSource.getMemoryUsage(), memoryUsage);
}
totalRows += page.getPositionCount();
}
memoryUsage = -1;
while (totalRows < NUM_ROWS) {
assertFalse(pageSource.isFinished());
Page page = pageSource.getNextPage();
assertNotNull(page);
Block block = page.getBlock(1);
if (memoryUsage == -1) {
// Memory usage before lazy-loading the block
if (useCache) {
assertBetweenInclusive(pageSource.getMemoryUsage(), testPreparer.getFileSize(), testPreparer.getFileSize() + 2000);
} else {
assertBetweenInclusive(pageSource.getMemoryUsage(), 0L, 1000L);
}
// trigger loading for lazy block
createUnboundedVarcharType().getSlice(block, block.getPositionCount() - 1);
memoryUsage = pageSource.getMemoryUsage();
// Memory usage after loading the actual block
if (useCache) {
assertBetweenInclusive(pageSource.getMemoryUsage(), testPreparer.getFileSize() + 260_000, testPreparer.getFileSize() + 270_000);
} else {
assertBetweenInclusive(memoryUsage, 360_000L, 369_999L);
}
} else {
assertEquals(pageSource.getMemoryUsage(), memoryUsage);
// trigger loading for lazy block
createUnboundedVarcharType().getSlice(block, block.getPositionCount() - 1);
assertEquals(pageSource.getMemoryUsage(), memoryUsage);
}
totalRows += page.getPositionCount();
}
assertFalse(pageSource.isFinished());
assertNull(pageSource.getNextPage());
assertTrue(pageSource.isFinished());
if (useCache) {
// file is fully cached
assertBetweenInclusive(pageSource.getMemoryUsage(), testPreparer.getFileSize(), testPreparer.getFileSize() + 200);
} else {
assertEquals(pageSource.getMemoryUsage(), 0);
}
pageSource.close();
}
use of io.trino.spi.connector.ConnectorPageSource in project trino by trinodb.
the class BenchmarkHiveFileFormat method read.
@Benchmark
public List<Page> read(CompressionCounter counter) throws IOException {
if (!fileFormat.supports(data)) {
throw new RuntimeException(fileFormat + " does not support data set " + dataSet);
}
List<Page> pages = new ArrayList<>(100);
try (ConnectorPageSource pageSource = fileFormat.createFileFormatReader(SESSION, HDFS_ENVIRONMENT, dataFile, data.getColumnNames(), data.getColumnTypes())) {
while (!pageSource.isFinished()) {
Page page = pageSource.getNextPage();
if (page != null) {
pages.add(page.getLoadedPage());
}
}
}
counter.inputSize += data.getSize();
counter.outputSize += dataFile.length();
return pages;
}
Aggregations