use of io.trino.spi.Page in project trino by trinodb.
the class TestOrcPageSourceMemoryTracking method testMaxReadBytes.
@Test(dataProvider = "rowCount")
public void testMaxReadBytes(int rowCount) throws Exception {
int maxReadBytes = 1_000;
HiveSessionProperties hiveSessionProperties = new HiveSessionProperties(new HiveConfig(), new OrcReaderConfig().setMaxBlockSize(DataSize.ofBytes(maxReadBytes)), new OrcWriterConfig(), new ParquetReaderConfig(), new ParquetWriterConfig());
ConnectorSession session = TestingConnectorSession.builder().setPropertyMetadata(hiveSessionProperties.getSessionProperties()).build();
FileFormatDataSourceStats stats = new FileFormatDataSourceStats();
// Build a table where every row gets larger, so we can test that the "batchSize" reduces
int numColumns = 5;
int step = 250;
ImmutableList.Builder<TestColumn> columnBuilder = ImmutableList.<TestColumn>builder().add(new TestColumn("p_empty_string", javaStringObjectInspector, () -> "", true));
GrowingTestColumn[] dataColumns = new GrowingTestColumn[numColumns];
for (int i = 0; i < numColumns; i++) {
dataColumns[i] = new GrowingTestColumn("p_string" + "_" + i, javaStringObjectInspector, () -> Long.toHexString(random.nextLong()), false, step * (i + 1));
columnBuilder.add(dataColumns[i]);
}
List<TestColumn> testColumns = columnBuilder.build();
File tempFile = File.createTempFile("trino_test_orc_page_source_max_read_bytes", "orc");
tempFile.delete();
TestPreparer testPreparer = new TestPreparer(tempFile.getAbsolutePath(), testColumns, rowCount, rowCount);
ConnectorPageSource pageSource = testPreparer.newPageSource(stats, session);
try {
int positionCount = 0;
while (true) {
Page page = pageSource.getNextPage();
if (pageSource.isFinished()) {
break;
}
assertNotNull(page);
page = page.getLoadedPage();
positionCount += page.getPositionCount();
// ignore the first MAX_BATCH_SIZE rows given the sizes are set when loading the blocks
if (positionCount > MAX_BATCH_SIZE) {
// either the block is bounded by maxReadBytes or we just load one single large block
// an error margin MAX_BATCH_SIZE / step is needed given the block sizes are increasing
assertTrue(page.getSizeInBytes() < maxReadBytes * (MAX_BATCH_SIZE / step) || 1 == page.getPositionCount());
}
}
// verify the stats are correctly recorded
Distribution distribution = stats.getMaxCombinedBytesPerRow().getAllTime();
assertEquals((int) distribution.getCount(), 1);
// the block is VariableWidthBlock that contains valueIsNull and offsets arrays as overhead
assertEquals((int) distribution.getMax(), Arrays.stream(dataColumns).mapToInt(GrowingTestColumn::getMaxSize).sum() + (Integer.BYTES + Byte.BYTES) * numColumns);
pageSource.close();
} finally {
tempFile.delete();
}
}
use of io.trino.spi.Page in project trino by trinodb.
the class TestOrcPageSourceMemoryTracking method testTableScanOperator.
@Test
public void testTableScanOperator() {
// Numbers used in assertions in this test may change when implementation is modified,
// feel free to change them if they break in the future
DriverContext driverContext = testPreparer.newDriverContext();
SourceOperator operator = testPreparer.newTableScanOperator(driverContext);
assertEquals(driverContext.getMemoryUsage(), 0);
long memoryUsage = -1;
int totalRows = 0;
while (totalRows < 20000) {
assertFalse(operator.isFinished());
Page page = operator.getOutput();
assertNotNull(page);
page.getBlock(1);
if (memoryUsage == -1) {
memoryUsage = driverContext.getMemoryUsage();
assertBetweenInclusive(memoryUsage, 460000L, 469999L);
} else {
assertEquals(driverContext.getMemoryUsage(), memoryUsage);
}
totalRows += page.getPositionCount();
}
memoryUsage = -1;
while (totalRows < 40000) {
assertFalse(operator.isFinished());
Page page = operator.getOutput();
assertNotNull(page);
page.getBlock(1);
if (memoryUsage == -1) {
memoryUsage = driverContext.getMemoryUsage();
assertBetweenInclusive(memoryUsage, 460000L, 469999L);
} else {
assertEquals(driverContext.getMemoryUsage(), memoryUsage);
}
totalRows += page.getPositionCount();
}
memoryUsage = -1;
while (totalRows < NUM_ROWS) {
assertFalse(operator.isFinished());
Page page = operator.getOutput();
assertNotNull(page);
page.getBlock(1);
if (memoryUsage == -1) {
memoryUsage = driverContext.getMemoryUsage();
assertBetweenInclusive(memoryUsage, 360000L, 369999L);
} else {
assertEquals(driverContext.getMemoryUsage(), memoryUsage);
}
totalRows += page.getPositionCount();
}
assertFalse(operator.isFinished());
assertNull(operator.getOutput());
assertTrue(operator.isFinished());
assertEquals(driverContext.getMemoryUsage(), 0);
}
use of io.trino.spi.Page in project trino by trinodb.
the class TestOrcPageSourceMemoryTracking method testPageSource.
private void testPageSource(boolean useCache) throws Exception {
// Numbers used in assertions in this test may change when implementation is modified,
// feel free to change them if they break in the future
FileFormatDataSourceStats stats = new FileFormatDataSourceStats();
ConnectorPageSource pageSource = testPreparer.newPageSource(stats, useCache ? CACHED_SESSION : UNCACHED_SESSION);
if (useCache) {
// file is fully cached
assertBetweenInclusive(pageSource.getMemoryUsage(), testPreparer.getFileSize(), testPreparer.getFileSize() + 200);
} else {
assertEquals(pageSource.getMemoryUsage(), 0);
}
long memoryUsage = -1;
int totalRows = 0;
while (totalRows < 20000) {
assertFalse(pageSource.isFinished());
Page page = pageSource.getNextPage();
assertNotNull(page);
Block block = page.getBlock(1);
if (memoryUsage == -1) {
// Memory usage before lazy-loading the block
if (useCache) {
assertBetweenInclusive(pageSource.getMemoryUsage(), testPreparer.getFileSize(), testPreparer.getFileSize() + 2000);
} else {
assertBetweenInclusive(pageSource.getMemoryUsage(), 0L, 1000L);
}
// trigger loading for lazy block
createUnboundedVarcharType().getSlice(block, block.getPositionCount() - 1);
memoryUsage = pageSource.getMemoryUsage();
// Memory usage after lazy-loading the actual block
if (useCache) {
assertBetweenInclusive(pageSource.getMemoryUsage(), testPreparer.getFileSize() + 270_000, testPreparer.getFileSize() + 280_000);
} else {
assertBetweenInclusive(memoryUsage, 460_000L, 469_999L);
}
} else {
assertEquals(pageSource.getMemoryUsage(), memoryUsage);
// trigger loading for lazy block
createUnboundedVarcharType().getSlice(block, block.getPositionCount() - 1);
assertEquals(pageSource.getMemoryUsage(), memoryUsage);
}
totalRows += page.getPositionCount();
}
memoryUsage = -1;
while (totalRows < 40000) {
assertFalse(pageSource.isFinished());
Page page = pageSource.getNextPage();
assertNotNull(page);
Block block = page.getBlock(1);
if (memoryUsage == -1) {
// Memory usage before lazy-loading the block
if (useCache) {
assertBetweenInclusive(pageSource.getMemoryUsage(), testPreparer.getFileSize(), testPreparer.getFileSize() + 2000);
} else {
assertBetweenInclusive(pageSource.getMemoryUsage(), 0L, 1000L);
}
// trigger loading for lazy block
createUnboundedVarcharType().getSlice(block, block.getPositionCount() - 1);
memoryUsage = pageSource.getMemoryUsage();
// Memory usage after lazy-loading the actual block
if (useCache) {
assertBetweenInclusive(pageSource.getMemoryUsage(), testPreparer.getFileSize() + 270_000, testPreparer.getFileSize() + 280_000);
} else {
assertBetweenInclusive(memoryUsage, 460_000L, 469_999L);
}
} else {
assertEquals(pageSource.getMemoryUsage(), memoryUsage);
// trigger loading for lazy block
createUnboundedVarcharType().getSlice(block, block.getPositionCount() - 1);
assertEquals(pageSource.getMemoryUsage(), memoryUsage);
}
totalRows += page.getPositionCount();
}
memoryUsage = -1;
while (totalRows < NUM_ROWS) {
assertFalse(pageSource.isFinished());
Page page = pageSource.getNextPage();
assertNotNull(page);
Block block = page.getBlock(1);
if (memoryUsage == -1) {
// Memory usage before lazy-loading the block
if (useCache) {
assertBetweenInclusive(pageSource.getMemoryUsage(), testPreparer.getFileSize(), testPreparer.getFileSize() + 2000);
} else {
assertBetweenInclusive(pageSource.getMemoryUsage(), 0L, 1000L);
}
// trigger loading for lazy block
createUnboundedVarcharType().getSlice(block, block.getPositionCount() - 1);
memoryUsage = pageSource.getMemoryUsage();
// Memory usage after loading the actual block
if (useCache) {
assertBetweenInclusive(pageSource.getMemoryUsage(), testPreparer.getFileSize() + 260_000, testPreparer.getFileSize() + 270_000);
} else {
assertBetweenInclusive(memoryUsage, 360_000L, 369_999L);
}
} else {
assertEquals(pageSource.getMemoryUsage(), memoryUsage);
// trigger loading for lazy block
createUnboundedVarcharType().getSlice(block, block.getPositionCount() - 1);
assertEquals(pageSource.getMemoryUsage(), memoryUsage);
}
totalRows += page.getPositionCount();
}
assertFalse(pageSource.isFinished());
assertNull(pageSource.getNextPage());
assertTrue(pageSource.isFinished());
if (useCache) {
// file is fully cached
assertBetweenInclusive(pageSource.getMemoryUsage(), testPreparer.getFileSize(), testPreparer.getFileSize() + 200);
} else {
assertEquals(pageSource.getMemoryUsage(), 0);
}
pageSource.close();
}
use of io.trino.spi.Page in project trino by trinodb.
the class BenchmarkFileFormatsUtils method createTpchDataSet.
public static <E extends TpchEntity> TestData createTpchDataSet(FileFormat format, TpchTable<E> tpchTable, List<TpchColumn<E>> columns) {
List<String> columnNames = columns.stream().map(TpchColumn::getColumnName).collect(toList());
List<Type> columnTypes = columns.stream().map(BenchmarkFileFormatsUtils::getColumnType).map(type -> format.supportsDate() || !DATE.equals(type) ? type : createUnboundedVarcharType()).collect(toList());
PageBuilder pageBuilder = new PageBuilder(columnTypes);
ImmutableList.Builder<Page> pages = ImmutableList.builder();
long dataSize = 0;
for (E row : tpchTable.createGenerator(10, 1, 1)) {
pageBuilder.declarePosition();
for (int i = 0; i < columns.size(); i++) {
TpchColumn<E> column = columns.get(i);
BlockBuilder blockBuilder = pageBuilder.getBlockBuilder(i);
switch(column.getType().getBase()) {
case IDENTIFIER:
BIGINT.writeLong(blockBuilder, column.getIdentifier(row));
break;
case INTEGER:
INTEGER.writeLong(blockBuilder, column.getInteger(row));
break;
case DATE:
if (format.supportsDate()) {
DATE.writeLong(blockBuilder, column.getDate(row));
} else {
createUnboundedVarcharType().writeString(blockBuilder, column.getString(row));
}
break;
case DOUBLE:
DOUBLE.writeDouble(blockBuilder, column.getDouble(row));
break;
case VARCHAR:
createUnboundedVarcharType().writeSlice(blockBuilder, Slices.utf8Slice(column.getString(row)));
break;
default:
throw new IllegalArgumentException("Unsupported type " + column.getType());
}
}
if (pageBuilder.isFull()) {
Page page = pageBuilder.build();
pages.add(page);
pageBuilder.reset();
dataSize += page.getSizeInBytes();
if (dataSize >= MIN_DATA_SIZE) {
break;
}
}
}
if (!pageBuilder.isEmpty()) {
pages.add(pageBuilder.build());
}
return new TestData(columnNames, columnTypes, pages.build());
}
use of io.trino.spi.Page in project trino by trinodb.
the class BenchmarkHiveFileFormat method read.
@Benchmark
public List<Page> read(CompressionCounter counter) throws IOException {
if (!fileFormat.supports(data)) {
throw new RuntimeException(fileFormat + " does not support data set " + dataSet);
}
List<Page> pages = new ArrayList<>(100);
try (ConnectorPageSource pageSource = fileFormat.createFileFormatReader(SESSION, HDFS_ENVIRONMENT, dataFile, data.getColumnNames(), data.getColumnTypes())) {
while (!pageSource.isFinished()) {
Page page = pageSource.getNextPage();
if (page != null) {
pages.add(page.getLoadedPage());
}
}
}
counter.inputSize += data.getSize();
counter.outputSize += dataFile.length();
return pages;
}
Aggregations