Search in sources :

Example 16 with ConnectorPageSource

use of io.trino.spi.connector.ConnectorPageSource in project trino by trinodb.

the class TestHivePageSink method createPageSource.

private static ConnectorPageSource createPageSource(HiveTransactionHandle transaction, HiveConfig config, File outputFile) {
    Properties splitProperties = new Properties();
    splitProperties.setProperty(FILE_INPUT_FORMAT, config.getHiveStorageFormat().getInputFormat());
    splitProperties.setProperty(SERIALIZATION_LIB, config.getHiveStorageFormat().getSerde());
    splitProperties.setProperty("columns", Joiner.on(',').join(getColumnHandles().stream().map(HiveColumnHandle::getName).collect(toImmutableList())));
    splitProperties.setProperty("columns.types", Joiner.on(',').join(getColumnHandles().stream().map(HiveColumnHandle::getHiveType).map(hiveType -> hiveType.getHiveTypeName().toString()).collect(toImmutableList())));
    HiveSplit split = new HiveSplit(SCHEMA_NAME, TABLE_NAME, "", "file:///" + outputFile.getAbsolutePath(), 0, outputFile.length(), outputFile.length(), outputFile.lastModified(), splitProperties, ImmutableList.of(), ImmutableList.of(), OptionalInt.empty(), 0, false, TableToPartitionMapping.empty(), Optional.empty(), Optional.empty(), false, Optional.empty(), 0, SplitWeight.standard());
    ConnectorTableHandle table = new HiveTableHandle(SCHEMA_NAME, TABLE_NAME, ImmutableMap.of(), ImmutableList.of(), ImmutableList.of(), Optional.empty());
    HivePageSourceProvider provider = new HivePageSourceProvider(TESTING_TYPE_MANAGER, HDFS_ENVIRONMENT, config, getDefaultHivePageSourceFactories(HDFS_ENVIRONMENT, config), getDefaultHiveRecordCursorProviders(config, HDFS_ENVIRONMENT), new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT, config), Optional.empty());
    return provider.createPageSource(transaction, getHiveSession(config), split, table, ImmutableList.copyOf(getColumnHandles()), DynamicFilter.EMPTY);
}
Also used : MoreFiles.deleteRecursively(com.google.common.io.MoreFiles.deleteRecursively) MaterializedResult(io.trino.testing.MaterializedResult) Assertions.assertGreaterThan(io.airlift.testing.Assertions.assertGreaterThan) Test(org.testng.annotations.Test) NO_ACID_TRANSACTION(io.trino.plugin.hive.acid.AcidTransaction.NO_ACID_TRANSACTION) TypeOperators(io.trino.spi.type.TypeOperators) SplitWeight(io.trino.spi.SplitWeight) HiveMetastoreFactory(io.trino.plugin.hive.metastore.HiveMetastoreFactory) TpchColumnType(io.trino.tpch.TpchColumnType) Math.round(java.lang.Math.round) Slices(io.airlift.slice.Slices) ConnectorTableHandle(io.trino.spi.connector.ConnectorTableHandle) Path(org.apache.hadoop.fs.Path) LineItemColumn(io.trino.tpch.LineItemColumn) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) INTEGER(io.trino.spi.type.IntegerType.INTEGER) Assert.assertEquals(io.trino.testing.assertions.Assert.assertEquals) SERIALIZATION_LIB(org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB) HiveTestUtils.getDefaultHiveRecordCursorProviders(io.trino.plugin.hive.HiveTestUtils.getDefaultHiveRecordCursorProviders) TestingNodeManager(io.trino.testing.TestingNodeManager) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) HIVE_DATE(io.trino.plugin.hive.HiveType.HIVE_DATE) HDFS_ENVIRONMENT(io.trino.plugin.hive.HiveTestUtils.HDFS_ENVIRONMENT) SchemaTableName(io.trino.spi.connector.SchemaTableName) String.format(java.lang.String.format) LineItemGenerator(io.trino.tpch.LineItemGenerator) LineItem(io.trino.tpch.LineItem) List(java.util.List) Stream(java.util.stream.Stream) BIGINT(io.trino.spi.type.BigintType.BIGINT) DynamicFilter(io.trino.spi.connector.DynamicFilter) Optional(java.util.Optional) HivePageSinkMetadata(io.trino.plugin.hive.metastore.HivePageSinkMetadata) DATE(io.trino.spi.type.DateType.DATE) Joiner(com.google.common.base.Joiner) JsonCodec(io.airlift.json.JsonCodec) DIRECT_TO_TARGET_NEW_DIRECTORY(io.trino.plugin.hive.LocationHandle.WriteMode.DIRECT_TO_TARGET_NEW_DIRECTORY) HiveTestUtils.getDefaultHivePageSourceFactories(io.trino.plugin.hive.HiveTestUtils.getDefaultHivePageSourceFactories) HIVE_DOUBLE(io.trino.plugin.hive.HiveType.HIVE_DOUBLE) PageBuilder(io.trino.spi.PageBuilder) Type(io.trino.spi.type.Type) Page(io.trino.spi.Page) VarcharType.createUnboundedVarcharType(io.trino.spi.type.VarcharType.createUnboundedVarcharType) JoinCompiler(io.trino.sql.gen.JoinCompiler) OptionalInt(java.util.OptionalInt) GroupByHashPageIndexerFactory(io.trino.operator.GroupByHashPageIndexerFactory) ArrayList(java.util.ArrayList) HiveMetastore(io.trino.plugin.hive.metastore.HiveMetastore) ALLOW_INSECURE(com.google.common.io.RecursiveDeleteOption.ALLOW_INSECURE) ImmutableList(com.google.common.collect.ImmutableList) Files(com.google.common.io.Files) NONE(io.trino.plugin.hive.HiveCompressionCodec.NONE) HiveColumnHandle.createBaseColumn(io.trino.plugin.hive.HiveColumnHandle.createBaseColumn) FileHiveMetastore.createTestingFileHiveMetastore(io.trino.plugin.hive.metastore.file.FileHiveMetastore.createTestingFileHiveMetastore) ConnectorPageSink(io.trino.spi.connector.ConnectorPageSink) BlockTypeOperators(io.trino.type.BlockTypeOperators) Properties(java.util.Properties) HIVE_LONG(io.trino.plugin.hive.HiveType.HIVE_LONG) HiveTestUtils.getDefaultHiveFileWriterFactories(io.trino.plugin.hive.HiveTestUtils.getDefaultHiveFileWriterFactories) TESTING_TYPE_MANAGER(io.trino.type.InternalTypeManager.TESTING_TYPE_MANAGER) Iterables.getOnlyElement(com.google.common.collect.Iterables.getOnlyElement) HiveTestUtils.getHiveSessionProperties(io.trino.plugin.hive.HiveTestUtils.getHiveSessionProperties) ConnectorSession(io.trino.spi.connector.ConnectorSession) MoreFutures.getFutureValue(io.airlift.concurrent.MoreFutures.getFutureValue) PAGE_SORTER(io.trino.plugin.hive.HiveTestUtils.PAGE_SORTER) File(java.io.File) HIVE_STRING(io.trino.plugin.hive.HiveType.HIVE_STRING) TpchColumnTypes(io.trino.tpch.TpchColumnTypes) DOUBLE(io.trino.spi.type.DoubleType.DOUBLE) HiveTestUtils.getHiveSession(io.trino.plugin.hive.HiveTestUtils.getHiveSession) HIVE_INT(io.trino.plugin.hive.HiveType.HIVE_INT) Collectors.toList(java.util.stream.Collectors.toList) FILE_INPUT_FORMAT(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT) Assert.assertTrue(org.testng.Assert.assertTrue) BlockBuilder(io.trino.spi.block.BlockBuilder) REGULAR(io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR) Properties(java.util.Properties) HiveTestUtils.getHiveSessionProperties(io.trino.plugin.hive.HiveTestUtils.getHiveSessionProperties) ConnectorTableHandle(io.trino.spi.connector.ConnectorTableHandle)

Example 17 with ConnectorPageSource

use of io.trino.spi.connector.ConnectorPageSource in project trino by trinodb.

the class TestNodeLocalDynamicSplitPruning method testDynamicBucketPruning.

@Test
public void testDynamicBucketPruning() throws IOException {
    HiveConfig config = new HiveConfig();
    HiveTransactionHandle transaction = new HiveTransactionHandle(false);
    try (TempFile tempFile = new TempFile()) {
        ConnectorPageSource emptyPageSource = createTestingPageSource(transaction, config, tempFile.file(), getDynamicFilter(getTupleDomainForBucketSplitPruning()));
        assertEquals(emptyPageSource.getClass(), EmptyPageSource.class);
        ConnectorPageSource nonEmptyPageSource = createTestingPageSource(transaction, config, tempFile.file(), getDynamicFilter(getNonSelectiveBucketTupleDomain()));
        assertEquals(nonEmptyPageSource.getClass(), HivePageSource.class);
    }
}
Also used : TempFile(io.airlift.testing.TempFile) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) Test(org.testng.annotations.Test)

Example 18 with ConnectorPageSource

use of io.trino.spi.connector.ConnectorPageSource in project trino by trinodb.

the class TestOrcPageSourceMemoryTracking method testMaxReadBytes.

@Test(dataProvider = "rowCount")
public void testMaxReadBytes(int rowCount) throws Exception {
    int maxReadBytes = 1_000;
    HiveSessionProperties hiveSessionProperties = new HiveSessionProperties(new HiveConfig(), new OrcReaderConfig().setMaxBlockSize(DataSize.ofBytes(maxReadBytes)), new OrcWriterConfig(), new ParquetReaderConfig(), new ParquetWriterConfig());
    ConnectorSession session = TestingConnectorSession.builder().setPropertyMetadata(hiveSessionProperties.getSessionProperties()).build();
    FileFormatDataSourceStats stats = new FileFormatDataSourceStats();
    // Build a table where every row gets larger, so we can test that the "batchSize" reduces
    int numColumns = 5;
    int step = 250;
    ImmutableList.Builder<TestColumn> columnBuilder = ImmutableList.<TestColumn>builder().add(new TestColumn("p_empty_string", javaStringObjectInspector, () -> "", true));
    GrowingTestColumn[] dataColumns = new GrowingTestColumn[numColumns];
    for (int i = 0; i < numColumns; i++) {
        dataColumns[i] = new GrowingTestColumn("p_string" + "_" + i, javaStringObjectInspector, () -> Long.toHexString(random.nextLong()), false, step * (i + 1));
        columnBuilder.add(dataColumns[i]);
    }
    List<TestColumn> testColumns = columnBuilder.build();
    File tempFile = File.createTempFile("trino_test_orc_page_source_max_read_bytes", "orc");
    tempFile.delete();
    TestPreparer testPreparer = new TestPreparer(tempFile.getAbsolutePath(), testColumns, rowCount, rowCount);
    ConnectorPageSource pageSource = testPreparer.newPageSource(stats, session);
    try {
        int positionCount = 0;
        while (true) {
            Page page = pageSource.getNextPage();
            if (pageSource.isFinished()) {
                break;
            }
            assertNotNull(page);
            page = page.getLoadedPage();
            positionCount += page.getPositionCount();
            // ignore the first MAX_BATCH_SIZE rows given the sizes are set when loading the blocks
            if (positionCount > MAX_BATCH_SIZE) {
                // either the block is bounded by maxReadBytes or we just load one single large block
                // an error margin MAX_BATCH_SIZE / step is needed given the block sizes are increasing
                assertTrue(page.getSizeInBytes() < maxReadBytes * (MAX_BATCH_SIZE / step) || 1 == page.getPositionCount());
            }
        }
        // verify the stats are correctly recorded
        Distribution distribution = stats.getMaxCombinedBytesPerRow().getAllTime();
        assertEquals((int) distribution.getCount(), 1);
        // the block is VariableWidthBlock that contains valueIsNull and offsets arrays as overhead
        assertEquals((int) distribution.getMax(), Arrays.stream(dataColumns).mapToInt(GrowingTestColumn::getMaxSize).sum() + (Integer.BYTES + Byte.BYTES) * numColumns);
        pageSource.close();
    } finally {
        tempFile.delete();
    }
}
Also used : ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ImmutableList(com.google.common.collect.ImmutableList) ParquetWriterConfig(io.trino.plugin.hive.parquet.ParquetWriterConfig) OrcWriterConfig(io.trino.plugin.hive.orc.OrcWriterConfig) Page(io.trino.spi.Page) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) OrcReaderConfig(io.trino.plugin.hive.orc.OrcReaderConfig) Distribution(io.airlift.stats.Distribution) ConnectorSession(io.trino.spi.connector.ConnectorSession) TestingConnectorSession(io.trino.testing.TestingConnectorSession) SequenceFile(org.apache.hadoop.io.SequenceFile) File(java.io.File) OrcFile(org.apache.hadoop.hive.ql.io.orc.OrcFile) ParquetReaderConfig(io.trino.plugin.hive.parquet.ParquetReaderConfig) Test(org.testng.annotations.Test)

Example 19 with ConnectorPageSource

use of io.trino.spi.connector.ConnectorPageSource in project trino by trinodb.

the class TestOrcPageSourceMemoryTracking method testPageSource.

private void testPageSource(boolean useCache) throws Exception {
    // Numbers used in assertions in this test may change when implementation is modified,
    // feel free to change them if they break in the future
    FileFormatDataSourceStats stats = new FileFormatDataSourceStats();
    ConnectorPageSource pageSource = testPreparer.newPageSource(stats, useCache ? CACHED_SESSION : UNCACHED_SESSION);
    if (useCache) {
        // file is fully cached
        assertBetweenInclusive(pageSource.getMemoryUsage(), testPreparer.getFileSize(), testPreparer.getFileSize() + 200);
    } else {
        assertEquals(pageSource.getMemoryUsage(), 0);
    }
    long memoryUsage = -1;
    int totalRows = 0;
    while (totalRows < 20000) {
        assertFalse(pageSource.isFinished());
        Page page = pageSource.getNextPage();
        assertNotNull(page);
        Block block = page.getBlock(1);
        if (memoryUsage == -1) {
            // Memory usage before lazy-loading the block
            if (useCache) {
                assertBetweenInclusive(pageSource.getMemoryUsage(), testPreparer.getFileSize(), testPreparer.getFileSize() + 2000);
            } else {
                assertBetweenInclusive(pageSource.getMemoryUsage(), 0L, 1000L);
            }
            // trigger loading for lazy block
            createUnboundedVarcharType().getSlice(block, block.getPositionCount() - 1);
            memoryUsage = pageSource.getMemoryUsage();
            // Memory usage after lazy-loading the actual block
            if (useCache) {
                assertBetweenInclusive(pageSource.getMemoryUsage(), testPreparer.getFileSize() + 270_000, testPreparer.getFileSize() + 280_000);
            } else {
                assertBetweenInclusive(memoryUsage, 460_000L, 469_999L);
            }
        } else {
            assertEquals(pageSource.getMemoryUsage(), memoryUsage);
            // trigger loading for lazy block
            createUnboundedVarcharType().getSlice(block, block.getPositionCount() - 1);
            assertEquals(pageSource.getMemoryUsage(), memoryUsage);
        }
        totalRows += page.getPositionCount();
    }
    memoryUsage = -1;
    while (totalRows < 40000) {
        assertFalse(pageSource.isFinished());
        Page page = pageSource.getNextPage();
        assertNotNull(page);
        Block block = page.getBlock(1);
        if (memoryUsage == -1) {
            // Memory usage before lazy-loading the block
            if (useCache) {
                assertBetweenInclusive(pageSource.getMemoryUsage(), testPreparer.getFileSize(), testPreparer.getFileSize() + 2000);
            } else {
                assertBetweenInclusive(pageSource.getMemoryUsage(), 0L, 1000L);
            }
            // trigger loading for lazy block
            createUnboundedVarcharType().getSlice(block, block.getPositionCount() - 1);
            memoryUsage = pageSource.getMemoryUsage();
            // Memory usage after lazy-loading the actual block
            if (useCache) {
                assertBetweenInclusive(pageSource.getMemoryUsage(), testPreparer.getFileSize() + 270_000, testPreparer.getFileSize() + 280_000);
            } else {
                assertBetweenInclusive(memoryUsage, 460_000L, 469_999L);
            }
        } else {
            assertEquals(pageSource.getMemoryUsage(), memoryUsage);
            // trigger loading for lazy block
            createUnboundedVarcharType().getSlice(block, block.getPositionCount() - 1);
            assertEquals(pageSource.getMemoryUsage(), memoryUsage);
        }
        totalRows += page.getPositionCount();
    }
    memoryUsage = -1;
    while (totalRows < NUM_ROWS) {
        assertFalse(pageSource.isFinished());
        Page page = pageSource.getNextPage();
        assertNotNull(page);
        Block block = page.getBlock(1);
        if (memoryUsage == -1) {
            // Memory usage before lazy-loading the block
            if (useCache) {
                assertBetweenInclusive(pageSource.getMemoryUsage(), testPreparer.getFileSize(), testPreparer.getFileSize() + 2000);
            } else {
                assertBetweenInclusive(pageSource.getMemoryUsage(), 0L, 1000L);
            }
            // trigger loading for lazy block
            createUnboundedVarcharType().getSlice(block, block.getPositionCount() - 1);
            memoryUsage = pageSource.getMemoryUsage();
            // Memory usage after loading the actual block
            if (useCache) {
                assertBetweenInclusive(pageSource.getMemoryUsage(), testPreparer.getFileSize() + 260_000, testPreparer.getFileSize() + 270_000);
            } else {
                assertBetweenInclusive(memoryUsage, 360_000L, 369_999L);
            }
        } else {
            assertEquals(pageSource.getMemoryUsage(), memoryUsage);
            // trigger loading for lazy block
            createUnboundedVarcharType().getSlice(block, block.getPositionCount() - 1);
            assertEquals(pageSource.getMemoryUsage(), memoryUsage);
        }
        totalRows += page.getPositionCount();
    }
    assertFalse(pageSource.isFinished());
    assertNull(pageSource.getNextPage());
    assertTrue(pageSource.isFinished());
    if (useCache) {
        // file is fully cached
        assertBetweenInclusive(pageSource.getMemoryUsage(), testPreparer.getFileSize(), testPreparer.getFileSize() + 200);
    } else {
        assertEquals(pageSource.getMemoryUsage(), 0);
    }
    pageSource.close();
}
Also used : Block(io.trino.spi.block.Block) Page(io.trino.spi.Page) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource)

Example 20 with ConnectorPageSource

use of io.trino.spi.connector.ConnectorPageSource in project trino by trinodb.

the class BenchmarkHiveFileFormat method read.

@Benchmark
public List<Page> read(CompressionCounter counter) throws IOException {
    if (!fileFormat.supports(data)) {
        throw new RuntimeException(fileFormat + " does not support data set " + dataSet);
    }
    List<Page> pages = new ArrayList<>(100);
    try (ConnectorPageSource pageSource = fileFormat.createFileFormatReader(SESSION, HDFS_ENVIRONMENT, dataFile, data.getColumnNames(), data.getColumnTypes())) {
        while (!pageSource.isFinished()) {
            Page page = pageSource.getNextPage();
            if (page != null) {
                pages.add(page.getLoadedPage());
            }
        }
    }
    counter.inputSize += data.getSize();
    counter.outputSize += dataFile.length();
    return pages;
}
Also used : ArrayList(java.util.ArrayList) Page(io.trino.spi.Page) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) Benchmark(org.openjdk.jmh.annotations.Benchmark)

Aggregations

ConnectorPageSource (io.trino.spi.connector.ConnectorPageSource)50 ConnectorSession (io.trino.spi.connector.ConnectorSession)23 Page (io.trino.spi.Page)18 Type (io.trino.spi.type.Type)18 Test (org.testng.annotations.Test)17 ImmutableList (com.google.common.collect.ImmutableList)16 MaterializedResult (io.trino.testing.MaterializedResult)14 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)13 ColumnHandle (io.trino.spi.connector.ColumnHandle)13 ConnectorTableHandle (io.trino.spi.connector.ConnectorTableHandle)13 List (java.util.List)12 Optional (java.util.Optional)12 ConnectorSplit (io.trino.spi.connector.ConnectorSplit)11 ImmutableMap (com.google.common.collect.ImmutableMap)10 TestingConnectorSession (io.trino.testing.TestingConnectorSession)10 File (java.io.File)10 Path (org.apache.hadoop.fs.Path)10 TupleDomain (io.trino.spi.predicate.TupleDomain)9 IOException (java.io.IOException)9 ArrayList (java.util.ArrayList)9