Search in sources :

Example 1 with CheckpointSchemaManager

use of io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointSchemaManager in project trino by trinodb.

the class TestDeltaLakeFileStatistics method testParseParquetStatistics.

@Test
public void testParseParquetStatistics() throws Exception {
    File statsFile = new File(getClass().getResource("/databricks/pruning/parquet_struct_statistics/_delta_log/00000000000000000010.checkpoint.parquet").getFile());
    Path checkpointPath = new Path(statsFile.toURI());
    TypeManager typeManager = TESTING_TYPE_MANAGER;
    CheckpointSchemaManager checkpointSchemaManager = new CheckpointSchemaManager(typeManager);
    HdfsConfig hdfsConfig = new HdfsConfig();
    HdfsConfiguration hdfsConfiguration = new HiveHdfsConfiguration(new HdfsConfigurationInitializer(hdfsConfig), ImmutableSet.of());
    HdfsEnvironment hdfsEnvironment = new HdfsEnvironment(hdfsConfiguration, hdfsConfig, new NoHdfsAuthentication());
    FileSystem fs = hdfsEnvironment.getFileSystem(new HdfsEnvironment.HdfsContext(SESSION), checkpointPath);
    CheckpointEntryIterator metadataEntryIterator = new CheckpointEntryIterator(checkpointPath, SESSION, fs.getFileStatus(checkpointPath).getLen(), checkpointSchemaManager, typeManager, ImmutableSet.of(METADATA), Optional.empty(), hdfsEnvironment, new FileFormatDataSourceStats(), new ParquetReaderConfig().toParquetReaderOptions(), true);
    MetadataEntry metadataEntry = getOnlyElement(metadataEntryIterator).getMetaData();
    CheckpointEntryIterator checkpointEntryIterator = new CheckpointEntryIterator(checkpointPath, SESSION, fs.getFileStatus(checkpointPath).getLen(), checkpointSchemaManager, typeManager, ImmutableSet.of(CheckpointEntryIterator.EntryType.ADD), Optional.of(metadataEntry), hdfsEnvironment, new FileFormatDataSourceStats(), new ParquetReaderConfig().toParquetReaderOptions(), true);
    DeltaLakeTransactionLogEntry matchingAddFileEntry = null;
    while (checkpointEntryIterator.hasNext()) {
        DeltaLakeTransactionLogEntry entry = checkpointEntryIterator.next();
        if (entry.getAdd() != null && entry.getAdd().getPath().contains("part-00000-17951bea-0d04-43c1-979c-ea1fac19b382-c000.snappy.parquet")) {
            assertNull(matchingAddFileEntry);
            matchingAddFileEntry = entry;
        }
    }
    assertNotNull(matchingAddFileEntry);
    assertThat(matchingAddFileEntry.getAdd().getStats()).isPresent();
    testStatisticsValues(matchingAddFileEntry.getAdd().getStats().get());
}
Also used : Path(org.apache.hadoop.fs.Path) HdfsConfigurationInitializer(io.trino.plugin.hive.HdfsConfigurationInitializer) HiveHdfsConfiguration(io.trino.plugin.hive.HiveHdfsConfiguration) DeltaLakeTransactionLogEntry(io.trino.plugin.deltalake.transactionlog.DeltaLakeTransactionLogEntry) HdfsConfig(io.trino.plugin.hive.HdfsConfig) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) CheckpointEntryIterator(io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointEntryIterator) HiveHdfsConfiguration(io.trino.plugin.hive.HiveHdfsConfiguration) HdfsConfiguration(io.trino.plugin.hive.HdfsConfiguration) NoHdfsAuthentication(io.trino.plugin.hive.authentication.NoHdfsAuthentication) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) CheckpointSchemaManager(io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointSchemaManager) FileSystem(org.apache.hadoop.fs.FileSystem) TypeManager(io.trino.spi.type.TypeManager) MetadataEntry(io.trino.plugin.deltalake.transactionlog.MetadataEntry) File(java.io.File) ParquetReaderConfig(io.trino.plugin.hive.parquet.ParquetReaderConfig) Test(org.testng.annotations.Test)

Example 2 with CheckpointSchemaManager

use of io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointSchemaManager in project trino by trinodb.

the class TestDeltaLakeMetastoreStatistics method setupMetastore.

@BeforeClass
public void setupMetastore() {
    TestingConnectorContext context = new TestingConnectorContext();
    TypeManager typeManager = context.getTypeManager();
    CheckpointSchemaManager checkpointSchemaManager = new CheckpointSchemaManager(typeManager);
    HdfsConfig hdfsConfig = new HdfsConfig();
    HdfsConfiguration hdfsConfiguration = new HiveHdfsConfiguration(new HdfsConfigurationInitializer(hdfsConfig), ImmutableSet.of());
    HdfsEnvironment hdfsEnvironment = new HdfsEnvironment(hdfsConfiguration, hdfsConfig, new NoHdfsAuthentication());
    FileFormatDataSourceStats fileFormatDataSourceStats = new FileFormatDataSourceStats();
    TransactionLogAccess transactionLogAccess = new TransactionLogAccess(typeManager, checkpointSchemaManager, new DeltaLakeConfig(), fileFormatDataSourceStats, hdfsEnvironment, new ParquetReaderConfig(), new DeltaLakeConfig());
    File tmpDir = Files.createTempDir();
    File metastoreDir = new File(tmpDir, "metastore");
    hiveMetastore = new FileHiveMetastore(new NodeVersion("test_version"), hdfsEnvironment, new MetastoreConfig(), new FileHiveMetastoreConfig().setCatalogDirectory(metastoreDir.toURI().toString()).setMetastoreUser("test"));
    hiveMetastore.createDatabase(new Database("db_name", Optional.empty(), Optional.of("test"), Optional.of(PrincipalType.USER), Optional.empty(), ImmutableMap.of()));
    CachingDeltaLakeStatisticsAccess statistics = new CachingDeltaLakeStatisticsAccess(new MetaDirStatisticsAccess(hdfsEnvironment, new JsonCodecFactory().jsonCodec(DeltaLakeStatistics.class)));
    deltaLakeMetastore = new HiveMetastoreBackedDeltaLakeMetastore(hiveMetastore, transactionLogAccess, typeManager, statistics);
}
Also used : DeltaLakeConfig(io.trino.plugin.deltalake.DeltaLakeConfig) HdfsConfigurationInitializer(io.trino.plugin.hive.HdfsConfigurationInitializer) HiveHdfsConfiguration(io.trino.plugin.hive.HiveHdfsConfiguration) MetastoreConfig(io.trino.plugin.hive.metastore.MetastoreConfig) FileHiveMetastoreConfig(io.trino.plugin.hive.metastore.file.FileHiveMetastoreConfig) MetaDirStatisticsAccess(io.trino.plugin.deltalake.statistics.MetaDirStatisticsAccess) HdfsConfig(io.trino.plugin.hive.HdfsConfig) TransactionLogAccess(io.trino.plugin.deltalake.transactionlog.TransactionLogAccess) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) TestingConnectorContext(io.trino.testing.TestingConnectorContext) HiveHdfsConfiguration(io.trino.plugin.hive.HiveHdfsConfiguration) HdfsConfiguration(io.trino.plugin.hive.HdfsConfiguration) NoHdfsAuthentication(io.trino.plugin.hive.authentication.NoHdfsAuthentication) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) NodeVersion(io.trino.plugin.hive.NodeVersion) FileHiveMetastoreConfig(io.trino.plugin.hive.metastore.file.FileHiveMetastoreConfig) FileHiveMetastore(io.trino.plugin.hive.metastore.file.FileHiveMetastore) CheckpointSchemaManager(io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointSchemaManager) Database(io.trino.plugin.hive.metastore.Database) TypeManager(io.trino.spi.type.TypeManager) CachingDeltaLakeStatisticsAccess(io.trino.plugin.deltalake.statistics.CachingDeltaLakeStatisticsAccess) File(java.io.File) JsonCodecFactory(io.airlift.json.JsonCodecFactory) ParquetReaderConfig(io.trino.plugin.hive.parquet.ParquetReaderConfig) BeforeClass(org.testng.annotations.BeforeClass)

Example 3 with CheckpointSchemaManager

use of io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointSchemaManager in project trino by trinodb.

the class AbstractTestDeltaLakeCreateTableStatistics method getAddFileEntries.

protected List<AddFileEntry> getAddFileEntries(String tableName) throws IOException {
    TestingConnectorContext context = new TestingConnectorContext();
    HdfsConfig hdfsConfig = new HdfsConfig();
    HdfsConfiguration hdfsConfiguration = new HiveHdfsConfiguration(new HdfsConfigurationInitializer(hdfsConfig), ImmutableSet.of());
    HdfsEnvironment hdfsEnvironment = new HdfsEnvironment(hdfsConfiguration, hdfsConfig, new NoHdfsAuthentication());
    TransactionLogAccess transactionLogAccess = new TransactionLogAccess(context.getTypeManager(), new CheckpointSchemaManager(context.getTypeManager()), new DeltaLakeConfig(), new FileFormatDataSourceStats(), hdfsEnvironment, new ParquetReaderConfig(), new DeltaLakeConfig());
    return transactionLogAccess.getActiveFiles(transactionLogAccess.loadSnapshot(new SchemaTableName(SCHEMA, tableName), new Path(format("s3://%s/%s", bucketName, tableName)), SESSION), SESSION);
}
Also used : Path(org.apache.hadoop.fs.Path) HdfsConfigurationInitializer(io.trino.plugin.hive.HdfsConfigurationInitializer) HiveHdfsConfiguration(io.trino.plugin.hive.HiveHdfsConfiguration) HdfsConfig(io.trino.plugin.hive.HdfsConfig) TransactionLogAccess(io.trino.plugin.deltalake.transactionlog.TransactionLogAccess) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) TestingConnectorContext(io.trino.testing.TestingConnectorContext) HiveHdfsConfiguration(io.trino.plugin.hive.HiveHdfsConfiguration) HdfsConfiguration(io.trino.plugin.hive.HdfsConfiguration) NoHdfsAuthentication(io.trino.plugin.hive.authentication.NoHdfsAuthentication) SchemaTableName(io.trino.spi.connector.SchemaTableName) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) CheckpointSchemaManager(io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointSchemaManager) ParquetReaderConfig(io.trino.plugin.hive.parquet.ParquetReaderConfig)

Example 4 with CheckpointSchemaManager

use of io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointSchemaManager in project trino by trinodb.

the class TestTableSnapshot method setUp.

@BeforeMethod
public void setUp() throws IOException, URISyntaxException {
    checkpointSchemaManager = new CheckpointSchemaManager(typeManager);
    URI deltaLogPath = getClass().getClassLoader().getResource("databricks/person").toURI();
    tableLocation = new Path(deltaLogPath);
    Configuration conf = new Configuration(false);
    FileSystem filesystem = tableLocation.getFileSystem(conf);
    accessTrackingFileSystem = new AccessTrackingFileSystem(filesystem);
    HdfsConfig hdfsConfig = new HdfsConfig();
    HdfsConfiguration hdfsConfiguration = new HiveHdfsConfiguration(new HdfsConfigurationInitializer(hdfsConfig), ImmutableSet.of());
    hdfsEnvironment = new HdfsEnvironment(hdfsConfiguration, hdfsConfig, new NoHdfsAuthentication());
}
Also used : Path(org.apache.hadoop.fs.Path) HdfsConfigurationInitializer(io.trino.plugin.hive.HdfsConfigurationInitializer) HiveHdfsConfiguration(io.trino.plugin.hive.HiveHdfsConfiguration) CheckpointSchemaManager(io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointSchemaManager) Configuration(org.apache.hadoop.conf.Configuration) HiveHdfsConfiguration(io.trino.plugin.hive.HiveHdfsConfiguration) HdfsConfiguration(io.trino.plugin.hive.HdfsConfiguration) FileSystem(org.apache.hadoop.fs.FileSystem) AccessTrackingFileSystem(io.trino.plugin.deltalake.AccessTrackingFileSystem) HdfsConfig(io.trino.plugin.hive.HdfsConfig) HiveHdfsConfiguration(io.trino.plugin.hive.HiveHdfsConfiguration) HdfsConfiguration(io.trino.plugin.hive.HdfsConfiguration) URI(java.net.URI) NoHdfsAuthentication(io.trino.plugin.hive.authentication.NoHdfsAuthentication) AccessTrackingFileSystem(io.trino.plugin.deltalake.AccessTrackingFileSystem) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) BeforeMethod(org.testng.annotations.BeforeMethod)

Example 5 with CheckpointSchemaManager

use of io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointSchemaManager in project trino by trinodb.

the class TestTransactionLogAccess method setupTransactionLogAccess.

private void setupTransactionLogAccess(String tableName, Path tableLocation) throws IOException {
    TestingConnectorContext context = new TestingConnectorContext();
    TypeManager typeManager = context.getTypeManager();
    HdfsConfig hdfsConfig = new HdfsConfig();
    HdfsConfiguration hdfsConfiguration = new HiveHdfsConfiguration(new HdfsConfigurationInitializer(hdfsConfig), ImmutableSet.of());
    HdfsEnvironment hdfsEnvironment = new HdfsEnvironment(hdfsConfiguration, hdfsConfig, new NoHdfsAuthentication());
    FileFormatDataSourceStats fileFormatDataSourceStats = new FileFormatDataSourceStats();
    transactionLogAccess = new TrackingTransactionLogAccess(tableName, tableLocation, SESSION, typeManager, new CheckpointSchemaManager(typeManager), new DeltaLakeConfig(), fileFormatDataSourceStats, hdfsEnvironment, new ParquetReaderConfig());
    DeltaLakeTableHandle tableHandle = new DeltaLakeTableHandle("schema", tableName, "location", // ignored
    Optional.empty(), TupleDomain.none(), TupleDomain.none(), Optional.empty(), Optional.empty(), Optional.empty(), Optional.empty(), Optional.empty(), 0);
    tableSnapshot = transactionLogAccess.loadSnapshot(tableHandle.getSchemaTableName(), tableLocation, SESSION);
}
Also used : HdfsConfigurationInitializer(io.trino.plugin.hive.HdfsConfigurationInitializer) HiveHdfsConfiguration(io.trino.plugin.hive.HiveHdfsConfiguration) HdfsConfig(io.trino.plugin.hive.HdfsConfig) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) TestingConnectorContext(io.trino.testing.TestingConnectorContext) HiveHdfsConfiguration(io.trino.plugin.hive.HiveHdfsConfiguration) HdfsConfiguration(io.trino.plugin.hive.HdfsConfiguration) NoHdfsAuthentication(io.trino.plugin.hive.authentication.NoHdfsAuthentication) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) CheckpointSchemaManager(io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointSchemaManager) TypeManager(io.trino.spi.type.TypeManager) ParquetReaderConfig(io.trino.plugin.hive.parquet.ParquetReaderConfig)

Aggregations

CheckpointSchemaManager (io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointSchemaManager)5 HdfsConfig (io.trino.plugin.hive.HdfsConfig)5 HdfsConfiguration (io.trino.plugin.hive.HdfsConfiguration)5 HdfsConfigurationInitializer (io.trino.plugin.hive.HdfsConfigurationInitializer)5 HdfsEnvironment (io.trino.plugin.hive.HdfsEnvironment)5 HiveHdfsConfiguration (io.trino.plugin.hive.HiveHdfsConfiguration)5 NoHdfsAuthentication (io.trino.plugin.hive.authentication.NoHdfsAuthentication)5 FileFormatDataSourceStats (io.trino.plugin.hive.FileFormatDataSourceStats)4 ParquetReaderConfig (io.trino.plugin.hive.parquet.ParquetReaderConfig)4 TypeManager (io.trino.spi.type.TypeManager)3 TestingConnectorContext (io.trino.testing.TestingConnectorContext)3 Path (org.apache.hadoop.fs.Path)3 TransactionLogAccess (io.trino.plugin.deltalake.transactionlog.TransactionLogAccess)2 File (java.io.File)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 JsonCodecFactory (io.airlift.json.JsonCodecFactory)1 AccessTrackingFileSystem (io.trino.plugin.deltalake.AccessTrackingFileSystem)1 DeltaLakeConfig (io.trino.plugin.deltalake.DeltaLakeConfig)1 CachingDeltaLakeStatisticsAccess (io.trino.plugin.deltalake.statistics.CachingDeltaLakeStatisticsAccess)1 MetaDirStatisticsAccess (io.trino.plugin.deltalake.statistics.MetaDirStatisticsAccess)1