Search in sources :

Example 81 with Table

use of org.apache.iceberg.Table in project hive by apache.

the class IcebergTableUtil method getTable.

/**
 * Load the iceberg table either from the {@link QueryState} or through the configured catalog. Look for the table
 * object stored in the query state. If it's null, it means the table was not loaded yet within the same query
 * therefore we claim it through the Catalogs API and then store it in query state.
 * @param configuration a Hadoop configuration
 * @param properties controlling properties
 * @return an Iceberg table
 */
static Table getTable(Configuration configuration, Properties properties) {
    String metaTable = properties.getProperty("metaTable");
    String tableName = properties.getProperty(Catalogs.NAME);
    if (metaTable != null) {
        properties.setProperty(Catalogs.NAME, tableName + "." + metaTable);
    }
    String tableIdentifier = properties.getProperty(Catalogs.NAME);
    return SessionStateUtil.getResource(configuration, tableIdentifier).filter(o -> o instanceof Table).map(o -> (Table) o).orElseGet(() -> {
        LOG.debug("Iceberg table {} is not found in QueryState. Loading table from configured catalog", tableIdentifier);
        Table tab = Catalogs.loadTable(configuration, properties);
        SessionStateUtil.addResource(configuration, tableIdentifier, tab);
        return tab;
    });
}
Also used : PartitionTransformSpec(org.apache.hadoop.hive.ql.parse.PartitionTransformSpec) Properties(java.util.Properties) Logger(org.slf4j.Logger) Table(org.apache.iceberg.Table) Catalogs(org.apache.iceberg.mr.Catalogs) LoggerFactory(org.slf4j.LoggerFactory) Schema(org.apache.iceberg.Schema) List(java.util.List) UpdatePartitionSpec(org.apache.iceberg.UpdatePartitionSpec) Configuration(org.apache.hadoop.conf.Configuration) PartitionSpec(org.apache.iceberg.PartitionSpec) SessionStateUtil(org.apache.hadoop.hive.ql.session.SessionStateUtil) QueryState(org.apache.hadoop.hive.ql.QueryState) Expressions(org.apache.iceberg.expressions.Expressions) org.apache.hadoop.hive.metastore.api.hive_metastoreConstants(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants) Table(org.apache.iceberg.Table)

Example 82 with Table

use of org.apache.iceberg.Table in project hive by apache.

the class IcebergInputFormat method getSplits.

@Override
public List<InputSplit> getSplits(JobContext context) {
    Configuration conf = context.getConfiguration();
    Table table = Optional.ofNullable(HiveIcebergStorageHandler.table(conf, conf.get(InputFormatConfig.TABLE_IDENTIFIER))).orElseGet(() -> Catalogs.loadTable(conf));
    TableScan scan = createTableScan(table, conf);
    List<InputSplit> splits = Lists.newArrayList();
    boolean applyResidual = !conf.getBoolean(InputFormatConfig.SKIP_RESIDUAL_FILTERING, false);
    InputFormatConfig.InMemoryDataModel model = conf.getEnum(InputFormatConfig.IN_MEMORY_DATA_MODEL, InputFormatConfig.InMemoryDataModel.GENERIC);
    try (CloseableIterable<CombinedScanTask> tasksIterable = scan.planTasks()) {
        Table serializableTable = SerializableTable.copyOf(table);
        tasksIterable.forEach(task -> {
            if (applyResidual && (model == InputFormatConfig.InMemoryDataModel.HIVE || model == InputFormatConfig.InMemoryDataModel.PIG)) {
                // TODO: We do not support residual evaluation for HIVE and PIG in memory data model yet
                checkResiduals(task);
            }
            splits.add(new IcebergSplit(serializableTable, conf, task));
        });
    } catch (IOException e) {
        throw new UncheckedIOException(String.format("Failed to close table scan: %s", scan), e);
    }
    // wouldn't be able to inject the config into these tasks on the deserializer-side, unlike for standard queries
    if (scan instanceof DataTableScan) {
        HiveIcebergStorageHandler.checkAndSkipIoConfigSerialization(conf, table);
    }
    return splits;
}
Also used : TableScan(org.apache.iceberg.TableScan) DataTableScan(org.apache.iceberg.DataTableScan) Table(org.apache.iceberg.Table) SerializableTable(org.apache.iceberg.SerializableTable) CombinedScanTask(org.apache.iceberg.CombinedScanTask) Configuration(org.apache.hadoop.conf.Configuration) UncheckedIOException(java.io.UncheckedIOException) UncheckedIOException(java.io.UncheckedIOException) IOException(java.io.IOException) InputFormatConfig(org.apache.iceberg.mr.InputFormatConfig) DataTableScan(org.apache.iceberg.DataTableScan) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 83 with Table

use of org.apache.iceberg.Table in project hive by apache.

the class TestHelper method createTable.

public Table createTable(Schema theSchema, PartitionSpec theSpec) {
    Table tbl = tables.create(theSchema, theSpec, properties(), tableIdentifier);
    setTable(tbl);
    return tbl;
}
Also used : Table(org.apache.iceberg.Table)

Example 84 with Table

use of org.apache.iceberg.Table in project incubator-gobblin by apache.

the class IcebergMetadataWriterTest method testWriteAddFileGMCE.

@Test(dependsOnGroups = { "hiveMetadataWriterTest" })
public void testWriteAddFileGMCE() throws IOException {
    // Creating a copy of gmce with static type in GenericRecord to work with writeEnvelop method
    // without risking running into type cast runtime error.
    GenericRecord genericGmce = GenericData.get().deepCopy(gmce.getSchema(), gmce);
    gobblinMCEWriterWithAcceptClusters.writeEnvelope(new RecordEnvelope<>(genericGmce, new KafkaStreamingExtractor.KafkaWatermark(new KafkaPartition.Builder().withTopicName("GobblinMetadataChangeEvent_test").withId(1).build(), new LongWatermark(10L))));
    // Test when accept clusters does not contain the gmce cluster, we will skip
    Assert.assertEquals(catalog.listTables(Namespace.of(dbName)).size(), 0);
    gobblinMCEWriter.writeEnvelope(new RecordEnvelope<>(genericGmce, new KafkaStreamingExtractor.KafkaWatermark(new KafkaPartition.Builder().withTopicName("GobblinMetadataChangeEvent_test").withId(1).build(), new LongWatermark(10L))));
    Assert.assertEquals(catalog.listTables(Namespace.of(dbName)).size(), 1);
    Table table = catalog.loadTable(catalog.listTables(Namespace.of(dbName)).get(0));
    Assert.assertFalse(table.properties().containsKey("offset.range.testTopic-1"));
    Assert.assertEquals(table.location(), new File(tmpDir, "data/tracking/testIcebergTable/_iceberg_metadata/").getAbsolutePath() + "/" + dbName);
    gmce.setTopicPartitionOffsetsRange(ImmutableMap.<String, String>builder().put("testTopic-1", "1000-2000").build());
    GenericRecord genericGmce_1000_2000 = GenericData.get().deepCopy(gmce.getSchema(), gmce);
    gobblinMCEWriter.writeEnvelope(new RecordEnvelope<>(genericGmce_1000_2000, new KafkaStreamingExtractor.KafkaWatermark(new KafkaPartition.Builder().withTopicName("GobblinMetadataChangeEvent_test").withId(1).build(), new LongWatermark(20L))));
    gobblinMCEWriter.flush();
    table = catalog.loadTable(catalog.listTables(Namespace.of(dbName)).get(0));
    Assert.assertEquals(table.properties().get("offset.range.testTopic-1"), "0-2000");
    Assert.assertEquals(table.currentSnapshot().allManifests().size(), 1);
    // Assert low watermark and high watermark set properly
    Assert.assertEquals(table.properties().get("gmce.low.watermark.GobblinMetadataChangeEvent_test-1"), "9");
    Assert.assertEquals(table.properties().get("gmce.high.watermark.GobblinMetadataChangeEvent_test-1"), "20");
    /*test flush twice*/
    gmce.setTopicPartitionOffsetsRange(ImmutableMap.<String, String>builder().put("testTopic-1", "2000-3000").build());
    gmce.setNewFiles(Lists.newArrayList(DataFile.newBuilder().setFilePath(hourlyDataFile_2.toString()).setFileFormat("avro").setFileMetrics(DataMetrics.newBuilder().setRecordCount(10L).build()).build()));
    GenericRecord genericGmce_2000_3000 = GenericData.get().deepCopy(gmce.getSchema(), gmce);
    gobblinMCEWriter.writeEnvelope(new RecordEnvelope<>(genericGmce_2000_3000, new KafkaStreamingExtractor.KafkaWatermark(new KafkaPartition.Builder().withTopicName("GobblinMetadataChangeEvent_test").withId(1).build(), new LongWatermark(30L))));
    gobblinMCEWriter.flush();
    table = catalog.loadTable(catalog.listTables(Namespace.of(dbName)).get(0));
    Assert.assertEquals(table.properties().get("offset.range.testTopic-1"), "0-3000");
    Assert.assertEquals(table.currentSnapshot().allManifests().size(), 2);
    Assert.assertEquals(table.properties().get("gmce.low.watermark.GobblinMetadataChangeEvent_test-1"), "20");
    Assert.assertEquals(table.properties().get("gmce.high.watermark.GobblinMetadataChangeEvent_test-1"), "30");
    /* Test it will skip event with lower watermark*/
    gmce.setTopicPartitionOffsetsRange(ImmutableMap.<String, String>builder().put("testTopic-1", "3000-4000").build());
    gobblinMCEWriter.writeEnvelope(new RecordEnvelope<>(genericGmce, new KafkaStreamingExtractor.KafkaWatermark(new KafkaPartition.Builder().withTopicName("GobblinMetadataChangeEvent_test").withId(1).build(), new LongWatermark(30L))));
    gobblinMCEWriter.flush();
    table = catalog.loadTable(catalog.listTables(Namespace.of(dbName)).get(0));
    Assert.assertEquals(table.properties().get("offset.range.testTopic-1"), "0-3000");
    Assert.assertEquals(table.currentSnapshot().allManifests().size(), 2);
}
Also used : HiveTable(org.apache.gobblin.hive.HiveTable) Table(org.apache.iceberg.Table) GobblinEventBuilder(org.apache.gobblin.metrics.event.GobblinEventBuilder) SchemaBuilder(org.apache.avro.SchemaBuilder) GenericRecord(org.apache.avro.generic.GenericRecord) DataFile(org.apache.gobblin.metadata.DataFile) File(java.io.File) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark) Test(org.testng.annotations.Test) HiveMetastoreTest(org.apache.iceberg.hive.HiveMetastoreTest)

Example 85 with Table

use of org.apache.iceberg.Table in project incubator-gobblin by apache.

the class IcebergMetadataWriterTest method testWriteAddFileGMCECompleteness.

@Test(dependsOnMethods = { "testChangeProperty" }, groups = { "icebergMetadataWriterTest" })
public void testWriteAddFileGMCECompleteness() throws IOException {
    // Creating a copy of gmce with static type in GenericRecord to work with writeEnvelop method
    // without risking running into type cast runtime error.
    gmce.setOperationType(OperationType.add_files);
    File hourlyFile = new File(tmpDir, "data/tracking/testIcebergTable/hourly/2021/09/16/10/data.avro");
    long timestampMillis = 1631811600000L;
    Files.createParentDirs(hourlyFile);
    writeRecord(hourlyFile);
    gmce.setNewFiles(Lists.newArrayList(DataFile.newBuilder().setFilePath(hourlyFile.toString()).setFileFormat("avro").setFileMetrics(DataMetrics.newBuilder().setRecordCount(10L).build()).build()));
    gmce.setTopicPartitionOffsetsRange(ImmutableMap.<String, String>builder().put("testTopic-1", "3000-4000").build());
    GenericRecord genericGmce_3000_4000 = GenericData.get().deepCopy(gmce.getSchema(), gmce);
    gobblinMCEWriterWithCompletness.writeEnvelope(new RecordEnvelope<>(genericGmce_3000_4000, new KafkaStreamingExtractor.KafkaWatermark(new KafkaPartition.Builder().withTopicName("GobblinMetadataChangeEvent_test").withId(1).build(), new LongWatermark(50L))));
    Table table = catalog.loadTable(catalog.listTables(Namespace.of(dbName)).get(0));
    Assert.assertEquals(table.properties().get("offset.range.testTopic-1"), "0-4000");
    Assert.assertTrue(table.spec().fields().size() == 2);
    Assert.assertEquals(table.spec().fields().get(1).name(), "late");
    // Test when completeness watermark = -1 bootstrap case
    KafkaAuditCountVerifier verifier = Mockito.mock(TestAuditCountVerifier.class);
    Mockito.when(verifier.isComplete("testTopic", timestampMillis - TimeUnit.HOURS.toMillis(1), timestampMillis)).thenReturn(true);
    ((IcebergMetadataWriter) gobblinMCEWriterWithCompletness.metadataWriters.iterator().next()).setAuditCountVerifier(verifier);
    gobblinMCEWriterWithCompletness.flush();
    table = catalog.loadTable(catalog.listTables(Namespace.of(dbName)).get(0));
    // completeness watermark = "2020-09-16-10"
    Assert.assertEquals(table.properties().get(TOPIC_NAME_KEY), "testTopic");
    Assert.assertEquals(table.properties().get(COMPLETION_WATERMARK_TIMEZONE_KEY), "America/Los_Angeles");
    Assert.assertEquals(table.properties().get(COMPLETION_WATERMARK_KEY), String.valueOf(timestampMillis));
    Iterator<org.apache.iceberg.DataFile> dfl = FindFiles.in(table).withMetadataMatching(Expressions.startsWith("file_path", hourlyFile.getAbsolutePath())).collect().iterator();
    Assert.assertTrue(dfl.hasNext());
    // Test when completeness watermark is still "2021-09-16-10" but have a late file for "2021-09-16-09"
    File hourlyFile1 = new File(tmpDir, "data/tracking/testIcebergTable/hourly/2021/09/16/09/data1.avro");
    Files.createParentDirs(hourlyFile1);
    writeRecord(hourlyFile1);
    gmce.setNewFiles(Lists.newArrayList(DataFile.newBuilder().setFilePath(hourlyFile1.toString()).setFileFormat("avro").setFileMetrics(DataMetrics.newBuilder().setRecordCount(10L).build()).build()));
    gmce.setTopicPartitionOffsetsRange(ImmutableMap.<String, String>builder().put("testTopic-1", "4000-5000").build());
    GenericRecord genericGmce_4000_5000 = GenericData.get().deepCopy(gmce.getSchema(), gmce);
    gobblinMCEWriterWithCompletness.writeEnvelope(new RecordEnvelope<>(genericGmce_4000_5000, new KafkaStreamingExtractor.KafkaWatermark(new KafkaPartition.Builder().withTopicName("GobblinMetadataChangeEvent_test").withId(1).build(), new LongWatermark(55L))));
    gobblinMCEWriterWithCompletness.flush();
    table = catalog.loadTable(catalog.listTables(Namespace.of(dbName)).get(0));
    Assert.assertEquals(table.properties().get(COMPLETION_WATERMARK_KEY), String.valueOf(timestampMillis));
    dfl = FindFiles.in(table).withMetadataMatching(Expressions.startsWith("file_path", hourlyFile1.getAbsolutePath())).collect().iterator();
    Assert.assertTrue(dfl.hasNext());
    Assert.assertEquals((int) dfl.next().partition().get(1, Integer.class), 1);
    // Test when completeness watermark will advance to "2021-09-16-11"
    File hourlyFile2 = new File(tmpDir, "data/tracking/testIcebergTable/hourly/2021/09/16/11/data.avro");
    long timestampMillis1 = timestampMillis + TimeUnit.HOURS.toMillis(1);
    Files.createParentDirs(hourlyFile2);
    writeRecord(hourlyFile2);
    gmce.setNewFiles(Lists.newArrayList(DataFile.newBuilder().setFilePath(hourlyFile2.toString()).setFileFormat("avro").setFileMetrics(DataMetrics.newBuilder().setRecordCount(10L).build()).build()));
    gmce.setTopicPartitionOffsetsRange(ImmutableMap.<String, String>builder().put("testTopic-1", "5000-6000").build());
    GenericRecord genericGmce_5000_6000 = GenericData.get().deepCopy(gmce.getSchema(), gmce);
    gobblinMCEWriterWithCompletness.writeEnvelope(new RecordEnvelope<>(genericGmce_5000_6000, new KafkaStreamingExtractor.KafkaWatermark(new KafkaPartition.Builder().withTopicName("GobblinMetadataChangeEvent_test").withId(1).build(), new LongWatermark(60L))));
    Mockito.when(verifier.isComplete("testTopic", timestampMillis1 - TimeUnit.HOURS.toMillis(1), timestampMillis1)).thenReturn(true);
    gobblinMCEWriterWithCompletness.flush();
    table = catalog.loadTable(catalog.listTables(Namespace.of(dbName)).get(0));
    Assert.assertEquals(table.properties().get(COMPLETION_WATERMARK_KEY), String.valueOf(timestampMillis1));
    dfl = FindFiles.in(table).withMetadataMatching(Expressions.startsWith("file_path", hourlyFile2.getAbsolutePath())).collect().iterator();
    Assert.assertTrue(dfl.hasNext());
    Assert.assertTrue(dfl.next().partition().get(1, Integer.class) == 0);
}
Also used : HiveTable(org.apache.gobblin.hive.HiveTable) Table(org.apache.iceberg.Table) GobblinEventBuilder(org.apache.gobblin.metrics.event.GobblinEventBuilder) SchemaBuilder(org.apache.avro.SchemaBuilder) DataFile(org.apache.gobblin.metadata.DataFile) KafkaAuditCountVerifier(org.apache.gobblin.completeness.verifier.KafkaAuditCountVerifier) GenericRecord(org.apache.avro.generic.GenericRecord) DataFile(org.apache.gobblin.metadata.DataFile) File(java.io.File) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark) Test(org.testng.annotations.Test) HiveMetastoreTest(org.apache.iceberg.hive.HiveMetastoreTest)

Aggregations

Table (org.apache.iceberg.Table)188 Test (org.junit.Test)132 Schema (org.apache.iceberg.Schema)66 TableIdentifier (org.apache.iceberg.catalog.TableIdentifier)56 Record (org.apache.iceberg.data.Record)56 PartitionSpec (org.apache.iceberg.PartitionSpec)51 IOException (java.io.IOException)27 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)27 List (java.util.List)22 Map (java.util.Map)20 DataFile (org.apache.iceberg.DataFile)19 NoSuchTableException (org.apache.iceberg.exceptions.NoSuchTableException)19 Collectors (java.util.stream.Collectors)18 BaseTable (org.apache.iceberg.BaseTable)18 Types (org.apache.iceberg.types.Types)18 Properties (java.util.Properties)17 Configuration (org.apache.hadoop.conf.Configuration)17 Path (org.apache.hadoop.fs.Path)17 FileFormat (org.apache.iceberg.FileFormat)16 ArrayList (java.util.ArrayList)15