use of org.apache.iceberg.Table in project hive by apache.
the class IcebergTableUtil method getTable.
/**
* Load the iceberg table either from the {@link QueryState} or through the configured catalog. Look for the table
* object stored in the query state. If it's null, it means the table was not loaded yet within the same query
* therefore we claim it through the Catalogs API and then store it in query state.
* @param configuration a Hadoop configuration
* @param properties controlling properties
* @return an Iceberg table
*/
static Table getTable(Configuration configuration, Properties properties) {
String metaTable = properties.getProperty("metaTable");
String tableName = properties.getProperty(Catalogs.NAME);
if (metaTable != null) {
properties.setProperty(Catalogs.NAME, tableName + "." + metaTable);
}
String tableIdentifier = properties.getProperty(Catalogs.NAME);
return SessionStateUtil.getResource(configuration, tableIdentifier).filter(o -> o instanceof Table).map(o -> (Table) o).orElseGet(() -> {
LOG.debug("Iceberg table {} is not found in QueryState. Loading table from configured catalog", tableIdentifier);
Table tab = Catalogs.loadTable(configuration, properties);
SessionStateUtil.addResource(configuration, tableIdentifier, tab);
return tab;
});
}
use of org.apache.iceberg.Table in project hive by apache.
the class IcebergInputFormat method getSplits.
@Override
public List<InputSplit> getSplits(JobContext context) {
Configuration conf = context.getConfiguration();
Table table = Optional.ofNullable(HiveIcebergStorageHandler.table(conf, conf.get(InputFormatConfig.TABLE_IDENTIFIER))).orElseGet(() -> Catalogs.loadTable(conf));
TableScan scan = createTableScan(table, conf);
List<InputSplit> splits = Lists.newArrayList();
boolean applyResidual = !conf.getBoolean(InputFormatConfig.SKIP_RESIDUAL_FILTERING, false);
InputFormatConfig.InMemoryDataModel model = conf.getEnum(InputFormatConfig.IN_MEMORY_DATA_MODEL, InputFormatConfig.InMemoryDataModel.GENERIC);
try (CloseableIterable<CombinedScanTask> tasksIterable = scan.planTasks()) {
Table serializableTable = SerializableTable.copyOf(table);
tasksIterable.forEach(task -> {
if (applyResidual && (model == InputFormatConfig.InMemoryDataModel.HIVE || model == InputFormatConfig.InMemoryDataModel.PIG)) {
// TODO: We do not support residual evaluation for HIVE and PIG in memory data model yet
checkResiduals(task);
}
splits.add(new IcebergSplit(serializableTable, conf, task));
});
} catch (IOException e) {
throw new UncheckedIOException(String.format("Failed to close table scan: %s", scan), e);
}
// wouldn't be able to inject the config into these tasks on the deserializer-side, unlike for standard queries
if (scan instanceof DataTableScan) {
HiveIcebergStorageHandler.checkAndSkipIoConfigSerialization(conf, table);
}
return splits;
}
use of org.apache.iceberg.Table in project hive by apache.
the class TestHelper method createTable.
public Table createTable(Schema theSchema, PartitionSpec theSpec) {
Table tbl = tables.create(theSchema, theSpec, properties(), tableIdentifier);
setTable(tbl);
return tbl;
}
use of org.apache.iceberg.Table in project incubator-gobblin by apache.
the class IcebergMetadataWriterTest method testWriteAddFileGMCE.
@Test(dependsOnGroups = { "hiveMetadataWriterTest" })
public void testWriteAddFileGMCE() throws IOException {
// Creating a copy of gmce with static type in GenericRecord to work with writeEnvelop method
// without risking running into type cast runtime error.
GenericRecord genericGmce = GenericData.get().deepCopy(gmce.getSchema(), gmce);
gobblinMCEWriterWithAcceptClusters.writeEnvelope(new RecordEnvelope<>(genericGmce, new KafkaStreamingExtractor.KafkaWatermark(new KafkaPartition.Builder().withTopicName("GobblinMetadataChangeEvent_test").withId(1).build(), new LongWatermark(10L))));
// Test when accept clusters does not contain the gmce cluster, we will skip
Assert.assertEquals(catalog.listTables(Namespace.of(dbName)).size(), 0);
gobblinMCEWriter.writeEnvelope(new RecordEnvelope<>(genericGmce, new KafkaStreamingExtractor.KafkaWatermark(new KafkaPartition.Builder().withTopicName("GobblinMetadataChangeEvent_test").withId(1).build(), new LongWatermark(10L))));
Assert.assertEquals(catalog.listTables(Namespace.of(dbName)).size(), 1);
Table table = catalog.loadTable(catalog.listTables(Namespace.of(dbName)).get(0));
Assert.assertFalse(table.properties().containsKey("offset.range.testTopic-1"));
Assert.assertEquals(table.location(), new File(tmpDir, "data/tracking/testIcebergTable/_iceberg_metadata/").getAbsolutePath() + "/" + dbName);
gmce.setTopicPartitionOffsetsRange(ImmutableMap.<String, String>builder().put("testTopic-1", "1000-2000").build());
GenericRecord genericGmce_1000_2000 = GenericData.get().deepCopy(gmce.getSchema(), gmce);
gobblinMCEWriter.writeEnvelope(new RecordEnvelope<>(genericGmce_1000_2000, new KafkaStreamingExtractor.KafkaWatermark(new KafkaPartition.Builder().withTopicName("GobblinMetadataChangeEvent_test").withId(1).build(), new LongWatermark(20L))));
gobblinMCEWriter.flush();
table = catalog.loadTable(catalog.listTables(Namespace.of(dbName)).get(0));
Assert.assertEquals(table.properties().get("offset.range.testTopic-1"), "0-2000");
Assert.assertEquals(table.currentSnapshot().allManifests().size(), 1);
// Assert low watermark and high watermark set properly
Assert.assertEquals(table.properties().get("gmce.low.watermark.GobblinMetadataChangeEvent_test-1"), "9");
Assert.assertEquals(table.properties().get("gmce.high.watermark.GobblinMetadataChangeEvent_test-1"), "20");
/*test flush twice*/
gmce.setTopicPartitionOffsetsRange(ImmutableMap.<String, String>builder().put("testTopic-1", "2000-3000").build());
gmce.setNewFiles(Lists.newArrayList(DataFile.newBuilder().setFilePath(hourlyDataFile_2.toString()).setFileFormat("avro").setFileMetrics(DataMetrics.newBuilder().setRecordCount(10L).build()).build()));
GenericRecord genericGmce_2000_3000 = GenericData.get().deepCopy(gmce.getSchema(), gmce);
gobblinMCEWriter.writeEnvelope(new RecordEnvelope<>(genericGmce_2000_3000, new KafkaStreamingExtractor.KafkaWatermark(new KafkaPartition.Builder().withTopicName("GobblinMetadataChangeEvent_test").withId(1).build(), new LongWatermark(30L))));
gobblinMCEWriter.flush();
table = catalog.loadTable(catalog.listTables(Namespace.of(dbName)).get(0));
Assert.assertEquals(table.properties().get("offset.range.testTopic-1"), "0-3000");
Assert.assertEquals(table.currentSnapshot().allManifests().size(), 2);
Assert.assertEquals(table.properties().get("gmce.low.watermark.GobblinMetadataChangeEvent_test-1"), "20");
Assert.assertEquals(table.properties().get("gmce.high.watermark.GobblinMetadataChangeEvent_test-1"), "30");
/* Test it will skip event with lower watermark*/
gmce.setTopicPartitionOffsetsRange(ImmutableMap.<String, String>builder().put("testTopic-1", "3000-4000").build());
gobblinMCEWriter.writeEnvelope(new RecordEnvelope<>(genericGmce, new KafkaStreamingExtractor.KafkaWatermark(new KafkaPartition.Builder().withTopicName("GobblinMetadataChangeEvent_test").withId(1).build(), new LongWatermark(30L))));
gobblinMCEWriter.flush();
table = catalog.loadTable(catalog.listTables(Namespace.of(dbName)).get(0));
Assert.assertEquals(table.properties().get("offset.range.testTopic-1"), "0-3000");
Assert.assertEquals(table.currentSnapshot().allManifests().size(), 2);
}
use of org.apache.iceberg.Table in project incubator-gobblin by apache.
the class IcebergMetadataWriterTest method testWriteAddFileGMCECompleteness.
@Test(dependsOnMethods = { "testChangeProperty" }, groups = { "icebergMetadataWriterTest" })
public void testWriteAddFileGMCECompleteness() throws IOException {
// Creating a copy of gmce with static type in GenericRecord to work with writeEnvelop method
// without risking running into type cast runtime error.
gmce.setOperationType(OperationType.add_files);
File hourlyFile = new File(tmpDir, "data/tracking/testIcebergTable/hourly/2021/09/16/10/data.avro");
long timestampMillis = 1631811600000L;
Files.createParentDirs(hourlyFile);
writeRecord(hourlyFile);
gmce.setNewFiles(Lists.newArrayList(DataFile.newBuilder().setFilePath(hourlyFile.toString()).setFileFormat("avro").setFileMetrics(DataMetrics.newBuilder().setRecordCount(10L).build()).build()));
gmce.setTopicPartitionOffsetsRange(ImmutableMap.<String, String>builder().put("testTopic-1", "3000-4000").build());
GenericRecord genericGmce_3000_4000 = GenericData.get().deepCopy(gmce.getSchema(), gmce);
gobblinMCEWriterWithCompletness.writeEnvelope(new RecordEnvelope<>(genericGmce_3000_4000, new KafkaStreamingExtractor.KafkaWatermark(new KafkaPartition.Builder().withTopicName("GobblinMetadataChangeEvent_test").withId(1).build(), new LongWatermark(50L))));
Table table = catalog.loadTable(catalog.listTables(Namespace.of(dbName)).get(0));
Assert.assertEquals(table.properties().get("offset.range.testTopic-1"), "0-4000");
Assert.assertTrue(table.spec().fields().size() == 2);
Assert.assertEquals(table.spec().fields().get(1).name(), "late");
// Test when completeness watermark = -1 bootstrap case
KafkaAuditCountVerifier verifier = Mockito.mock(TestAuditCountVerifier.class);
Mockito.when(verifier.isComplete("testTopic", timestampMillis - TimeUnit.HOURS.toMillis(1), timestampMillis)).thenReturn(true);
((IcebergMetadataWriter) gobblinMCEWriterWithCompletness.metadataWriters.iterator().next()).setAuditCountVerifier(verifier);
gobblinMCEWriterWithCompletness.flush();
table = catalog.loadTable(catalog.listTables(Namespace.of(dbName)).get(0));
// completeness watermark = "2020-09-16-10"
Assert.assertEquals(table.properties().get(TOPIC_NAME_KEY), "testTopic");
Assert.assertEquals(table.properties().get(COMPLETION_WATERMARK_TIMEZONE_KEY), "America/Los_Angeles");
Assert.assertEquals(table.properties().get(COMPLETION_WATERMARK_KEY), String.valueOf(timestampMillis));
Iterator<org.apache.iceberg.DataFile> dfl = FindFiles.in(table).withMetadataMatching(Expressions.startsWith("file_path", hourlyFile.getAbsolutePath())).collect().iterator();
Assert.assertTrue(dfl.hasNext());
// Test when completeness watermark is still "2021-09-16-10" but have a late file for "2021-09-16-09"
File hourlyFile1 = new File(tmpDir, "data/tracking/testIcebergTable/hourly/2021/09/16/09/data1.avro");
Files.createParentDirs(hourlyFile1);
writeRecord(hourlyFile1);
gmce.setNewFiles(Lists.newArrayList(DataFile.newBuilder().setFilePath(hourlyFile1.toString()).setFileFormat("avro").setFileMetrics(DataMetrics.newBuilder().setRecordCount(10L).build()).build()));
gmce.setTopicPartitionOffsetsRange(ImmutableMap.<String, String>builder().put("testTopic-1", "4000-5000").build());
GenericRecord genericGmce_4000_5000 = GenericData.get().deepCopy(gmce.getSchema(), gmce);
gobblinMCEWriterWithCompletness.writeEnvelope(new RecordEnvelope<>(genericGmce_4000_5000, new KafkaStreamingExtractor.KafkaWatermark(new KafkaPartition.Builder().withTopicName("GobblinMetadataChangeEvent_test").withId(1).build(), new LongWatermark(55L))));
gobblinMCEWriterWithCompletness.flush();
table = catalog.loadTable(catalog.listTables(Namespace.of(dbName)).get(0));
Assert.assertEquals(table.properties().get(COMPLETION_WATERMARK_KEY), String.valueOf(timestampMillis));
dfl = FindFiles.in(table).withMetadataMatching(Expressions.startsWith("file_path", hourlyFile1.getAbsolutePath())).collect().iterator();
Assert.assertTrue(dfl.hasNext());
Assert.assertEquals((int) dfl.next().partition().get(1, Integer.class), 1);
// Test when completeness watermark will advance to "2021-09-16-11"
File hourlyFile2 = new File(tmpDir, "data/tracking/testIcebergTable/hourly/2021/09/16/11/data.avro");
long timestampMillis1 = timestampMillis + TimeUnit.HOURS.toMillis(1);
Files.createParentDirs(hourlyFile2);
writeRecord(hourlyFile2);
gmce.setNewFiles(Lists.newArrayList(DataFile.newBuilder().setFilePath(hourlyFile2.toString()).setFileFormat("avro").setFileMetrics(DataMetrics.newBuilder().setRecordCount(10L).build()).build()));
gmce.setTopicPartitionOffsetsRange(ImmutableMap.<String, String>builder().put("testTopic-1", "5000-6000").build());
GenericRecord genericGmce_5000_6000 = GenericData.get().deepCopy(gmce.getSchema(), gmce);
gobblinMCEWriterWithCompletness.writeEnvelope(new RecordEnvelope<>(genericGmce_5000_6000, new KafkaStreamingExtractor.KafkaWatermark(new KafkaPartition.Builder().withTopicName("GobblinMetadataChangeEvent_test").withId(1).build(), new LongWatermark(60L))));
Mockito.when(verifier.isComplete("testTopic", timestampMillis1 - TimeUnit.HOURS.toMillis(1), timestampMillis1)).thenReturn(true);
gobblinMCEWriterWithCompletness.flush();
table = catalog.loadTable(catalog.listTables(Namespace.of(dbName)).get(0));
Assert.assertEquals(table.properties().get(COMPLETION_WATERMARK_KEY), String.valueOf(timestampMillis1));
dfl = FindFiles.in(table).withMetadataMatching(Expressions.startsWith("file_path", hourlyFile2.getAbsolutePath())).collect().iterator();
Assert.assertTrue(dfl.hasNext());
Assert.assertTrue(dfl.next().partition().get(1, Integer.class) == 0);
}
Aggregations