Search in sources :

Example 1 with KafkaAuditCountVerifier

use of org.apache.gobblin.completeness.verifier.KafkaAuditCountVerifier in project incubator-gobblin by apache.

the class IcebergMetadataWriterTest method testWriteAddFileGMCECompleteness.

@Test(dependsOnMethods = { "testChangeProperty" }, groups = { "icebergMetadataWriterTest" })
public void testWriteAddFileGMCECompleteness() throws IOException {
    // Creating a copy of gmce with static type in GenericRecord to work with writeEnvelop method
    // without risking running into type cast runtime error.
    gmce.setOperationType(OperationType.add_files);
    File hourlyFile = new File(tmpDir, "data/tracking/testIcebergTable/hourly/2021/09/16/10/data.avro");
    long timestampMillis = 1631811600000L;
    Files.createParentDirs(hourlyFile);
    writeRecord(hourlyFile);
    gmce.setNewFiles(Lists.newArrayList(DataFile.newBuilder().setFilePath(hourlyFile.toString()).setFileFormat("avro").setFileMetrics(DataMetrics.newBuilder().setRecordCount(10L).build()).build()));
    gmce.setTopicPartitionOffsetsRange(ImmutableMap.<String, String>builder().put("testTopic-1", "3000-4000").build());
    GenericRecord genericGmce_3000_4000 = GenericData.get().deepCopy(gmce.getSchema(), gmce);
    gobblinMCEWriterWithCompletness.writeEnvelope(new RecordEnvelope<>(genericGmce_3000_4000, new KafkaStreamingExtractor.KafkaWatermark(new KafkaPartition.Builder().withTopicName("GobblinMetadataChangeEvent_test").withId(1).build(), new LongWatermark(50L))));
    Table table = catalog.loadTable(catalog.listTables(Namespace.of(dbName)).get(0));
    Assert.assertEquals(table.properties().get("offset.range.testTopic-1"), "0-4000");
    Assert.assertTrue(table.spec().fields().size() == 2);
    Assert.assertEquals(table.spec().fields().get(1).name(), "late");
    // Test when completeness watermark = -1 bootstrap case
    KafkaAuditCountVerifier verifier = Mockito.mock(TestAuditCountVerifier.class);
    Mockito.when(verifier.isComplete("testTopic", timestampMillis - TimeUnit.HOURS.toMillis(1), timestampMillis)).thenReturn(true);
    ((IcebergMetadataWriter) gobblinMCEWriterWithCompletness.metadataWriters.iterator().next()).setAuditCountVerifier(verifier);
    gobblinMCEWriterWithCompletness.flush();
    table = catalog.loadTable(catalog.listTables(Namespace.of(dbName)).get(0));
    // completeness watermark = "2020-09-16-10"
    Assert.assertEquals(table.properties().get(TOPIC_NAME_KEY), "testTopic");
    Assert.assertEquals(table.properties().get(COMPLETION_WATERMARK_TIMEZONE_KEY), "America/Los_Angeles");
    Assert.assertEquals(table.properties().get(COMPLETION_WATERMARK_KEY), String.valueOf(timestampMillis));
    Iterator<org.apache.iceberg.DataFile> dfl = FindFiles.in(table).withMetadataMatching(Expressions.startsWith("file_path", hourlyFile.getAbsolutePath())).collect().iterator();
    Assert.assertTrue(dfl.hasNext());
    // Test when completeness watermark is still "2021-09-16-10" but have a late file for "2021-09-16-09"
    File hourlyFile1 = new File(tmpDir, "data/tracking/testIcebergTable/hourly/2021/09/16/09/data1.avro");
    Files.createParentDirs(hourlyFile1);
    writeRecord(hourlyFile1);
    gmce.setNewFiles(Lists.newArrayList(DataFile.newBuilder().setFilePath(hourlyFile1.toString()).setFileFormat("avro").setFileMetrics(DataMetrics.newBuilder().setRecordCount(10L).build()).build()));
    gmce.setTopicPartitionOffsetsRange(ImmutableMap.<String, String>builder().put("testTopic-1", "4000-5000").build());
    GenericRecord genericGmce_4000_5000 = GenericData.get().deepCopy(gmce.getSchema(), gmce);
    gobblinMCEWriterWithCompletness.writeEnvelope(new RecordEnvelope<>(genericGmce_4000_5000, new KafkaStreamingExtractor.KafkaWatermark(new KafkaPartition.Builder().withTopicName("GobblinMetadataChangeEvent_test").withId(1).build(), new LongWatermark(55L))));
    gobblinMCEWriterWithCompletness.flush();
    table = catalog.loadTable(catalog.listTables(Namespace.of(dbName)).get(0));
    Assert.assertEquals(table.properties().get(COMPLETION_WATERMARK_KEY), String.valueOf(timestampMillis));
    dfl = FindFiles.in(table).withMetadataMatching(Expressions.startsWith("file_path", hourlyFile1.getAbsolutePath())).collect().iterator();
    Assert.assertTrue(dfl.hasNext());
    Assert.assertEquals((int) dfl.next().partition().get(1, Integer.class), 1);
    // Test when completeness watermark will advance to "2021-09-16-11"
    File hourlyFile2 = new File(tmpDir, "data/tracking/testIcebergTable/hourly/2021/09/16/11/data.avro");
    long timestampMillis1 = timestampMillis + TimeUnit.HOURS.toMillis(1);
    Files.createParentDirs(hourlyFile2);
    writeRecord(hourlyFile2);
    gmce.setNewFiles(Lists.newArrayList(DataFile.newBuilder().setFilePath(hourlyFile2.toString()).setFileFormat("avro").setFileMetrics(DataMetrics.newBuilder().setRecordCount(10L).build()).build()));
    gmce.setTopicPartitionOffsetsRange(ImmutableMap.<String, String>builder().put("testTopic-1", "5000-6000").build());
    GenericRecord genericGmce_5000_6000 = GenericData.get().deepCopy(gmce.getSchema(), gmce);
    gobblinMCEWriterWithCompletness.writeEnvelope(new RecordEnvelope<>(genericGmce_5000_6000, new KafkaStreamingExtractor.KafkaWatermark(new KafkaPartition.Builder().withTopicName("GobblinMetadataChangeEvent_test").withId(1).build(), new LongWatermark(60L))));
    Mockito.when(verifier.isComplete("testTopic", timestampMillis1 - TimeUnit.HOURS.toMillis(1), timestampMillis1)).thenReturn(true);
    gobblinMCEWriterWithCompletness.flush();
    table = catalog.loadTable(catalog.listTables(Namespace.of(dbName)).get(0));
    Assert.assertEquals(table.properties().get(COMPLETION_WATERMARK_KEY), String.valueOf(timestampMillis1));
    dfl = FindFiles.in(table).withMetadataMatching(Expressions.startsWith("file_path", hourlyFile2.getAbsolutePath())).collect().iterator();
    Assert.assertTrue(dfl.hasNext());
    Assert.assertTrue(dfl.next().partition().get(1, Integer.class) == 0);
}
Also used : HiveTable(org.apache.gobblin.hive.HiveTable) Table(org.apache.iceberg.Table) GobblinEventBuilder(org.apache.gobblin.metrics.event.GobblinEventBuilder) SchemaBuilder(org.apache.avro.SchemaBuilder) DataFile(org.apache.gobblin.metadata.DataFile) KafkaAuditCountVerifier(org.apache.gobblin.completeness.verifier.KafkaAuditCountVerifier) GenericRecord(org.apache.avro.generic.GenericRecord) DataFile(org.apache.gobblin.metadata.DataFile) File(java.io.File) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark) Test(org.testng.annotations.Test) HiveMetastoreTest(org.apache.iceberg.hive.HiveMetastoreTest)

Aggregations

File (java.io.File)1 SchemaBuilder (org.apache.avro.SchemaBuilder)1 GenericRecord (org.apache.avro.generic.GenericRecord)1 KafkaAuditCountVerifier (org.apache.gobblin.completeness.verifier.KafkaAuditCountVerifier)1 HiveTable (org.apache.gobblin.hive.HiveTable)1 DataFile (org.apache.gobblin.metadata.DataFile)1 GobblinEventBuilder (org.apache.gobblin.metrics.event.GobblinEventBuilder)1 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)1 Table (org.apache.iceberg.Table)1 HiveMetastoreTest (org.apache.iceberg.hive.HiveMetastoreTest)1 Test (org.testng.annotations.Test)1