Search in sources :

Example 6 with RawTripTestPayload

use of org.apache.hudi.common.testutils.RawTripTestPayload in project hudi by apache.

the class TestFlinkHoodieBloomIndex method testCheckExists.

@ParameterizedTest(name = TEST_NAME_WITH_PARAMS)
@MethodSource("configParams")
public void testCheckExists(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) throws Exception {
    // We have some records to be tagged (two different partitions)
    String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
    String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
    String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
    // record key same as recordStr2
    String recordStr4 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
    RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
    HoodieKey key1 = new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath());
    HoodieRecord record1 = new HoodieAvroRecord(key1, rowChange1);
    RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
    HoodieKey key2 = new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath());
    HoodieRecord record2 = new HoodieAvroRecord(key2, rowChange2);
    RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
    HoodieKey key3 = new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath());
    RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
    HoodieKey key4 = new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath());
    HoodieRecord record4 = new HoodieAvroRecord(key4, rowChange4);
    List<HoodieKey> keys = asList(key1, key2, key3, key4);
    // Also create the metadata and config
    HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking);
    HoodieTable hoodieTable = HoodieFlinkTable.create(config, context, metaClient);
    HoodieFlinkWriteableTestTable testTable = HoodieFlinkWriteableTestTable.of(hoodieTable, SCHEMA);
    // Let's tag
    HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance());
    List<HoodieRecord> toTagRecords = new ArrayList<>();
    toTagRecords.add(new HoodieAvroRecord(record4.getKey(), null));
    List<HoodieRecord> taggedRecords = tagLocation(bloomIndex, toTagRecords, hoodieTable);
    Map<HoodieKey, Option<Pair<String, String>>> recordLocations = new HashMap<>();
    for (HoodieRecord taggedRecord : taggedRecords) {
        recordLocations.put(taggedRecord.getKey(), taggedRecord.isCurrentLocationKnown() ? Option.of(Pair.of(taggedRecord.getPartitionPath(), taggedRecord.getCurrentLocation().getFileId())) : Option.empty());
    }
    // Should not find any files
    for (Option<Pair<String, String>> record : recordLocations.values()) {
        assertTrue(!record.isPresent());
    }
    // We create three base file, each having one record. (two different partitions)
    String fileId1 = testTable.addCommit("001").getFileIdWithInserts("2016/01/31", record1);
    String fileId2 = testTable.addCommit("002").getFileIdWithInserts("2016/01/31", record2);
    String fileId3 = testTable.addCommit("003").getFileIdWithInserts("2015/01/31", record4);
    // We do the tag again
    metaClient = HoodieTableMetaClient.reload(metaClient);
    hoodieTable = HoodieFlinkTable.create(config, context, metaClient);
    List<HoodieRecord> toTagRecords1 = new ArrayList<>();
    for (HoodieKey key : keys) {
        taggedRecords.add(new HoodieAvroRecord(key, null));
    }
    taggedRecords = tagLocation(bloomIndex, toTagRecords1, hoodieTable);
    recordLocations.clear();
    for (HoodieRecord taggedRecord : taggedRecords) {
        recordLocations.put(taggedRecord.getKey(), taggedRecord.isCurrentLocationKnown() ? Option.of(Pair.of(taggedRecord.getPartitionPath(), taggedRecord.getCurrentLocation().getFileId())) : Option.empty());
    }
    // Check results
    for (Map.Entry<HoodieKey, Option<Pair<String, String>>> record : recordLocations.entrySet()) {
        if (record.getKey().getRecordKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) {
            assertTrue(record.getValue().isPresent());
            assertEquals(fileId1, record.getValue().get().getRight());
        } else if (record.getKey().getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) {
            assertTrue(record.getValue().isPresent());
            if (record.getKey().getPartitionPath().equals("2015/01/31")) {
                assertEquals(fileId3, record.getValue().get().getRight());
            } else {
                assertEquals(fileId2, record.getValue().get().getRight());
            }
        } else if (record.getKey().getRecordKey().equals("3eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) {
            assertFalse(record.getValue().isPresent());
        }
    }
}
Also used : HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieTable(org.apache.hudi.table.HoodieTable) Option(org.apache.hudi.common.util.Option) HoodieFlinkWriteableTestTable(org.apache.hudi.testutils.HoodieFlinkWriteableTestTable) HashMap(java.util.HashMap) Map(java.util.Map) HoodieMapPair(org.apache.hudi.common.data.HoodieMapPair) Pair(org.apache.hudi.common.util.collection.Pair) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 7 with RawTripTestPayload

use of org.apache.hudi.common.testutils.RawTripTestPayload in project hudi by apache.

the class TestFlinkHoodieBloomIndex method testLoadInvolvedFiles.

@ParameterizedTest(name = TEST_NAME_WITH_PARAMS)
@MethodSource("configParams")
public void testLoadInvolvedFiles(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) throws Exception {
    HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking);
    HoodieBloomIndex index = new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance());
    HoodieTable hoodieTable = HoodieFlinkTable.create(config, context, metaClient, false);
    HoodieFlinkWriteableTestTable testTable = HoodieFlinkWriteableTestTable.of(hoodieTable, SCHEMA);
    // Create some partitions, and put some files
    // "2016/01/21": 0 file
    // "2016/04/01": 1 file (2_0_20160401010101.parquet)
    // "2015/03/12": 3 files (1_0_20150312101010.parquet, 3_0_20150312101010.parquet, 4_0_20150312101010.parquet)
    testTable.withPartitionMetaFiles("2016/01/21", "2016/04/01", "2015/03/12");
    RawTripTestPayload rowChange1 = new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
    HoodieRecord record1 = new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
    RawTripTestPayload rowChange2 = new RawTripTestPayload("{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
    HoodieRecord record2 = new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
    RawTripTestPayload rowChange3 = new RawTripTestPayload("{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
    HoodieRecord record3 = new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
    RawTripTestPayload rowChange4 = new RawTripTestPayload("{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
    HoodieRecord record4 = new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
    List<String> partitions = asList("2016/01/21", "2016/04/01", "2015/03/12");
    List<Pair<String, BloomIndexFileInfo>> filesList = index.loadColumnRangesFromFiles(partitions, context, hoodieTable);
    // Still 0, as no valid commit
    assertEquals(0, filesList.size());
    testTable.addCommit("20160401010101").withInserts("2016/04/01", "2");
    testTable.addCommit("20150312101010").withInserts("2015/03/12", "1").withInserts("2015/03/12", "3", record1).withInserts("2015/03/12", "4", record2, record3, record4);
    metaClient.reloadActiveTimeline();
    filesList = index.loadColumnRangesFromFiles(partitions, context, hoodieTable);
    assertEquals(4, filesList.size());
    if (rangePruning) {
        // these files will not have the key ranges
        assertNull(filesList.get(0).getRight().getMaxRecordKey());
        assertNull(filesList.get(0).getRight().getMinRecordKey());
        assertFalse(filesList.get(1).getRight().hasKeyRanges());
        assertNotNull(filesList.get(2).getRight().getMaxRecordKey());
        assertNotNull(filesList.get(2).getRight().getMinRecordKey());
        assertTrue(filesList.get(3).getRight().hasKeyRanges());
        // no longer sorted, but should have same files.
        List<Pair<String, BloomIndexFileInfo>> expected = asList(Pair.of("2016/04/01", new BloomIndexFileInfo("2")), Pair.of("2015/03/12", new BloomIndexFileInfo("1")), Pair.of("2015/03/12", new BloomIndexFileInfo("3", "000", "000")), Pair.of("2015/03/12", new BloomIndexFileInfo("4", "001", "003")));
        assertEquals(expected, filesList);
    }
}
Also used : HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieFlinkWriteableTestTable(org.apache.hudi.testutils.HoodieFlinkWriteableTestTable) HoodieMapPair(org.apache.hudi.common.data.HoodieMapPair) Pair(org.apache.hudi.common.util.collection.Pair) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 8 with RawTripTestPayload

use of org.apache.hudi.common.testutils.RawTripTestPayload in project hudi by apache.

the class TestHoodieClientOnCopyOnWriteStorage method testHoodieConcatHandleOnDupInserts.

private void testHoodieConcatHandleOnDupInserts(HoodieWriteConfig config, boolean isPrepped) throws Exception {
    HoodieWriteConfig hoodieWriteConfig = getConfigBuilder().withProps(config.getProps()).withMergeAllowDuplicateOnInserts(true).build();
    SparkRDDWriteClient<RawTripTestPayload> client = getHoodieWriteClient(hoodieWriteConfig);
    // Write 1 (only inserts)
    String initCommitTime = "000";
    String newCommitTime = "001";
    int firstInsertRecords = 50;
    insertFirstBatch(hoodieWriteConfig, client, newCommitTime, initCommitTime, firstInsertRecords, SparkRDDWriteClient::insert, isPrepped, true, firstInsertRecords, config.populateMetaFields());
    // Write 2 (updates with duplicates)
    String prevCommitTime = newCommitTime;
    newCommitTime = "004";
    // needs to be larger than firstInsertRecords to guarantee duplicate keys
    int secondInsertRecords = 100;
    List<String> commitTimesBetweenPrevAndNew = Arrays.asList("002", "003");
    final Function2<List<HoodieRecord>, String, Integer> recordGenFunction = generateWrapRecordsFn(isPrepped, hoodieWriteConfig, dataGen::generateUpdates);
    writeBatch(client, newCommitTime, prevCommitTime, Option.of(commitTimesBetweenPrevAndNew), initCommitTime, secondInsertRecords, recordGenFunction, SparkRDDWriteClient::insert, true, secondInsertRecords, firstInsertRecords + secondInsertRecords, 2, false, config.populateMetaFields());
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) ArrayList(java.util.ArrayList) List(java.util.List)

Example 9 with RawTripTestPayload

use of org.apache.hudi.common.testutils.RawTripTestPayload in project hudi by apache.

the class TestHoodieIndex method testTagLocationAndFetchRecordLocations.

@ParameterizedTest
@MethodSource("regularIndexTypeParams")
public void testTagLocationAndFetchRecordLocations(IndexType indexType, boolean populateMetaFields, boolean enableMetadataIndex) throws Exception {
    setUp(indexType, populateMetaFields, enableMetadataIndex);
    String p1 = "2016/01/31";
    String p2 = "2015/01/31";
    String rowKey1 = UUID.randomUUID().toString();
    String rowKey2 = UUID.randomUUID().toString();
    String rowKey3 = UUID.randomUUID().toString();
    String recordStr1 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
    String recordStr2 = "{\"_row_key\":\"" + rowKey2 + "\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
    String recordStr3 = "{\"_row_key\":\"" + rowKey3 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
    // place same row key under a different partition.
    String recordStr4 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
    RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
    HoodieRecord record1 = new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
    RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
    HoodieRecord record2 = new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
    RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
    HoodieRecord record3 = new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
    RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
    HoodieRecord record4 = new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
    JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4));
    String newCommitTime = writeClient.startCommit();
    metaClient = HoodieTableMetaClient.reload(metaClient);
    writeClient.upsert(recordRDD, newCommitTime);
    HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
    JavaRDD<HoodieRecord> taggedRecordRDD = tagLocation(index, recordRDD, hoodieTable);
    // Should not find any files
    for (HoodieRecord record : taggedRecordRDD.collect()) {
        assertFalse(record.isCurrentLocationKnown());
    }
    // We create three parquet files, each having one record (two different partitions)
    HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(metaClient, SCHEMA, metadataWriter);
    final String fileId1 = "fileID1";
    final String fileId2 = "fileID2";
    final String fileId3 = "fileID3";
    Map<String, List<Pair<String, Integer>>> partitionToFilesNameLengthMap = new HashMap<>();
    Path baseFilePath = testTable.forCommit("0000001").withInserts(p1, fileId1, Collections.singletonList(record1));
    long baseFileLength = fs.getFileStatus(baseFilePath).getLen();
    partitionToFilesNameLengthMap.computeIfAbsent(p1, k -> new ArrayList<>()).add(Pair.of(fileId1, Integer.valueOf((int) baseFileLength)));
    testTable.doWriteOperation("0000001", WriteOperationType.UPSERT, Arrays.asList(p1, p2), partitionToFilesNameLengthMap, false, false);
    partitionToFilesNameLengthMap.clear();
    baseFilePath = testTable.forCommit("0000002").withInserts(p1, fileId2, Collections.singletonList(record2));
    baseFileLength = fs.getFileStatus(baseFilePath).getLen();
    partitionToFilesNameLengthMap.computeIfAbsent(p1, k -> new ArrayList<>()).add(Pair.of(fileId2, Integer.valueOf((int) baseFileLength)));
    testTable.doWriteOperation("0000002", WriteOperationType.UPSERT, Arrays.asList(p1, p2), partitionToFilesNameLengthMap, false, false);
    partitionToFilesNameLengthMap.clear();
    baseFilePath = testTable.forCommit("0000003").withInserts(p2, fileId3, Collections.singletonList(record4));
    baseFileLength = fs.getFileStatus(baseFilePath).getLen();
    partitionToFilesNameLengthMap.computeIfAbsent(p2, k -> new ArrayList<>()).add(Pair.of(fileId3, Integer.valueOf((int) baseFileLength)));
    testTable.doWriteOperation("0000003", WriteOperationType.UPSERT, Arrays.asList(p1, p2), partitionToFilesNameLengthMap, false, false);
    // We do the tag again
    metaClient = HoodieTableMetaClient.reload(metaClient);
    hoodieTable = HoodieSparkTable.create(config, context, metaClient);
    taggedRecordRDD = tagLocation(index, recordRDD, hoodieTable);
    List<HoodieRecord> records = taggedRecordRDD.collect();
    // Check results
    for (HoodieRecord record : records) {
        if (record.getRecordKey().equals(rowKey1)) {
            if (record.getPartitionPath().equals(p2)) {
                assertEquals(record.getCurrentLocation().getFileId(), fileId3);
            } else {
                assertEquals(record.getCurrentLocation().getFileId(), fileId1);
            }
        } else if (record.getRecordKey().equals(rowKey2)) {
            assertEquals(record.getCurrentLocation().getFileId(), fileId2);
        } else if (record.getRecordKey().equals(rowKey3)) {
            assertFalse(record.isCurrentLocationKnown());
        }
    }
    JavaPairRDD<HoodieKey, Option<Pair<String, String>>> recordLocations = getRecordLocations(recordRDD.map(HoodieRecord::getKey), hoodieTable);
    for (Tuple2<HoodieKey, Option<Pair<String, String>>> entry : recordLocations.collect()) {
        if (entry._1.getRecordKey().equals(rowKey1)) {
            assertTrue(entry._2.isPresent(), "Row1 should have been present ");
            if (entry._1.getPartitionPath().equals(p2)) {
                assertTrue(entry._2.isPresent(), "Row1 should have been present ");
                assertEquals(entry._2.get().getRight(), fileId3);
            } else {
                assertEquals(entry._2.get().getRight(), fileId1);
            }
        } else if (entry._1.getRecordKey().equals(rowKey2)) {
            assertTrue(entry._2.isPresent(), "Row2 should have been present ");
            assertEquals(entry._2.get().getRight(), fileId2);
        } else if (entry._1.getRecordKey().equals(rowKey3)) {
            assertFalse(entry._2.isPresent(), "Row3 should have been absent ");
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieLayoutConfig(org.apache.hudi.config.HoodieLayoutConfig) HoodieTable(org.apache.hudi.table.HoodieTable) Arrays(java.util.Arrays) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Random(java.util.Random) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) Map(java.util.Map) SparkHoodieBackedTableMetadataWriter(org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter) SparkBucketIndexPartitioner(org.apache.hudi.table.action.commit.SparkBucketIndexPartitioner) HoodieStorageConfig(org.apache.hudi.config.HoodieStorageConfig) Path(org.apache.hadoop.fs.Path) Tag(org.junit.jupiter.api.Tag) FileSystemViewStorageType(org.apache.hudi.common.table.view.FileSystemViewStorageType) MethodSource(org.junit.jupiter.params.provider.MethodSource) Schema(org.apache.avro.Schema) IndexType(org.apache.hudi.index.HoodieIndex.IndexType) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) UUID(java.util.UUID) Arguments(org.junit.jupiter.params.provider.Arguments) Tuple2(scala.Tuple2) HoodieIndex(org.apache.hudi.index.HoodieIndex) Test(org.junit.jupiter.api.Test) List(java.util.List) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) Assertions(org.apache.hudi.testutils.Assertions) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) Assertions.fail(org.junit.jupiter.api.Assertions.fail) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) JavaRDD(org.apache.spark.api.java.JavaRDD) SchemaTestUtil.getSchemaFromResource(org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) ConsistencyGuardConfig(org.apache.hudi.common.fs.ConsistencyGuardConfig) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) IOException(java.io.IOException) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) WriteStatus(org.apache.hudi.client.WriteStatus) AfterEach(org.junit.jupiter.api.AfterEach) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) Collections(java.util.Collections) Pair(org.apache.hudi.common.util.collection.Pair) HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) List(java.util.List) ArrayList(java.util.ArrayList) Option(org.apache.hudi.common.util.Option) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 10 with RawTripTestPayload

use of org.apache.hudi.common.testutils.RawTripTestPayload in project hudi by apache.

the class TestHoodieIndex method testSimpleGlobalIndexTagLocationWhenShouldUpdatePartitionPath.

@Test
public void testSimpleGlobalIndexTagLocationWhenShouldUpdatePartitionPath() throws Exception {
    setUp(IndexType.GLOBAL_SIMPLE, true, true);
    config = getConfigBuilder().withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType).withGlobalSimpleIndexUpdatePartitionPath(true).withBloomIndexUpdatePartitionPath(true).build()).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).withMetadataIndexBloomFilter(true).withMetadataIndexColumnStats(true).build()).build();
    writeClient = getHoodieWriteClient(config);
    index = writeClient.getIndex();
    HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
    HoodieTableMetadataWriter metadataWriter = SparkHoodieBackedTableMetadataWriter.create(writeClient.getEngineContext().getHadoopConf().get(), config, writeClient.getEngineContext());
    HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(hoodieTable.getMetaClient(), SCHEMA, metadataWriter);
    final String p1 = "2016/01/31";
    final String p2 = "2016/02/28";
    // Create the original partition, and put a record, along with the meta file
    // "2016/01/31": 1 file (1_0_20160131101010.parquet)
    // this record will be saved in table and will be tagged to an empty record
    RawTripTestPayload originalPayload = new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
    HoodieRecord originalRecord = new HoodieAvroRecord(new HoodieKey(originalPayload.getRowKey(), originalPayload.getPartitionPath()), originalPayload);
    /*
    This record has the same record key as originalRecord but different time so different partition
    Because GLOBAL_BLOOM_INDEX_SHOULD_UPDATE_PARTITION_PATH = true,
    globalBloomIndex should
    - tag the original partition of the originalRecord to an empty record for deletion, and
    - tag the new partition of the incomingRecord
    */
    RawTripTestPayload incomingPayload = new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-02-28T03:16:41.415Z\",\"number\":12}");
    HoodieRecord incomingRecord = new HoodieAvroRecord(new HoodieKey(incomingPayload.getRowKey(), incomingPayload.getPartitionPath()), incomingPayload);
    /*
    This record has the same record key as originalRecord and the same partition
    Though GLOBAL_BLOOM_INDEX_SHOULD_UPDATE_PARTITION_PATH = true,
    globalBloomIndex should just tag the original partition
    */
    RawTripTestPayload incomingPayloadSamePartition = new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T04:16:41.415Z\",\"number\":15}");
    HoodieRecord incomingRecordSamePartition = new HoodieAvroRecord(new HoodieKey(incomingPayloadSamePartition.getRowKey(), incomingPayloadSamePartition.getPartitionPath()), incomingPayloadSamePartition);
    final String file1P1C0 = UUID.randomUUID().toString();
    Map<String, List<Pair<String, Integer>>> c1PartitionToFilesNameLengthMap = new HashMap<>();
    // We have some records to be tagged (two different partitions)
    Path baseFilePath = testTable.forCommit("1000").withInserts(p1, file1P1C0, Collections.singletonList(originalRecord));
    long baseFileLength = fs.getFileStatus(baseFilePath).getLen();
    c1PartitionToFilesNameLengthMap.put(p1, Collections.singletonList(Pair.of(file1P1C0, Integer.valueOf((int) baseFileLength))));
    testTable.doWriteOperation("1000", WriteOperationType.INSERT, Arrays.asList(p1), c1PartitionToFilesNameLengthMap, false, false);
    // We have some records to be tagged (two different partitions)
    testTable.withInserts(p1, file1P1C0, originalRecord);
    // test against incoming record with a different partition
    JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Collections.singletonList(incomingRecord));
    JavaRDD<HoodieRecord> taggedRecordRDD = tagLocation(index, recordRDD, hoodieTable);
    assertEquals(2, taggedRecordRDD.count());
    for (HoodieRecord record : taggedRecordRDD.collect()) {
        switch(record.getPartitionPath()) {
            case p1:
                assertEquals("000", record.getRecordKey());
                assertTrue(record.getData() instanceof EmptyHoodieRecordPayload);
                break;
            case p2:
                assertEquals("000", record.getRecordKey());
                assertEquals(incomingPayload.getJsonData(), ((RawTripTestPayload) record.getData()).getJsonData());
                break;
            default:
                fail(String.format("Should not get partition path: %s", record.getPartitionPath()));
        }
    }
    // test against incoming record with the same partition
    JavaRDD<HoodieRecord> recordRDDSamePartition = jsc.parallelize(Collections.singletonList(incomingRecordSamePartition));
    JavaRDD<HoodieRecord> taggedRecordRDDSamePartition = tagLocation(index, recordRDDSamePartition, hoodieTable);
    assertEquals(1, taggedRecordRDDSamePartition.count());
    HoodieRecord record = taggedRecordRDDSamePartition.first();
    assertEquals("000", record.getRecordKey());
    assertEquals(p1, record.getPartitionPath());
    assertEquals(incomingPayloadSamePartition.getJsonData(), ((RawTripTestPayload) record.getData()).getJsonData());
}
Also used : Path(org.apache.hadoop.fs.Path) HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieKey(org.apache.hudi.common.model.HoodieKey) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) List(java.util.List) ArrayList(java.util.ArrayList) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Aggregations

RawTripTestPayload (org.apache.hudi.common.testutils.RawTripTestPayload)31 HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)30 HoodieKey (org.apache.hudi.common.model.HoodieKey)30 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)30 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)27 ArrayList (java.util.ArrayList)24 Test (org.junit.jupiter.api.Test)20 Path (org.apache.hadoop.fs.Path)16 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)16 List (java.util.List)14 HoodieTable (org.apache.hudi.table.HoodieTable)14 Arrays (java.util.Arrays)12 Map (java.util.Map)12 Schema (org.apache.avro.Schema)12 Pair (org.apache.hudi.common.util.collection.Pair)12 AfterEach (org.junit.jupiter.api.AfterEach)12 HoodieSparkWriteableTestTable (org.apache.hudi.testutils.HoodieSparkWriteableTestTable)11 JavaRDD (org.apache.spark.api.java.JavaRDD)11 Assertions.assertFalse (org.junit.jupiter.api.Assertions.assertFalse)11 Assertions.assertTrue (org.junit.jupiter.api.Assertions.assertTrue)11