Search in sources :

Example 61 with HoodieTable

use of org.apache.hudi.table.HoodieTable in project hudi by apache.

the class TestHoodieIndex method testSimpleGlobalIndexTagLocationWhenShouldUpdatePartitionPath.

@Test
public void testSimpleGlobalIndexTagLocationWhenShouldUpdatePartitionPath() throws Exception {
    setUp(IndexType.GLOBAL_SIMPLE, true, true);
    config = getConfigBuilder().withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType).withGlobalSimpleIndexUpdatePartitionPath(true).withBloomIndexUpdatePartitionPath(true).build()).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).withMetadataIndexBloomFilter(true).withMetadataIndexColumnStats(true).build()).build();
    writeClient = getHoodieWriteClient(config);
    index = writeClient.getIndex();
    HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
    HoodieTableMetadataWriter metadataWriter = SparkHoodieBackedTableMetadataWriter.create(writeClient.getEngineContext().getHadoopConf().get(), config, writeClient.getEngineContext());
    HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(hoodieTable.getMetaClient(), SCHEMA, metadataWriter);
    final String p1 = "2016/01/31";
    final String p2 = "2016/02/28";
    // Create the original partition, and put a record, along with the meta file
    // "2016/01/31": 1 file (1_0_20160131101010.parquet)
    // this record will be saved in table and will be tagged to an empty record
    RawTripTestPayload originalPayload = new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
    HoodieRecord originalRecord = new HoodieAvroRecord(new HoodieKey(originalPayload.getRowKey(), originalPayload.getPartitionPath()), originalPayload);
    /*
    This record has the same record key as originalRecord but different time so different partition
    Because GLOBAL_BLOOM_INDEX_SHOULD_UPDATE_PARTITION_PATH = true,
    globalBloomIndex should
    - tag the original partition of the originalRecord to an empty record for deletion, and
    - tag the new partition of the incomingRecord
    */
    RawTripTestPayload incomingPayload = new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-02-28T03:16:41.415Z\",\"number\":12}");
    HoodieRecord incomingRecord = new HoodieAvroRecord(new HoodieKey(incomingPayload.getRowKey(), incomingPayload.getPartitionPath()), incomingPayload);
    /*
    This record has the same record key as originalRecord and the same partition
    Though GLOBAL_BLOOM_INDEX_SHOULD_UPDATE_PARTITION_PATH = true,
    globalBloomIndex should just tag the original partition
    */
    RawTripTestPayload incomingPayloadSamePartition = new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T04:16:41.415Z\",\"number\":15}");
    HoodieRecord incomingRecordSamePartition = new HoodieAvroRecord(new HoodieKey(incomingPayloadSamePartition.getRowKey(), incomingPayloadSamePartition.getPartitionPath()), incomingPayloadSamePartition);
    final String file1P1C0 = UUID.randomUUID().toString();
    Map<String, List<Pair<String, Integer>>> c1PartitionToFilesNameLengthMap = new HashMap<>();
    // We have some records to be tagged (two different partitions)
    Path baseFilePath = testTable.forCommit("1000").withInserts(p1, file1P1C0, Collections.singletonList(originalRecord));
    long baseFileLength = fs.getFileStatus(baseFilePath).getLen();
    c1PartitionToFilesNameLengthMap.put(p1, Collections.singletonList(Pair.of(file1P1C0, Integer.valueOf((int) baseFileLength))));
    testTable.doWriteOperation("1000", WriteOperationType.INSERT, Arrays.asList(p1), c1PartitionToFilesNameLengthMap, false, false);
    // We have some records to be tagged (two different partitions)
    testTable.withInserts(p1, file1P1C0, originalRecord);
    // test against incoming record with a different partition
    JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Collections.singletonList(incomingRecord));
    JavaRDD<HoodieRecord> taggedRecordRDD = tagLocation(index, recordRDD, hoodieTable);
    assertEquals(2, taggedRecordRDD.count());
    for (HoodieRecord record : taggedRecordRDD.collect()) {
        switch(record.getPartitionPath()) {
            case p1:
                assertEquals("000", record.getRecordKey());
                assertTrue(record.getData() instanceof EmptyHoodieRecordPayload);
                break;
            case p2:
                assertEquals("000", record.getRecordKey());
                assertEquals(incomingPayload.getJsonData(), ((RawTripTestPayload) record.getData()).getJsonData());
                break;
            default:
                fail(String.format("Should not get partition path: %s", record.getPartitionPath()));
        }
    }
    // test against incoming record with the same partition
    JavaRDD<HoodieRecord> recordRDDSamePartition = jsc.parallelize(Collections.singletonList(incomingRecordSamePartition));
    JavaRDD<HoodieRecord> taggedRecordRDDSamePartition = tagLocation(index, recordRDDSamePartition, hoodieTable);
    assertEquals(1, taggedRecordRDDSamePartition.count());
    HoodieRecord record = taggedRecordRDDSamePartition.first();
    assertEquals("000", record.getRecordKey());
    assertEquals(p1, record.getPartitionPath());
    assertEquals(incomingPayloadSamePartition.getJsonData(), ((RawTripTestPayload) record.getData()).getJsonData());
}
Also used : Path(org.apache.hadoop.fs.Path) HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieKey(org.apache.hudi.common.model.HoodieKey) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) List(java.util.List) ArrayList(java.util.ArrayList) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 62 with HoodieTable

use of org.apache.hudi.table.HoodieTable in project hudi by apache.

the class TestHoodieBloomIndex method testLoadInvolvedFiles.

@ParameterizedTest(name = TEST_NAME_WITH_PARAMS)
@MethodSource("configParams")
public void testLoadInvolvedFiles(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) throws Exception {
    HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking);
    HoodieBloomIndex index = new HoodieBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance());
    HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
    HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(metaClient, SCHEMA, metadataWriter);
    // Create some partitions, and put some files
    // "2016/01/21": 0 file
    // "2016/04/01": 1 file (2_0_20160401010101.parquet)
    // "2015/03/12": 3 files (1_0_20150312101010.parquet, 3_0_20150312101010.parquet, 4_0_20150312101010.parquet)
    testTable.withPartitionMetaFiles("2016/01/21", "2016/04/01", "2015/03/12");
    RawTripTestPayload rowChange1 = new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
    HoodieRecord record1 = new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
    RawTripTestPayload rowChange2 = new RawTripTestPayload("{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
    HoodieRecord record2 = new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
    RawTripTestPayload rowChange3 = new RawTripTestPayload("{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
    HoodieRecord record3 = new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
    RawTripTestPayload rowChange4 = new RawTripTestPayload("{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
    HoodieRecord record4 = new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
    List<String> partitions = Arrays.asList("2016/01/21", "2016/04/01", "2015/03/12");
    List<Pair<String, BloomIndexFileInfo>> filesList = index.loadColumnRangesFromFiles(partitions, context, hoodieTable);
    // Still 0, as no valid commit
    assertEquals(0, filesList.size());
    final String fileId1 = "1";
    final String fileId2 = "2";
    final String fileId3 = "3";
    final String fileId4 = "4";
    final Map<String, List<Pair<String, Integer>>> partitionToFilesNameLengthMap = new HashMap<>();
    String commitTime = "20160401010101";
    Path baseFilePath = testTable.forCommit(commitTime).withInserts(partitions.get(1), fileId2, Collections.emptyList());
    long baseFileLength = fs.getFileStatus(baseFilePath).getLen();
    partitionToFilesNameLengthMap.computeIfAbsent(partitions.get(1), k -> new ArrayList<>()).add(Pair.of(fileId2, Integer.valueOf((int) baseFileLength)));
    testTable.doWriteOperation(commitTime, WriteOperationType.UPSERT, Arrays.asList(partitions.get(1)), partitionToFilesNameLengthMap, false, false);
    commitTime = "20150312101010";
    partitionToFilesNameLengthMap.clear();
    testTable.forCommit(commitTime);
    baseFilePath = testTable.withInserts(partitions.get(2), fileId1, Collections.emptyList());
    baseFileLength = fs.getFileStatus(baseFilePath).getLen();
    partitionToFilesNameLengthMap.computeIfAbsent(partitions.get(2), k -> new ArrayList<>()).add(Pair.of(fileId1, Integer.valueOf((int) baseFileLength)));
    baseFilePath = testTable.withInserts(partitions.get(2), fileId3, Collections.singletonList(record1));
    baseFileLength = fs.getFileStatus(baseFilePath).getLen();
    partitionToFilesNameLengthMap.computeIfAbsent(partitions.get(2), k -> new ArrayList<>()).add(Pair.of(fileId3, Integer.valueOf((int) baseFileLength)));
    baseFilePath = testTable.withInserts(partitions.get(2), fileId4, Arrays.asList(record2, record3, record4));
    baseFileLength = fs.getFileStatus(baseFilePath).getLen();
    partitionToFilesNameLengthMap.computeIfAbsent(partitions.get(2), k -> new ArrayList<>()).add(Pair.of(fileId4, Integer.valueOf((int) baseFileLength)));
    testTable.doWriteOperation(commitTime, WriteOperationType.UPSERT, Arrays.asList(partitions.get(2)), partitionToFilesNameLengthMap, false, false);
    filesList = index.loadColumnRangesFromFiles(partitions, context, hoodieTable);
    assertEquals(4, filesList.size());
    if (rangePruning) {
        // these files will not have the key ranges
        assertNull(filesList.get(0).getRight().getMaxRecordKey());
        assertNull(filesList.get(0).getRight().getMinRecordKey());
        assertFalse(filesList.get(1).getRight().hasKeyRanges());
        assertNotNull(filesList.get(2).getRight().getMaxRecordKey());
        assertNotNull(filesList.get(2).getRight().getMinRecordKey());
        assertTrue(filesList.get(3).getRight().hasKeyRanges());
        // no longer sorted, but should have same files.
        List<ImmutablePair<String, BloomIndexFileInfo>> expected = Arrays.asList(new ImmutablePair<>("2016/04/01", new BloomIndexFileInfo("2")), new ImmutablePair<>("2015/03/12", new BloomIndexFileInfo("1")), new ImmutablePair<>("2015/03/12", new BloomIndexFileInfo("3", "000", "000")), new ImmutablePair<>("2015/03/12", new BloomIndexFileInfo("4", "001", "003")));
        assertEquals(expected, filesList);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieTable(org.apache.hudi.table.HoodieTable) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) MethodSource(org.junit.jupiter.params.provider.MethodSource) Schema(org.apache.avro.Schema) BloomFilterFactory(org.apache.hudi.common.bloom.BloomFilterFactory) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) UUID(java.util.UUID) Arguments(org.junit.jupiter.params.provider.Arguments) Tuple2(scala.Tuple2) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) Test(org.junit.jupiter.api.Test) List(java.util.List) Stream(java.util.stream.Stream) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) HoodieIndexUtils(org.apache.hudi.index.HoodieIndexUtils) Assertions.assertDoesNotThrow(org.junit.jupiter.api.Assertions.assertDoesNotThrow) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Assertions.assertNotNull(org.junit.jupiter.api.Assertions.assertNotNull) Assertions.assertNull(org.junit.jupiter.api.Assertions.assertNull) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) JavaRDD(org.apache.spark.api.java.JavaRDD) SchemaTestUtil.getSchemaFromResource(org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) BloomFilterTypeCode(org.apache.hudi.common.bloom.BloomFilterTypeCode) TestHoodieMetadataBase(org.apache.hudi.client.functional.TestHoodieMetadataBase) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) AfterEach(org.junit.jupiter.api.AfterEach) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Paths(java.nio.file.Paths) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) Collections(java.util.Collections) HoodieJavaPairRDD(org.apache.hudi.data.HoodieJavaPairRDD) Pair(org.apache.hudi.common.util.collection.Pair) HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) List(java.util.List) ArrayList(java.util.ArrayList) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Pair(org.apache.hudi.common.util.collection.Pair) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 63 with HoodieTable

use of org.apache.hudi.table.HoodieTable in project hudi by apache.

the class TestHoodieBloomIndex method testBloomFilterFalseError.

@ParameterizedTest(name = TEST_NAME_WITH_PARAMS)
@MethodSource("configParams")
public void testBloomFilterFalseError(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) throws Exception {
    // We have two hoodie records
    String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
    String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
    // We write record1 to a parquet file, using a bloom filter having both records
    RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
    HoodieRecord record1 = new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
    RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
    HoodieRecord record2 = new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
    BloomFilter filter = BloomFilterFactory.createBloomFilter(10000, 0.0000001, -1, BloomFilterTypeCode.SIMPLE.name());
    filter.add(record2.getRecordKey());
    HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(metaClient, SCHEMA, filter);
    String fileId = testTable.addCommit("000").getFileIdWithInserts("2016/01/31", record1);
    assertTrue(filter.mightContain(record1.getRecordKey()));
    assertTrue(filter.mightContain(record2.getRecordKey()));
    // We do the tag
    JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2));
    HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking);
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieTable table = HoodieSparkTable.create(config, context, metaClient);
    HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance());
    JavaRDD<HoodieRecord> taggedRecordRDD = tagLocation(bloomIndex, recordRDD, table);
    // Check results
    for (HoodieRecord record : taggedRecordRDD.collect()) {
        if (record.getKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) {
            assertEquals(record.getCurrentLocation().getFileId(), fileId);
        } else if (record.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) {
            assertFalse(record.isCurrentLocationKnown());
        }
    }
}
Also used : RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 64 with HoodieTable

use of org.apache.hudi.table.HoodieTable in project hudi by apache.

the class TestHoodieBloomIndex method testTagLocation.

@ParameterizedTest(name = TEST_NAME_WITH_PARAMS)
@MethodSource("configParams")
public void testTagLocation(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) throws Exception {
    // We have some records to be tagged (two different partitions)
    String rowKey1 = UUID.randomUUID().toString();
    String rowKey2 = UUID.randomUUID().toString();
    String rowKey3 = UUID.randomUUID().toString();
    String recordStr1 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
    String recordStr2 = "{\"_row_key\":\"" + rowKey2 + "\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
    String recordStr3 = "{\"_row_key\":\"" + rowKey3 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
    // place same row key under a different partition.
    String recordStr4 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
    RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
    HoodieRecord record1 = new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
    RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
    HoodieRecord record2 = new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
    RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
    HoodieRecord record3 = new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
    RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
    HoodieRecord record4 = new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
    JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4));
    // Also create the metadata and config
    HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking);
    HoodieSparkTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
    HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(metaClient, SCHEMA, metadataWriter);
    // Let's tag
    HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance());
    JavaRDD<HoodieRecord> taggedRecordRDD = tagLocation(bloomIndex, recordRDD, hoodieTable);
    // Should not find any files
    for (HoodieRecord record : taggedRecordRDD.collect()) {
        assertFalse(record.isCurrentLocationKnown());
    }
    final Map<String, List<Pair<String, Integer>>> partitionToFilesNameLengthMap = new HashMap<>();
    final String partition1 = "2016/01/31";
    final String partition2 = "2015/01/31";
    // We create three parquet file, each having one record. (two different partitions)
    final String fileId1 = UUID.randomUUID().toString();
    final String commit1 = "0000001";
    Path baseFilePath = testTable.forCommit(commit1).withInserts(partition1, fileId1, Collections.singletonList(record1));
    long baseFileLength = fs.getFileStatus(baseFilePath).getLen();
    partitionToFilesNameLengthMap.computeIfAbsent(partition1, k -> new ArrayList<>()).add(Pair.of(fileId1, Integer.valueOf((int) baseFileLength)));
    testTable.doWriteOperation(commit1, WriteOperationType.UPSERT, Collections.singletonList(partition1), partitionToFilesNameLengthMap, false, false);
    final String fileId2 = UUID.randomUUID().toString();
    final String commit2 = "0000002";
    baseFilePath = testTable.forCommit(commit2).withInserts(partition1, fileId2, Collections.singletonList(record2));
    baseFileLength = fs.getFileStatus(baseFilePath).getLen();
    partitionToFilesNameLengthMap.clear();
    partitionToFilesNameLengthMap.computeIfAbsent(partition1, k -> new ArrayList<>()).add(Pair.of(fileId2, Integer.valueOf((int) baseFileLength)));
    testTable.doWriteOperation(commit2, WriteOperationType.UPSERT, Collections.singletonList(partition1), partitionToFilesNameLengthMap, false, false);
    final String fileId3 = UUID.randomUUID().toString();
    final String commit3 = "0000003";
    baseFilePath = testTable.forCommit(commit3).withInserts(partition2, fileId3, Collections.singletonList(record4));
    baseFileLength = fs.getFileStatus(baseFilePath).getLen();
    partitionToFilesNameLengthMap.clear();
    partitionToFilesNameLengthMap.computeIfAbsent(partition2, k -> new ArrayList<>()).add(Pair.of(fileId3, Integer.valueOf((int) baseFileLength)));
    testTable.doWriteOperation(commit3, WriteOperationType.UPSERT, Collections.singletonList(partition2), partitionToFilesNameLengthMap, false, false);
    // We do the tag again
    taggedRecordRDD = tagLocation(bloomIndex, recordRDD, HoodieSparkTable.create(config, context, metaClient));
    // Check results
    for (HoodieRecord record : taggedRecordRDD.collect()) {
        if (record.getRecordKey().equals(rowKey1)) {
            if (record.getPartitionPath().equals(partition2)) {
                assertEquals(record.getCurrentLocation().getFileId(), fileId3);
            } else {
                assertEquals(record.getCurrentLocation().getFileId(), fileId1);
            }
        } else if (record.getRecordKey().equals(rowKey2)) {
            assertEquals(record.getCurrentLocation().getFileId(), fileId2);
        } else if (record.getRecordKey().equals(rowKey3)) {
            assertFalse(record.isCurrentLocationKnown());
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieTable(org.apache.hudi.table.HoodieTable) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) MethodSource(org.junit.jupiter.params.provider.MethodSource) Schema(org.apache.avro.Schema) BloomFilterFactory(org.apache.hudi.common.bloom.BloomFilterFactory) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) UUID(java.util.UUID) Arguments(org.junit.jupiter.params.provider.Arguments) Tuple2(scala.Tuple2) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) Test(org.junit.jupiter.api.Test) List(java.util.List) Stream(java.util.stream.Stream) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) HoodieIndexUtils(org.apache.hudi.index.HoodieIndexUtils) Assertions.assertDoesNotThrow(org.junit.jupiter.api.Assertions.assertDoesNotThrow) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Assertions.assertNotNull(org.junit.jupiter.api.Assertions.assertNotNull) Assertions.assertNull(org.junit.jupiter.api.Assertions.assertNull) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) JavaRDD(org.apache.spark.api.java.JavaRDD) SchemaTestUtil.getSchemaFromResource(org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) BloomFilterTypeCode(org.apache.hudi.common.bloom.BloomFilterTypeCode) TestHoodieMetadataBase(org.apache.hudi.client.functional.TestHoodieMetadataBase) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) AfterEach(org.junit.jupiter.api.AfterEach) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Paths(java.nio.file.Paths) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) Collections(java.util.Collections) HoodieJavaPairRDD(org.apache.hudi.data.HoodieJavaPairRDD) Pair(org.apache.hudi.common.util.collection.Pair) HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) List(java.util.List) ArrayList(java.util.ArrayList) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 65 with HoodieTable

use of org.apache.hudi.table.HoodieTable in project hudi by apache.

the class SparkBulkInsertHelper method bulkInsert.

@Override
public HoodieData<WriteStatus> bulkInsert(HoodieData<HoodieRecord<T>> inputRecords, String instantTime, HoodieTable<T, HoodieData<HoodieRecord<T>>, HoodieData<HoodieKey>, HoodieData<WriteStatus>> table, HoodieWriteConfig config, boolean performDedupe, Option<BulkInsertPartitioner> userDefinedBulkInsertPartitioner, boolean useWriterSchema, int parallelism, WriteHandleFactory writeHandleFactory) {
    // De-dupe/merge if needed
    HoodieData<HoodieRecord<T>> dedupedRecords = inputRecords;
    if (performDedupe) {
        dedupedRecords = (HoodieData<HoodieRecord<T>>) HoodieWriteHelper.newInstance().combineOnCondition(config.shouldCombineBeforeInsert(), inputRecords, parallelism, table);
    }
    final HoodieData<HoodieRecord<T>> repartitionedRecords;
    BulkInsertPartitioner partitioner = userDefinedBulkInsertPartitioner.isPresent() ? userDefinedBulkInsertPartitioner.get() : BulkInsertInternalPartitionerFactory.get(config.getBulkInsertSortMode());
    // only JavaRDD is supported for Spark partitioner, but it is not enforced by BulkInsertPartitioner API. To improve this, TODO HUDI-3463
    repartitionedRecords = HoodieJavaRDD.of((JavaRDD<HoodieRecord<T>>) partitioner.repartitionRecords(HoodieJavaRDD.getJavaRDD(dedupedRecords), parallelism));
    // generate new file ID prefixes for each output partition
    final List<String> fileIDPrefixes = IntStream.range(0, parallelism).mapToObj(i -> FSUtils.createNewFileIdPfx()).collect(Collectors.toList());
    JavaRDD<WriteStatus> writeStatusRDD = HoodieJavaRDD.getJavaRDD(repartitionedRecords).mapPartitionsWithIndex(new BulkInsertMapFunction<>(instantTime, partitioner.arePartitionRecordsSorted(), config, table, fileIDPrefixes, useWriterSchema, writeHandleFactory), true).flatMap(List::iterator);
    return HoodieJavaRDD.of(writeStatusRDD);
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) IntStream(java.util.stream.IntStream) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) CreateHandleFactory(org.apache.hudi.io.CreateHandleFactory) Option(org.apache.hudi.common.util.Option) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) Collectors(java.util.stream.Collectors) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) WriteHandleFactory(org.apache.hudi.io.WriteHandleFactory) List(java.util.List) BulkInsertPartitioner(org.apache.hudi.table.BulkInsertPartitioner) BulkInsertInternalPartitionerFactory(org.apache.hudi.execution.bulkinsert.BulkInsertInternalPartitionerFactory) BulkInsertMapFunction(org.apache.hudi.execution.bulkinsert.BulkInsertMapFunction) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) FSUtils(org.apache.hudi.common.fs.FSUtils) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) BulkInsertMapFunction(org.apache.hudi.execution.bulkinsert.BulkInsertMapFunction) List(java.util.List) BulkInsertPartitioner(org.apache.hudi.table.BulkInsertPartitioner) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) JavaRDD(org.apache.spark.api.java.JavaRDD)

Aggregations

HoodieTable (org.apache.hudi.table.HoodieTable)133 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)105 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)76 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)75 List (java.util.List)64 Test (org.junit.jupiter.api.Test)63 ArrayList (java.util.ArrayList)58 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)57 WriteStatus (org.apache.hudi.client.WriteStatus)49 Path (org.apache.hadoop.fs.Path)48 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)46 Option (org.apache.hudi.common.util.Option)46 IOException (java.io.IOException)44 Map (java.util.Map)44 Collectors (java.util.stream.Collectors)44 SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)43 HashMap (java.util.HashMap)41 Pair (org.apache.hudi.common.util.collection.Pair)39 HoodieKey (org.apache.hudi.common.model.HoodieKey)38 HoodieSparkTable (org.apache.hudi.table.HoodieSparkTable)38