Search in sources :

Example 6 with HoodieAvroRecord

use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.

the class HoodieGlobalBloomIndex method tagLocationBacktoRecords.

/**
 * Tagging for global index should only consider the record key.
 */
@Override
protected <R> HoodieData<HoodieRecord<R>> tagLocationBacktoRecords(HoodiePairData<HoodieKey, HoodieRecordLocation> keyLocationPairs, HoodieData<HoodieRecord<R>> records) {
    HoodiePairData<String, HoodieRecord<R>> incomingRowKeyRecordPairs = records.mapToPair(record -> new ImmutablePair<>(record.getRecordKey(), record));
    HoodiePairData<String, Pair<HoodieRecordLocation, HoodieKey>> existingRecordKeyToRecordLocationHoodieKeyMap = keyLocationPairs.mapToPair(p -> new ImmutablePair<>(p.getKey().getRecordKey(), new ImmutablePair<>(p.getValue(), p.getKey())));
    // Here as the records might have more data than rowKeys (some rowKeys' fileId is null), so we do left outer join.
    return incomingRowKeyRecordPairs.leftOuterJoin(existingRecordKeyToRecordLocationHoodieKeyMap).values().flatMap(record -> {
        final HoodieRecord<R> hoodieRecord = record.getLeft();
        final Option<Pair<HoodieRecordLocation, HoodieKey>> recordLocationHoodieKeyPair = record.getRight();
        if (recordLocationHoodieKeyPair.isPresent()) {
            // Record key matched to file
            if (config.getBloomIndexUpdatePartitionPath() && !recordLocationHoodieKeyPair.get().getRight().getPartitionPath().equals(hoodieRecord.getPartitionPath())) {
                // Create an empty record to delete the record in the old partition
                HoodieRecord<R> deleteRecord = new HoodieAvroRecord(recordLocationHoodieKeyPair.get().getRight(), new EmptyHoodieRecordPayload());
                deleteRecord.setCurrentLocation(recordLocationHoodieKeyPair.get().getLeft());
                deleteRecord.seal();
                // Tag the incoming record for inserting to the new partition
                HoodieRecord<R> insertRecord = HoodieIndexUtils.getTaggedRecord(hoodieRecord, Option.empty());
                return Arrays.asList(deleteRecord, insertRecord).iterator();
            } else {
                // When it differs, the record will still be updated at its old partition.
                return Collections.singletonList((HoodieRecord<R>) HoodieIndexUtils.getTaggedRecord(new HoodieAvroRecord(recordLocationHoodieKeyPair.get().getRight(), (HoodieRecordPayload) hoodieRecord.getData()), Option.ofNullable(recordLocationHoodieKeyPair.get().getLeft()))).iterator();
            }
        } else {
            return Collections.singletonList((HoodieRecord<R>) HoodieIndexUtils.getTaggedRecord(hoodieRecord, Option.empty())).iterator();
        }
    });
}
Also used : ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Pair(org.apache.hudi.common.util.collection.Pair)

Example 7 with HoodieAvroRecord

use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.

the class TestHoodieConcatHandle method testInsert.

@Test
public void testInsert() throws Exception {
    HoodieWriteConfig config = makeHoodieClientConfigBuilder().withMergeAllowDuplicateOnInserts(true).build();
    HoodieJavaWriteClient writeClient = getHoodieWriteClient(config);
    metaClient = HoodieTableMetaClient.reload(metaClient);
    BaseFileUtils fileUtils = BaseFileUtils.getInstance(metaClient);
    // Get some records belong to the same partition (2021/09/11)
    String insertRecordStr1 = "{\"_row_key\":\"1\"," + "\"time\":\"2021-09-11T16:16:41.415Z\",\"number\":1}";
    String insertRecordStr2 = "{\"_row_key\":\"2\"," + "\"time\":\"2021-09-11T16:16:41.415Z\",\"number\":2}";
    List<HoodieRecord> records1 = new ArrayList<>();
    RawTripTestPayload insertRow1 = new RawTripTestPayload(insertRecordStr1);
    RawTripTestPayload insertRow2 = new RawTripTestPayload(insertRecordStr2);
    records1.add(new HoodieAvroRecord(new HoodieKey(insertRow1.getRowKey(), insertRow1.getPartitionPath()), insertRow1));
    records1.add(new HoodieAvroRecord(new HoodieKey(insertRow2.getRowKey(), insertRow2.getPartitionPath()), insertRow2));
    int startInstant = 1;
    String firstCommitTime = makeNewCommitTime(startInstant++);
    // First insert
    writeClient.startCommitWithTime(firstCommitTime);
    writeClient.insert(records1, firstCommitTime);
    String partitionPath = "2021/09/11";
    FileStatus[] allFiles = getIncrementalFiles(partitionPath, "0", -1);
    assertEquals(1, allFiles.length);
    // Read out the bloom filter and make sure filter can answer record exist or not
    Path filePath = allFiles[0].getPath();
    BloomFilter filter = fileUtils.readBloomFilterFromMetadata(hadoopConf, filePath);
    for (HoodieRecord record : records1) {
        assertTrue(filter.mightContain(record.getRecordKey()));
    }
    insertRecordStr1 = "{\"_row_key\":\"1\"," + "\"time\":\"2021-09-11T16:39:41.415Z\",\"number\":3}";
    insertRecordStr2 = "{\"_row_key\":\"2\"," + "\"time\":\"2021-09-11T16:39:41.415Z\",\"number\":4}";
    List<HoodieRecord> records2 = new ArrayList<>();
    insertRow1 = new RawTripTestPayload(insertRecordStr1);
    insertRow2 = new RawTripTestPayload(insertRecordStr2);
    // The recordKey of records2 and records1 are the same, but the values of other fields are different
    records2.add(new HoodieAvroRecord(new HoodieKey(insertRow1.getRowKey(), insertRow1.getPartitionPath()), insertRow1));
    records2.add(new HoodieAvroRecord(new HoodieKey(insertRow2.getRowKey(), insertRow2.getPartitionPath()), insertRow2));
    String newCommitTime = makeNewCommitTime(startInstant++);
    writeClient.startCommitWithTime(newCommitTime);
    // Second insert is the same as the _row_key of the first one,test allowDuplicateInserts
    writeClient.insert(records2, newCommitTime);
    allFiles = getIncrementalFiles(partitionPath, firstCommitTime, -1);
    assertEquals(1, allFiles.length);
    // verify new incremental file group is same as the previous one
    assertEquals(FSUtils.getFileId(filePath.getName()), FSUtils.getFileId(allFiles[0].getPath().getName()));
    filePath = allFiles[0].getPath();
    // The final result should be a collection of records1 and records2
    records1.addAll(records2);
    // Read the base file, check the record content
    List<GenericRecord> fileRecords = fileUtils.readAvroRecords(hadoopConf, filePath);
    int index = 0;
    for (GenericRecord record : fileRecords) {
        assertEquals(records1.get(index).getRecordKey(), record.get("_row_key").toString());
        assertEquals(index + 1, record.get("number"));
        index++;
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieJavaWriteClient(org.apache.hudi.client.HoodieJavaWriteClient) HoodieKey(org.apache.hudi.common.model.HoodieKey) BaseFileUtils(org.apache.hudi.common.util.BaseFileUtils) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 8 with HoodieAvroRecord

use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.

the class TestJavaCopyOnWriteActionExecutor method testFileSizeUpsertRecords.

@Test
public void testFileSizeUpsertRecords() throws Exception {
    HoodieWriteConfig config = makeHoodieClientConfigBuilder().withStorageConfig(HoodieStorageConfig.newBuilder().parquetMaxFileSize(64 * 1024).hfileMaxFileSize(64 * 1024).parquetBlockSize(64 * 1024).parquetPageSize(64 * 1024).build()).build();
    String instantTime = makeNewCommitTime();
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieJavaCopyOnWriteTable table = (HoodieJavaCopyOnWriteTable) HoodieJavaTable.create(config, context, metaClient);
    List<HoodieRecord> records = new ArrayList<>();
    // Approx 1150 records are written for block size of 64KB
    for (int i = 0; i < 2000; i++) {
        String recordStr = "{\"_row_key\":\"" + UUID.randomUUID().toString() + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i + "}";
        RawTripTestPayload rowChange = new RawTripTestPayload(recordStr);
        records.add(new HoodieAvroRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange));
    }
    // Insert new records
    BaseJavaCommitActionExecutor actionExecutor = new JavaUpsertCommitActionExecutor(context, config, table, instantTime, records);
    Arrays.asList(1).stream().map(i -> actionExecutor.handleInsert(FSUtils.createNewFileIdPfx(), records.iterator())).map(Transformations::flatten).collect(Collectors.toList());
    // Check the updated file
    int counts = 0;
    for (File file : Paths.get(basePath, "2016/01/31").toFile().listFiles()) {
        if (file.getName().endsWith(table.getBaseFileExtension()) && FSUtils.getCommitTime(file.getName()).equals(instantTime)) {
            LOG.info(file.getName() + "-" + file.length());
            counts++;
        }
    }
    assertEquals(3, counts, "If the number of records are more than 1150, then there should be a new file");
}
Also used : HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) File(java.io.File) HoodieJavaCopyOnWriteTable(org.apache.hudi.table.HoodieJavaCopyOnWriteTable) Test(org.junit.jupiter.api.Test)

Example 9 with HoodieAvroRecord

use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.

the class TestFlinkHoodieBloomIndex method testCheckUUIDsAgainstOneFile.

@Test
public void testCheckUUIDsAgainstOneFile() throws Exception {
    final String partition = "2016/01/31";
    // Create some records to use
    String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
    String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
    String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
    String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}";
    RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
    HoodieRecord record1 = new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
    RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
    HoodieRecord record2 = new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
    RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
    HoodieRecord record3 = new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
    RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
    HoodieRecord record4 = new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
    // We write record1, record2 to a base file, but the bloom filter contains (record1,
    // record2, record3).
    BloomFilter filter = BloomFilterFactory.createBloomFilter(10000, 0.0000001, -1, BloomFilterTypeCode.SIMPLE.name());
    filter.add(record3.getRecordKey());
    HoodieFlinkWriteableTestTable testTable = HoodieFlinkWriteableTestTable.of(metaClient, SCHEMA, filter);
    String fileId = testTable.addCommit("000").getFileIdWithInserts(partition, record1, record2);
    String filename = testTable.getBaseFileNameById(fileId);
    // The bloom filter contains 3 records
    assertTrue(filter.mightContain(record1.getRecordKey()));
    assertTrue(filter.mightContain(record2.getRecordKey()));
    assertTrue(filter.mightContain(record3.getRecordKey()));
    assertFalse(filter.mightContain(record4.getRecordKey()));
    // Compare with file
    List<String> uuids = asList(record1.getRecordKey(), record2.getRecordKey(), record3.getRecordKey(), record4.getRecordKey());
    HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
    HoodieFlinkTable table = HoodieFlinkTable.create(config, context, metaClient);
    List<String> results = HoodieIndexUtils.filterKeysFromFile(new Path(java.nio.file.Paths.get(basePath, partition, filename).toString()), uuids, hadoopConf);
    assertEquals(results.size(), 2);
    assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0") || results.get(1).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0"));
    assertTrue(results.get(0).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0") || results.get(1).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0"));
// TODO(vc): Need more coverage on actual filenames
// assertTrue(results.get(0)._2().equals(filename));
// assertTrue(results.get(1)._2().equals(filename));
}
Also used : Path(org.apache.hadoop.fs.Path) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieFlinkTable(org.apache.hudi.table.HoodieFlinkTable) HoodieFlinkWriteableTestTable(org.apache.hudi.testutils.HoodieFlinkWriteableTestTable) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 10 with HoodieAvroRecord

use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.

the class TestFlinkHoodieBloomIndex method testCheckExists.

@ParameterizedTest(name = TEST_NAME_WITH_PARAMS)
@MethodSource("configParams")
public void testCheckExists(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) throws Exception {
    // We have some records to be tagged (two different partitions)
    String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
    String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
    String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
    // record key same as recordStr2
    String recordStr4 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
    RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
    HoodieKey key1 = new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath());
    HoodieRecord record1 = new HoodieAvroRecord(key1, rowChange1);
    RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
    HoodieKey key2 = new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath());
    HoodieRecord record2 = new HoodieAvroRecord(key2, rowChange2);
    RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
    HoodieKey key3 = new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath());
    RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
    HoodieKey key4 = new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath());
    HoodieRecord record4 = new HoodieAvroRecord(key4, rowChange4);
    List<HoodieKey> keys = asList(key1, key2, key3, key4);
    // Also create the metadata and config
    HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking);
    HoodieTable hoodieTable = HoodieFlinkTable.create(config, context, metaClient);
    HoodieFlinkWriteableTestTable testTable = HoodieFlinkWriteableTestTable.of(hoodieTable, SCHEMA);
    // Let's tag
    HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance());
    List<HoodieRecord> toTagRecords = new ArrayList<>();
    toTagRecords.add(new HoodieAvroRecord(record4.getKey(), null));
    List<HoodieRecord> taggedRecords = tagLocation(bloomIndex, toTagRecords, hoodieTable);
    Map<HoodieKey, Option<Pair<String, String>>> recordLocations = new HashMap<>();
    for (HoodieRecord taggedRecord : taggedRecords) {
        recordLocations.put(taggedRecord.getKey(), taggedRecord.isCurrentLocationKnown() ? Option.of(Pair.of(taggedRecord.getPartitionPath(), taggedRecord.getCurrentLocation().getFileId())) : Option.empty());
    }
    // Should not find any files
    for (Option<Pair<String, String>> record : recordLocations.values()) {
        assertTrue(!record.isPresent());
    }
    // We create three base file, each having one record. (two different partitions)
    String fileId1 = testTable.addCommit("001").getFileIdWithInserts("2016/01/31", record1);
    String fileId2 = testTable.addCommit("002").getFileIdWithInserts("2016/01/31", record2);
    String fileId3 = testTable.addCommit("003").getFileIdWithInserts("2015/01/31", record4);
    // We do the tag again
    metaClient = HoodieTableMetaClient.reload(metaClient);
    hoodieTable = HoodieFlinkTable.create(config, context, metaClient);
    List<HoodieRecord> toTagRecords1 = new ArrayList<>();
    for (HoodieKey key : keys) {
        taggedRecords.add(new HoodieAvroRecord(key, null));
    }
    taggedRecords = tagLocation(bloomIndex, toTagRecords1, hoodieTable);
    recordLocations.clear();
    for (HoodieRecord taggedRecord : taggedRecords) {
        recordLocations.put(taggedRecord.getKey(), taggedRecord.isCurrentLocationKnown() ? Option.of(Pair.of(taggedRecord.getPartitionPath(), taggedRecord.getCurrentLocation().getFileId())) : Option.empty());
    }
    // Check results
    for (Map.Entry<HoodieKey, Option<Pair<String, String>>> record : recordLocations.entrySet()) {
        if (record.getKey().getRecordKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) {
            assertTrue(record.getValue().isPresent());
            assertEquals(fileId1, record.getValue().get().getRight());
        } else if (record.getKey().getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) {
            assertTrue(record.getValue().isPresent());
            if (record.getKey().getPartitionPath().equals("2015/01/31")) {
                assertEquals(fileId3, record.getValue().get().getRight());
            } else {
                assertEquals(fileId2, record.getValue().get().getRight());
            }
        } else if (record.getKey().getRecordKey().equals("3eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) {
            assertFalse(record.getValue().isPresent());
        }
    }
}
Also used : HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieTable(org.apache.hudi.table.HoodieTable) Option(org.apache.hudi.common.util.Option) HoodieFlinkWriteableTestTable(org.apache.hudi.testutils.HoodieFlinkWriteableTestTable) HashMap(java.util.HashMap) Map(java.util.Map) HoodieMapPair(org.apache.hudi.common.data.HoodieMapPair) Pair(org.apache.hudi.common.util.collection.Pair) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Aggregations

HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)84 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)72 HoodieKey (org.apache.hudi.common.model.HoodieKey)68 ArrayList (java.util.ArrayList)38 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)37 RawTripTestPayload (org.apache.hudi.common.testutils.RawTripTestPayload)31 Test (org.junit.jupiter.api.Test)30 GenericRecord (org.apache.avro.generic.GenericRecord)29 Path (org.apache.hadoop.fs.Path)26 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)25 IOException (java.io.IOException)24 HoodieTable (org.apache.hudi.table.HoodieTable)24 List (java.util.List)23 Schema (org.apache.avro.Schema)23 HashMap (java.util.HashMap)22 Pair (org.apache.hudi.common.util.collection.Pair)21 Map (java.util.Map)20 Collectors (java.util.stream.Collectors)20 Arrays (java.util.Arrays)17 Option (org.apache.hudi.common.util.Option)16