use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.
the class TestUpdateSchemaEvolution method prepareFirstRecordCommit.
private WriteStatus prepareFirstRecordCommit(List<String> recordsStrs) throws IOException {
// Create a bunch of records with an old version of schema
final HoodieWriteConfig config = makeHoodieClientConfig("/exampleSchema.avsc");
final HoodieSparkTable table = HoodieSparkTable.create(config, context);
final List<WriteStatus> statuses = jsc.parallelize(Arrays.asList(1)).map(x -> {
List<HoodieRecord> insertRecords = new ArrayList<>();
for (String recordStr : recordsStrs) {
RawTripTestPayload rowChange = new RawTripTestPayload(recordStr);
insertRecords.add(new HoodieAvroRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange));
}
Map<String, HoodieRecord> insertRecordMap = insertRecords.stream().collect(Collectors.toMap(r -> r.getRecordKey(), Function.identity()));
HoodieCreateHandle<?, ?, ?, ?> createHandle = new HoodieCreateHandle(config, "100", table, insertRecords.get(0).getPartitionPath(), "f1-0", insertRecordMap, supplier);
createHandle.write();
return createHandle.close().get(0);
}).collect();
final Path commitFile = new Path(config.getBasePath() + "/.hoodie/" + HoodieTimeline.makeCommitFileName("100"));
FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf()).create(commitFile);
return statuses.get(0);
}
use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.
the class TestBucketIdentifier method testBucketIdWithSimpleRecordKey.
@Test
public void testBucketIdWithSimpleRecordKey() {
String recordKeyField = "_row_key";
String indexKeyField = "_row_key";
GenericRecord record = KeyGeneratorTestUtilities.getRecord();
HoodieRecord hoodieRecord = new HoodieAvroRecord(new HoodieKey(KeyGenUtils.getRecordKey(record, recordKeyField, false), ""), null);
int bucketId = BucketIdentifier.getBucketId(hoodieRecord, indexKeyField, 8);
assert bucketId == BucketIdentifier.getBucketId(Arrays.asList(record.get(indexKeyField).toString()), 8);
}
use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.
the class TestHoodieBucketIndex method testTagLocation.
@Test
public void testTagLocation() throws Exception {
String rowKey1 = UUID.randomUUID().toString();
String rowKey2 = UUID.randomUUID().toString();
String rowKey3 = UUID.randomUUID().toString();
String recordStr1 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr2 = "{\"_row_key\":\"" + rowKey2 + "\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = "{\"_row_key\":\"" + rowKey3 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
String recordStr4 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
HoodieRecord record1 = new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
HoodieRecord record2 = new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
HoodieRecord record3 = new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
HoodieRecord record4 = new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
JavaRDD<HoodieRecord<HoodieAvroRecord>> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4));
HoodieWriteConfig config = makeConfig();
HoodieTable table = HoodieSparkTable.create(config, context, metaClient);
HoodieBucketIndex bucketIndex = new HoodieBucketIndex(config);
HoodieData<HoodieRecord<HoodieAvroRecord>> taggedRecordRDD = bucketIndex.tagLocation(HoodieJavaRDD.of(recordRDD), context, table);
assertFalse(taggedRecordRDD.collectAsList().stream().anyMatch(r -> r.isCurrentLocationKnown()));
HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(table, SCHEMA);
testTable.addCommit("001").withInserts("2016/01/31", getRecordFileId(record1), record1);
testTable.addCommit("002").withInserts("2016/01/31", getRecordFileId(record2), record2);
testTable.addCommit("003").withInserts("2016/01/31", getRecordFileId(record3), record3);
taggedRecordRDD = bucketIndex.tagLocation(HoodieJavaRDD.of(recordRDD), context, HoodieSparkTable.create(config, context, metaClient));
assertFalse(taggedRecordRDD.collectAsList().stream().filter(r -> r.isCurrentLocationKnown()).filter(r -> BucketIdentifier.bucketIdFromFileId(r.getCurrentLocation().getFileId()) != getRecordBucketId(r)).findAny().isPresent());
assertTrue(taggedRecordRDD.collectAsList().stream().filter(r -> r.getPartitionPath().equals("2015/01/31") && !r.isCurrentLocationKnown()).count() == 1L);
}
use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.
the class TestSparkHoodieHBaseIndex method testDelete.
@Test
public void testDelete() throws Exception {
final String newCommitTime = "001";
final int numRecords = 10;
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, numRecords);
JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
// Load to memory
HoodieWriteConfig config = getConfig();
SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config);
try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config)) {
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
// Test tagLocation without any entries in index
JavaRDD<HoodieRecord> records1 = tagLocation(index, writeRecords, hoodieTable);
assertEquals(0, records1.filter(record -> record.isCurrentLocationKnown()).count());
// Insert records
writeClient.startCommitWithTime(newCommitTime);
JavaRDD<WriteStatus> writeStatues = writeClient.upsert(writeRecords, newCommitTime);
assertNoWriteErrors(writeStatues.collect());
writeClient.commit(newCommitTime, writeStatues);
// Now tagLocation for these records, hbaseIndex should tag them correctly
metaClient = HoodieTableMetaClient.reload(metaClient);
hoodieTable = HoodieSparkTable.create(config, context, metaClient);
List<HoodieRecord> records2 = tagLocation(index, writeRecords, hoodieTable).collect();
assertEquals(numRecords, records2.stream().filter(record -> record.isCurrentLocationKnown()).count());
assertEquals(numRecords, records2.stream().map(record -> record.getKey().getRecordKey()).distinct().count());
assertEquals(numRecords, records2.stream().filter(record -> (record.getCurrentLocation() != null && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count());
// Delete all records. This has to be done directly as deleting index entries
// is not implemented via HoodieWriteClient
JavaRDD<WriteStatus> deleteWriteStatues = writeStatues.map(w -> {
WriteStatus newWriteStatus = new WriteStatus(true, 1.0);
w.getWrittenRecords().forEach(r -> newWriteStatus.markSuccess(new HoodieAvroRecord(r.getKey(), null), Option.empty()));
assertEquals(w.getTotalRecords(), newWriteStatus.getTotalRecords());
newWriteStatus.setStat(new HoodieWriteStat());
return newWriteStatus;
});
// if not for this caching, due to RDD chaining/lineage, first time update is called again when subsequent update is called.
// So caching here to break the chain and so future update does not re-trigger update of older Rdd.
deleteWriteStatues.cache();
JavaRDD<WriteStatus> deleteStatus = updateLocation(index, deleteWriteStatues, hoodieTable);
assertEquals(deleteStatus.count(), deleteWriteStatues.count());
assertNoWriteErrors(deleteStatus.collect());
// Ensure no records can be tagged
List<HoodieRecord> records3 = tagLocation(index, writeRecords, hoodieTable).collect();
assertEquals(0, records3.stream().filter(record -> record.isCurrentLocationKnown()).count());
assertEquals(numRecords, records3.stream().map(record -> record.getKey().getRecordKey()).distinct().count());
assertEquals(0, records3.stream().filter(record -> (record.getCurrentLocation() != null && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count());
}
}
use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.
the class TestSparkHoodieHBaseIndex method testTagLocationAndPartitionPathUpdateWithExplicitRollback.
@Test
public void testTagLocationAndPartitionPathUpdateWithExplicitRollback() throws Exception {
final int numRecords = 10;
final String oldPartitionPath = "1970/01/01";
final String emptyHoodieRecordPayloadClasssName = EmptyHoodieRecordPayload.class.getName();
HoodieWriteConfig config = getConfigBuilder(100, true, true).withRollbackUsingMarkers(false).build();
SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config);
try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config)) {
final String firstCommitTime = writeClient.startCommit();
List<HoodieRecord> newRecords = dataGen.generateInserts(firstCommitTime, numRecords);
List<HoodieRecord> oldRecords = new LinkedList();
for (HoodieRecord newRecord : newRecords) {
HoodieKey key = new HoodieKey(newRecord.getRecordKey(), oldPartitionPath);
HoodieRecord hoodieRecord = new HoodieAvroRecord(key, (HoodieRecordPayload) newRecord.getData());
oldRecords.add(hoodieRecord);
}
JavaRDD<HoodieRecord> newWriteRecords = jsc().parallelize(newRecords, 1);
JavaRDD<HoodieRecord> oldWriteRecords = jsc().parallelize(oldRecords, 1);
// first commit old record
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
List<HoodieRecord> beforeFirstTaggedRecords = tagLocation(index, oldWriteRecords, hoodieTable).collect();
JavaRDD<WriteStatus> oldWriteStatues = writeClient.upsert(oldWriteRecords, firstCommitTime);
updateLocation(index, oldWriteStatues, hoodieTable);
writeClient.commit(firstCommitTime, oldWriteStatues);
List<HoodieRecord> afterFirstTaggedRecords = tagLocation(index, oldWriteRecords, hoodieTable).collect();
metaClient = HoodieTableMetaClient.reload(metaClient);
hoodieTable = HoodieSparkTable.create(config, context, metaClient);
final String secondCommitTime = writeClient.startCommit();
List<HoodieRecord> beforeSecondTaggedRecords = tagLocation(index, newWriteRecords, hoodieTable).collect();
JavaRDD<WriteStatus> newWriteStatues = writeClient.upsert(newWriteRecords, secondCommitTime);
updateLocation(index, newWriteStatues, hoodieTable);
writeClient.commit(secondCommitTime, newWriteStatues);
List<HoodieRecord> afterSecondTaggedRecords = tagLocation(index, newWriteRecords, hoodieTable).collect();
writeClient.rollback(secondCommitTime);
List<HoodieRecord> afterRollback = tagLocation(index, newWriteRecords, hoodieTable).collect();
// Verify the first commit
assertEquals(numRecords, beforeFirstTaggedRecords.stream().filter(record -> record.getCurrentLocation() == null).count());
assertEquals(numRecords, afterFirstTaggedRecords.stream().filter(HoodieRecord::isCurrentLocationKnown).count());
// Verify the second commit
assertEquals(numRecords, beforeSecondTaggedRecords.stream().filter(record -> record.getKey().getPartitionPath().equals(oldPartitionPath) && record.getData().getClass().getName().equals(emptyHoodieRecordPayloadClasssName)).count());
assertEquals(numRecords * 2, beforeSecondTaggedRecords.stream().count());
assertEquals(numRecords, afterSecondTaggedRecords.stream().count());
assertEquals(numRecords, afterSecondTaggedRecords.stream().filter(record -> !record.getKey().getPartitionPath().equals(oldPartitionPath)).count());
// Verify the rollback
// If an exception occurs after hbase writes the index and the index does not roll back,
// the currentLocation information will not be returned.
assertEquals(numRecords, afterRollback.stream().filter(record -> record.getKey().getPartitionPath().equals(oldPartitionPath) && record.getData().getClass().getName().equals(emptyHoodieRecordPayloadClasssName)).count());
assertEquals(numRecords * 2, beforeSecondTaggedRecords.stream().count());
assertEquals(numRecords, afterRollback.stream().filter(HoodieRecord::isCurrentLocationKnown).filter(record -> record.getCurrentLocation().getInstantTime().equals(firstCommitTime)).count());
}
}
Aggregations