use of org.apache.hudi.common.util.Option in project hudi by apache.
the class TestHoodieIndex method testTagLocationAndFetchRecordLocations.
@ParameterizedTest
@MethodSource("regularIndexTypeParams")
public void testTagLocationAndFetchRecordLocations(IndexType indexType, boolean populateMetaFields, boolean enableMetadataIndex) throws Exception {
setUp(indexType, populateMetaFields, enableMetadataIndex);
String p1 = "2016/01/31";
String p2 = "2015/01/31";
String rowKey1 = UUID.randomUUID().toString();
String rowKey2 = UUID.randomUUID().toString();
String rowKey3 = UUID.randomUUID().toString();
String recordStr1 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr2 = "{\"_row_key\":\"" + rowKey2 + "\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = "{\"_row_key\":\"" + rowKey3 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
// place same row key under a different partition.
String recordStr4 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
HoodieRecord record1 = new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
HoodieRecord record2 = new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
HoodieRecord record3 = new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
HoodieRecord record4 = new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4));
String newCommitTime = writeClient.startCommit();
metaClient = HoodieTableMetaClient.reload(metaClient);
writeClient.upsert(recordRDD, newCommitTime);
HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
JavaRDD<HoodieRecord> taggedRecordRDD = tagLocation(index, recordRDD, hoodieTable);
// Should not find any files
for (HoodieRecord record : taggedRecordRDD.collect()) {
assertFalse(record.isCurrentLocationKnown());
}
// We create three parquet files, each having one record (two different partitions)
HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(metaClient, SCHEMA, metadataWriter);
final String fileId1 = "fileID1";
final String fileId2 = "fileID2";
final String fileId3 = "fileID3";
Map<String, List<Pair<String, Integer>>> partitionToFilesNameLengthMap = new HashMap<>();
Path baseFilePath = testTable.forCommit("0000001").withInserts(p1, fileId1, Collections.singletonList(record1));
long baseFileLength = fs.getFileStatus(baseFilePath).getLen();
partitionToFilesNameLengthMap.computeIfAbsent(p1, k -> new ArrayList<>()).add(Pair.of(fileId1, Integer.valueOf((int) baseFileLength)));
testTable.doWriteOperation("0000001", WriteOperationType.UPSERT, Arrays.asList(p1, p2), partitionToFilesNameLengthMap, false, false);
partitionToFilesNameLengthMap.clear();
baseFilePath = testTable.forCommit("0000002").withInserts(p1, fileId2, Collections.singletonList(record2));
baseFileLength = fs.getFileStatus(baseFilePath).getLen();
partitionToFilesNameLengthMap.computeIfAbsent(p1, k -> new ArrayList<>()).add(Pair.of(fileId2, Integer.valueOf((int) baseFileLength)));
testTable.doWriteOperation("0000002", WriteOperationType.UPSERT, Arrays.asList(p1, p2), partitionToFilesNameLengthMap, false, false);
partitionToFilesNameLengthMap.clear();
baseFilePath = testTable.forCommit("0000003").withInserts(p2, fileId3, Collections.singletonList(record4));
baseFileLength = fs.getFileStatus(baseFilePath).getLen();
partitionToFilesNameLengthMap.computeIfAbsent(p2, k -> new ArrayList<>()).add(Pair.of(fileId3, Integer.valueOf((int) baseFileLength)));
testTable.doWriteOperation("0000003", WriteOperationType.UPSERT, Arrays.asList(p1, p2), partitionToFilesNameLengthMap, false, false);
// We do the tag again
metaClient = HoodieTableMetaClient.reload(metaClient);
hoodieTable = HoodieSparkTable.create(config, context, metaClient);
taggedRecordRDD = tagLocation(index, recordRDD, hoodieTable);
List<HoodieRecord> records = taggedRecordRDD.collect();
// Check results
for (HoodieRecord record : records) {
if (record.getRecordKey().equals(rowKey1)) {
if (record.getPartitionPath().equals(p2)) {
assertEquals(record.getCurrentLocation().getFileId(), fileId3);
} else {
assertEquals(record.getCurrentLocation().getFileId(), fileId1);
}
} else if (record.getRecordKey().equals(rowKey2)) {
assertEquals(record.getCurrentLocation().getFileId(), fileId2);
} else if (record.getRecordKey().equals(rowKey3)) {
assertFalse(record.isCurrentLocationKnown());
}
}
JavaPairRDD<HoodieKey, Option<Pair<String, String>>> recordLocations = getRecordLocations(recordRDD.map(HoodieRecord::getKey), hoodieTable);
for (Tuple2<HoodieKey, Option<Pair<String, String>>> entry : recordLocations.collect()) {
if (entry._1.getRecordKey().equals(rowKey1)) {
assertTrue(entry._2.isPresent(), "Row1 should have been present ");
if (entry._1.getPartitionPath().equals(p2)) {
assertTrue(entry._2.isPresent(), "Row1 should have been present ");
assertEquals(entry._2.get().getRight(), fileId3);
} else {
assertEquals(entry._2.get().getRight(), fileId1);
}
} else if (entry._1.getRecordKey().equals(rowKey2)) {
assertTrue(entry._2.isPresent(), "Row2 should have been present ");
assertEquals(entry._2.get().getRight(), fileId2);
} else if (entry._1.getRecordKey().equals(rowKey3)) {
assertFalse(entry._2.isPresent(), "Row3 should have been absent ");
}
}
}
use of org.apache.hudi.common.util.Option in project hudi by apache.
the class SparkHoodieHBaseIndex method updateLocationFunction.
private Function2<Integer, Iterator<WriteStatus>, Iterator<WriteStatus>> updateLocationFunction() {
return (partition, statusIterator) -> {
List<WriteStatus> writeStatusList = new ArrayList<>();
// Grab the global HBase connection
synchronized (SparkHoodieHBaseIndex.class) {
if (hbaseConnection == null || hbaseConnection.isClosed()) {
hbaseConnection = getHBaseConnection();
}
}
final long startTimeForPutsTask = DateTime.now().getMillis();
LOG.info("startTimeForPutsTask for this task: " + startTimeForPutsTask);
try (BufferedMutator mutator = hbaseConnection.getBufferedMutator(TableName.valueOf(tableName))) {
final RateLimiter limiter = RateLimiter.create(multiPutBatchSize, TimeUnit.SECONDS);
while (statusIterator.hasNext()) {
WriteStatus writeStatus = statusIterator.next();
List<Mutation> mutations = new ArrayList<>();
try {
long numOfInserts = writeStatus.getStat().getNumInserts();
LOG.info("Num of inserts in this WriteStatus: " + numOfInserts);
LOG.info("Total inserts in this job: " + this.totalNumInserts);
LOG.info("multiPutBatchSize for this job: " + this.multiPutBatchSize);
// Any calls beyond `multiPutBatchSize` within a second will be rate limited
for (HoodieRecord rec : writeStatus.getWrittenRecords()) {
if (!writeStatus.isErrored(rec.getKey())) {
Option<HoodieRecordLocation> loc = rec.getNewLocation();
if (loc.isPresent()) {
if (rec.getCurrentLocation() != null) {
// This is an update, no need to update index
continue;
}
Put put = new Put(Bytes.toBytes(rec.getRecordKey()));
put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN, Bytes.toBytes(loc.get().getInstantTime()));
put.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN, Bytes.toBytes(loc.get().getFileId()));
put.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN, Bytes.toBytes(rec.getPartitionPath()));
mutations.add(put);
} else {
// Delete existing index for a deleted record
Delete delete = new Delete(Bytes.toBytes(rec.getRecordKey()));
mutations.add(delete);
}
}
if (mutations.size() < multiPutBatchSize) {
continue;
}
doMutations(mutator, mutations, limiter);
}
// process remaining puts and deletes, if any
doMutations(mutator, mutations, limiter);
} catch (Exception e) {
Exception we = new Exception("Error updating index for " + writeStatus, e);
LOG.error(we);
writeStatus.setGlobalError(we);
}
writeStatusList.add(writeStatus);
}
final long endPutsTime = DateTime.now().getMillis();
LOG.info("hbase puts task time for this task: " + (endPutsTime - startTimeForPutsTask));
} catch (IOException e) {
throw new HoodieIndexException("Failed to Update Index locations because of exception with HBase Client", e);
}
return writeStatusList.iterator();
};
}
use of org.apache.hudi.common.util.Option in project hudi by apache.
the class BaseSparkCommitActionExecutor method mapPartitionsAsRDD.
private HoodieData<WriteStatus> mapPartitionsAsRDD(HoodieData<HoodieRecord<T>> dedupedRecords, Partitioner partitioner) {
JavaPairRDD<Tuple2<HoodieKey, Option<HoodieRecordLocation>>, HoodieRecord<T>> mappedRDD = HoodieJavaPairRDD.getJavaPairRDD(dedupedRecords.mapToPair(record -> Pair.of(new Tuple2<>(record.getKey(), Option.ofNullable(record.getCurrentLocation())), record)));
JavaPairRDD<Tuple2<HoodieKey, Option<HoodieRecordLocation>>, HoodieRecord<T>> partitionedRDD;
if (table.requireSortedRecords()) {
// Partition and sort within each partition as a single step. This is faster than partitioning first and then
// applying a sort.
Comparator<Tuple2<HoodieKey, Option<HoodieRecordLocation>>> comparator = (Comparator<Tuple2<HoodieKey, Option<HoodieRecordLocation>>> & Serializable) (t1, t2) -> {
HoodieKey key1 = t1._1;
HoodieKey key2 = t2._1;
return key1.getRecordKey().compareTo(key2.getRecordKey());
};
partitionedRDD = mappedRDD.repartitionAndSortWithinPartitions(partitioner, comparator);
} else {
// Partition only
partitionedRDD = mappedRDD.partitionBy(partitioner);
}
return HoodieJavaRDD.of(partitionedRDD.map(Tuple2::_2).mapPartitionsWithIndex((partition, recordItr) -> {
if (WriteOperationType.isChangingRecords(operationType)) {
return handleUpsertPartition(instantTime, partition, recordItr, partitioner);
} else {
return handleInsertPartition(instantTime, partition, recordItr, partitioner);
}
}, true).flatMap(List::iterator));
}
use of org.apache.hudi.common.util.Option in project hudi by apache.
the class SparkBulkInsertHelper method bulkInsert.
@Override
public HoodieData<WriteStatus> bulkInsert(HoodieData<HoodieRecord<T>> inputRecords, String instantTime, HoodieTable<T, HoodieData<HoodieRecord<T>>, HoodieData<HoodieKey>, HoodieData<WriteStatus>> table, HoodieWriteConfig config, boolean performDedupe, Option<BulkInsertPartitioner> userDefinedBulkInsertPartitioner, boolean useWriterSchema, int parallelism, WriteHandleFactory writeHandleFactory) {
// De-dupe/merge if needed
HoodieData<HoodieRecord<T>> dedupedRecords = inputRecords;
if (performDedupe) {
dedupedRecords = (HoodieData<HoodieRecord<T>>) HoodieWriteHelper.newInstance().combineOnCondition(config.shouldCombineBeforeInsert(), inputRecords, parallelism, table);
}
final HoodieData<HoodieRecord<T>> repartitionedRecords;
BulkInsertPartitioner partitioner = userDefinedBulkInsertPartitioner.isPresent() ? userDefinedBulkInsertPartitioner.get() : BulkInsertInternalPartitionerFactory.get(config.getBulkInsertSortMode());
// only JavaRDD is supported for Spark partitioner, but it is not enforced by BulkInsertPartitioner API. To improve this, TODO HUDI-3463
repartitionedRecords = HoodieJavaRDD.of((JavaRDD<HoodieRecord<T>>) partitioner.repartitionRecords(HoodieJavaRDD.getJavaRDD(dedupedRecords), parallelism));
// generate new file ID prefixes for each output partition
final List<String> fileIDPrefixes = IntStream.range(0, parallelism).mapToObj(i -> FSUtils.createNewFileIdPfx()).collect(Collectors.toList());
JavaRDD<WriteStatus> writeStatusRDD = HoodieJavaRDD.getJavaRDD(repartitionedRecords).mapPartitionsWithIndex(new BulkInsertMapFunction<>(instantTime, partitioner.arePartitionRecordsSorted(), config, table, fileIDPrefixes, useWriterSchema, writeHandleFactory), true).flatMap(List::iterator);
return HoodieJavaRDD.of(writeStatusRDD);
}
use of org.apache.hudi.common.util.Option in project hudi by apache.
the class RestorePlanActionExecutor method execute.
@Override
public Option<HoodieRestorePlan> execute() {
final HoodieInstant restoreInstant = new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.RESTORE_ACTION, instantTime);
try {
// Get all the commits on the timeline after the provided commit time
// rollback pending clustering instants first before other instants (See HUDI-3362)
List<HoodieInstant> pendingClusteringInstantsToRollback = table.getActiveTimeline().filterPendingReplaceTimeline().filter(instant -> ClusteringUtils.isPendingClusteringInstant(table.getMetaClient(), instant)).getReverseOrderedInstants().filter(instant -> HoodieActiveTimeline.GREATER_THAN.test(instant.getTimestamp(), restoreInstantTime)).collect(Collectors.toList());
// Get all the commits on the timeline after the provided commit time
List<HoodieInstant> commitInstantsToRollback = table.getActiveTimeline().getWriteTimeline().getReverseOrderedInstants().filter(instant -> HoodieActiveTimeline.GREATER_THAN.test(instant.getTimestamp(), restoreInstantTime)).filter(instant -> !pendingClusteringInstantsToRollback.contains(instant)).collect(Collectors.toList());
// Combine both lists - first rollback pending clustering and then rollback all other commits
List<HoodieInstantInfo> instantsToRollback = Stream.concat(pendingClusteringInstantsToRollback.stream(), commitInstantsToRollback.stream()).map(entry -> new HoodieInstantInfo(entry.getTimestamp(), entry.getAction())).collect(Collectors.toList());
HoodieRestorePlan restorePlan = new HoodieRestorePlan(instantsToRollback, LATEST_RESTORE_PLAN_VERSION);
table.getActiveTimeline().saveToRestoreRequested(restoreInstant, TimelineMetadataUtils.serializeRestorePlan(restorePlan));
table.getMetaClient().reloadActiveTimeline();
LOG.info("Requesting Restore with instant time " + restoreInstant);
return Option.of(restorePlan);
} catch (IOException e) {
LOG.error("Got exception when saving restore requested file", e);
throw new HoodieIOException(e.getMessage(), e);
}
}
Aggregations