use of org.apache.hudi.common.bootstrap.index.BootstrapIndex in project hudi by apache.
the class BootstrapCommand method createBootstrapIndexReader.
private BootstrapIndex.IndexReader createBootstrapIndexReader() {
HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient();
BootstrapIndex index = BootstrapIndex.getBootstrapIndex(metaClient);
if (!index.useIndex()) {
throw new HoodieException("This is not a bootstrapped Hudi table. Don't have any index info");
}
return index.createReader();
}
use of org.apache.hudi.common.bootstrap.index.BootstrapIndex in project hudi by apache.
the class TestBootstrapIndex method testNoOpBootstrapIndex.
@Test
public void testNoOpBootstrapIndex() throws IOException {
Properties props = metaClient.getTableConfig().getProps();
props.put(HoodieTableConfig.BOOTSTRAP_INDEX_ENABLE.key(), "false");
Properties properties = new Properties();
properties.putAll(props);
HoodieTableConfig.create(metaClient.getFs(), new Path(metaClient.getMetaPath()), properties);
metaClient = HoodieTableMetaClient.builder().setConf(metaClient.getHadoopConf()).setBasePath(basePath).build();
BootstrapIndex bootstrapIndex = BootstrapIndex.getBootstrapIndex(metaClient);
assert (bootstrapIndex instanceof NoOpBootstrapIndex);
}
use of org.apache.hudi.common.bootstrap.index.BootstrapIndex in project hudi by apache.
the class TestBootstrap method testBootstrapCommon.
private void testBootstrapCommon(boolean partitioned, boolean deltaCommit, EffectiveMode mode) throws Exception {
if (deltaCommit) {
metaClient = HoodieTestUtils.init(basePath, HoodieTableType.MERGE_ON_READ, bootstrapBasePath, true);
} else {
metaClient = HoodieTestUtils.init(basePath, HoodieTableType.COPY_ON_WRITE, bootstrapBasePath, true);
}
int totalRecords = 100;
String keyGeneratorClass = partitioned ? SimpleKeyGenerator.class.getCanonicalName() : NonpartitionedKeyGenerator.class.getCanonicalName();
final String bootstrapModeSelectorClass;
final String bootstrapCommitInstantTs;
final boolean checkNumRawFiles;
final boolean isBootstrapIndexCreated;
final int numInstantsAfterBootstrap;
final List<String> bootstrapInstants;
switch(mode) {
case FULL_BOOTSTRAP_MODE:
bootstrapModeSelectorClass = FullRecordBootstrapModeSelector.class.getCanonicalName();
bootstrapCommitInstantTs = HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS;
checkNumRawFiles = false;
isBootstrapIndexCreated = false;
numInstantsAfterBootstrap = 1;
bootstrapInstants = Arrays.asList(bootstrapCommitInstantTs);
break;
case METADATA_BOOTSTRAP_MODE:
bootstrapModeSelectorClass = MetadataOnlyBootstrapModeSelector.class.getCanonicalName();
bootstrapCommitInstantTs = HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS;
checkNumRawFiles = true;
isBootstrapIndexCreated = true;
numInstantsAfterBootstrap = 1;
bootstrapInstants = Arrays.asList(bootstrapCommitInstantTs);
break;
default:
bootstrapModeSelectorClass = TestRandomBootstrapModeSelector.class.getName();
bootstrapCommitInstantTs = HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS;
checkNumRawFiles = false;
isBootstrapIndexCreated = true;
numInstantsAfterBootstrap = 2;
bootstrapInstants = Arrays.asList(HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS, HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS);
break;
}
List<String> partitions = Arrays.asList("2020/04/01", "2020/04/02", "2020/04/03");
long timestamp = Instant.now().toEpochMilli();
Schema schema = generateNewDataSetAndReturnSchema(timestamp, totalRecords, partitions, bootstrapBasePath);
HoodieWriteConfig config = getConfigBuilder(schema.toString()).withAutoCommit(true).withSchema(schema.toString()).withCompactionConfig(HoodieCompactionConfig.newBuilder().withMaxNumDeltaCommitsBeforeCompaction(1).build()).withBootstrapConfig(HoodieBootstrapConfig.newBuilder().withBootstrapBasePath(bootstrapBasePath).withBootstrapKeyGenClass(keyGeneratorClass).withFullBootstrapInputProvider(TestFullBootstrapDataProvider.class.getName()).withBootstrapParallelism(3).withBootstrapModeSelector(bootstrapModeSelectorClass).build()).build();
SparkRDDWriteClientOverride client = new SparkRDDWriteClientOverride(context, config);
client.bootstrap(Option.empty());
checkBootstrapResults(totalRecords, schema, bootstrapCommitInstantTs, checkNumRawFiles, numInstantsAfterBootstrap, numInstantsAfterBootstrap, timestamp, timestamp, deltaCommit, bootstrapInstants, true);
// Rollback Bootstrap
HoodieActiveTimeline.deleteInstantFile(metaClient.getFs(), metaClient.getMetaPath(), new HoodieInstant(State.COMPLETED, deltaCommit ? HoodieTimeline.DELTA_COMMIT_ACTION : HoodieTimeline.COMMIT_ACTION, bootstrapCommitInstantTs));
metaClient.reloadActiveTimeline();
client.rollbackFailedBootstrap();
metaClient.reloadActiveTimeline();
assertEquals(0, metaClient.getCommitsTimeline().countInstants());
assertEquals(0L, BootstrapUtils.getAllLeafFoldersWithFiles(metaClient, metaClient.getFs(), basePath, context).stream().flatMap(f -> f.getValue().stream()).count());
BootstrapIndex index = BootstrapIndex.getBootstrapIndex(metaClient);
assertFalse(index.useIndex());
// Run bootstrap again
client = new SparkRDDWriteClientOverride(context, config);
client.bootstrap(Option.empty());
metaClient.reloadActiveTimeline();
index = BootstrapIndex.getBootstrapIndex(metaClient);
if (isBootstrapIndexCreated) {
assertTrue(index.useIndex());
} else {
assertFalse(index.useIndex());
}
checkBootstrapResults(totalRecords, schema, bootstrapCommitInstantTs, checkNumRawFiles, numInstantsAfterBootstrap, numInstantsAfterBootstrap, timestamp, timestamp, deltaCommit, bootstrapInstants, true);
// Upsert case
long updateTimestamp = Instant.now().toEpochMilli();
String updateSPath = tmpFolder.toAbsolutePath().toString() + "/data2";
generateNewDataSetAndReturnSchema(updateTimestamp, totalRecords, partitions, updateSPath);
JavaRDD<HoodieRecord> updateBatch = generateInputBatch(jsc, BootstrapUtils.getAllLeafFoldersWithFiles(metaClient, metaClient.getFs(), updateSPath, context), schema);
String newInstantTs = client.startCommit();
client.upsert(updateBatch, newInstantTs);
checkBootstrapResults(totalRecords, schema, newInstantTs, false, numInstantsAfterBootstrap + 1, updateTimestamp, deltaCommit ? timestamp : updateTimestamp, deltaCommit, true);
if (deltaCommit) {
Option<String> compactionInstant = client.scheduleCompaction(Option.empty());
assertTrue(compactionInstant.isPresent());
client.compact(compactionInstant.get());
checkBootstrapResults(totalRecords, schema, compactionInstant.get(), checkNumRawFiles, numInstantsAfterBootstrap + 2, 2, updateTimestamp, updateTimestamp, !deltaCommit, Arrays.asList(compactionInstant.get()), !config.isPreserveHoodieCommitMetadataForCompaction());
}
}
use of org.apache.hudi.common.bootstrap.index.BootstrapIndex in project hudi by apache.
the class TestOrcBootstrap method testBootstrapCommon.
private void testBootstrapCommon(boolean partitioned, boolean deltaCommit, EffectiveMode mode) throws Exception {
if (deltaCommit) {
metaClient = HoodieTestUtils.init(basePath, HoodieTableType.MERGE_ON_READ, bootstrapBasePath, HoodieFileFormat.ORC);
} else {
metaClient = HoodieTestUtils.init(basePath, HoodieTableType.COPY_ON_WRITE, bootstrapBasePath, HoodieFileFormat.ORC);
}
int totalRecords = 100;
String keyGeneratorClass = partitioned ? SimpleKeyGenerator.class.getCanonicalName() : NonpartitionedKeyGenerator.class.getCanonicalName();
final String bootstrapModeSelectorClass;
final String bootstrapCommitInstantTs;
final boolean checkNumRawFiles;
final boolean isBootstrapIndexCreated;
final int numInstantsAfterBootstrap;
final List<String> bootstrapInstants;
switch(mode) {
case FULL_BOOTSTRAP_MODE:
bootstrapModeSelectorClass = FullRecordBootstrapModeSelector.class.getCanonicalName();
bootstrapCommitInstantTs = HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS;
checkNumRawFiles = false;
isBootstrapIndexCreated = false;
numInstantsAfterBootstrap = 1;
bootstrapInstants = Arrays.asList(bootstrapCommitInstantTs);
break;
case METADATA_BOOTSTRAP_MODE:
bootstrapModeSelectorClass = MetadataOnlyBootstrapModeSelector.class.getCanonicalName();
bootstrapCommitInstantTs = HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS;
checkNumRawFiles = true;
isBootstrapIndexCreated = true;
numInstantsAfterBootstrap = 1;
bootstrapInstants = Arrays.asList(bootstrapCommitInstantTs);
break;
default:
bootstrapModeSelectorClass = TestRandomBootstrapModeSelector.class.getName();
bootstrapCommitInstantTs = HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS;
checkNumRawFiles = false;
isBootstrapIndexCreated = true;
numInstantsAfterBootstrap = 2;
bootstrapInstants = Arrays.asList(HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS, HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS);
break;
}
List<String> partitions = Arrays.asList("2020/04/01", "2020/04/02", "2020/04/03");
long timestamp = Instant.now().toEpochMilli();
Schema schema = generateNewDataSetAndReturnSchema(timestamp, totalRecords, partitions, bootstrapBasePath);
HoodieWriteConfig config = getConfigBuilder(schema.toString()).withAutoCommit(true).withSchema(schema.toString()).withCompactionConfig(HoodieCompactionConfig.newBuilder().withMaxNumDeltaCommitsBeforeCompaction(1).build()).withBootstrapConfig(HoodieBootstrapConfig.newBuilder().withBootstrapBasePath(bootstrapBasePath).withBootstrapKeyGenClass(keyGeneratorClass).withFullBootstrapInputProvider(TestFullBootstrapDataProvider.class.getName()).withBootstrapParallelism(3).withBootstrapModeSelector(bootstrapModeSelectorClass).build()).build();
SparkRDDWriteClientOverride client = new SparkRDDWriteClientOverride(context, config);
client.bootstrap(Option.empty());
checkBootstrapResults(totalRecords, schema, bootstrapCommitInstantTs, checkNumRawFiles, numInstantsAfterBootstrap, numInstantsAfterBootstrap, timestamp, timestamp, deltaCommit, bootstrapInstants, true);
// Rollback Bootstrap
if (deltaCommit) {
FileCreateUtils.deleteDeltaCommit(metaClient.getBasePath(), bootstrapCommitInstantTs);
} else {
FileCreateUtils.deleteCommit(metaClient.getBasePath(), bootstrapCommitInstantTs);
}
client.rollbackFailedBootstrap();
metaClient.reloadActiveTimeline();
assertEquals(0, metaClient.getCommitsTimeline().countInstants());
assertEquals(0L, BootstrapUtils.getAllLeafFoldersWithFiles(metaClient, metaClient.getFs(), basePath, context).stream().flatMap(f -> f.getValue().stream()).count());
BootstrapIndex index = BootstrapIndex.getBootstrapIndex(metaClient);
assertFalse(index.useIndex());
// Run bootstrap again
client = new SparkRDDWriteClientOverride(context, config);
client.bootstrap(Option.empty());
metaClient.reloadActiveTimeline();
index = BootstrapIndex.getBootstrapIndex(metaClient);
if (isBootstrapIndexCreated) {
assertTrue(index.useIndex());
} else {
assertFalse(index.useIndex());
}
checkBootstrapResults(totalRecords, schema, bootstrapCommitInstantTs, checkNumRawFiles, numInstantsAfterBootstrap, numInstantsAfterBootstrap, timestamp, timestamp, deltaCommit, bootstrapInstants, true);
// Upsert case
long updateTimestamp = Instant.now().toEpochMilli();
String updateSPath = tmpFolder.toAbsolutePath().toString() + "/data2";
generateNewDataSetAndReturnSchema(updateTimestamp, totalRecords, partitions, updateSPath);
JavaRDD<HoodieRecord> updateBatch = generateInputBatch(jsc, BootstrapUtils.getAllLeafFoldersWithFiles(metaClient, metaClient.getFs(), updateSPath, context), schema);
String newInstantTs = client.startCommit();
client.upsert(updateBatch, newInstantTs);
checkBootstrapResults(totalRecords, schema, newInstantTs, false, numInstantsAfterBootstrap + 1, updateTimestamp, deltaCommit ? timestamp : updateTimestamp, deltaCommit, true);
if (deltaCommit) {
Option<String> compactionInstant = client.scheduleCompaction(Option.empty());
assertTrue(compactionInstant.isPresent());
client.compact(compactionInstant.get());
checkBootstrapResults(totalRecords, schema, compactionInstant.get(), checkNumRawFiles, numInstantsAfterBootstrap + 2, 2, updateTimestamp, updateTimestamp, !deltaCommit, Arrays.asList(compactionInstant.get()), !config.isPreserveHoodieCommitMetadataForCompaction());
}
}
use of org.apache.hudi.common.bootstrap.index.BootstrapIndex in project hudi by apache.
the class TestBootstrapIndex method generateBootstrapIndex.
public static Map<String, List<BootstrapFileMapping>> generateBootstrapIndex(HoodieTableMetaClient metaClient, String sourceBasePath, String[] partitions, int numEntriesPerPartition) {
Map<String, List<BootstrapFileMapping>> bootstrapMapping = generateBootstrapMapping(sourceBasePath, partitions, numEntriesPerPartition);
BootstrapIndex index = new HFileBootstrapIndex(metaClient);
try (IndexWriter writer = index.createWriter(sourceBasePath)) {
writer.begin();
bootstrapMapping.entrySet().stream().forEach(e -> writer.appendNextPartition(e.getKey(), e.getValue()));
writer.finish();
}
return bootstrapMapping;
}
Aggregations