Search in sources :

Example 1 with BootstrapIndex

use of org.apache.hudi.common.bootstrap.index.BootstrapIndex in project hudi by apache.

the class BootstrapCommand method createBootstrapIndexReader.

private BootstrapIndex.IndexReader createBootstrapIndexReader() {
    HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient();
    BootstrapIndex index = BootstrapIndex.getBootstrapIndex(metaClient);
    if (!index.useIndex()) {
        throw new HoodieException("This is not a bootstrapped Hudi table. Don't have any index info");
    }
    return index.createReader();
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieException(org.apache.hudi.exception.HoodieException) BootstrapIndex(org.apache.hudi.common.bootstrap.index.BootstrapIndex)

Example 2 with BootstrapIndex

use of org.apache.hudi.common.bootstrap.index.BootstrapIndex in project hudi by apache.

the class TestBootstrapIndex method testNoOpBootstrapIndex.

@Test
public void testNoOpBootstrapIndex() throws IOException {
    Properties props = metaClient.getTableConfig().getProps();
    props.put(HoodieTableConfig.BOOTSTRAP_INDEX_ENABLE.key(), "false");
    Properties properties = new Properties();
    properties.putAll(props);
    HoodieTableConfig.create(metaClient.getFs(), new Path(metaClient.getMetaPath()), properties);
    metaClient = HoodieTableMetaClient.builder().setConf(metaClient.getHadoopConf()).setBasePath(basePath).build();
    BootstrapIndex bootstrapIndex = BootstrapIndex.getBootstrapIndex(metaClient);
    assert (bootstrapIndex instanceof NoOpBootstrapIndex);
}
Also used : Path(org.apache.hadoop.fs.Path) HoodiePath(org.apache.hudi.avro.model.HoodiePath) NoOpBootstrapIndex(org.apache.hudi.common.bootstrap.index.NoOpBootstrapIndex) HFileBootstrapIndex(org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex) BootstrapIndex(org.apache.hudi.common.bootstrap.index.BootstrapIndex) Properties(java.util.Properties) NoOpBootstrapIndex(org.apache.hudi.common.bootstrap.index.NoOpBootstrapIndex) Test(org.junit.jupiter.api.Test)

Example 3 with BootstrapIndex

use of org.apache.hudi.common.bootstrap.index.BootstrapIndex in project hudi by apache.

the class TestBootstrap method testBootstrapCommon.

private void testBootstrapCommon(boolean partitioned, boolean deltaCommit, EffectiveMode mode) throws Exception {
    if (deltaCommit) {
        metaClient = HoodieTestUtils.init(basePath, HoodieTableType.MERGE_ON_READ, bootstrapBasePath, true);
    } else {
        metaClient = HoodieTestUtils.init(basePath, HoodieTableType.COPY_ON_WRITE, bootstrapBasePath, true);
    }
    int totalRecords = 100;
    String keyGeneratorClass = partitioned ? SimpleKeyGenerator.class.getCanonicalName() : NonpartitionedKeyGenerator.class.getCanonicalName();
    final String bootstrapModeSelectorClass;
    final String bootstrapCommitInstantTs;
    final boolean checkNumRawFiles;
    final boolean isBootstrapIndexCreated;
    final int numInstantsAfterBootstrap;
    final List<String> bootstrapInstants;
    switch(mode) {
        case FULL_BOOTSTRAP_MODE:
            bootstrapModeSelectorClass = FullRecordBootstrapModeSelector.class.getCanonicalName();
            bootstrapCommitInstantTs = HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS;
            checkNumRawFiles = false;
            isBootstrapIndexCreated = false;
            numInstantsAfterBootstrap = 1;
            bootstrapInstants = Arrays.asList(bootstrapCommitInstantTs);
            break;
        case METADATA_BOOTSTRAP_MODE:
            bootstrapModeSelectorClass = MetadataOnlyBootstrapModeSelector.class.getCanonicalName();
            bootstrapCommitInstantTs = HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS;
            checkNumRawFiles = true;
            isBootstrapIndexCreated = true;
            numInstantsAfterBootstrap = 1;
            bootstrapInstants = Arrays.asList(bootstrapCommitInstantTs);
            break;
        default:
            bootstrapModeSelectorClass = TestRandomBootstrapModeSelector.class.getName();
            bootstrapCommitInstantTs = HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS;
            checkNumRawFiles = false;
            isBootstrapIndexCreated = true;
            numInstantsAfterBootstrap = 2;
            bootstrapInstants = Arrays.asList(HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS, HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS);
            break;
    }
    List<String> partitions = Arrays.asList("2020/04/01", "2020/04/02", "2020/04/03");
    long timestamp = Instant.now().toEpochMilli();
    Schema schema = generateNewDataSetAndReturnSchema(timestamp, totalRecords, partitions, bootstrapBasePath);
    HoodieWriteConfig config = getConfigBuilder(schema.toString()).withAutoCommit(true).withSchema(schema.toString()).withCompactionConfig(HoodieCompactionConfig.newBuilder().withMaxNumDeltaCommitsBeforeCompaction(1).build()).withBootstrapConfig(HoodieBootstrapConfig.newBuilder().withBootstrapBasePath(bootstrapBasePath).withBootstrapKeyGenClass(keyGeneratorClass).withFullBootstrapInputProvider(TestFullBootstrapDataProvider.class.getName()).withBootstrapParallelism(3).withBootstrapModeSelector(bootstrapModeSelectorClass).build()).build();
    SparkRDDWriteClientOverride client = new SparkRDDWriteClientOverride(context, config);
    client.bootstrap(Option.empty());
    checkBootstrapResults(totalRecords, schema, bootstrapCommitInstantTs, checkNumRawFiles, numInstantsAfterBootstrap, numInstantsAfterBootstrap, timestamp, timestamp, deltaCommit, bootstrapInstants, true);
    // Rollback Bootstrap
    HoodieActiveTimeline.deleteInstantFile(metaClient.getFs(), metaClient.getMetaPath(), new HoodieInstant(State.COMPLETED, deltaCommit ? HoodieTimeline.DELTA_COMMIT_ACTION : HoodieTimeline.COMMIT_ACTION, bootstrapCommitInstantTs));
    metaClient.reloadActiveTimeline();
    client.rollbackFailedBootstrap();
    metaClient.reloadActiveTimeline();
    assertEquals(0, metaClient.getCommitsTimeline().countInstants());
    assertEquals(0L, BootstrapUtils.getAllLeafFoldersWithFiles(metaClient, metaClient.getFs(), basePath, context).stream().flatMap(f -> f.getValue().stream()).count());
    BootstrapIndex index = BootstrapIndex.getBootstrapIndex(metaClient);
    assertFalse(index.useIndex());
    // Run bootstrap again
    client = new SparkRDDWriteClientOverride(context, config);
    client.bootstrap(Option.empty());
    metaClient.reloadActiveTimeline();
    index = BootstrapIndex.getBootstrapIndex(metaClient);
    if (isBootstrapIndexCreated) {
        assertTrue(index.useIndex());
    } else {
        assertFalse(index.useIndex());
    }
    checkBootstrapResults(totalRecords, schema, bootstrapCommitInstantTs, checkNumRawFiles, numInstantsAfterBootstrap, numInstantsAfterBootstrap, timestamp, timestamp, deltaCommit, bootstrapInstants, true);
    // Upsert case
    long updateTimestamp = Instant.now().toEpochMilli();
    String updateSPath = tmpFolder.toAbsolutePath().toString() + "/data2";
    generateNewDataSetAndReturnSchema(updateTimestamp, totalRecords, partitions, updateSPath);
    JavaRDD<HoodieRecord> updateBatch = generateInputBatch(jsc, BootstrapUtils.getAllLeafFoldersWithFiles(metaClient, metaClient.getFs(), updateSPath, context), schema);
    String newInstantTs = client.startCommit();
    client.upsert(updateBatch, newInstantTs);
    checkBootstrapResults(totalRecords, schema, newInstantTs, false, numInstantsAfterBootstrap + 1, updateTimestamp, deltaCommit ? timestamp : updateTimestamp, deltaCommit, true);
    if (deltaCommit) {
        Option<String> compactionInstant = client.scheduleCompaction(Option.empty());
        assertTrue(compactionInstant.isPresent());
        client.compact(compactionInstant.get());
        checkBootstrapResults(totalRecords, schema, compactionInstant.get(), checkNumRawFiles, numInstantsAfterBootstrap + 2, 2, updateTimestamp, updateTimestamp, !deltaCommit, Arrays.asList(compactionInstant.get()), !config.isPreserveHoodieCommitMetadataForCompaction());
    }
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Schema(org.apache.avro.Schema) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) MetadataOnlyBootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector) SimpleKeyGenerator(org.apache.hudi.keygen.SimpleKeyGenerator) BootstrapIndex(org.apache.hudi.common.bootstrap.index.BootstrapIndex) NonpartitionedKeyGenerator(org.apache.hudi.keygen.NonpartitionedKeyGenerator) FullRecordBootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.FullRecordBootstrapModeSelector)

Example 4 with BootstrapIndex

use of org.apache.hudi.common.bootstrap.index.BootstrapIndex in project hudi by apache.

the class TestOrcBootstrap method testBootstrapCommon.

private void testBootstrapCommon(boolean partitioned, boolean deltaCommit, EffectiveMode mode) throws Exception {
    if (deltaCommit) {
        metaClient = HoodieTestUtils.init(basePath, HoodieTableType.MERGE_ON_READ, bootstrapBasePath, HoodieFileFormat.ORC);
    } else {
        metaClient = HoodieTestUtils.init(basePath, HoodieTableType.COPY_ON_WRITE, bootstrapBasePath, HoodieFileFormat.ORC);
    }
    int totalRecords = 100;
    String keyGeneratorClass = partitioned ? SimpleKeyGenerator.class.getCanonicalName() : NonpartitionedKeyGenerator.class.getCanonicalName();
    final String bootstrapModeSelectorClass;
    final String bootstrapCommitInstantTs;
    final boolean checkNumRawFiles;
    final boolean isBootstrapIndexCreated;
    final int numInstantsAfterBootstrap;
    final List<String> bootstrapInstants;
    switch(mode) {
        case FULL_BOOTSTRAP_MODE:
            bootstrapModeSelectorClass = FullRecordBootstrapModeSelector.class.getCanonicalName();
            bootstrapCommitInstantTs = HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS;
            checkNumRawFiles = false;
            isBootstrapIndexCreated = false;
            numInstantsAfterBootstrap = 1;
            bootstrapInstants = Arrays.asList(bootstrapCommitInstantTs);
            break;
        case METADATA_BOOTSTRAP_MODE:
            bootstrapModeSelectorClass = MetadataOnlyBootstrapModeSelector.class.getCanonicalName();
            bootstrapCommitInstantTs = HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS;
            checkNumRawFiles = true;
            isBootstrapIndexCreated = true;
            numInstantsAfterBootstrap = 1;
            bootstrapInstants = Arrays.asList(bootstrapCommitInstantTs);
            break;
        default:
            bootstrapModeSelectorClass = TestRandomBootstrapModeSelector.class.getName();
            bootstrapCommitInstantTs = HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS;
            checkNumRawFiles = false;
            isBootstrapIndexCreated = true;
            numInstantsAfterBootstrap = 2;
            bootstrapInstants = Arrays.asList(HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS, HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS);
            break;
    }
    List<String> partitions = Arrays.asList("2020/04/01", "2020/04/02", "2020/04/03");
    long timestamp = Instant.now().toEpochMilli();
    Schema schema = generateNewDataSetAndReturnSchema(timestamp, totalRecords, partitions, bootstrapBasePath);
    HoodieWriteConfig config = getConfigBuilder(schema.toString()).withAutoCommit(true).withSchema(schema.toString()).withCompactionConfig(HoodieCompactionConfig.newBuilder().withMaxNumDeltaCommitsBeforeCompaction(1).build()).withBootstrapConfig(HoodieBootstrapConfig.newBuilder().withBootstrapBasePath(bootstrapBasePath).withBootstrapKeyGenClass(keyGeneratorClass).withFullBootstrapInputProvider(TestFullBootstrapDataProvider.class.getName()).withBootstrapParallelism(3).withBootstrapModeSelector(bootstrapModeSelectorClass).build()).build();
    SparkRDDWriteClientOverride client = new SparkRDDWriteClientOverride(context, config);
    client.bootstrap(Option.empty());
    checkBootstrapResults(totalRecords, schema, bootstrapCommitInstantTs, checkNumRawFiles, numInstantsAfterBootstrap, numInstantsAfterBootstrap, timestamp, timestamp, deltaCommit, bootstrapInstants, true);
    // Rollback Bootstrap
    if (deltaCommit) {
        FileCreateUtils.deleteDeltaCommit(metaClient.getBasePath(), bootstrapCommitInstantTs);
    } else {
        FileCreateUtils.deleteCommit(metaClient.getBasePath(), bootstrapCommitInstantTs);
    }
    client.rollbackFailedBootstrap();
    metaClient.reloadActiveTimeline();
    assertEquals(0, metaClient.getCommitsTimeline().countInstants());
    assertEquals(0L, BootstrapUtils.getAllLeafFoldersWithFiles(metaClient, metaClient.getFs(), basePath, context).stream().flatMap(f -> f.getValue().stream()).count());
    BootstrapIndex index = BootstrapIndex.getBootstrapIndex(metaClient);
    assertFalse(index.useIndex());
    // Run bootstrap again
    client = new SparkRDDWriteClientOverride(context, config);
    client.bootstrap(Option.empty());
    metaClient.reloadActiveTimeline();
    index = BootstrapIndex.getBootstrapIndex(metaClient);
    if (isBootstrapIndexCreated) {
        assertTrue(index.useIndex());
    } else {
        assertFalse(index.useIndex());
    }
    checkBootstrapResults(totalRecords, schema, bootstrapCommitInstantTs, checkNumRawFiles, numInstantsAfterBootstrap, numInstantsAfterBootstrap, timestamp, timestamp, deltaCommit, bootstrapInstants, true);
    // Upsert case
    long updateTimestamp = Instant.now().toEpochMilli();
    String updateSPath = tmpFolder.toAbsolutePath().toString() + "/data2";
    generateNewDataSetAndReturnSchema(updateTimestamp, totalRecords, partitions, updateSPath);
    JavaRDD<HoodieRecord> updateBatch = generateInputBatch(jsc, BootstrapUtils.getAllLeafFoldersWithFiles(metaClient, metaClient.getFs(), updateSPath, context), schema);
    String newInstantTs = client.startCommit();
    client.upsert(updateBatch, newInstantTs);
    checkBootstrapResults(totalRecords, schema, newInstantTs, false, numInstantsAfterBootstrap + 1, updateTimestamp, deltaCommit ? timestamp : updateTimestamp, deltaCommit, true);
    if (deltaCommit) {
        Option<String> compactionInstant = client.scheduleCompaction(Option.empty());
        assertTrue(compactionInstant.isPresent());
        client.compact(compactionInstant.get());
        checkBootstrapResults(totalRecords, schema, compactionInstant.get(), checkNumRawFiles, numInstantsAfterBootstrap + 2, 2, updateTimestamp, updateTimestamp, !deltaCommit, Arrays.asList(compactionInstant.get()), !config.isPreserveHoodieCommitMetadataForCompaction());
    }
}
Also used : HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Schema(org.apache.avro.Schema) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) MetadataOnlyBootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector) SimpleKeyGenerator(org.apache.hudi.keygen.SimpleKeyGenerator) BootstrapIndex(org.apache.hudi.common.bootstrap.index.BootstrapIndex) NonpartitionedKeyGenerator(org.apache.hudi.keygen.NonpartitionedKeyGenerator) FullRecordBootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.FullRecordBootstrapModeSelector)

Example 5 with BootstrapIndex

use of org.apache.hudi.common.bootstrap.index.BootstrapIndex in project hudi by apache.

the class TestBootstrapIndex method generateBootstrapIndex.

public static Map<String, List<BootstrapFileMapping>> generateBootstrapIndex(HoodieTableMetaClient metaClient, String sourceBasePath, String[] partitions, int numEntriesPerPartition) {
    Map<String, List<BootstrapFileMapping>> bootstrapMapping = generateBootstrapMapping(sourceBasePath, partitions, numEntriesPerPartition);
    BootstrapIndex index = new HFileBootstrapIndex(metaClient);
    try (IndexWriter writer = index.createWriter(sourceBasePath)) {
        writer.begin();
        bootstrapMapping.entrySet().stream().forEach(e -> writer.appendNextPartition(e.getKey(), e.getValue()));
        writer.finish();
    }
    return bootstrapMapping;
}
Also used : IndexWriter(org.apache.hudi.common.bootstrap.index.BootstrapIndex.IndexWriter) ArrayList(java.util.ArrayList) List(java.util.List) NoOpBootstrapIndex(org.apache.hudi.common.bootstrap.index.NoOpBootstrapIndex) HFileBootstrapIndex(org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex) BootstrapIndex(org.apache.hudi.common.bootstrap.index.BootstrapIndex) HFileBootstrapIndex(org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex)

Aggregations

BootstrapIndex (org.apache.hudi.common.bootstrap.index.BootstrapIndex)6 HFileBootstrapIndex (org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex)3 NoOpBootstrapIndex (org.apache.hudi.common.bootstrap.index.NoOpBootstrapIndex)3 ArrayList (java.util.ArrayList)2 Schema (org.apache.avro.Schema)2 FullRecordBootstrapModeSelector (org.apache.hudi.client.bootstrap.selector.FullRecordBootstrapModeSelector)2 MetadataOnlyBootstrapModeSelector (org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector)2 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)2 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)2 NonpartitionedKeyGenerator (org.apache.hudi.keygen.NonpartitionedKeyGenerator)2 SimpleKeyGenerator (org.apache.hudi.keygen.SimpleKeyGenerator)2 List (java.util.List)1 Properties (java.util.Properties)1 Path (org.apache.hadoop.fs.Path)1 HoodiePath (org.apache.hudi.avro.model.HoodiePath)1 IndexWriter (org.apache.hudi.common.bootstrap.index.BootstrapIndex.IndexWriter)1 BootstrapFileMapping (org.apache.hudi.common.model.BootstrapFileMapping)1 HoodieFileGroupId (org.apache.hudi.common.model.HoodieFileGroupId)1 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)1 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)1