Search in sources :

Example 71 with SparkRDDWriteClient

use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.

the class SparkHoodieBackedTableMetadataWriter method commit.

@Override
protected void commit(String instantTime, Map<MetadataPartitionType, HoodieData<HoodieRecord>> partitionRecordsMap, boolean canTriggerTableService) {
    ValidationUtils.checkState(metadataMetaClient != null, "Metadata table is not fully initialized yet.");
    ValidationUtils.checkState(enabled, "Metadata table cannot be committed to as it is not enabled");
    HoodieData<HoodieRecord> preppedRecords = prepRecords(partitionRecordsMap);
    JavaRDD<HoodieRecord> preppedRecordRDD = HoodieJavaRDD.getJavaRDD(preppedRecords);
    try (SparkRDDWriteClient writeClient = new SparkRDDWriteClient(engineContext, metadataWriteConfig, true)) {
        if (canTriggerTableService) {
            // trigger compaction before doing the delta commit. this is to ensure, if this delta commit succeeds in metadata table, but failed in data table,
            // we would have compacted metadata table and so could have included uncommitted data which will never be ignored while reading from metadata
            // table (since reader will filter out only from delta commits)
            compactIfNecessary(writeClient, instantTime);
        }
        if (!metadataMetaClient.getActiveTimeline().filterCompletedInstants().containsInstant(instantTime)) {
            // if this is a new commit being applied to metadata for the first time
            writeClient.startCommitWithTime(instantTime);
        } else {
            // this code path refers to a re-attempted commit that got committed to metadata table, but failed in datatable.
            // for eg, lets say compaction c1 on 1st attempt succeeded in metadata table and failed before committing to datatable.
            // when retried again, data table will first rollback pending compaction. these will be applied to metadata table, but all changes
            // are upserts to metadata table and so only a new delta commit will be created.
            // once rollback is complete, compaction will be retried again, which will eventually hit this code block where the respective commit is
            // already part of completed commit. So, we have to manually remove the completed instant and proceed.
            // and it is for the same reason we enabled withAllowMultiWriteOnSameInstant for metadata table.
            HoodieInstant alreadyCompletedInstant = metadataMetaClient.getActiveTimeline().filterCompletedInstants().filter(entry -> entry.getTimestamp().equals(instantTime)).lastInstant().get();
            HoodieActiveTimeline.deleteInstantFile(metadataMetaClient.getFs(), metadataMetaClient.getMetaPath(), alreadyCompletedInstant);
            metadataMetaClient.reloadActiveTimeline();
        }
        List<WriteStatus> statuses = writeClient.upsertPreppedRecords(preppedRecordRDD, instantTime).collect();
        statuses.forEach(writeStatus -> {
            if (writeStatus.hasErrors()) {
                throw new HoodieMetadataException("Failed to commit metadata table records at instant " + instantTime);
            }
        });
        // reload timeline
        metadataMetaClient.reloadActiveTimeline();
        if (canTriggerTableService) {
            cleanIfNecessary(writeClient, instantTime);
            writeClient.archive();
        }
    }
    // Update total size of the metadata and count of base/log files
    metrics.ifPresent(m -> m.updateSizeMetrics(metadataMetaClient, metadata));
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) WriteStatus(org.apache.hudi.client.WriteStatus)

Example 72 with SparkRDDWriteClient

use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.

the class TestDFSHoodieDatasetInputReader method testSimpleHoodieDatasetReader.

@Test
public void testSimpleHoodieDatasetReader() throws Exception {
    HoodieWriteConfig config = makeHoodieClientConfig();
    SparkRDDWriteClient client = new SparkRDDWriteClient(new HoodieSparkEngineContext(jsc), config);
    String commitTime = client.startCommit();
    HoodieTestDataGenerator generator = new HoodieTestDataGenerator();
    // Insert 100 records across 3 partitions
    List<HoodieRecord> inserts = generator.generateInserts(commitTime, 100);
    JavaRDD<WriteStatus> writeStatuses = client.upsert(jsc.parallelize(inserts), commitTime);
    writeStatuses.count();
    DFSHoodieDatasetInputReader reader = new DFSHoodieDatasetInputReader(jsc, config.getBasePath(), HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema())).toString());
    // Try to read 100 records for the same partition path and same file ID
    JavaRDD<GenericRecord> records = reader.read(1, 1, 100L);
    assertTrue(records.count() <= 100);
    assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).collect()).size(), 1);
    assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.FILENAME_METADATA_FIELD)).collect()).size(), 1);
    // Try to read 100 records for 3 partition paths and 3 different file ids
    records = reader.read(3, 3, 100L);
    assertTrue(records.count() <= 100);
    assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).collect()).size(), 3);
    assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.FILENAME_METADATA_FIELD)).collect()).size(), 3);
    // Try to read 100 records for 3 partition paths and 50% records from each file
    records = reader.read(3, 3, 0.5);
    assertTrue(records.count() <= 100);
    assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).collect()).size(), 3);
    assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.FILENAME_METADATA_FIELD)).collect()).size(), 3);
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) WriteStatus(org.apache.hudi.client.WriteStatus) HashSet(java.util.HashSet) Test(org.junit.jupiter.api.Test)

Example 73 with SparkRDDWriteClient

use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.

the class HoodieWriteClientExample method main.

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: HoodieWriteClientExample <tablePath> <tableName>");
        System.exit(1);
    }
    String tablePath = args[0];
    String tableName = args[1];
    SparkConf sparkConf = HoodieExampleSparkUtils.defaultSparkConf("hoodie-client-example");
    try (JavaSparkContext jsc = new JavaSparkContext(sparkConf)) {
        // Generator of some records to be loaded in.
        HoodieExampleDataGenerator<HoodieAvroPayload> dataGen = new HoodieExampleDataGenerator<>();
        // initialize the table, if not done already
        Path path = new Path(tablePath);
        FileSystem fs = FSUtils.getFs(tablePath, jsc.hadoopConfiguration());
        if (!fs.exists(path)) {
            HoodieTableMetaClient.withPropertyBuilder().setTableType(tableType).setTableName(tableName).setPayloadClass(HoodieAvroPayload.class).initTable(jsc.hadoopConfiguration(), tablePath);
        }
        // Create the write client to write some records in
        HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath).withSchema(HoodieExampleDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).withDeleteParallelism(2).forTable(tableName).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(20, 30).build()).build();
        SparkRDDWriteClient<HoodieAvroPayload> client = new SparkRDDWriteClient<>(new HoodieSparkEngineContext(jsc), cfg);
        // inserts
        String newCommitTime = client.startCommit();
        LOG.info("Starting commit " + newCommitTime);
        List<HoodieRecord<HoodieAvroPayload>> records = dataGen.generateInserts(newCommitTime, 10);
        List<HoodieRecord<HoodieAvroPayload>> recordsSoFar = new ArrayList<>(records);
        JavaRDD<HoodieRecord<HoodieAvroPayload>> writeRecords = jsc.parallelize(records, 1);
        client.insert(writeRecords, newCommitTime);
        // updates
        newCommitTime = client.startCommit();
        LOG.info("Starting commit " + newCommitTime);
        List<HoodieRecord<HoodieAvroPayload>> toBeUpdated = dataGen.generateUpdates(newCommitTime, 2);
        records.addAll(toBeUpdated);
        recordsSoFar.addAll(toBeUpdated);
        writeRecords = jsc.parallelize(records, 1);
        client.upsert(writeRecords, newCommitTime);
        // Delete
        newCommitTime = client.startCommit();
        LOG.info("Starting commit " + newCommitTime);
        // just delete half of the records
        int numToDelete = recordsSoFar.size() / 2;
        List<HoodieKey> toBeDeleted = recordsSoFar.stream().map(HoodieRecord::getKey).limit(numToDelete).collect(Collectors.toList());
        JavaRDD<HoodieKey> deleteRecords = jsc.parallelize(toBeDeleted, 1);
        client.delete(deleteRecords, newCommitTime);
        // Delete by partition
        newCommitTime = client.startCommit();
        client.startCommitWithTime(newCommitTime, HoodieTimeline.REPLACE_COMMIT_ACTION);
        LOG.info("Starting commit " + newCommitTime);
        // The partition where the data needs to be deleted
        List<String> partitionList = toBeDeleted.stream().map(s -> s.getPartitionPath()).distinct().collect(Collectors.toList());
        List<String> deleteList = recordsSoFar.stream().filter(f -> !partitionList.contains(f.getPartitionPath())).map(m -> m.getKey().getPartitionPath()).distinct().collect(Collectors.toList());
        client.deletePartitions(deleteList, newCommitTime);
        // compaction
        if (HoodieTableType.valueOf(tableType) == HoodieTableType.MERGE_ON_READ) {
            Option<String> instant = client.scheduleCompaction(Option.empty());
            HoodieWriteMetadata<JavaRDD<WriteStatus>> compactionMetadata = client.compact(instant.get());
            client.commitCompaction(instant.get(), compactionMetadata.getCommitMetadata().get(), Option.empty());
        }
    }
}
Also used : HoodieAvroPayload(org.apache.hudi.common.model.HoodieAvroPayload) HoodieExampleSparkUtils(org.apache.hudi.examples.common.HoodieExampleSparkUtils) HoodieExampleDataGenerator(org.apache.hudi.examples.common.HoodieExampleDataGenerator) FileSystem(org.apache.hadoop.fs.FileSystem) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) SparkConf(org.apache.spark.SparkConf) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) WriteStatus(org.apache.hudi.client.WriteStatus) List(java.util.List) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieKey(org.apache.hudi.common.model.HoodieKey) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) FileSystem(org.apache.hadoop.fs.FileSystem) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) HoodieExampleDataGenerator(org.apache.hudi.examples.common.HoodieExampleDataGenerator) Path(org.apache.hadoop.fs.Path) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieKey(org.apache.hudi.common.model.HoodieKey) SparkConf(org.apache.spark.SparkConf) HoodieAvroPayload(org.apache.hudi.common.model.HoodieAvroPayload)

Example 74 with SparkRDDWriteClient

use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.

the class TestHoodieSnapshotExporter method init.

@BeforeEach
public void init() throws Exception {
    // Initialize test data dirs
    sourcePath = Paths.get(basePath(), "source").toString();
    targetPath = Paths.get(basePath(), "target").toString();
    lfs = (LocalFileSystem) FSUtils.getFs(basePath(), jsc().hadoopConfiguration());
    HoodieTableMetaClient.withPropertyBuilder().setTableType(HoodieTableType.COPY_ON_WRITE).setTableName(TABLE_NAME).setPayloadClass(HoodieAvroPayload.class).initTable(jsc().hadoopConfiguration(), sourcePath);
    // Prepare data as source Hudi dataset
    HoodieWriteConfig cfg = getHoodieWriteConfig(sourcePath);
    SparkRDDWriteClient writeClient = getHoodieWriteClient(cfg);
    writeClient.startCommitWithTime(COMMIT_TIME);
    HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(new String[] { PARTITION_PATH });
    List<HoodieRecord> records = dataGen.generateInserts(COMMIT_TIME, NUM_RECORDS);
    JavaRDD<HoodieRecord> recordsRDD = jsc().parallelize(records, 1);
    writeClient.bulkInsert(recordsRDD, COMMIT_TIME);
    writeClient.close();
    RemoteIterator<LocatedFileStatus> itr = lfs.listFiles(new Path(sourcePath), true);
    while (itr.hasNext()) {
        LOG.info(">>> Prepared test file: " + itr.next().getPath());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) HoodieAvroPayload(org.apache.hudi.common.model.HoodieAvroPayload) BeforeEach(org.junit.jupiter.api.BeforeEach)

Example 75 with SparkRDDWriteClient

use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.

the class TestHoodieIncrSource method testHoodieIncrSource.

@Test
public void testHoodieIncrSource() throws IOException {
    HoodieWriteConfig writeConfig = getConfigBuilder(basePath).withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 3).retainCommits(1).build()).withMetadataConfig(HoodieMetadataConfig.newBuilder().withMaxNumDeltaCommitsBeforeCompaction(1).build()).build();
    SparkRDDWriteClient writeClient = new SparkRDDWriteClient(context, writeConfig);
    Pair<String, List<HoodieRecord>> inserts = writeRecords(writeClient, true, null, "100");
    Pair<String, List<HoodieRecord>> inserts2 = writeRecords(writeClient, true, null, "200");
    Pair<String, List<HoodieRecord>> inserts3 = writeRecords(writeClient, true, null, "300");
    Pair<String, List<HoodieRecord>> inserts4 = writeRecords(writeClient, true, null, "400");
    Pair<String, List<HoodieRecord>> inserts5 = writeRecords(writeClient, true, null, "500");
    // read everything upto latest
    readAndAssert(IncrSourceHelper.MissingCheckpointStrategy.READ_UPTO_LATEST_COMMIT, Option.empty(), 500, inserts5.getKey());
    // even if the begin timestamp is archived (100), full table scan should kick in, but should filter for records having commit time > 100
    readAndAssert(IncrSourceHelper.MissingCheckpointStrategy.READ_UPTO_LATEST_COMMIT, Option.of("100"), 400, inserts5.getKey());
    // even if the read upto latest is set, if begin timestamp is in active timeline, only incremental should kick in.
    readAndAssert(IncrSourceHelper.MissingCheckpointStrategy.READ_UPTO_LATEST_COMMIT, Option.of("400"), 100, inserts5.getKey());
    // read just the latest
    readAndAssert(IncrSourceHelper.MissingCheckpointStrategy.READ_LATEST, Option.empty(), 100, inserts5.getKey());
    // ensure checkpoint does not move
    readAndAssert(IncrSourceHelper.MissingCheckpointStrategy.READ_LATEST, Option.of(inserts5.getKey()), 0, inserts5.getKey());
    Pair<String, List<HoodieRecord>> inserts6 = writeRecords(writeClient, true, null, "600");
    // insert new batch and ensure the checkpoint moves
    readAndAssert(IncrSourceHelper.MissingCheckpointStrategy.READ_LATEST, Option.of(inserts5.getKey()), 100, inserts6.getKey());
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) List(java.util.List) Test(org.junit.jupiter.api.Test)

Aggregations

SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)143 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)127 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)113 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)86 Test (org.junit.jupiter.api.Test)80 WriteStatus (org.apache.hudi.client.WriteStatus)76 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)74 HoodieTestDataGenerator (org.apache.hudi.common.testutils.HoodieTestDataGenerator)61 List (java.util.List)59 ArrayList (java.util.ArrayList)51 HoodieTable (org.apache.hudi.table.HoodieTable)51 Path (org.apache.hadoop.fs.Path)47 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)47 JavaRDD (org.apache.spark.api.java.JavaRDD)47 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)44 Collectors (java.util.stream.Collectors)43 Assertions.assertEquals (org.junit.jupiter.api.Assertions.assertEquals)43 HoodieCompactionConfig (org.apache.hudi.config.HoodieCompactionConfig)42 HashMap (java.util.HashMap)41 Properties (java.util.Properties)41