Search in sources :

Example 11 with HoodieWriteMetadata

use of org.apache.hudi.table.action.HoodieWriteMetadata in project hudi by apache.

the class TestHoodieClientMultiWriter method testMultiWriterWithAsyncTableServicesWithConflict.

private void testMultiWriterWithAsyncTableServicesWithConflict(HoodieTableType tableType) throws Exception {
    // create inserts X 1
    if (tableType == HoodieTableType.MERGE_ON_READ) {
        setUpMORTestTable();
    }
    Properties properties = new Properties();
    properties.setProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, basePath + "/.hoodie/.locks");
    properties.setProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, basePath);
    properties.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "3000");
    // Disabling embedded timeline server, it doesn't work with multiwriter
    HoodieWriteConfig.Builder writeConfigBuilder = getConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().withAutoClean(false).withInlineCompaction(false).withAsyncClean(true).withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY).withMaxNumDeltaCommitsBeforeCompaction(2).build()).withEmbeddedTimelineServerEnabled(false).withMarkersType(MarkerType.DIRECT.name()).withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder().withStorageType(FileSystemViewStorageType.MEMORY).build()).withWriteConcurrencyMode(WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL).withLockConfig(HoodieLockConfig.newBuilder().withLockProvider(InProcessLockProvider.class).build()).withAutoCommit(false).withProperties(properties);
    Set<String> validInstants = new HashSet<>();
    // Create the first commit with inserts
    HoodieWriteConfig cfg = writeConfigBuilder.build();
    SparkRDDWriteClient client = getHoodieWriteClient(cfg);
    createCommitWithInserts(cfg, client, "000", "001", 200, true);
    validInstants.add("001");
    // Create 2 commits with upserts
    createCommitWithUpserts(cfg, client, "001", "000", "002", 100);
    createCommitWithUpserts(cfg, client, "002", "000", "003", 100);
    validInstants.add("002");
    validInstants.add("003");
    // Three clients running actions in parallel
    final int threadCount = 3;
    final CountDownLatch scheduleCountDownLatch = new CountDownLatch(threadCount);
    final ExecutorService executors = Executors.newFixedThreadPool(threadCount);
    // Write config with clustering enabled
    final HoodieWriteConfig cfg2 = writeConfigBuilder.withClusteringConfig(HoodieClusteringConfig.newBuilder().withInlineClustering(true).withInlineClusteringNumCommits(1).build()).build();
    final SparkRDDWriteClient client1 = getHoodieWriteClient(cfg2);
    final SparkRDDWriteClient client2 = getHoodieWriteClient(cfg);
    final SparkRDDWriteClient client3 = getHoodieWriteClient(cfg);
    // Create upserts, schedule cleaning, schedule compaction in parallel
    Future future1 = executors.submit(() -> {
        final String newCommitTime = "004";
        final int numRecords = 100;
        final String commitTimeBetweenPrevAndNew = "002";
        // We want the upsert to go through only after the compaction
        // and cleaning schedule completion. So, waiting on latch here.
        latchCountDownAndWait(scheduleCountDownLatch, 30000);
        if (tableType == HoodieTableType.MERGE_ON_READ) {
            // Since the compaction already went in, this upsert has
            // to fail
            assertThrows(IllegalArgumentException.class, () -> {
                createCommitWithUpserts(cfg, client1, "003", commitTimeBetweenPrevAndNew, newCommitTime, numRecords);
            });
        } else {
            // We don't have the compaction for COW and so this upsert
            // has to pass
            assertDoesNotThrow(() -> {
                createCommitWithUpserts(cfg, client1, "003", commitTimeBetweenPrevAndNew, newCommitTime, numRecords);
            });
            validInstants.add(newCommitTime);
        }
    });
    Future future2 = executors.submit(() -> {
        if (tableType == HoodieTableType.MERGE_ON_READ) {
            assertDoesNotThrow(() -> {
                client2.scheduleTableService("005", Option.empty(), TableServiceType.COMPACT);
            });
        }
        latchCountDownAndWait(scheduleCountDownLatch, 30000);
    });
    Future future3 = executors.submit(() -> {
        assertDoesNotThrow(() -> {
            latchCountDownAndWait(scheduleCountDownLatch, 30000);
            client3.scheduleTableService("006", Option.empty(), TableServiceType.CLEAN);
        });
    });
    future1.get();
    future2.get();
    future3.get();
    CountDownLatch runCountDownLatch = new CountDownLatch(threadCount);
    // Create inserts, run cleaning, run compaction in parallel
    future1 = executors.submit(() -> {
        final String newCommitTime = "007";
        final int numRecords = 100;
        latchCountDownAndWait(runCountDownLatch, 30000);
        assertDoesNotThrow(() -> {
            createCommitWithInserts(cfg, client1, "003", newCommitTime, numRecords, true);
            validInstants.add("007");
        });
    });
    future2 = executors.submit(() -> {
        latchCountDownAndWait(runCountDownLatch, 30000);
        if (tableType == HoodieTableType.MERGE_ON_READ) {
            assertDoesNotThrow(() -> {
                HoodieWriteMetadata<JavaRDD<WriteStatus>> compactionMetadata = client2.compact("005");
                client2.commitCompaction("005", compactionMetadata.getCommitMetadata().get(), Option.empty());
                validInstants.add("005");
            });
        }
    });
    future3 = executors.submit(() -> {
        latchCountDownAndWait(runCountDownLatch, 30000);
        assertDoesNotThrow(() -> {
            client3.clean("006", false);
            validInstants.add("006");
        });
    });
    future1.get();
    future2.get();
    future3.get();
    validInstants.addAll(metaClient.reloadActiveTimeline().getCompletedReplaceTimeline().filterCompletedInstants().getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toSet()));
    Set<String> completedInstants = metaClient.reloadActiveTimeline().getCommitsTimeline().filterCompletedInstants().getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toSet());
    assertTrue(validInstants.containsAll(completedInstants));
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Properties(java.util.Properties) CountDownLatch(java.util.concurrent.CountDownLatch) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HashSet(java.util.HashSet)

Example 12 with HoodieWriteMetadata

use of org.apache.hudi.table.action.HoodieWriteMetadata in project hudi by apache.

the class SparkBootstrapCommitActionExecutor method execute.

@Override
public HoodieBootstrapWriteMetadata<HoodieData<WriteStatus>> execute() {
    validate();
    try {
        HoodieTableMetaClient metaClient = table.getMetaClient();
        Option<HoodieInstant> completedInstant = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().lastInstant();
        ValidationUtils.checkArgument(!completedInstant.isPresent(), "Active Timeline is expected to be empty for bootstrap to be performed. " + "If you want to re-bootstrap, please rollback bootstrap first !!");
        Map<BootstrapMode, List<Pair<String, List<HoodieFileStatus>>>> partitionSelections = listAndProcessSourcePartitions();
        // First run metadata bootstrap which will auto commit
        Option<HoodieWriteMetadata<HoodieData<WriteStatus>>> metadataResult = metadataBootstrap(partitionSelections.get(BootstrapMode.METADATA_ONLY));
        // if there are full bootstrap to be performed, perform that too
        Option<HoodieWriteMetadata<HoodieData<WriteStatus>>> fullBootstrapResult = fullBootstrap(partitionSelections.get(BootstrapMode.FULL_RECORD));
        // Delete the marker directory for the instant
        WriteMarkersFactory.get(config.getMarkersType(), table, instantTime).quietDeleteMarkerDir(context, config.getMarkersDeleteParallelism());
        return new HoodieBootstrapWriteMetadata(metadataResult, fullBootstrapResult);
    } catch (IOException ioe) {
        throw new HoodieIOException(ioe.getMessage(), ioe);
    }
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) BootstrapMode(org.apache.hudi.client.bootstrap.BootstrapMode) HoodieIOException(org.apache.hudi.exception.HoodieIOException) List(java.util.List) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) BootstrapWriteStatus(org.apache.hudi.client.bootstrap.BootstrapWriteStatus) WriteStatus(org.apache.hudi.client.WriteStatus)

Example 13 with HoodieWriteMetadata

use of org.apache.hudi.table.action.HoodieWriteMetadata in project hudi by apache.

the class SparkDeletePartitionCommitActionExecutor method execute.

@Override
public HoodieWriteMetadata<HoodieData<WriteStatus>> execute() {
    HoodieTimer timer = new HoodieTimer().startTimer();
    context.setJobStatus(this.getClass().getSimpleName(), "Gather all file ids from all deleting partitions.");
    Map<String, List<String>> partitionToReplaceFileIds = HoodieJavaPairRDD.getJavaPairRDD(context.parallelize(partitions).distinct().mapToPair(partitionPath -> Pair.of(partitionPath, getAllExistingFileIds(partitionPath)))).collectAsMap();
    HoodieWriteMetadata<HoodieData<WriteStatus>> result = new HoodieWriteMetadata<>();
    result.setPartitionToReplaceFileIds(partitionToReplaceFileIds);
    result.setIndexUpdateDuration(Duration.ofMillis(timer.endTimer()));
    result.setWriteStatuses(context.emptyHoodieData());
    this.saveWorkloadProfileMetadataToInflight(new WorkloadProfile(Pair.of(new HashMap<>(), new WorkloadStat())), instantTime);
    this.commitOnAutoCommit(result);
    return result;
}
Also used : HoodieData(org.apache.hudi.common.data.HoodieData) WorkloadProfile(org.apache.hudi.table.WorkloadProfile) WorkloadStat(org.apache.hudi.table.WorkloadStat) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) List(java.util.List) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata)

Example 14 with HoodieWriteMetadata

use of org.apache.hudi.table.action.HoodieWriteMetadata in project hudi by apache.

the class HoodieFlinkWriteClient method insertOverwrite.

/**
 * Removes all existing records from the partitions affected and inserts the given HoodieRecords, into the table.
 *
 * @param records     HoodieRecords to insert
 * @param instantTime Instant time of the commit
 * @return list of WriteStatus to inspect errors and counts
 */
public List<WriteStatus> insertOverwrite(List<HoodieRecord<T>> records, final String instantTime) {
    HoodieTable<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> table = initTable(WriteOperationType.INSERT_OVERWRITE, Option.ofNullable(instantTime));
    table.validateInsertSchema();
    preWrite(instantTime, WriteOperationType.INSERT_OVERWRITE, table.getMetaClient());
    // create the write handle if not exists
    final HoodieWriteHandle<?, ?, ?, ?> writeHandle = getOrCreateWriteHandle(records.get(0), getConfig(), instantTime, table, records.listIterator());
    HoodieWriteMetadata result = ((HoodieFlinkTable<T>) table).insertOverwrite(context, writeHandle, instantTime, records);
    return postWrite(result, instantTime, table);
}
Also used : List(java.util.List) HoodieList(org.apache.hudi.common.data.HoodieList) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieFlinkTable(org.apache.hudi.table.HoodieFlinkTable)

Example 15 with HoodieWriteMetadata

use of org.apache.hudi.table.action.HoodieWriteMetadata in project hudi by apache.

the class HoodieWriteClientExample method main.

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: HoodieWriteClientExample <tablePath> <tableName>");
        System.exit(1);
    }
    String tablePath = args[0];
    String tableName = args[1];
    SparkConf sparkConf = HoodieExampleSparkUtils.defaultSparkConf("hoodie-client-example");
    try (JavaSparkContext jsc = new JavaSparkContext(sparkConf)) {
        // Generator of some records to be loaded in.
        HoodieExampleDataGenerator<HoodieAvroPayload> dataGen = new HoodieExampleDataGenerator<>();
        // initialize the table, if not done already
        Path path = new Path(tablePath);
        FileSystem fs = FSUtils.getFs(tablePath, jsc.hadoopConfiguration());
        if (!fs.exists(path)) {
            HoodieTableMetaClient.withPropertyBuilder().setTableType(tableType).setTableName(tableName).setPayloadClass(HoodieAvroPayload.class).initTable(jsc.hadoopConfiguration(), tablePath);
        }
        // Create the write client to write some records in
        HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath).withSchema(HoodieExampleDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).withDeleteParallelism(2).forTable(tableName).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(20, 30).build()).build();
        SparkRDDWriteClient<HoodieAvroPayload> client = new SparkRDDWriteClient<>(new HoodieSparkEngineContext(jsc), cfg);
        // inserts
        String newCommitTime = client.startCommit();
        LOG.info("Starting commit " + newCommitTime);
        List<HoodieRecord<HoodieAvroPayload>> records = dataGen.generateInserts(newCommitTime, 10);
        List<HoodieRecord<HoodieAvroPayload>> recordsSoFar = new ArrayList<>(records);
        JavaRDD<HoodieRecord<HoodieAvroPayload>> writeRecords = jsc.parallelize(records, 1);
        client.insert(writeRecords, newCommitTime);
        // updates
        newCommitTime = client.startCommit();
        LOG.info("Starting commit " + newCommitTime);
        List<HoodieRecord<HoodieAvroPayload>> toBeUpdated = dataGen.generateUpdates(newCommitTime, 2);
        records.addAll(toBeUpdated);
        recordsSoFar.addAll(toBeUpdated);
        writeRecords = jsc.parallelize(records, 1);
        client.upsert(writeRecords, newCommitTime);
        // Delete
        newCommitTime = client.startCommit();
        LOG.info("Starting commit " + newCommitTime);
        // just delete half of the records
        int numToDelete = recordsSoFar.size() / 2;
        List<HoodieKey> toBeDeleted = recordsSoFar.stream().map(HoodieRecord::getKey).limit(numToDelete).collect(Collectors.toList());
        JavaRDD<HoodieKey> deleteRecords = jsc.parallelize(toBeDeleted, 1);
        client.delete(deleteRecords, newCommitTime);
        // Delete by partition
        newCommitTime = client.startCommit();
        client.startCommitWithTime(newCommitTime, HoodieTimeline.REPLACE_COMMIT_ACTION);
        LOG.info("Starting commit " + newCommitTime);
        // The partition where the data needs to be deleted
        List<String> partitionList = toBeDeleted.stream().map(s -> s.getPartitionPath()).distinct().collect(Collectors.toList());
        List<String> deleteList = recordsSoFar.stream().filter(f -> !partitionList.contains(f.getPartitionPath())).map(m -> m.getKey().getPartitionPath()).distinct().collect(Collectors.toList());
        client.deletePartitions(deleteList, newCommitTime);
        // compaction
        if (HoodieTableType.valueOf(tableType) == HoodieTableType.MERGE_ON_READ) {
            Option<String> instant = client.scheduleCompaction(Option.empty());
            HoodieWriteMetadata<JavaRDD<WriteStatus>> compactionMetadata = client.compact(instant.get());
            client.commitCompaction(instant.get(), compactionMetadata.getCommitMetadata().get(), Option.empty());
        }
    }
}
Also used : HoodieAvroPayload(org.apache.hudi.common.model.HoodieAvroPayload) HoodieExampleSparkUtils(org.apache.hudi.examples.common.HoodieExampleSparkUtils) HoodieExampleDataGenerator(org.apache.hudi.examples.common.HoodieExampleDataGenerator) FileSystem(org.apache.hadoop.fs.FileSystem) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) SparkConf(org.apache.spark.SparkConf) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) WriteStatus(org.apache.hudi.client.WriteStatus) List(java.util.List) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieKey(org.apache.hudi.common.model.HoodieKey) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) FileSystem(org.apache.hadoop.fs.FileSystem) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) HoodieExampleDataGenerator(org.apache.hudi.examples.common.HoodieExampleDataGenerator) Path(org.apache.hadoop.fs.Path) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieKey(org.apache.hudi.common.model.HoodieKey) SparkConf(org.apache.spark.SparkConf) HoodieAvroPayload(org.apache.hudi.common.model.HoodieAvroPayload)

Aggregations

HoodieWriteMetadata (org.apache.hudi.table.action.HoodieWriteMetadata)27 WriteStatus (org.apache.hudi.client.WriteStatus)23 List (java.util.List)20 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)16 Collectors (java.util.stream.Collectors)15 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)15 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)14 HoodieTable (org.apache.hudi.table.HoodieTable)14 IOException (java.io.IOException)12 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)12 JavaRDD (org.apache.spark.api.java.JavaRDD)12 HoodieData (org.apache.hudi.common.data.HoodieData)11 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)11 Option (org.apache.hudi.common.util.Option)11 Path (org.apache.hadoop.fs.Path)10 HoodieSparkTable (org.apache.hudi.table.HoodieSparkTable)10 HashMap (java.util.HashMap)9 Map (java.util.Map)9 Stream (java.util.stream.Stream)9 HoodieKey (org.apache.hudi.common.model.HoodieKey)9