Search in sources :

Example 1 with HoodieExampleDataGenerator

use of org.apache.hudi.examples.common.HoodieExampleDataGenerator in project hudi by apache.

the class HoodieJavaWriteClientExample method main.

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: HoodieJavaWriteClientExample <tablePath> <tableName>");
        System.exit(1);
    }
    String tablePath = args[0];
    String tableName = args[1];
    // Generator of some records to be loaded in.
    HoodieExampleDataGenerator<HoodieAvroPayload> dataGen = new HoodieExampleDataGenerator<>();
    Configuration hadoopConf = new Configuration();
    // initialize the table, if not done already
    Path path = new Path(tablePath);
    FileSystem fs = FSUtils.getFs(tablePath, hadoopConf);
    if (!fs.exists(path)) {
        HoodieTableMetaClient.withPropertyBuilder().setTableType(tableType).setTableName(tableName).setPayloadClassName(HoodieAvroPayload.class.getName()).initTable(hadoopConf, tablePath);
    }
    // Create the write client to write some records in
    HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath).withSchema(HoodieExampleDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).withDeleteParallelism(2).forTable(tableName).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(20, 30).build()).build();
    HoodieJavaWriteClient<HoodieAvroPayload> client = new HoodieJavaWriteClient<>(new HoodieJavaEngineContext(hadoopConf), cfg);
    // inserts
    String newCommitTime = client.startCommit();
    LOG.info("Starting commit " + newCommitTime);
    List<HoodieRecord<HoodieAvroPayload>> records = dataGen.generateInserts(newCommitTime, 10);
    List<HoodieRecord<HoodieAvroPayload>> recordsSoFar = new ArrayList<>(records);
    List<HoodieRecord<HoodieAvroPayload>> writeRecords = recordsSoFar.stream().map(r -> new HoodieAvroRecord<HoodieAvroPayload>(r)).collect(Collectors.toList());
    client.insert(writeRecords, newCommitTime);
    // updates
    newCommitTime = client.startCommit();
    LOG.info("Starting commit " + newCommitTime);
    List<HoodieRecord<HoodieAvroPayload>> toBeUpdated = dataGen.generateUpdates(newCommitTime, 2);
    records.addAll(toBeUpdated);
    recordsSoFar.addAll(toBeUpdated);
    writeRecords = recordsSoFar.stream().map(r -> new HoodieAvroRecord<HoodieAvroPayload>(r)).collect(Collectors.toList());
    client.upsert(writeRecords, newCommitTime);
    // Delete
    newCommitTime = client.startCommit();
    LOG.info("Starting commit " + newCommitTime);
    // just delete half of the records
    int numToDelete = recordsSoFar.size() / 2;
    List<HoodieKey> toBeDeleted = recordsSoFar.stream().map(HoodieRecord::getKey).limit(numToDelete).collect(Collectors.toList());
    client.delete(toBeDeleted, newCommitTime);
    client.close();
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieAvroPayload(org.apache.hudi.common.model.HoodieAvroPayload) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieExampleDataGenerator(org.apache.hudi.examples.common.HoodieExampleDataGenerator) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieJavaEngineContext(org.apache.hudi.client.common.HoodieJavaEngineContext) HoodieJavaWriteClient(org.apache.hudi.client.HoodieJavaWriteClient) Collectors(java.util.stream.Collectors) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieIndex(org.apache.hudi.index.HoodieIndex) ArrayList(java.util.ArrayList) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) List(java.util.List) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Configuration(org.apache.hadoop.conf.Configuration) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieKey(org.apache.hudi.common.model.HoodieKey) Path(org.apache.hadoop.fs.Path) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) Configuration(org.apache.hadoop.conf.Configuration) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieJavaWriteClient(org.apache.hudi.client.HoodieJavaWriteClient) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieAvroPayload(org.apache.hudi.common.model.HoodieAvroPayload) HoodieExampleDataGenerator(org.apache.hudi.examples.common.HoodieExampleDataGenerator) HoodieJavaEngineContext(org.apache.hudi.client.common.HoodieJavaEngineContext)

Example 2 with HoodieExampleDataGenerator

use of org.apache.hudi.examples.common.HoodieExampleDataGenerator in project hudi by apache.

the class HoodieWriteClientExample method main.

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: HoodieWriteClientExample <tablePath> <tableName>");
        System.exit(1);
    }
    String tablePath = args[0];
    String tableName = args[1];
    SparkConf sparkConf = HoodieExampleSparkUtils.defaultSparkConf("hoodie-client-example");
    try (JavaSparkContext jsc = new JavaSparkContext(sparkConf)) {
        // Generator of some records to be loaded in.
        HoodieExampleDataGenerator<HoodieAvroPayload> dataGen = new HoodieExampleDataGenerator<>();
        // initialize the table, if not done already
        Path path = new Path(tablePath);
        FileSystem fs = FSUtils.getFs(tablePath, jsc.hadoopConfiguration());
        if (!fs.exists(path)) {
            HoodieTableMetaClient.withPropertyBuilder().setTableType(tableType).setTableName(tableName).setPayloadClass(HoodieAvroPayload.class).initTable(jsc.hadoopConfiguration(), tablePath);
        }
        // Create the write client to write some records in
        HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath).withSchema(HoodieExampleDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).withDeleteParallelism(2).forTable(tableName).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(20, 30).build()).build();
        SparkRDDWriteClient<HoodieAvroPayload> client = new SparkRDDWriteClient<>(new HoodieSparkEngineContext(jsc), cfg);
        // inserts
        String newCommitTime = client.startCommit();
        LOG.info("Starting commit " + newCommitTime);
        List<HoodieRecord<HoodieAvroPayload>> records = dataGen.generateInserts(newCommitTime, 10);
        List<HoodieRecord<HoodieAvroPayload>> recordsSoFar = new ArrayList<>(records);
        JavaRDD<HoodieRecord<HoodieAvroPayload>> writeRecords = jsc.parallelize(records, 1);
        client.insert(writeRecords, newCommitTime);
        // updates
        newCommitTime = client.startCommit();
        LOG.info("Starting commit " + newCommitTime);
        List<HoodieRecord<HoodieAvroPayload>> toBeUpdated = dataGen.generateUpdates(newCommitTime, 2);
        records.addAll(toBeUpdated);
        recordsSoFar.addAll(toBeUpdated);
        writeRecords = jsc.parallelize(records, 1);
        client.upsert(writeRecords, newCommitTime);
        // Delete
        newCommitTime = client.startCommit();
        LOG.info("Starting commit " + newCommitTime);
        // just delete half of the records
        int numToDelete = recordsSoFar.size() / 2;
        List<HoodieKey> toBeDeleted = recordsSoFar.stream().map(HoodieRecord::getKey).limit(numToDelete).collect(Collectors.toList());
        JavaRDD<HoodieKey> deleteRecords = jsc.parallelize(toBeDeleted, 1);
        client.delete(deleteRecords, newCommitTime);
        // Delete by partition
        newCommitTime = client.startCommit();
        client.startCommitWithTime(newCommitTime, HoodieTimeline.REPLACE_COMMIT_ACTION);
        LOG.info("Starting commit " + newCommitTime);
        // The partition where the data needs to be deleted
        List<String> partitionList = toBeDeleted.stream().map(s -> s.getPartitionPath()).distinct().collect(Collectors.toList());
        List<String> deleteList = recordsSoFar.stream().filter(f -> !partitionList.contains(f.getPartitionPath())).map(m -> m.getKey().getPartitionPath()).distinct().collect(Collectors.toList());
        client.deletePartitions(deleteList, newCommitTime);
        // compaction
        if (HoodieTableType.valueOf(tableType) == HoodieTableType.MERGE_ON_READ) {
            Option<String> instant = client.scheduleCompaction(Option.empty());
            HoodieWriteMetadata<JavaRDD<WriteStatus>> compactionMetadata = client.compact(instant.get());
            client.commitCompaction(instant.get(), compactionMetadata.getCommitMetadata().get(), Option.empty());
        }
    }
}
Also used : HoodieAvroPayload(org.apache.hudi.common.model.HoodieAvroPayload) HoodieExampleSparkUtils(org.apache.hudi.examples.common.HoodieExampleSparkUtils) HoodieExampleDataGenerator(org.apache.hudi.examples.common.HoodieExampleDataGenerator) FileSystem(org.apache.hadoop.fs.FileSystem) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) SparkConf(org.apache.spark.SparkConf) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) WriteStatus(org.apache.hudi.client.WriteStatus) List(java.util.List) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieKey(org.apache.hudi.common.model.HoodieKey) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) FileSystem(org.apache.hadoop.fs.FileSystem) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) HoodieExampleDataGenerator(org.apache.hudi.examples.common.HoodieExampleDataGenerator) Path(org.apache.hadoop.fs.Path) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieKey(org.apache.hudi.common.model.HoodieKey) SparkConf(org.apache.spark.SparkConf) HoodieAvroPayload(org.apache.hudi.common.model.HoodieAvroPayload)

Aggregations

ArrayList (java.util.ArrayList)2 List (java.util.List)2 Collectors (java.util.stream.Collectors)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 Path (org.apache.hadoop.fs.Path)2 FSUtils (org.apache.hudi.common.fs.FSUtils)2 HoodieAvroPayload (org.apache.hudi.common.model.HoodieAvroPayload)2 HoodieKey (org.apache.hudi.common.model.HoodieKey)2 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)2 HoodieTableType (org.apache.hudi.common.model.HoodieTableType)2 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)2 HoodieCompactionConfig (org.apache.hudi.config.HoodieCompactionConfig)2 HoodieIndexConfig (org.apache.hudi.config.HoodieIndexConfig)2 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)2 HoodieExampleDataGenerator (org.apache.hudi.examples.common.HoodieExampleDataGenerator)2 HoodieIndex (org.apache.hudi.index.HoodieIndex)2 LogManager (org.apache.log4j.LogManager)2 Logger (org.apache.log4j.Logger)2 Configuration (org.apache.hadoop.conf.Configuration)1 HoodieJavaWriteClient (org.apache.hudi.client.HoodieJavaWriteClient)1