Search in sources :

Example 26 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class TestHoodieBackedMetadata method testFirstCommitRollback.

// Some operations are not feasible with test table infra. hence using write client to test those cases.
/**
 * Rollback of the first commit should not trigger bootstrap errors at the metadata table.
 */
@ParameterizedTest
@EnumSource(HoodieTableType.class)
public void testFirstCommitRollback(HoodieTableType tableType) throws Exception {
    init(tableType);
    HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
    try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfigBuilder(true, true, false).withRollbackUsingMarkers(false).build())) {
        // Write 1
        String commitTime = "0000001";
        List<HoodieRecord> records = dataGen.generateInserts(commitTime, 20);
        client.startCommitWithTime(commitTime);
        List<WriteStatus> writeStatuses = client.insert(jsc.parallelize(records, 1), commitTime).collect();
        assertNoWriteErrors(writeStatuses);
        validateMetadata(client);
        // Rollback the first commit
        client.rollback(commitTime);
        // Write 2
        commitTime = "0000002";
        records = dataGen.generateInserts(commitTime, 10);
        client.startCommitWithTime(commitTime);
        writeStatuses = client.upsert(jsc.parallelize(records, 1), commitTime).collect();
        assertNoWriteErrors(writeStatuses);
        validateMetadata(client);
    }
}
Also used : HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) WriteStatus(org.apache.hudi.client.WriteStatus) EnumSource(org.junit.jupiter.params.provider.EnumSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 27 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class SparkHoodieBackedTableMetadataWriter method initialize.

@Override
protected <T extends SpecificRecordBase> void initialize(HoodieEngineContext engineContext, Option<T> actionMetadata, Option<String> inflightInstantTimestamp) {
    try {
        metrics.map(HoodieMetadataMetrics::registry).ifPresent(registry -> {
            if (registry instanceof DistributedRegistry) {
                HoodieSparkEngineContext sparkEngineContext = (HoodieSparkEngineContext) engineContext;
                ((DistributedRegistry) registry).register(sparkEngineContext.getJavaSparkContext());
            }
        });
        if (enabled) {
            initializeIfNeeded(dataMetaClient, actionMetadata, inflightInstantTimestamp);
        }
    } catch (IOException e) {
        LOG.error("Failed to initialize metadata table. Disabling the writer.", e);
        enabled = false;
    }
}
Also used : DistributedRegistry(org.apache.hudi.metrics.DistributedRegistry) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) IOException(java.io.IOException)

Example 28 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class TestDFSHoodieDatasetInputReader method testSimpleHoodieDatasetReader.

@Test
public void testSimpleHoodieDatasetReader() throws Exception {
    HoodieWriteConfig config = makeHoodieClientConfig();
    SparkRDDWriteClient client = new SparkRDDWriteClient(new HoodieSparkEngineContext(jsc), config);
    String commitTime = client.startCommit();
    HoodieTestDataGenerator generator = new HoodieTestDataGenerator();
    // Insert 100 records across 3 partitions
    List<HoodieRecord> inserts = generator.generateInserts(commitTime, 100);
    JavaRDD<WriteStatus> writeStatuses = client.upsert(jsc.parallelize(inserts), commitTime);
    writeStatuses.count();
    DFSHoodieDatasetInputReader reader = new DFSHoodieDatasetInputReader(jsc, config.getBasePath(), HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema())).toString());
    // Try to read 100 records for the same partition path and same file ID
    JavaRDD<GenericRecord> records = reader.read(1, 1, 100L);
    assertTrue(records.count() <= 100);
    assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).collect()).size(), 1);
    assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.FILENAME_METADATA_FIELD)).collect()).size(), 1);
    // Try to read 100 records for 3 partition paths and 3 different file ids
    records = reader.read(3, 3, 100L);
    assertTrue(records.count() <= 100);
    assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).collect()).size(), 3);
    assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.FILENAME_METADATA_FIELD)).collect()).size(), 3);
    // Try to read 100 records for 3 partition paths and 50% records from each file
    records = reader.read(3, 3, 0.5);
    assertTrue(records.count() <= 100);
    assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).collect()).size(), 3);
    assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.FILENAME_METADATA_FIELD)).collect()).size(), 3);
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) WriteStatus(org.apache.hudi.client.WriteStatus) HashSet(java.util.HashSet) Test(org.junit.jupiter.api.Test)

Example 29 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class HoodieWriteClientExample method main.

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: HoodieWriteClientExample <tablePath> <tableName>");
        System.exit(1);
    }
    String tablePath = args[0];
    String tableName = args[1];
    SparkConf sparkConf = HoodieExampleSparkUtils.defaultSparkConf("hoodie-client-example");
    try (JavaSparkContext jsc = new JavaSparkContext(sparkConf)) {
        // Generator of some records to be loaded in.
        HoodieExampleDataGenerator<HoodieAvroPayload> dataGen = new HoodieExampleDataGenerator<>();
        // initialize the table, if not done already
        Path path = new Path(tablePath);
        FileSystem fs = FSUtils.getFs(tablePath, jsc.hadoopConfiguration());
        if (!fs.exists(path)) {
            HoodieTableMetaClient.withPropertyBuilder().setTableType(tableType).setTableName(tableName).setPayloadClass(HoodieAvroPayload.class).initTable(jsc.hadoopConfiguration(), tablePath);
        }
        // Create the write client to write some records in
        HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath).withSchema(HoodieExampleDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).withDeleteParallelism(2).forTable(tableName).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(20, 30).build()).build();
        SparkRDDWriteClient<HoodieAvroPayload> client = new SparkRDDWriteClient<>(new HoodieSparkEngineContext(jsc), cfg);
        // inserts
        String newCommitTime = client.startCommit();
        LOG.info("Starting commit " + newCommitTime);
        List<HoodieRecord<HoodieAvroPayload>> records = dataGen.generateInserts(newCommitTime, 10);
        List<HoodieRecord<HoodieAvroPayload>> recordsSoFar = new ArrayList<>(records);
        JavaRDD<HoodieRecord<HoodieAvroPayload>> writeRecords = jsc.parallelize(records, 1);
        client.insert(writeRecords, newCommitTime);
        // updates
        newCommitTime = client.startCommit();
        LOG.info("Starting commit " + newCommitTime);
        List<HoodieRecord<HoodieAvroPayload>> toBeUpdated = dataGen.generateUpdates(newCommitTime, 2);
        records.addAll(toBeUpdated);
        recordsSoFar.addAll(toBeUpdated);
        writeRecords = jsc.parallelize(records, 1);
        client.upsert(writeRecords, newCommitTime);
        // Delete
        newCommitTime = client.startCommit();
        LOG.info("Starting commit " + newCommitTime);
        // just delete half of the records
        int numToDelete = recordsSoFar.size() / 2;
        List<HoodieKey> toBeDeleted = recordsSoFar.stream().map(HoodieRecord::getKey).limit(numToDelete).collect(Collectors.toList());
        JavaRDD<HoodieKey> deleteRecords = jsc.parallelize(toBeDeleted, 1);
        client.delete(deleteRecords, newCommitTime);
        // Delete by partition
        newCommitTime = client.startCommit();
        client.startCommitWithTime(newCommitTime, HoodieTimeline.REPLACE_COMMIT_ACTION);
        LOG.info("Starting commit " + newCommitTime);
        // The partition where the data needs to be deleted
        List<String> partitionList = toBeDeleted.stream().map(s -> s.getPartitionPath()).distinct().collect(Collectors.toList());
        List<String> deleteList = recordsSoFar.stream().filter(f -> !partitionList.contains(f.getPartitionPath())).map(m -> m.getKey().getPartitionPath()).distinct().collect(Collectors.toList());
        client.deletePartitions(deleteList, newCommitTime);
        // compaction
        if (HoodieTableType.valueOf(tableType) == HoodieTableType.MERGE_ON_READ) {
            Option<String> instant = client.scheduleCompaction(Option.empty());
            HoodieWriteMetadata<JavaRDD<WriteStatus>> compactionMetadata = client.compact(instant.get());
            client.commitCompaction(instant.get(), compactionMetadata.getCommitMetadata().get(), Option.empty());
        }
    }
}
Also used : HoodieAvroPayload(org.apache.hudi.common.model.HoodieAvroPayload) HoodieExampleSparkUtils(org.apache.hudi.examples.common.HoodieExampleSparkUtils) HoodieExampleDataGenerator(org.apache.hudi.examples.common.HoodieExampleDataGenerator) FileSystem(org.apache.hadoop.fs.FileSystem) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) SparkConf(org.apache.spark.SparkConf) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) WriteStatus(org.apache.hudi.client.WriteStatus) List(java.util.List) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieKey(org.apache.hudi.common.model.HoodieKey) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) FileSystem(org.apache.hadoop.fs.FileSystem) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) HoodieExampleDataGenerator(org.apache.hudi.examples.common.HoodieExampleDataGenerator) Path(org.apache.hadoop.fs.Path) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieKey(org.apache.hudi.common.model.HoodieKey) SparkConf(org.apache.spark.SparkConf) HoodieAvroPayload(org.apache.hudi.common.model.HoodieAvroPayload)

Example 30 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class UtilitiesTestBase method setup.

@BeforeEach
public void setup() throws Exception {
    TestDataSource.initDataGen();
    jsc = UtilHelpers.buildSparkContext(this.getClass().getName() + "-hoodie", "local[2]");
    context = new HoodieSparkEngineContext(jsc);
    sqlContext = new SQLContext(jsc);
    sparkSession = SparkSession.builder().config(jsc.getConf()).getOrCreate();
}
Also used : HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) SQLContext(org.apache.spark.sql.SQLContext) BeforeEach(org.junit.jupiter.api.BeforeEach)

Aggregations

HoodieSparkEngineContext (org.apache.hudi.client.common.HoodieSparkEngineContext)58 Path (org.apache.hadoop.fs.Path)25 SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)24 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)23 ArrayList (java.util.ArrayList)19 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)19 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)17 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)17 WriteStatus (org.apache.hudi.client.WriteStatus)15 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)15 IOException (java.io.IOException)14 List (java.util.List)14 Option (org.apache.hudi.common.util.Option)14 LogManager (org.apache.log4j.LogManager)14 Logger (org.apache.log4j.Logger)14 Test (org.junit.jupiter.api.Test)14 Collectors (java.util.stream.Collectors)12 FileStatus (org.apache.hadoop.fs.FileStatus)12 FileSystem (org.apache.hadoop.fs.FileSystem)12 HoodieEngineContext (org.apache.hudi.common.engine.HoodieEngineContext)11