use of org.apache.hudi.common.model.HoodieRecordPayload in project hudi by apache.
the class HoodieTestSuiteWriter method commitCompaction.
public void commitCompaction(JavaRDD<WriteStatus> records, JavaRDD<DeltaWriteStats> generatedDataStats, Option<String> instantTime) throws IOException {
if (!cfg.useDeltaStreamer) {
Map<String, String> extraMetadata = new HashMap<>();
/**
* Store the checkpoint in the commit metadata just like
* {@link HoodieDeltaStreamer#commit(SparkRDDWriteClient, JavaRDD, Option)} *
*/
extraMetadata.put(HoodieDeltaStreamerWrapper.CHECKPOINT_KEY, lastCheckpoint.get());
if (generatedDataStats != null && generatedDataStats.count() > 1) {
// Just stores the path where this batch of data is generated to
extraMetadata.put(GENERATED_DATA_PATH, generatedDataStats.map(s -> s.getFilePath()).collect().get(0));
}
HoodieSparkTable<HoodieRecordPayload> table = HoodieSparkTable.create(writeClient.getConfig(), writeClient.getEngineContext());
HoodieCommitMetadata metadata = CompactHelpers.getInstance().createCompactionMetadata(table, instantTime.get(), HoodieJavaRDD.of(records), writeClient.getConfig().getSchema());
writeClient.commitCompaction(instantTime.get(), metadata, Option.of(extraMetadata));
}
}
use of org.apache.hudi.common.model.HoodieRecordPayload in project hudi by apache.
the class UtilHelpers method createHoodieClient.
/**
* Build Hoodie write client.
*
* @param jsc Java Spark Context
* @param basePath Base Path
* @param schemaStr Schema
* @param parallelism Parallelism
*/
public static SparkRDDWriteClient<HoodieRecordPayload> createHoodieClient(JavaSparkContext jsc, String basePath, String schemaStr, int parallelism, Option<String> compactionStrategyClass, TypedProperties properties) {
HoodieCompactionConfig compactionConfig = compactionStrategyClass.map(strategy -> HoodieCompactionConfig.newBuilder().withInlineCompaction(false).withCompactionStrategy(ReflectionUtils.loadClass(strategy)).build()).orElse(HoodieCompactionConfig.newBuilder().withInlineCompaction(false).build());
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withParallelism(parallelism, parallelism).withBulkInsertParallelism(parallelism).withDeleteParallelism(parallelism).withSchema(schemaStr).combineInput(true, true).withCompactionConfig(compactionConfig).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).withProps(properties).build();
return new SparkRDDWriteClient<>(new HoodieSparkEngineContext(jsc), config);
}
use of org.apache.hudi.common.model.HoodieRecordPayload in project hudi by apache.
the class HoodieCompactor method doCompact.
private int doCompact(JavaSparkContext jsc) throws Exception {
// Get schema.
String schemaStr;
if (StringUtils.isNullOrEmpty(cfg.schemaFile)) {
schemaStr = getSchemaFromLatestInstant();
} else {
schemaStr = UtilHelpers.parseSchema(fs, cfg.schemaFile);
}
LOG.info("Schema --> : " + schemaStr);
try (SparkRDDWriteClient<HoodieRecordPayload> client = UtilHelpers.createHoodieClient(jsc, cfg.basePath, schemaStr, cfg.parallelism, Option.empty(), props)) {
// instant from the active timeline
if (StringUtils.isNullOrEmpty(cfg.compactionInstantTime)) {
HoodieTableMetaClient metaClient = UtilHelpers.createMetaClient(jsc, cfg.basePath, true);
Option<HoodieInstant> firstCompactionInstant = metaClient.getActiveTimeline().firstInstant(HoodieTimeline.COMPACTION_ACTION, HoodieInstant.State.REQUESTED);
if (firstCompactionInstant.isPresent()) {
cfg.compactionInstantTime = firstCompactionInstant.get().getTimestamp();
LOG.info("Found the earliest scheduled compaction instant which will be executed: " + cfg.compactionInstantTime);
} else {
throw new HoodieCompactionException("There is no scheduled compaction in the table.");
}
}
HoodieWriteMetadata<JavaRDD<WriteStatus>> compactionMetadata = client.compact(cfg.compactionInstantTime);
return UtilHelpers.handleErrors(compactionMetadata.getCommitMetadata().get(), cfg.compactionInstantTime);
}
}
use of org.apache.hudi.common.model.HoodieRecordPayload in project hudi by apache.
the class DeltaSync method fetchFromSource.
private Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> fetchFromSource(Option<String> resumeCheckpointStr) {
final Option<JavaRDD<GenericRecord>> avroRDDOptional;
final String checkpointStr;
SchemaProvider schemaProvider;
if (transformer.isPresent()) {
// Transformation is needed. Fetch New rows in Row Format, apply transformation and then convert them
// to generic records for writing
InputBatch<Dataset<Row>> dataAndCheckpoint = formatAdapter.fetchNewDataInRowFormat(resumeCheckpointStr, cfg.sourceLimit);
Option<Dataset<Row>> transformed = dataAndCheckpoint.getBatch().map(data -> transformer.get().apply(jssc, sparkSession, data, props));
checkpointStr = dataAndCheckpoint.getCheckpointForNextBatch();
boolean reconcileSchema = props.getBoolean(DataSourceWriteOptions.RECONCILE_SCHEMA().key());
if (this.userProvidedSchemaProvider != null && this.userProvidedSchemaProvider.getTargetSchema() != null) {
// If the target schema is specified through Avro schema,
// pass in the schema for the Row-to-Avro conversion
// to avoid nullability mismatch between Avro schema and Row schema
avroRDDOptional = transformed.map(t -> HoodieSparkUtils.createRdd(t, HOODIE_RECORD_STRUCT_NAME, HOODIE_RECORD_NAMESPACE, reconcileSchema, Option.of(this.userProvidedSchemaProvider.getTargetSchema())).toJavaRDD());
schemaProvider = this.userProvidedSchemaProvider;
} else {
// Use Transformed Row's schema if not overridden. If target schema is not specified
// default to RowBasedSchemaProvider
schemaProvider = transformed.map(r -> {
// determine the targetSchemaProvider. use latestTableSchema if reconcileSchema is enabled.
SchemaProvider targetSchemaProvider = null;
if (reconcileSchema) {
targetSchemaProvider = UtilHelpers.createLatestSchemaProvider(r.schema(), jssc, fs, cfg.targetBasePath);
} else {
targetSchemaProvider = UtilHelpers.createRowBasedSchemaProvider(r.schema(), props, jssc);
}
return (SchemaProvider) new DelegatingSchemaProvider(props, jssc, dataAndCheckpoint.getSchemaProvider(), targetSchemaProvider);
}).orElse(dataAndCheckpoint.getSchemaProvider());
avroRDDOptional = transformed.map(t -> HoodieSparkUtils.createRdd(t, HOODIE_RECORD_STRUCT_NAME, HOODIE_RECORD_NAMESPACE, reconcileSchema, Option.ofNullable(schemaProvider.getTargetSchema())).toJavaRDD());
}
} else {
// Pull the data from the source & prepare the write
InputBatch<JavaRDD<GenericRecord>> dataAndCheckpoint = formatAdapter.fetchNewDataInAvroFormat(resumeCheckpointStr, cfg.sourceLimit);
avroRDDOptional = dataAndCheckpoint.getBatch();
checkpointStr = dataAndCheckpoint.getCheckpointForNextBatch();
schemaProvider = dataAndCheckpoint.getSchemaProvider();
}
if (!cfg.allowCommitOnNoCheckpointChange && Objects.equals(checkpointStr, resumeCheckpointStr.orElse(null))) {
LOG.info("No new data, source checkpoint has not changed. Nothing to commit. Old checkpoint=(" + resumeCheckpointStr + "). New Checkpoint=(" + checkpointStr + ")");
String commitActionType = CommitUtils.getCommitActionType(cfg.operation, HoodieTableType.valueOf(cfg.tableType));
hoodieMetrics.updateMetricsForEmptyData(commitActionType);
return null;
}
jssc.setJobGroup(this.getClass().getSimpleName(), "Checking if input is empty");
if ((!avroRDDOptional.isPresent()) || (avroRDDOptional.get().isEmpty())) {
LOG.info("No new data, perform empty commit.");
return Pair.of(schemaProvider, Pair.of(checkpointStr, jssc.emptyRDD()));
}
boolean shouldCombine = cfg.filterDupes || cfg.operation.equals(WriteOperationType.UPSERT);
JavaRDD<GenericRecord> avroRDD = avroRDDOptional.get();
JavaRDD<HoodieRecord> records = avroRDD.map(gr -> {
HoodieRecordPayload payload = shouldCombine ? DataSourceUtils.createPayload(cfg.payloadClassName, gr, (Comparable) HoodieAvroUtils.getNestedFieldVal(gr, cfg.sourceOrderingField, false, props.getBoolean(KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.key(), Boolean.parseBoolean(KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.defaultValue())))) : DataSourceUtils.createPayload(cfg.payloadClassName, gr);
return new HoodieAvroRecord<>(keyGenerator.getKey(gr), payload);
});
return Pair.of(schemaProvider, Pair.of(checkpointStr, records));
}
use of org.apache.hudi.common.model.HoodieRecordPayload in project hudi by apache.
the class TestHoodieBackedTableMetadata method verifyMetadataMergedRecords.
/**
* Verify the metadata table in-memory merged records. Irrespective of key deduplication
* config, the in-memory merged records should always have the key field in the record
* payload fully materialized.
*
* @param metadataMetaClient - Metadata table meta client
* @param logFilePaths - Metadata table log file paths
* @param latestCommitTimestamp - Latest commit timestamp
*/
private void verifyMetadataMergedRecords(HoodieTableMetaClient metadataMetaClient, List<String> logFilePaths, String latestCommitTimestamp) {
Schema schema = HoodieAvroUtils.addMetadataFields(HoodieMetadataRecord.getClassSchema());
HoodieMetadataMergedLogRecordReader logRecordReader = HoodieMetadataMergedLogRecordReader.newBuilder().withFileSystem(metadataMetaClient.getFs()).withBasePath(metadataMetaClient.getBasePath()).withLogFilePaths(logFilePaths).withLatestInstantTime(latestCommitTimestamp).withPartition(MetadataPartitionType.FILES.getPartitionPath()).withReaderSchema(schema).withMaxMemorySizeInBytes(100000L).withBufferSize(4096).withSpillableMapBasePath(tempDir.toString()).withDiskMapType(ExternalSpillableMap.DiskMapType.BITCASK).build();
assertDoesNotThrow(() -> {
logRecordReader.scan();
}, "Metadata log records materialization failed");
for (Map.Entry<String, HoodieRecord<? extends HoodieRecordPayload>> entry : logRecordReader.getRecords().entrySet()) {
assertFalse(entry.getKey().isEmpty());
assertFalse(entry.getValue().getRecordKey().isEmpty());
assertEquals(entry.getKey(), entry.getValue().getRecordKey());
}
}
Aggregations