use of org.apache.hudi.utilities.schema.SchemaProvider in project hudi by apache.
the class TestInputBatch method getSchemaProviderShouldReturnGivenSchemaProvider.
@Test
public void getSchemaProviderShouldReturnGivenSchemaProvider() {
SchemaProvider schemaProvider = new RowBasedSchemaProvider(null);
final InputBatch<String> inputBatch = new InputBatch<>(Option.of("foo"), null, schemaProvider);
assertSame(schemaProvider, inputBatch.getSchemaProvider());
}
use of org.apache.hudi.utilities.schema.SchemaProvider in project hudi by apache.
the class TestInputBatch method getSchemaProviderShouldReturnNullSchemaProvider.
@Test
public void getSchemaProviderShouldReturnNullSchemaProvider() {
final InputBatch<String> inputBatch = new InputBatch<>(Option.empty(), null, null);
SchemaProvider schemaProvider = inputBatch.getSchemaProvider();
assertTrue(schemaProvider instanceof InputBatch.NullSchemaProvider);
}
use of org.apache.hudi.utilities.schema.SchemaProvider in project hudi by apache.
the class UtilHelpers method createLatestSchemaProvider.
/**
* Create latest schema provider for Target schema.
*
* @param structType spark data type of incoming batch.
* @param jssc instance of {@link JavaSparkContext}.
* @param fs instance of {@link FileSystem}.
* @param basePath base path of the table.
* @return the schema provider where target schema refers to latest schema(either incoming schema or table schema).
*/
public static SchemaProvider createLatestSchemaProvider(StructType structType, JavaSparkContext jssc, FileSystem fs, String basePath) {
SchemaProvider rowSchemaProvider = new RowBasedSchemaProvider(structType);
Schema writeSchema = rowSchemaProvider.getTargetSchema();
Schema latestTableSchema = writeSchema;
try {
if (FSUtils.isTableExists(basePath, fs)) {
HoodieTableMetaClient tableMetaClient = HoodieTableMetaClient.builder().setConf(jssc.sc().hadoopConfiguration()).setBasePath(basePath).build();
TableSchemaResolver tableSchemaResolver = new TableSchemaResolver(tableMetaClient);
latestTableSchema = tableSchemaResolver.getLatestSchema(writeSchema, true, (Function1<Schema, Schema>) v1 -> AvroConversionUtils.convertStructTypeToAvroSchema(AvroConversionUtils.convertAvroSchemaToStructType(v1), RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME, RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE));
}
} catch (IOException e) {
LOG.warn("Could not fetch table schema. Falling back to writer schema");
}
final Schema finalLatestTableSchema = latestTableSchema;
return new SchemaProvider(new TypedProperties()) {
@Override
public Schema getSourceSchema() {
return rowSchemaProvider.getSourceSchema();
}
@Override
public Schema getTargetSchema() {
return finalLatestTableSchema;
}
};
}
use of org.apache.hudi.utilities.schema.SchemaProvider in project hudi by apache.
the class DeltaSync method fetchFromSource.
private Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> fetchFromSource(Option<String> resumeCheckpointStr) {
final Option<JavaRDD<GenericRecord>> avroRDDOptional;
final String checkpointStr;
SchemaProvider schemaProvider;
if (transformer.isPresent()) {
// Transformation is needed. Fetch New rows in Row Format, apply transformation and then convert them
// to generic records for writing
InputBatch<Dataset<Row>> dataAndCheckpoint = formatAdapter.fetchNewDataInRowFormat(resumeCheckpointStr, cfg.sourceLimit);
Option<Dataset<Row>> transformed = dataAndCheckpoint.getBatch().map(data -> transformer.get().apply(jssc, sparkSession, data, props));
checkpointStr = dataAndCheckpoint.getCheckpointForNextBatch();
boolean reconcileSchema = props.getBoolean(DataSourceWriteOptions.RECONCILE_SCHEMA().key());
if (this.userProvidedSchemaProvider != null && this.userProvidedSchemaProvider.getTargetSchema() != null) {
// If the target schema is specified through Avro schema,
// pass in the schema for the Row-to-Avro conversion
// to avoid nullability mismatch between Avro schema and Row schema
avroRDDOptional = transformed.map(t -> HoodieSparkUtils.createRdd(t, HOODIE_RECORD_STRUCT_NAME, HOODIE_RECORD_NAMESPACE, reconcileSchema, Option.of(this.userProvidedSchemaProvider.getTargetSchema())).toJavaRDD());
schemaProvider = this.userProvidedSchemaProvider;
} else {
// Use Transformed Row's schema if not overridden. If target schema is not specified
// default to RowBasedSchemaProvider
schemaProvider = transformed.map(r -> {
// determine the targetSchemaProvider. use latestTableSchema if reconcileSchema is enabled.
SchemaProvider targetSchemaProvider = null;
if (reconcileSchema) {
targetSchemaProvider = UtilHelpers.createLatestSchemaProvider(r.schema(), jssc, fs, cfg.targetBasePath);
} else {
targetSchemaProvider = UtilHelpers.createRowBasedSchemaProvider(r.schema(), props, jssc);
}
return (SchemaProvider) new DelegatingSchemaProvider(props, jssc, dataAndCheckpoint.getSchemaProvider(), targetSchemaProvider);
}).orElse(dataAndCheckpoint.getSchemaProvider());
avroRDDOptional = transformed.map(t -> HoodieSparkUtils.createRdd(t, HOODIE_RECORD_STRUCT_NAME, HOODIE_RECORD_NAMESPACE, reconcileSchema, Option.ofNullable(schemaProvider.getTargetSchema())).toJavaRDD());
}
} else {
// Pull the data from the source & prepare the write
InputBatch<JavaRDD<GenericRecord>> dataAndCheckpoint = formatAdapter.fetchNewDataInAvroFormat(resumeCheckpointStr, cfg.sourceLimit);
avroRDDOptional = dataAndCheckpoint.getBatch();
checkpointStr = dataAndCheckpoint.getCheckpointForNextBatch();
schemaProvider = dataAndCheckpoint.getSchemaProvider();
}
if (!cfg.allowCommitOnNoCheckpointChange && Objects.equals(checkpointStr, resumeCheckpointStr.orElse(null))) {
LOG.info("No new data, source checkpoint has not changed. Nothing to commit. Old checkpoint=(" + resumeCheckpointStr + "). New Checkpoint=(" + checkpointStr + ")");
String commitActionType = CommitUtils.getCommitActionType(cfg.operation, HoodieTableType.valueOf(cfg.tableType));
hoodieMetrics.updateMetricsForEmptyData(commitActionType);
return null;
}
jssc.setJobGroup(this.getClass().getSimpleName(), "Checking if input is empty");
if ((!avroRDDOptional.isPresent()) || (avroRDDOptional.get().isEmpty())) {
LOG.info("No new data, perform empty commit.");
return Pair.of(schemaProvider, Pair.of(checkpointStr, jssc.emptyRDD()));
}
boolean shouldCombine = cfg.filterDupes || cfg.operation.equals(WriteOperationType.UPSERT);
JavaRDD<GenericRecord> avroRDD = avroRDDOptional.get();
JavaRDD<HoodieRecord> records = avroRDD.map(gr -> {
HoodieRecordPayload payload = shouldCombine ? DataSourceUtils.createPayload(cfg.payloadClassName, gr, (Comparable) HoodieAvroUtils.getNestedFieldVal(gr, cfg.sourceOrderingField, false, props.getBoolean(KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.key(), Boolean.parseBoolean(KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.defaultValue())))) : DataSourceUtils.createPayload(cfg.payloadClassName, gr);
return new HoodieAvroRecord<>(keyGenerator.getKey(gr), payload);
});
return Pair.of(schemaProvider, Pair.of(checkpointStr, records));
}
Aggregations