Search in sources :

Example 6 with TableSchemaResolver

use of org.apache.hudi.common.table.TableSchemaResolver in project hudi by apache.

the class CompactionUtil method setAvroSchema.

/**
 * Sets up the avro schema string into the give configuration {@code conf}
 * through reading from the hoodie table metadata.
 *
 * @param conf The configuration
 */
public static void setAvroSchema(Configuration conf, HoodieTableMetaClient metaClient) throws Exception {
    TableSchemaResolver tableSchemaResolver = new TableSchemaResolver(metaClient);
    Schema tableAvroSchema = tableSchemaResolver.getTableAvroSchema(false);
    conf.setString(FlinkOptions.SOURCE_AVRO_SCHEMA, tableAvroSchema.toString());
}
Also used : Schema(org.apache.avro.Schema) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver)

Example 7 with TableSchemaResolver

use of org.apache.hudi.common.table.TableSchemaResolver in project hudi by apache.

the class TestHoodieDeltaStreamer method testSchemaEvolution.

@ParameterizedTest
@MethodSource("schemaEvolArgs")
public void testSchemaEvolution(String tableType, boolean useUserProvidedSchema, boolean useSchemaPostProcessor) throws Exception {
    String tableBasePath = dfsBasePath + "/test_table_schema_evolution" + tableType + "_" + useUserProvidedSchema + "_" + useSchemaPostProcessor;
    defaultSchemaProviderClassName = FilebasedSchemaProvider.class.getName();
    // Insert data produced with Schema A, pass Schema A
    HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT, Collections.singletonList(TestIdentityTransformer.class.getName()), PROPS_FILENAME_TEST_SOURCE, false, true, false, null, tableType);
    cfg.configs.add("hoodie.deltastreamer.schemaprovider.source.schema.file=" + dfsBasePath + "/source.avsc");
    cfg.configs.add("hoodie.deltastreamer.schemaprovider.target.schema.file=" + dfsBasePath + "/source.avsc");
    cfg.configs.add(DataSourceWriteOptions.RECONCILE_SCHEMA().key() + "=true");
    if (!useSchemaPostProcessor) {
        cfg.configs.add(SparkAvroPostProcessor.Config.SPARK_AVRO_POST_PROCESSOR_PROP_ENABLE + "=false");
    }
    new HoodieDeltaStreamer(cfg, jsc).sync();
    TestHelpers.assertRecordCount(1000, tableBasePath + "/*/*", sqlContext);
    TestHelpers.assertCommitMetadata("00000", tableBasePath, dfs, 1);
    // Upsert data produced with Schema B, pass Schema B
    cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.UPSERT, Collections.singletonList(TripsWithEvolvedOptionalFieldTransformer.class.getName()), PROPS_FILENAME_TEST_SOURCE, false, true, false, null, tableType);
    cfg.configs.add("hoodie.deltastreamer.schemaprovider.source.schema.file=" + dfsBasePath + "/source.avsc");
    cfg.configs.add("hoodie.deltastreamer.schemaprovider.target.schema.file=" + dfsBasePath + "/source_evolved.avsc");
    cfg.configs.add(DataSourceWriteOptions.RECONCILE_SCHEMA().key() + "=true");
    if (!useSchemaPostProcessor) {
        cfg.configs.add(SparkAvroPostProcessor.Config.SPARK_AVRO_POST_PROCESSOR_PROP_ENABLE + "=false");
    }
    new HoodieDeltaStreamer(cfg, jsc).sync();
    // out of 1000 new records, 500 are inserts, 450 are updates and 50 are deletes.
    TestHelpers.assertRecordCount(1450, tableBasePath + "/*/*", sqlContext);
    TestHelpers.assertCommitMetadata("00001", tableBasePath, dfs, 2);
    List<Row> counts = TestHelpers.countsPerCommit(tableBasePath + "/*/*", sqlContext);
    assertEquals(1450, counts.stream().mapToLong(entry -> entry.getLong(1)).sum());
    sqlContext.read().format("org.apache.hudi").load(tableBasePath + "/*/*").createOrReplaceTempView("tmp_trips");
    long recordCount = sqlContext.sparkSession().sql("select * from tmp_trips where evoluted_optional_union_field is not NULL").count();
    assertEquals(950, recordCount);
    // Upsert data produced with Schema A, pass Schema B
    if (!useUserProvidedSchema) {
        defaultSchemaProviderClassName = TestFileBasedSchemaProviderNullTargetSchema.class.getName();
    }
    cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.UPSERT, Collections.singletonList(TestIdentityTransformer.class.getName()), PROPS_FILENAME_TEST_SOURCE, false, true, false, null, tableType);
    cfg.configs.add("hoodie.deltastreamer.schemaprovider.source.schema.file=" + dfsBasePath + "/source.avsc");
    if (useUserProvidedSchema) {
        cfg.configs.add("hoodie.deltastreamer.schemaprovider.target.schema.file=" + dfsBasePath + "/source_evolved.avsc");
    }
    if (!useSchemaPostProcessor) {
        cfg.configs.add(SparkAvroPostProcessor.Config.SPARK_AVRO_POST_PROCESSOR_PROP_ENABLE + "=false");
    }
    cfg.configs.add(DataSourceWriteOptions.RECONCILE_SCHEMA().key() + "=true");
    new HoodieDeltaStreamer(cfg, jsc).sync();
    // again, 1000 new records, 500 are inserts, 450 are updates and 50 are deletes.
    TestHelpers.assertRecordCount(1900, tableBasePath + "/*/*", sqlContext);
    TestHelpers.assertCommitMetadata("00002", tableBasePath, dfs, 3);
    counts = TestHelpers.countsPerCommit(tableBasePath + "/*/*", sqlContext);
    assertEquals(1900, counts.stream().mapToLong(entry -> entry.getLong(1)).sum());
    TableSchemaResolver tableSchemaResolver = new TableSchemaResolver(HoodieTableMetaClient.builder().setBasePath(tableBasePath).setConf(dfs.getConf()).build());
    Schema tableSchema = tableSchemaResolver.getTableAvroSchemaWithoutMetadataFields();
    assertNotNull(tableSchema);
    Schema expectedSchema = new Schema.Parser().parse(dfs.open(new Path(dfsBasePath + "/source_evolved.avsc")));
    if (!useUserProvidedSchema || useSchemaPostProcessor) {
        expectedSchema = AvroConversionUtils.convertStructTypeToAvroSchema(AvroConversionUtils.convertAvroSchemaToStructType(expectedSchema), HOODIE_RECORD_STRUCT_NAME, HOODIE_RECORD_NAMESPACE);
    }
    assertEquals(tableSchema, expectedSchema);
    // clean up and reinit
    UtilitiesTestBase.Helpers.deleteFileFromDfs(FSUtils.getFs(cfg.targetBasePath, jsc.hadoopConfiguration()), dfsBasePath + "/" + PROPS_FILENAME_TEST_SOURCE);
    writeCommonPropsToFile(dfs, dfsBasePath);
    defaultSchemaProviderClassName = FilebasedSchemaProvider.class.getName();
}
Also used : Path(org.apache.hadoop.fs.Path) Schema(org.apache.avro.Schema) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) FilebasedSchemaProvider(org.apache.hudi.utilities.schema.FilebasedSchemaProvider) HoodieDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer) Row(org.apache.spark.sql.Row) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 8 with TableSchemaResolver

use of org.apache.hudi.common.table.TableSchemaResolver in project hudi by apache.

the class HoodieCompactor method compact.

/**
 * Execute compaction operations and report back status.
 */
public HoodieData<WriteStatus> compact(HoodieEngineContext context, HoodieCompactionPlan compactionPlan, HoodieTable table, HoodieWriteConfig config, String compactionInstantTime, HoodieCompactionHandler compactionHandler) {
    if (compactionPlan == null || (compactionPlan.getOperations() == null) || (compactionPlan.getOperations().isEmpty())) {
        return context.emptyHoodieData();
    }
    HoodieActiveTimeline timeline = table.getActiveTimeline();
    HoodieInstant instant = HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime);
    // Mark instant as compaction inflight
    timeline.transitionCompactionRequestedToInflight(instant);
    table.getMetaClient().reloadActiveTimeline();
    HoodieTableMetaClient metaClient = table.getMetaClient();
    TableSchemaResolver schemaResolver = new TableSchemaResolver(metaClient);
    // the same with the table schema.
    try {
        Schema readerSchema = schemaResolver.getTableAvroSchema(false);
        config.setSchema(readerSchema.toString());
    } catch (Exception e) {
    // If there is no commit in the table, just ignore the exception.
    }
    // Compacting is very similar to applying updates to existing file
    List<CompactionOperation> operations = compactionPlan.getOperations().stream().map(CompactionOperation::convertFromAvroRecordInstance).collect(toList());
    LOG.info("Compactor compacting " + operations + " files");
    context.setJobStatus(this.getClass().getSimpleName(), "Compacting file slices");
    TaskContextSupplier taskContextSupplier = table.getTaskContextSupplier();
    return context.parallelize(operations).map(operation -> compact(compactionHandler, metaClient, config, operation, compactionInstantTime, taskContextSupplier)).flatMap(List::iterator);
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieAccumulator(org.apache.hudi.common.data.HoodieAccumulator) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) RuntimeStats(org.apache.hudi.common.model.HoodieWriteStat.RuntimeStats) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) StreamSupport(java.util.stream.StreamSupport) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) HoodieData(org.apache.hudi.common.data.HoodieData) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) Schema(org.apache.avro.Schema) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) TaskContextSupplier(org.apache.hudi.common.engine.TaskContextSupplier) Collection(java.util.Collection) Set(java.util.Set) IOException(java.io.IOException) CompactionStrategy(org.apache.hudi.table.action.compact.strategy.CompactionStrategy) Serializable(java.io.Serializable) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieCompactionHandler(org.apache.hudi.table.HoodieCompactionHandler) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) SliceView(org.apache.hudi.common.table.view.TableFileSystemView.SliceView) IOUtils(org.apache.hudi.io.IOUtils) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) Pair(org.apache.hudi.common.util.collection.Pair) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) Schema(org.apache.avro.Schema) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) ArrayList(java.util.ArrayList) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) IOException(java.io.IOException) TaskContextSupplier(org.apache.hudi.common.engine.TaskContextSupplier)

Example 9 with TableSchemaResolver

use of org.apache.hudi.common.table.TableSchemaResolver in project hudi by apache.

the class HoodieTable method validateSchema.

/**
 * Ensure that the current writerSchema is compatible with the latest schema of this dataset.
 *
 * When inserting/updating data, we read records using the last used schema and convert them to the
 * GenericRecords with writerSchema. Hence, we need to ensure that this conversion can take place without errors.
 */
private void validateSchema() throws HoodieUpsertException, HoodieInsertException {
    if (!config.getAvroSchemaValidate() || getActiveTimeline().getCommitsTimeline().filterCompletedInstants().empty()) {
        // Check not required
        return;
    }
    Schema tableSchema;
    Schema writerSchema;
    boolean isValid;
    try {
        TableSchemaResolver schemaResolver = new TableSchemaResolver(getMetaClient());
        writerSchema = HoodieAvroUtils.createHoodieWriteSchema(config.getSchema());
        tableSchema = HoodieAvroUtils.createHoodieWriteSchema(schemaResolver.getTableAvroSchemaWithoutMetadataFields());
        isValid = TableSchemaResolver.isSchemaCompatible(tableSchema, writerSchema);
    } catch (Exception e) {
        throw new HoodieException("Failed to read schema/check compatibility for base path " + metaClient.getBasePath(), e);
    }
    if (!isValid) {
        throw new HoodieException("Failed schema compatibility check for writerSchema :" + writerSchema + ", table schema :" + tableSchema + ", base path :" + metaClient.getBasePath());
    }
}
Also used : Schema(org.apache.avro.Schema) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) HoodieException(org.apache.hudi.exception.HoodieException) HoodieException(org.apache.hudi.exception.HoodieException) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) TimeoutException(java.util.concurrent.TimeoutException) HoodieInsertException(org.apache.hudi.exception.HoodieInsertException) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Example 10 with TableSchemaResolver

use of org.apache.hudi.common.table.TableSchemaResolver in project hudi by apache.

the class TestHoodieDeltaStreamer method testParquetDFSSource.

private void testParquetDFSSource(boolean useSchemaProvider, List<String> transformerClassNames, boolean testEmptyBatch) throws Exception {
    prepareParquetDFSSource(useSchemaProvider, transformerClassNames != null);
    String tableBasePath = dfsBasePath + "/test_parquet_table" + testNum;
    HoodieDeltaStreamer deltaStreamer = new HoodieDeltaStreamer(TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT, testEmptyBatch ? TestParquetDFSSourceEmptyBatch.class.getName() : ParquetDFSSource.class.getName(), transformerClassNames, PROPS_FILENAME_TEST_PARQUET, false, useSchemaProvider, 100000, false, null, null, "timestamp", null), jsc);
    deltaStreamer.sync();
    TestHelpers.assertRecordCount(PARQUET_NUM_RECORDS, tableBasePath + "/*/*.parquet", sqlContext);
    testNum++;
    if (testEmptyBatch) {
        prepareParquetDFSFiles(100, PARQUET_SOURCE_ROOT, "2.parquet", false, null, null);
        // parquet source to return empty batch
        TestParquetDFSSourceEmptyBatch.returnEmptyBatch = true;
        deltaStreamer.sync();
        // since we mimic'ed empty batch, total records should be same as first sync().
        TestHelpers.assertRecordCount(PARQUET_NUM_RECORDS, tableBasePath + "/*/*.parquet", sqlContext);
        HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setBasePath(tableBasePath).setConf(jsc.hadoopConfiguration()).build();
        // validate table schema fetches valid schema from last but one commit.
        TableSchemaResolver tableSchemaResolver = new TableSchemaResolver(metaClient);
        assertNotEquals(tableSchemaResolver.getTableAvroSchema(), Schema.create(Schema.Type.NULL).toString());
    }
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer) TestParquetDFSSourceEmptyBatch(org.apache.hudi.utilities.sources.TestParquetDFSSourceEmptyBatch) ParquetDFSSource(org.apache.hudi.utilities.sources.ParquetDFSSource) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver)

Aggregations

TableSchemaResolver (org.apache.hudi.common.table.TableSchemaResolver)15 Schema (org.apache.avro.Schema)14 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)6 HoodieException (org.apache.hudi.exception.HoodieException)6 IOException (java.io.IOException)5 Path (org.apache.hadoop.fs.Path)4 HoodieIOException (org.apache.hudi.exception.HoodieIOException)4 Iterator (java.util.Iterator)3 List (java.util.List)3 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)3 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)3 Option (org.apache.hudi.common.util.Option)3 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)3 Collections (java.util.Collections)2 Collectors.toList (java.util.stream.Collectors.toList)2 HoodieAvroUtils (org.apache.hudi.avro.HoodieAvroUtils)2 HoodieCompactionPlan (org.apache.hudi.avro.model.HoodieCompactionPlan)2 WriteStatus (org.apache.hudi.client.WriteStatus)2 TypedProperties (org.apache.hudi.common.config.TypedProperties)2 HoodieData (org.apache.hudi.common.data.HoodieData)2