Search in sources :

Example 11 with HoodieTableType

use of org.apache.hudi.common.model.HoodieTableType in project hudi by apache.

the class TestHoodieMetadataBootstrap method testMetadataBootstrapInflightCommit.

@Test
public void testMetadataBootstrapInflightCommit() throws Exception {
    HoodieTableType tableType = COPY_ON_WRITE;
    init(tableType, false);
    doPreBootstrapWriteOperation(testTable, INSERT, "0000001");
    doPreBootstrapWriteOperation(testTable, "0000002");
    // add an inflight commit
    HoodieCommitMetadata inflightCommitMeta = testTable.doWriteOperation("00000007", UPSERT, emptyList(), asList("p1", "p2"), 2, true, true);
    // bootstrap and following validation should fail. bootstrap should not happen.
    bootstrapAndVerifyFailure();
    // once the commit is complete, metadata should get fully synced.
    // in prod code path, SparkHoodieBackedTableMetadataWriter.create() will be called for every commit,
    // which may not be the case here if we directly call HoodieBackedTableMetadataWriter.update()
    // hence lets first move the commit to complete and invoke sync directly
    ((HoodieMetadataTestTable) testTable).moveInflightCommitToComplete("00000007", inflightCommitMeta, true);
    syncTableMetadata(writeConfig);
    validateMetadata(testTable);
}
Also used : HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieMetadataTestTable(org.apache.hudi.common.testutils.HoodieMetadataTestTable) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 12 with HoodieTableType

use of org.apache.hudi.common.model.HoodieTableType in project hudi by apache.

the class HoodieClientRollbackTestBase method twoUpsertCommitDataWithTwoPartitions.

protected void twoUpsertCommitDataWithTwoPartitions(List<FileSlice> firstPartitionCommit2FileSlices, List<FileSlice> secondPartitionCommit2FileSlices, HoodieWriteConfig cfg, boolean commitSecondUpsert) throws IOException {
    // just generate two partitions
    dataGen = new HoodieTestDataGenerator(new String[] { DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH });
    // 1. prepare data
    HoodieTestDataGenerator.writePartitionMetadataDeprecated(fs, new String[] { DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH }, basePath);
    SparkRDDWriteClient client = getHoodieWriteClient(cfg);
    /**
     * Write 1 (only inserts)
     */
    String newCommitTime = "001";
    client.startCommitWithTime(newCommitTime);
    List<HoodieRecord> records = dataGen.generateInsertsContainsAllPartitions(newCommitTime, 2);
    JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
    JavaRDD<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime);
    Assertions.assertNoWriteErrors(statuses.collect());
    client.commit(newCommitTime, statuses);
    /**
     * Write 2 (updates)
     */
    newCommitTime = "002";
    client.startCommitWithTime(newCommitTime);
    records = dataGen.generateUpdates(newCommitTime, records);
    statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime);
    Assertions.assertNoWriteErrors(statuses.collect());
    if (commitSecondUpsert) {
        client.commit(newCommitTime, statuses);
    }
    // 2. assert file group and get the first partition file slice
    HoodieTable table = this.getHoodieTable(metaClient, cfg);
    SyncableFileSystemView fsView = getFileSystemViewWithUnCommittedSlices(table.getMetaClient());
    List<HoodieFileGroup> firstPartitionCommit2FileGroups = fsView.getAllFileGroups(DEFAULT_FIRST_PARTITION_PATH).collect(Collectors.toList());
    assertEquals(1, firstPartitionCommit2FileGroups.size());
    firstPartitionCommit2FileSlices.addAll(firstPartitionCommit2FileGroups.get(0).getAllFileSlices().collect(Collectors.toList()));
    // 3. assert file group and get the second partition file slice
    List<HoodieFileGroup> secondPartitionCommit2FileGroups = fsView.getAllFileGroups(DEFAULT_SECOND_PARTITION_PATH).collect(Collectors.toList());
    assertEquals(1, secondPartitionCommit2FileGroups.size());
    secondPartitionCommit2FileSlices.addAll(secondPartitionCommit2FileGroups.get(0).getAllFileSlices().collect(Collectors.toList()));
    // 4. assert file slice
    HoodieTableType tableType = this.getTableType();
    if (tableType.equals(HoodieTableType.COPY_ON_WRITE)) {
        assertEquals(2, firstPartitionCommit2FileSlices.size());
        assertEquals(2, secondPartitionCommit2FileSlices.size());
    } else {
        assertEquals(1, firstPartitionCommit2FileSlices.size());
        assertEquals(1, secondPartitionCommit2FileSlices.size());
    }
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) SyncableFileSystemView(org.apache.hudi.common.table.view.SyncableFileSystemView) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) WriteStatus(org.apache.hudi.client.WriteStatus)

Example 13 with HoodieTableType

use of org.apache.hudi.common.model.HoodieTableType in project hudi by apache.

the class TestHoodieMetadataBootstrap method testMetadataBootstrapWithExtraFiles.

/**
 * Validate that bootstrap considers only files part of completed commit and ignore any extra files.
 */
@Test
public void testMetadataBootstrapWithExtraFiles() throws Exception {
    HoodieTableType tableType = COPY_ON_WRITE;
    init(tableType, false);
    doPreBootstrapWriteOperation(testTable, INSERT, "0000001");
    doPreBootstrapWriteOperation(testTable, "0000002");
    doPreBootstrapClean(testTable, "0000003", Arrays.asList("0000001"));
    doPreBootstrapWriteOperation(testTable, "0000005");
    // add few extra files to table. bootstrap should include those files.
    String fileName = UUID.randomUUID().toString();
    Path baseFilePath = FileCreateUtils.getBaseFilePath(basePath, "p1", "0000006", fileName);
    FileCreateUtils.createBaseFile(basePath, "p1", "0000006", fileName, 100);
    writeConfig = getWriteConfig(true, true);
    initWriteConfigAndMetatableWriter(writeConfig, true);
    syncTableMetadata(writeConfig);
    // remove those files from table. and then validate.
    Files.delete(baseFilePath);
    // validate
    validateMetadata(testTable);
    // after bootstrap do two writes and validate its still functional.
    doWriteInsertAndUpsert(testTable);
    validateMetadata(testTable);
}
Also used : Path(java.nio.file.Path) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 14 with HoodieTableType

use of org.apache.hudi.common.model.HoodieTableType in project hudi by apache.

the class TestUpgradeDowngrade method twoUpsertCommitDataWithTwoPartitions.

/**
 * Create two commits and may or may not commit 2nd commit.
 *
 * @param firstPartitionCommit2FileSlices list to hold file slices in first partition.
 * @param secondPartitionCommit2FileSlices list of hold file slices from second partition.
 * @param cfg instance of {@link HoodieWriteConfig}
 * @param client instance of {@link SparkRDDWriteClient} to use.
 * @param commitSecondUpsert true if 2nd commit needs to be committed. false otherwise.
 * @return a pair of list of records from 1st and 2nd batch.
 */
private Pair<List<HoodieRecord>, List<HoodieRecord>> twoUpsertCommitDataWithTwoPartitions(List<FileSlice> firstPartitionCommit2FileSlices, List<FileSlice> secondPartitionCommit2FileSlices, HoodieWriteConfig cfg, SparkRDDWriteClient client, boolean commitSecondUpsert) throws IOException {
    // just generate two partitions
    dataGen = new HoodieTestDataGenerator(new String[] { DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH });
    // 1. prepare data
    HoodieTestDataGenerator.writePartitionMetadataDeprecated(metaClient.getFs(), new String[] { DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH }, basePath);
    /**
     * Write 1 (only inserts)
     */
    String newCommitTime = "001";
    client.startCommitWithTime(newCommitTime);
    List<HoodieRecord> records = dataGen.generateInsertsContainsAllPartitions(newCommitTime, 2);
    JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
    JavaRDD<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime);
    Assertions.assertNoWriteErrors(statuses.collect());
    client.commit(newCommitTime, statuses);
    /**
     * Write 2 (updates)
     */
    newCommitTime = "002";
    client.startCommitWithTime(newCommitTime);
    List<HoodieRecord> records2 = dataGen.generateUpdates(newCommitTime, records);
    statuses = client.upsert(jsc.parallelize(records2, 1), newCommitTime);
    Assertions.assertNoWriteErrors(statuses.collect());
    if (commitSecondUpsert) {
        client.commit(newCommitTime, statuses);
    }
    // 2. assert filegroup and get the first partition fileslice
    HoodieTable table = this.getHoodieTable(metaClient, cfg);
    SyncableFileSystemView fsView = getFileSystemViewWithUnCommittedSlices(table.getMetaClient());
    List<HoodieFileGroup> firstPartitionCommit2FileGroups = fsView.getAllFileGroups(DEFAULT_FIRST_PARTITION_PATH).collect(Collectors.toList());
    assertEquals(1, firstPartitionCommit2FileGroups.size());
    firstPartitionCommit2FileSlices.addAll(firstPartitionCommit2FileGroups.get(0).getAllFileSlices().collect(Collectors.toList()));
    // 3. assert filegroup and get the second partition fileslice
    List<HoodieFileGroup> secondPartitionCommit2FileGroups = fsView.getAllFileGroups(DEFAULT_SECOND_PARTITION_PATH).collect(Collectors.toList());
    assertEquals(1, secondPartitionCommit2FileGroups.size());
    secondPartitionCommit2FileSlices.addAll(secondPartitionCommit2FileGroups.get(0).getAllFileSlices().collect(Collectors.toList()));
    // 4. assert fileslice
    HoodieTableType tableType = metaClient.getTableType();
    if (tableType.equals(HoodieTableType.COPY_ON_WRITE)) {
        assertEquals(2, firstPartitionCommit2FileSlices.size());
        assertEquals(2, secondPartitionCommit2FileSlices.size());
    } else {
        assertEquals(1, firstPartitionCommit2FileSlices.size());
        assertEquals(1, secondPartitionCommit2FileSlices.size());
    }
    return Pair.of(records, records2);
}
Also used : SyncableFileSystemView(org.apache.hudi.common.table.view.SyncableFileSystemView) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup)

Example 15 with HoodieTableType

use of org.apache.hudi.common.model.HoodieTableType in project hudi by apache.

the class HoodieTableSource method getBatchInputFormat.

private InputFormat<RowData, ?> getBatchInputFormat() {
    final Schema tableAvroSchema = getTableAvroSchema();
    final DataType rowDataType = AvroSchemaConverter.convertToDataType(tableAvroSchema);
    final RowType rowType = (RowType) rowDataType.getLogicalType();
    final RowType requiredRowType = (RowType) getProducedDataType().notNull().getLogicalType();
    final String queryType = this.conf.getString(FlinkOptions.QUERY_TYPE);
    switch(queryType) {
        case FlinkOptions.QUERY_TYPE_SNAPSHOT:
            final HoodieTableType tableType = HoodieTableType.valueOf(this.conf.getString(FlinkOptions.TABLE_TYPE));
            switch(tableType) {
                case MERGE_ON_READ:
                    final List<MergeOnReadInputSplit> inputSplits = buildFileIndex();
                    if (inputSplits.size() == 0) {
                        // When there is no input splits, just return an empty source.
                        LOG.warn("No input splits generate for MERGE_ON_READ input format, returns empty collection instead");
                        return InputFormats.EMPTY_INPUT_FORMAT;
                    }
                    return mergeOnReadInputFormat(rowType, requiredRowType, tableAvroSchema, rowDataType, inputSplits, false);
                case COPY_ON_WRITE:
                    return baseFileOnlyInputFormat();
                default:
                    throw new HoodieException("Unexpected table type: " + this.conf.getString(FlinkOptions.TABLE_TYPE));
            }
        case FlinkOptions.QUERY_TYPE_READ_OPTIMIZED:
            return baseFileOnlyInputFormat();
        case FlinkOptions.QUERY_TYPE_INCREMENTAL:
            IncrementalInputSplits incrementalInputSplits = IncrementalInputSplits.builder().conf(conf).path(FilePathUtils.toFlinkPath(path)).maxCompactionMemoryInBytes(maxCompactionMemoryInBytes).requiredPartitions(getRequiredPartitionPaths()).build();
            final IncrementalInputSplits.Result result = incrementalInputSplits.inputSplits(metaClient, hadoopConf);
            if (result.isEmpty()) {
                // When there is no input splits, just return an empty source.
                LOG.warn("No input splits generate for incremental read, returns empty collection instead");
                return InputFormats.EMPTY_INPUT_FORMAT;
            }
            return mergeOnReadInputFormat(rowType, requiredRowType, tableAvroSchema, rowDataType, result.getInputSplits(), false);
        default:
            String errMsg = String.format("Invalid query type : '%s', options ['%s', '%s', '%s'] are supported now", queryType, FlinkOptions.QUERY_TYPE_SNAPSHOT, FlinkOptions.QUERY_TYPE_READ_OPTIMIZED, FlinkOptions.QUERY_TYPE_INCREMENTAL);
            throw new HoodieException(errMsg);
    }
}
Also used : MergeOnReadInputSplit(org.apache.hudi.table.format.mor.MergeOnReadInputSplit) IncrementalInputSplits(org.apache.hudi.source.IncrementalInputSplits) ResolvedSchema(org.apache.flink.table.catalog.ResolvedSchema) Schema(org.apache.avro.Schema) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) DataType(org.apache.flink.table.types.DataType) RowType(org.apache.flink.table.types.logical.RowType) HoodieException(org.apache.hudi.exception.HoodieException)

Aggregations

HoodieTableType (org.apache.hudi.common.model.HoodieTableType)15 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)11 Test (org.junit.jupiter.api.Test)6 WriteStatus (org.apache.hudi.client.WriteStatus)3 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)3 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)3 HoodieTestDataGenerator (org.apache.hudi.common.testutils.HoodieTestDataGenerator)3 HoodieTable (org.apache.hudi.table.HoodieTable)3 IOException (java.io.IOException)2 List (java.util.List)2 Schema (org.apache.avro.Schema)2 ResolvedSchema (org.apache.flink.table.catalog.ResolvedSchema)2 DataType (org.apache.flink.table.types.DataType)2 RowType (org.apache.flink.table.types.logical.RowType)2 Path (org.apache.hadoop.fs.Path)2 SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)2 HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)2 HoodieFileGroup (org.apache.hudi.common.model.HoodieFileGroup)2 SyncableFileSystemView (org.apache.hudi.common.table.view.SyncableFileSystemView)2 HoodieTableMetadata (org.apache.hudi.metadata.HoodieTableMetadata)2