Search in sources :

Example 11 with HoodieSparkTable

use of org.apache.hudi.table.HoodieSparkTable in project hudi by apache.

the class HoodieTestSuiteWriter method commitCompaction.

public void commitCompaction(JavaRDD<WriteStatus> records, JavaRDD<DeltaWriteStats> generatedDataStats, Option<String> instantTime) throws IOException {
    if (!cfg.useDeltaStreamer) {
        Map<String, String> extraMetadata = new HashMap<>();
        /**
         * Store the checkpoint in the commit metadata just like
         * {@link HoodieDeltaStreamer#commit(SparkRDDWriteClient, JavaRDD, Option)} *
         */
        extraMetadata.put(HoodieDeltaStreamerWrapper.CHECKPOINT_KEY, lastCheckpoint.get());
        if (generatedDataStats != null && generatedDataStats.count() > 1) {
            // Just stores the path where this batch of data is generated to
            extraMetadata.put(GENERATED_DATA_PATH, generatedDataStats.map(s -> s.getFilePath()).collect().get(0));
        }
        HoodieSparkTable<HoodieRecordPayload> table = HoodieSparkTable.create(writeClient.getConfig(), writeClient.getEngineContext());
        HoodieCommitMetadata metadata = CompactHelpers.getInstance().createCompactionMetadata(table, instantTime.get(), HoodieJavaRDD.of(records), writeClient.getConfig().getSchema());
        writeClient.commitCompaction(instantTime.get(), metadata, Option.of(extraMetadata));
    }
}
Also used : Arrays(java.util.Arrays) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LoggerFactory(org.slf4j.LoggerFactory) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) HashSet(java.util.HashSet) DagNode(org.apache.hudi.integ.testsuite.dag.nodes.DagNode) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieReadClient(org.apache.hudi.client.HoodieReadClient) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTestSuiteConfig(org.apache.hudi.integ.testsuite.HoodieTestSuiteJob.HoodieTestSuiteConfig) SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) Logger(org.slf4j.Logger) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) DeltaWriteStats(org.apache.hudi.integ.testsuite.writer.DeltaWriteStats) HoodiePayloadConfig(org.apache.hudi.config.HoodiePayloadConfig) CompactHelpers(org.apache.hudi.table.action.compact.CompactHelpers) Set(java.util.Set) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieIndex(org.apache.hudi.index.HoodieIndex) Serializable(java.io.Serializable) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) RollbackNode(org.apache.hudi.integ.testsuite.dag.nodes.RollbackNode) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) CleanNode(org.apache.hudi.integ.testsuite.dag.nodes.CleanNode) ScheduleCompactNode(org.apache.hudi.integ.testsuite.dag.nodes.ScheduleCompactNode) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) RDD(org.apache.spark.rdd.RDD) Pair(org.apache.hudi.common.util.collection.Pair) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HashMap(java.util.HashMap) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload)

Example 12 with HoodieSparkTable

use of org.apache.hudi.table.HoodieSparkTable in project hudi by apache.

the class TestHoodieIndex method testTagLocationAndDuplicateUpdate.

@ParameterizedTest
@MethodSource("indexTypeParams")
public void testTagLocationAndDuplicateUpdate(IndexType indexType, boolean populateMetaFields, boolean enableMetadataIndex) throws Exception {
    setUp(indexType, populateMetaFields, enableMetadataIndex);
    String newCommitTime = "001";
    int totalRecords = 10 + random.nextInt(20);
    List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, totalRecords);
    JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
    HoodieSparkTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
    writeClient.startCommitWithTime(newCommitTime);
    JavaRDD<WriteStatus> writeStatues = writeClient.upsert(writeRecords, newCommitTime);
    JavaRDD<HoodieRecord> javaRDD1 = tagLocation(index, writeRecords, hoodieTable);
    // Duplicate upsert and ensure correctness is maintained
    // We are trying to approximately imitate the case when the RDD is recomputed. For RDD creating, driver code is not
    // recomputed. This includes the state transitions. We need to delete the inflight instance so that subsequent
    // upsert will not run into conflicts.
    metaClient.getFs().delete(new Path(metaClient.getMetaPath(), "001.inflight"));
    writeClient.upsert(writeRecords, newCommitTime);
    Assertions.assertNoWriteErrors(writeStatues.collect());
    // Now commit this & update location of records inserted and validate no errors
    writeClient.commit(newCommitTime, writeStatues);
    // Now tagLocation for these records, hbaseIndex should tag them correctly
    metaClient = HoodieTableMetaClient.reload(metaClient);
    hoodieTable = HoodieSparkTable.create(config, context, metaClient);
    JavaRDD<HoodieRecord> javaRDD = tagLocation(index, writeRecords, hoodieTable);
    Map<String, String> recordKeyToPartitionPathMap = new HashMap();
    List<HoodieRecord> hoodieRecords = writeRecords.collect();
    hoodieRecords.forEach(entry -> recordKeyToPartitionPathMap.put(entry.getRecordKey(), entry.getPartitionPath()));
    assertEquals(totalRecords, javaRDD.filter(HoodieRecord::isCurrentLocationKnown).collect().size());
    assertEquals(totalRecords, javaRDD.map(record -> record.getKey().getRecordKey()).distinct().count());
    assertEquals(totalRecords, javaRDD.filter(record -> (record.getCurrentLocation() != null && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count());
    javaRDD.foreach(entry -> assertEquals(recordKeyToPartitionPathMap.get(entry.getRecordKey()), entry.getPartitionPath(), "PartitionPath mismatch"));
    JavaRDD<HoodieKey> hoodieKeyJavaRDD = writeRecords.map(entry -> entry.getKey());
    JavaPairRDD<HoodieKey, Option<Pair<String, String>>> recordLocations = getRecordLocations(hoodieKeyJavaRDD, hoodieTable);
    List<HoodieKey> hoodieKeys = hoodieKeyJavaRDD.collect();
    assertEquals(totalRecords, recordLocations.collect().size());
    assertEquals(totalRecords, recordLocations.map(record -> record._1).distinct().count());
    recordLocations.foreach(entry -> assertTrue(hoodieKeys.contains(entry._1), "Missing HoodieKey"));
    recordLocations.foreach(entry -> assertEquals(recordKeyToPartitionPathMap.get(entry._1.getRecordKey()), entry._1.getPartitionPath(), "PartitionPath mismatch"));
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieLayoutConfig(org.apache.hudi.config.HoodieLayoutConfig) HoodieTable(org.apache.hudi.table.HoodieTable) Arrays(java.util.Arrays) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Random(java.util.Random) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) Map(java.util.Map) SparkHoodieBackedTableMetadataWriter(org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter) SparkBucketIndexPartitioner(org.apache.hudi.table.action.commit.SparkBucketIndexPartitioner) HoodieStorageConfig(org.apache.hudi.config.HoodieStorageConfig) Path(org.apache.hadoop.fs.Path) Tag(org.junit.jupiter.api.Tag) FileSystemViewStorageType(org.apache.hudi.common.table.view.FileSystemViewStorageType) MethodSource(org.junit.jupiter.params.provider.MethodSource) Schema(org.apache.avro.Schema) IndexType(org.apache.hudi.index.HoodieIndex.IndexType) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) UUID(java.util.UUID) Arguments(org.junit.jupiter.params.provider.Arguments) Tuple2(scala.Tuple2) HoodieIndex(org.apache.hudi.index.HoodieIndex) Test(org.junit.jupiter.api.Test) List(java.util.List) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) Assertions(org.apache.hudi.testutils.Assertions) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) Assertions.fail(org.junit.jupiter.api.Assertions.fail) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) JavaRDD(org.apache.spark.api.java.JavaRDD) SchemaTestUtil.getSchemaFromResource(org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) ConsistencyGuardConfig(org.apache.hudi.common.fs.ConsistencyGuardConfig) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) IOException(java.io.IOException) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) WriteStatus(org.apache.hudi.client.WriteStatus) AfterEach(org.junit.jupiter.api.AfterEach) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) Collections(java.util.Collections) Pair(org.apache.hudi.common.util.collection.Pair) HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) Option(org.apache.hudi.common.util.Option) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 13 with HoodieSparkTable

use of org.apache.hudi.table.HoodieSparkTable in project hudi by apache.

the class TestUpsertPartitioner method testUpsertPartitionerWithSmallFileHandlingWithCanIndexLogFiles.

@Test
public void testUpsertPartitionerWithSmallFileHandlingWithCanIndexLogFiles() throws Exception {
    // Note this is used because it is same partition path used in CompactionTestUtils.createCompactionPlan()
    final String testPartitionPath = DEFAULT_PARTITION_PATHS[0];
    HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024).build()).withStorageConfig(HoodieStorageConfig.newBuilder().parquetMaxFileSize(1024).build()).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.HBASE).withHBaseIndexConfig(HoodieHBaseIndexConfig.newBuilder().build()).build()).build();
    // Create file group with only one log file
    FileCreateUtils.createLogFile(basePath, testPartitionPath, "001", "fg1", 1);
    FileCreateUtils.createDeltaCommit(basePath, "001");
    // Create another file group size set to max parquet file size so should not be considered during small file sizing
    FileCreateUtils.createBaseFile(basePath, testPartitionPath, "002", "fg2", 1024);
    FileCreateUtils.createCommit(basePath, "002");
    FileCreateUtils.createLogFile(basePath, testPartitionPath, "003", "fg2", 1);
    FileCreateUtils.createDeltaCommit(basePath, "003");
    // Partitioner will attempt to assign inserts to file groups including base file created by inflight compaction
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] { testPartitionPath });
    // Default estimated record size will be 1024 based on last file group created. Only 1 record can be added to small file
    List<HoodieRecord> insertRecords = dataGenerator.generateInserts("004", 1);
    WorkloadProfile profile = new WorkloadProfile(buildProfile(jsc.parallelize(insertRecords)));
    HoodieSparkTable table = HoodieSparkTable.create(config, context, metaClient);
    SparkUpsertDeltaCommitPartitioner partitioner = new SparkUpsertDeltaCommitPartitioner(profile, context, table, config);
    assertEquals(1, partitioner.numPartitions(), "Should have 1 partitions");
    assertEquals(BucketType.UPDATE, partitioner.getBucketInfo(0).bucketType, "Bucket 0 should be UPDATE");
    assertEquals("fg1", partitioner.getBucketInfo(0).fileIdPrefix, "Insert should be assigned to fg1");
}
Also used : WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) SparkUpsertDeltaCommitPartitioner(org.apache.hudi.table.action.deltacommit.SparkUpsertDeltaCommitPartitioner) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) Test(org.junit.jupiter.api.Test)

Example 14 with HoodieSparkTable

use of org.apache.hudi.table.HoodieSparkTable in project hudi by apache.

the class TestHoodieBloomIndex method testCheckUUIDsAgainstOneFile.

@Test
public void testCheckUUIDsAgainstOneFile() throws Exception {
    final String partition = "2016/01/31";
    // Create some records to use
    String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
    String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
    String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
    String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}";
    RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
    HoodieRecord record1 = new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
    RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
    HoodieRecord record2 = new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
    RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
    HoodieRecord record3 = new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
    RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
    HoodieRecord record4 = new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
    // We write record1, record2 to a parquet file, but the bloom filter contains (record1,
    // record2, record3).
    BloomFilter filter = BloomFilterFactory.createBloomFilter(10000, 0.0000001, -1, BloomFilterTypeCode.SIMPLE.name());
    filter.add(record3.getRecordKey());
    HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(metaClient, SCHEMA, filter, metadataWriter);
    final Map<String, List<Pair<String, Integer>>> partitionToFilesNameLengthMap = new HashMap<>();
    final String commitTime = "0000001";
    final String fileId = UUID.randomUUID().toString();
    Path baseFilePath = testTable.forCommit(commitTime).withInserts(partition, fileId, Arrays.asList(record1, record2));
    long baseFileLength = fs.getFileStatus(baseFilePath).getLen();
    partitionToFilesNameLengthMap.computeIfAbsent(partition, k -> new ArrayList<>()).add(Pair.of(fileId, Integer.valueOf((int) baseFileLength)));
    testTable.doWriteOperation(commitTime, WriteOperationType.UPSERT, Collections.singletonList(partition), partitionToFilesNameLengthMap, false, false);
    final String filename = testTable.getBaseFileNameById(fileId);
    // The bloom filter contains 3 records
    assertTrue(filter.mightContain(record1.getRecordKey()));
    assertTrue(filter.mightContain(record2.getRecordKey()));
    assertTrue(filter.mightContain(record3.getRecordKey()));
    assertFalse(filter.mightContain(record4.getRecordKey()));
    // Compare with file
    List<String> uuids = Arrays.asList(record1.getRecordKey(), record2.getRecordKey(), record3.getRecordKey(), record4.getRecordKey());
    HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
    HoodieSparkTable table = HoodieSparkTable.create(config, context, metaClient);
    List<String> results = HoodieIndexUtils.filterKeysFromFile(new Path(Paths.get(basePath, partition, filename).toString()), uuids, hadoopConf);
    assertEquals(results.size(), 2);
    assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0") || results.get(1).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0"));
    assertTrue(results.get(0).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0") || results.get(1).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0"));
// TODO(vc): Need more coverage on actual filenames
// assertTrue(results.get(0)._2().equals(filename));
// assertTrue(results.get(1)._2().equals(filename));
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieTable(org.apache.hudi.table.HoodieTable) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) MethodSource(org.junit.jupiter.params.provider.MethodSource) Schema(org.apache.avro.Schema) BloomFilterFactory(org.apache.hudi.common.bloom.BloomFilterFactory) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) UUID(java.util.UUID) Arguments(org.junit.jupiter.params.provider.Arguments) Tuple2(scala.Tuple2) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) Test(org.junit.jupiter.api.Test) List(java.util.List) Stream(java.util.stream.Stream) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) HoodieIndexUtils(org.apache.hudi.index.HoodieIndexUtils) Assertions.assertDoesNotThrow(org.junit.jupiter.api.Assertions.assertDoesNotThrow) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Assertions.assertNotNull(org.junit.jupiter.api.Assertions.assertNotNull) Assertions.assertNull(org.junit.jupiter.api.Assertions.assertNull) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) JavaRDD(org.apache.spark.api.java.JavaRDD) SchemaTestUtil.getSchemaFromResource(org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) BloomFilterTypeCode(org.apache.hudi.common.bloom.BloomFilterTypeCode) TestHoodieMetadataBase(org.apache.hudi.client.functional.TestHoodieMetadataBase) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) AfterEach(org.junit.jupiter.api.AfterEach) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Paths(java.nio.file.Paths) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) Collections(java.util.Collections) HoodieJavaPairRDD(org.apache.hudi.data.HoodieJavaPairRDD) Pair(org.apache.hudi.common.util.collection.Pair) HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) List(java.util.List) ArrayList(java.util.ArrayList) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 15 with HoodieSparkTable

use of org.apache.hudi.table.HoodieSparkTable in project hudi by apache.

the class TestUpdateSchemaEvolution method testSchemaEvolutionOnUpdateMisMatchWithDeleteColumn.

@Test
public void testSchemaEvolutionOnUpdateMisMatchWithDeleteColumn() throws Exception {
    final WriteStatus insertResult = prepareFirstRecordCommit(generateOneRecordForExampleSchema());
    // Now try an update with an evolved schema
    // Evolved schema does not have guarantee on preserving the original field ordering
    final HoodieWriteConfig config = makeHoodieClientConfig("/exampleEvolvedSchemaDeleteColumn.avsc");
    final HoodieSparkTable table = HoodieSparkTable.create(config, context);
    String recordStr = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\"}";
    List<HoodieRecord> updateRecords = buildUpdateRecords(recordStr, insertResult.getFileId());
    String assertMsg = "UpdateFunction when delete column, Parquet/Avro schema mismatch: Avro field 'xxx' not found";
    assertSchemaEvolutionOnUpdateResult(insertResult, table, updateRecords, assertMsg, true, InvalidRecordException.class);
}
Also used : HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) Test(org.junit.jupiter.api.Test)

Aggregations

HoodieSparkTable (org.apache.hudi.table.HoodieSparkTable)24 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)22 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)17 Test (org.junit.jupiter.api.Test)14 Map (java.util.Map)9 Arrays (java.util.Arrays)7 HashMap (java.util.HashMap)7 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)7 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)7 IOException (java.io.IOException)6 List (java.util.List)6 Schema (org.apache.avro.Schema)6 HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)6 Option (org.apache.hudi.common.util.Option)6 ArrayList (java.util.ArrayList)5 Collectors (java.util.stream.Collectors)5 Path (org.apache.hadoop.fs.Path)5 HoodieKey (org.apache.hudi.common.model.HoodieKey)5 HoodieTestDataGenerator (org.apache.hudi.common.testutils.HoodieTestDataGenerator)5 RawTripTestPayload (org.apache.hudi.common.testutils.RawTripTestPayload)5