Search in sources :

Example 66 with HoodieAvroRecord

use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.

the class TestSparkHoodieHBaseIndex method testTagLocationAndPartitionPathUpdate.

@Test
public void testTagLocationAndPartitionPathUpdate() throws Exception {
    final String newCommitTime = "001";
    final int numRecords = 10;
    final String oldPartitionPath = "1970/01/01";
    final String emptyHoodieRecordPayloadClassName = EmptyHoodieRecordPayload.class.getName();
    List<HoodieRecord> newRecords = dataGen.generateInserts(newCommitTime, numRecords);
    List<HoodieRecord> oldRecords = new LinkedList();
    for (HoodieRecord newRecord : newRecords) {
        HoodieKey key = new HoodieKey(newRecord.getRecordKey(), oldPartitionPath);
        HoodieRecord hoodieRecord = new HoodieAvroRecord(key, (HoodieRecordPayload) newRecord.getData());
        oldRecords.add(hoodieRecord);
    }
    JavaRDD<HoodieRecord> newWriteRecords = jsc().parallelize(newRecords, 1);
    JavaRDD<HoodieRecord> oldWriteRecords = jsc().parallelize(oldRecords, 1);
    HoodieWriteConfig config = getConfig(true, false);
    SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(getConfig(true, false));
    try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config)) {
        // allowed path change test
        metaClient = HoodieTableMetaClient.reload(metaClient);
        HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
        JavaRDD<HoodieRecord> oldHoodieRecord = tagLocation(index, oldWriteRecords, hoodieTable);
        assertEquals(0, oldHoodieRecord.filter(record -> record.isCurrentLocationKnown()).count());
        writeClient.startCommitWithTime(newCommitTime);
        JavaRDD<WriteStatus> writeStatues = writeClient.upsert(oldWriteRecords, newCommitTime);
        writeClient.commit(newCommitTime, writeStatues);
        assertNoWriteErrors(writeStatues.collect());
        updateLocation(index, writeStatues, hoodieTable);
        metaClient = HoodieTableMetaClient.reload(metaClient);
        hoodieTable = HoodieSparkTable.create(config, context, metaClient);
        List<HoodieRecord> taggedRecords = tagLocation(index, newWriteRecords, hoodieTable).collect();
        assertEquals(numRecords * 2L, taggedRecords.stream().count());
        // Verify the number of deleted records
        assertEquals(numRecords, taggedRecords.stream().filter(record -> record.getKey().getPartitionPath().equals(oldPartitionPath) && record.getData().getClass().getName().equals(emptyHoodieRecordPayloadClassName)).count());
        // Verify the number of inserted records
        assertEquals(numRecords, taggedRecords.stream().filter(record -> !record.getKey().getPartitionPath().equals(oldPartitionPath)).count());
        // not allowed path change test
        index = new SparkHoodieHBaseIndex(getConfig(false, false));
        List<HoodieRecord> notAllowPathChangeRecords = tagLocation(index, newWriteRecords, hoodieTable).collect();
        assertEquals(numRecords, notAllowPathChangeRecords.stream().count());
        assertEquals(numRecords, taggedRecords.stream().filter(hoodieRecord -> hoodieRecord.isCurrentLocationKnown() && hoodieRecord.getKey().getPartitionPath().equals(oldPartitionPath)).count());
    }
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) LinkedList(java.util.LinkedList) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieTable(org.apache.hudi.table.HoodieTable) WriteStatus(org.apache.hudi.client.WriteStatus) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 67 with HoodieAvroRecord

use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.

the class TestBootstrap method generateInputBatch.

private static JavaRDD<HoodieRecord> generateInputBatch(JavaSparkContext jsc, List<Pair<String, List<HoodieFileStatus>>> partitionPaths, Schema writerSchema) {
    List<Pair<String, Path>> fullFilePathsWithPartition = partitionPaths.stream().flatMap(p -> p.getValue().stream().map(x -> Pair.of(p.getKey(), FileStatusUtils.toPath(x.getPath())))).collect(Collectors.toList());
    return jsc.parallelize(fullFilePathsWithPartition.stream().flatMap(p -> {
        try {
            Configuration conf = jsc.hadoopConfiguration();
            AvroReadSupport.setAvroReadSchema(conf, writerSchema);
            Iterator<GenericRecord> recIterator = new ParquetReaderIterator(AvroParquetReader.<GenericRecord>builder(p.getValue()).withConf(conf).build());
            return StreamSupport.stream(Spliterators.spliteratorUnknownSize(recIterator, 0), false).map(gr -> {
                try {
                    String key = gr.get("_row_key").toString();
                    String pPath = p.getKey();
                    return new HoodieAvroRecord<>(new HoodieKey(key, pPath), new RawTripTestPayload(gr.toString(), key, pPath, HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA));
                } catch (IOException e) {
                    throw new HoodieIOException(e.getMessage(), e);
                }
            });
        } catch (IOException ioe) {
            throw new HoodieIOException(ioe.getMessage(), ioe);
        }
    }).collect(Collectors.toList()));
}
Also used : BootstrapUtils(org.apache.hudi.table.action.bootstrap.BootstrapUtils) BeforeEach(org.junit.jupiter.api.BeforeEach) HoodieMergeOnReadTestUtils(org.apache.hudi.testutils.HoodieMergeOnReadTestUtils) Arrays(java.util.Arrays) BootstrapMode(org.apache.hudi.client.bootstrap.BootstrapMode) Spliterators(java.util.Spliterators) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Random(java.util.Random) HoodieParquetRealtimeInputFormat(org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat) DataSourceWriteOptions(org.apache.hudi.DataSourceWriteOptions) LongWritable(org.apache.hadoop.io.LongWritable) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) NonpartitionedKeyGenerator(org.apache.hudi.keygen.NonpartitionedKeyGenerator) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) Tag(org.junit.jupiter.api.Tag) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) DataTypes(org.apache.spark.sql.types.DataTypes) Schema(org.apache.avro.Schema) IndexType(org.apache.hudi.index.HoodieIndex.IndexType) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) Set(java.util.Set) MetadataOnlyBootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector) SimpleKeyGenerator(org.apache.hudi.keygen.SimpleKeyGenerator) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) MessageType(org.apache.parquet.schema.MessageType) List(java.util.List) AvroParquetReader(org.apache.parquet.avro.AvroParquetReader) TempDir(org.junit.jupiter.api.io.TempDir) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) FullRecordBootstrapDataProvider(org.apache.hudi.client.bootstrap.FullRecordBootstrapDataProvider) HoodieClientTestBase(org.apache.hudi.testutils.HoodieClientTestBase) IntStream(java.util.stream.IntStream) AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) AvroReadSupport(org.apache.parquet.avro.AvroReadSupport) Dataset(org.apache.spark.sql.Dataset) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) ParquetReaderIterator(org.apache.hudi.common.util.ParquetReaderIterator) Collectors.mapping(java.util.stream.Collectors.mapping) StreamSupport(java.util.stream.StreamSupport) HoodieParquetInputFormat(org.apache.hudi.hadoop.HoodieParquetInputFormat) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) FullRecordBootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.FullRecordBootstrapModeSelector) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) SparkSession(org.apache.spark.sql.SparkSession) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) SaveMode(org.apache.spark.sql.SaveMode) BootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.BootstrapModeSelector) BootstrapIndex(org.apache.hudi.common.bootstrap.index.BootstrapIndex) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) Column(org.apache.spark.sql.Column) SQLContext(org.apache.spark.sql.SQLContext) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) JobConf(org.apache.hadoop.mapred.JobConf) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) AfterEach(org.junit.jupiter.api.AfterEach) Collectors.toList(java.util.stream.Collectors.toList) UDF1(org.apache.spark.sql.api.java.UDF1) HoodieKey(org.apache.hudi.common.model.HoodieKey) FileStatusUtils(org.apache.hudi.common.bootstrap.FileStatusUtils) HoodieIOException(org.apache.hudi.exception.HoodieIOException) PartitionPathEncodeUtils(org.apache.hudi.common.util.PartitionPathEncodeUtils) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) org.apache.spark.sql.functions.callUDF(org.apache.spark.sql.functions.callUDF) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieBootstrapConfig(org.apache.hudi.config.HoodieBootstrapConfig) Configuration(org.apache.hadoop.conf.Configuration) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) ParquetReaderIterator(org.apache.hudi.common.util.ParquetReaderIterator) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) GenericRecord(org.apache.avro.generic.GenericRecord) Pair(org.apache.hudi.common.util.collection.Pair)

Example 68 with HoodieAvroRecord

use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.

the class TestFlinkHoodieBloomIndex method testBloomFilterFalseError.

@ParameterizedTest(name = TEST_NAME_WITH_PARAMS)
@MethodSource("configParams")
public void testBloomFilterFalseError(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) throws Exception {
    // We have two hoodie records
    String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
    String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
    // We write record1 to a base file, using a bloom filter having both records
    RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
    HoodieRecord record1 = new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
    RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
    HoodieRecord record2 = new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
    BloomFilter filter = BloomFilterFactory.createBloomFilter(10000, 0.0000001, -1, BloomFilterTypeCode.SIMPLE.name());
    filter.add(record2.getRecordKey());
    HoodieFlinkWriteableTestTable testTable = HoodieFlinkWriteableTestTable.of(metaClient, SCHEMA, filter);
    String fileId = testTable.addCommit("000").getFileIdWithInserts("2016/01/31", record1);
    assertTrue(filter.mightContain(record1.getRecordKey()));
    assertTrue(filter.mightContain(record2.getRecordKey()));
    // We do the tag
    List<HoodieRecord> records = asList(record1, record2);
    HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking);
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieTable table = HoodieFlinkTable.create(config, context, metaClient);
    HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance());
    List<HoodieRecord> taggedRecords = tagLocation(bloomIndex, records, table);
    // Check results
    for (HoodieRecord record : taggedRecords) {
        if (record.getKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) {
            assertEquals(record.getCurrentLocation().getFileId(), fileId);
        } else if (record.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) {
            assertFalse(record.isCurrentLocationKnown());
        }
    }
}
Also used : RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieFlinkWriteableTestTable(org.apache.hudi.testutils.HoodieFlinkWriteableTestTable) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 69 with HoodieAvroRecord

use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.

the class TestFlinkHoodieBloomIndex method testTagLocation.

@ParameterizedTest(name = TEST_NAME_WITH_PARAMS)
@MethodSource("configParams")
public void testTagLocation(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) throws Exception {
    // We have some records to be tagged (two different partitions)
    String rowKey1 = randomUUID().toString();
    String rowKey2 = randomUUID().toString();
    String rowKey3 = randomUUID().toString();
    String recordStr1 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
    String recordStr2 = "{\"_row_key\":\"" + rowKey2 + "\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
    String recordStr3 = "{\"_row_key\":\"" + rowKey3 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
    // place same row key under a different partition.
    String recordStr4 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
    RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
    HoodieRecord record1 = new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
    RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
    HoodieRecord record2 = new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
    RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
    HoodieRecord record3 = new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
    RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
    HoodieRecord record4 = new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
    List<HoodieRecord> records = asList(record1, record2, record3, record4);
    // Also create the metadata and config
    HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking);
    HoodieFlinkTable hoodieTable = HoodieFlinkTable.create(config, context, metaClient);
    HoodieFlinkWriteableTestTable testTable = HoodieFlinkWriteableTestTable.of(hoodieTable, SCHEMA);
    // Let's tag
    HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance());
    List<HoodieRecord> taggedRecords = tagLocation(bloomIndex, records, hoodieTable);
    // Should not find any files
    for (HoodieRecord record : taggedRecords) {
        assertFalse(record.isCurrentLocationKnown());
    }
    // We create three base file, each having one record. (two different partitions)
    String fileId1 = testTable.addCommit("001").getFileIdWithInserts("2016/01/31", record1);
    String fileId2 = testTable.addCommit("002").getFileIdWithInserts("2016/01/31", record2);
    String fileId3 = testTable.addCommit("003").getFileIdWithInserts("2015/01/31", record4);
    metaClient.reloadActiveTimeline();
    // We do the tag again
    taggedRecords = tagLocation(bloomIndex, records, HoodieFlinkTable.create(config, context, metaClient));
    // Check results
    for (HoodieRecord record : taggedRecords) {
        if (record.getRecordKey().equals(rowKey1)) {
            if (record.getPartitionPath().equals("2015/01/31")) {
                assertEquals(record.getCurrentLocation().getFileId(), fileId3);
            } else {
                assertEquals(record.getCurrentLocation().getFileId(), fileId1);
            }
        } else if (record.getRecordKey().equals(rowKey2)) {
            assertEquals(record.getCurrentLocation().getFileId(), fileId2);
        } else if (record.getRecordKey().equals(rowKey3)) {
            assertFalse(record.isCurrentLocationKnown());
        }
    }
}
Also used : RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieFlinkTable(org.apache.hudi.table.HoodieFlinkTable) HoodieFlinkWriteableTestTable(org.apache.hudi.testutils.HoodieFlinkWriteableTestTable) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 70 with HoodieAvroRecord

use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.

the class TestJavaCopyOnWriteActionExecutor method newHoodieRecords.

private List<HoodieRecord> newHoodieRecords(int n, String time) throws Exception {
    List<HoodieRecord> records = new ArrayList<>();
    for (int i = 0; i < n; i++) {
        String recordStr = String.format("{\"_row_key\":\"%s\",\"time\":\"%s\",\"number\":%d}", UUID.randomUUID().toString(), time, i);
        RawTripTestPayload rowChange = new RawTripTestPayload(recordStr);
        records.add(new HoodieAvroRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange));
    }
    return records;
}
Also used : RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) ArrayList(java.util.ArrayList)

Aggregations

HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)84 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)72 HoodieKey (org.apache.hudi.common.model.HoodieKey)68 ArrayList (java.util.ArrayList)38 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)37 RawTripTestPayload (org.apache.hudi.common.testutils.RawTripTestPayload)31 Test (org.junit.jupiter.api.Test)30 GenericRecord (org.apache.avro.generic.GenericRecord)29 Path (org.apache.hadoop.fs.Path)26 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)25 IOException (java.io.IOException)24 HoodieTable (org.apache.hudi.table.HoodieTable)24 List (java.util.List)23 Schema (org.apache.avro.Schema)23 HashMap (java.util.HashMap)22 Pair (org.apache.hudi.common.util.collection.Pair)21 Map (java.util.Map)20 Collectors (java.util.stream.Collectors)20 Arrays (java.util.Arrays)17 Option (org.apache.hudi.common.util.Option)16