Search in sources :

Example 21 with RawTripTestPayload

use of org.apache.hudi.common.testutils.RawTripTestPayload in project hudi by apache.

the class TestHoodieGlobalBloomIndex method testLoadInvolvedFiles.

@Test
public void testLoadInvolvedFiles() throws Exception {
    HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
    HoodieGlobalBloomIndex index = new HoodieGlobalBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance());
    HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
    HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(metaClient, SCHEMA, metadataWriter);
    // Create some partitions, and put some files, along with the meta file
    // "2016/01/21": 0 file
    // "2016/04/01": 1 file (2_0_20160401010101.parquet)
    // "2015/03/12": 3 files (1_0_20150312101010.parquet, 3_0_20150312101010.parquet, 4_0_20150312101010.parquet)
    final String p1 = "2016/01/21";
    final String p2 = "2016/04/01";
    final String p3 = "2015/03/12";
    RawTripTestPayload rowChange1 = new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
    HoodieRecord record1 = new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
    RawTripTestPayload rowChange2 = new RawTripTestPayload("{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
    HoodieRecord record2 = new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
    RawTripTestPayload rowChange3 = new RawTripTestPayload("{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
    HoodieRecord record3 = new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
    RawTripTestPayload rowChange4 = new RawTripTestPayload("{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
    HoodieRecord record4 = new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
    // intentionally missed the partition "2015/03/12" to see if the GlobalBloomIndex can pick it up
    List<String> partitions = Arrays.asList(p1, p2);
    // partitions will NOT be respected by this loadInvolvedFiles(...) call
    List<Pair<String, BloomIndexFileInfo>> filesList = index.loadColumnRangesFromFiles(partitions, context, hoodieTable);
    // Still 0, as no valid commit
    assertEquals(0, filesList.size());
    final String fileId1 = "1";
    final String fileId2 = "2";
    final String fileId3 = "3";
    final String fileId4 = "4";
    final Map<String, List<Pair<String, Integer>>> partitionToFilesNameLengthMap = new HashMap<>();
    final String c1 = "20160401010101";
    Path baseFilePath = testTable.forCommit(c1).withInserts(p2, fileId2, Collections.emptyList());
    long baseFileLength = fs.getFileStatus(baseFilePath).getLen();
    partitionToFilesNameLengthMap.computeIfAbsent(p2, k -> new ArrayList<>()).add(Pair.of(fileId2, Integer.valueOf((int) baseFileLength)));
    testTable.doWriteOperation(c1, WriteOperationType.UPSERT, Collections.singletonList(p2), partitionToFilesNameLengthMap, false, false);
    final String c2 = "20150312101010";
    testTable.forCommit(c2);
    baseFilePath = testTable.withInserts(p3, fileId1, Collections.emptyList());
    baseFileLength = fs.getFileStatus(baseFilePath).getLen();
    partitionToFilesNameLengthMap.clear();
    partitionToFilesNameLengthMap.computeIfAbsent(p3, k -> new ArrayList<>()).add(Pair.of(fileId1, Integer.valueOf((int) baseFileLength)));
    baseFilePath = testTable.withInserts(p3, fileId3, Collections.singletonList(record1));
    baseFileLength = fs.getFileStatus(baseFilePath).getLen();
    partitionToFilesNameLengthMap.computeIfAbsent(p3, k -> new ArrayList<>()).add(Pair.of(fileId3, Integer.valueOf((int) baseFileLength)));
    baseFilePath = testTable.withInserts(p3, fileId4, Arrays.asList(record2, record3, record4));
    baseFileLength = fs.getFileStatus(baseFilePath).getLen();
    partitionToFilesNameLengthMap.computeIfAbsent(p3, k -> new ArrayList<>()).add(Pair.of(fileId4, Integer.valueOf((int) baseFileLength)));
    testTable.doWriteOperation(c2, WriteOperationType.UPSERT, Collections.singletonList(p3), partitionToFilesNameLengthMap, false, false);
    filesList = index.loadColumnRangesFromFiles(partitions, context, hoodieTable);
    assertEquals(4, filesList.size());
    Map<String, BloomIndexFileInfo> filesMap = toFileMap(filesList);
    // key ranges checks
    assertNull(filesMap.get("2016/04/01/2").getMaxRecordKey());
    assertNull(filesMap.get("2016/04/01/2").getMinRecordKey());
    assertFalse(filesMap.get("2015/03/12/1").hasKeyRanges());
    assertNotNull(filesMap.get("2015/03/12/3").getMaxRecordKey());
    assertNotNull(filesMap.get("2015/03/12/3").getMinRecordKey());
    assertTrue(filesMap.get("2015/03/12/3").hasKeyRanges());
    Map<String, BloomIndexFileInfo> expected = new HashMap<>();
    expected.put("2016/04/01/2", new BloomIndexFileInfo("2"));
    expected.put("2015/03/12/1", new BloomIndexFileInfo("1"));
    expected.put("2015/03/12/3", new BloomIndexFileInfo("3", "000", "000"));
    expected.put("2015/03/12/4", new BloomIndexFileInfo("4", "001", "003"));
    assertEquals(expected, filesMap);
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieTable(org.apache.hudi.table.HoodieTable) Assertions.fail(org.junit.jupiter.api.Assertions.fail) BeforeEach(org.junit.jupiter.api.BeforeEach) Assertions.assertNotNull(org.junit.jupiter.api.Assertions.assertNotNull) Arrays(java.util.Arrays) Assertions.assertNull(org.junit.jupiter.api.Assertions.assertNull) HashMap(java.util.HashMap) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) JavaRDD(org.apache.spark.api.java.JavaRDD) SchemaTestUtil.getSchemaFromResource(org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Schema(org.apache.avro.Schema) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) TestHoodieMetadataBase(org.apache.hudi.client.functional.TestHoodieMetadataBase) IOException(java.io.IOException) UUID(java.util.UUID) Tuple2(scala.Tuple2) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) Collectors(java.util.stream.Collectors) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieIndex(org.apache.hudi.index.HoodieIndex) Test(org.junit.jupiter.api.Test) AfterEach(org.junit.jupiter.api.AfterEach) List(java.util.List) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) Collections(java.util.Collections) HoodieJavaPairRDD(org.apache.hudi.data.HoodieJavaPairRDD) Pair(org.apache.hudi.common.util.collection.Pair) HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) ArrayList(java.util.ArrayList) List(java.util.List) Pair(org.apache.hudi.common.util.collection.Pair) Test(org.junit.jupiter.api.Test)

Example 22 with RawTripTestPayload

use of org.apache.hudi.common.testutils.RawTripTestPayload in project hudi by apache.

the class TestHoodieGlobalBloomIndex method testTagLocation.

@Test
public void testTagLocation() throws Exception {
    HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.GLOBAL_BLOOM).withBloomIndexUpdatePartitionPath(false).build()).build();
    HoodieGlobalBloomIndex index = new HoodieGlobalBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance());
    HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
    HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(metaClient, SCHEMA, metadataWriter);
    // Create some partitions, and put some files, along with the meta file
    // "2016/01/21": 0 file
    // "2016/04/01": 1 file (2_0_20160401010101.parquet)
    // "2015/03/12": 3 files (1_0_20150312101010.parquet, 3_0_20150312101010.parquet, 4_0_20150312101010.parquet)
    final String partition2 = "2016/04/01";
    final String partition3 = "2015/03/12";
    RawTripTestPayload rowChange1 = new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
    HoodieRecord record1 = new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
    RawTripTestPayload rowChange2 = new RawTripTestPayload("{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
    HoodieRecord record2 = new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
    RawTripTestPayload rowChange3 = new RawTripTestPayload("{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
    HoodieRecord record3 = new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
    // this record will be saved in table and will be tagged to the incoming record5
    RawTripTestPayload rowChange4 = new RawTripTestPayload("{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
    HoodieRecord record4 = new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
    // this has the same record key as record4 but different time so different partition, but globalbloomIndex should
    // tag the original partition of the saved record4
    RawTripTestPayload rowChange5 = new RawTripTestPayload("{\"_row_key\":\"003\",\"time\":\"2016-02-31T03:16:41.415Z\",\"number\":12}");
    HoodieRecord record5 = new HoodieAvroRecord(new HoodieKey(rowChange5.getRowKey(), rowChange5.getPartitionPath()), rowChange5);
    final String fileId1 = UUID.randomUUID().toString();
    final String fileId2 = UUID.randomUUID().toString();
    final String fileId3 = UUID.randomUUID().toString();
    final String fileId4 = UUID.randomUUID().toString();
    final Map<String, List<Pair<String, Integer>>> partitionToFilesNameLengthMap = new HashMap<>();
    // intentionally missed the partition "2015/03/12" to see if the GlobalBloomIndex can pick it up
    String commitTime = "0000001";
    Path baseFilePath = testTable.forCommit(commitTime).withInserts(partition2, fileId1, Collections.singletonList(record1));
    long baseFileLength = fs.getFileStatus(baseFilePath).getLen();
    partitionToFilesNameLengthMap.computeIfAbsent(partition2, k -> new ArrayList<>()).add(Pair.of(fileId1, Integer.valueOf((int) baseFileLength)));
    testTable.doWriteOperation(commitTime, WriteOperationType.UPSERT, Collections.singletonList(partition2), partitionToFilesNameLengthMap, false, false);
    commitTime = "0000002";
    baseFilePath = testTable.forCommit(commitTime).withInserts(partition3, fileId2, Collections.emptyList());
    baseFileLength = fs.getFileStatus(baseFilePath).getLen();
    partitionToFilesNameLengthMap.clear();
    partitionToFilesNameLengthMap.computeIfAbsent(partition3, k -> new ArrayList<>()).add(Pair.of(fileId2, Integer.valueOf((int) baseFileLength)));
    testTable.doWriteOperation(commitTime, WriteOperationType.UPSERT, Collections.singletonList(partition3), partitionToFilesNameLengthMap, false, false);
    commitTime = "0000003";
    baseFilePath = testTable.forCommit(commitTime).withInserts(partition3, fileId3, Collections.singletonList(record2));
    baseFileLength = fs.getFileStatus(baseFilePath).getLen();
    partitionToFilesNameLengthMap.clear();
    partitionToFilesNameLengthMap.computeIfAbsent(partition3, k -> new ArrayList<>()).add(Pair.of(fileId3, Integer.valueOf((int) baseFileLength)));
    testTable.doWriteOperation(commitTime, WriteOperationType.UPSERT, Collections.singletonList(partition3), partitionToFilesNameLengthMap, false, false);
    commitTime = "0000004";
    baseFilePath = testTable.forCommit(commitTime).withInserts(partition3, fileId4, Collections.singletonList(record4));
    baseFileLength = fs.getFileStatus(baseFilePath).getLen();
    partitionToFilesNameLengthMap.clear();
    partitionToFilesNameLengthMap.computeIfAbsent(partition3, k -> new ArrayList<>()).add(Pair.of(fileId4, Integer.valueOf((int) baseFileLength)));
    testTable.doWriteOperation(commitTime, WriteOperationType.UPSERT, Collections.singletonList(partition3), partitionToFilesNameLengthMap, false, false);
    JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record5));
    // partitions will NOT be respected by this loadInvolvedFiles(...) call
    JavaRDD<HoodieRecord> taggedRecordRDD = tagLocation(index, recordRDD, hoodieTable);
    for (HoodieRecord record : taggedRecordRDD.collect()) {
        switch(record.getRecordKey()) {
            case "000":
                assertEquals(record.getCurrentLocation().getFileId(), fileId1);
                assertEquals(((RawTripTestPayload) record.getData()).getJsonData(), rowChange1.getJsonData());
                break;
            case "001":
                assertEquals(record.getCurrentLocation().getFileId(), fileId3);
                assertEquals(((RawTripTestPayload) record.getData()).getJsonData(), rowChange2.getJsonData());
                break;
            case "002":
                assertFalse(record.isCurrentLocationKnown());
                assertEquals(((RawTripTestPayload) record.getData()).getJsonData(), rowChange3.getJsonData());
                break;
            case "003":
                assertEquals(record.getCurrentLocation().getFileId(), fileId4);
                assertEquals(((RawTripTestPayload) record.getData()).getJsonData(), rowChange5.getJsonData());
                break;
            case "004":
                assertEquals(record.getCurrentLocation().getFileId(), fileId4);
                assertEquals(((RawTripTestPayload) record.getData()).getJsonData(), rowChange4.getJsonData());
                break;
            default:
                throw new IllegalArgumentException("Unknown Key: " + record.getRecordKey());
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieTable(org.apache.hudi.table.HoodieTable) Assertions.fail(org.junit.jupiter.api.Assertions.fail) BeforeEach(org.junit.jupiter.api.BeforeEach) Assertions.assertNotNull(org.junit.jupiter.api.Assertions.assertNotNull) Arrays(java.util.Arrays) Assertions.assertNull(org.junit.jupiter.api.Assertions.assertNull) HashMap(java.util.HashMap) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) JavaRDD(org.apache.spark.api.java.JavaRDD) SchemaTestUtil.getSchemaFromResource(org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Schema(org.apache.avro.Schema) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) TestHoodieMetadataBase(org.apache.hudi.client.functional.TestHoodieMetadataBase) IOException(java.io.IOException) UUID(java.util.UUID) Tuple2(scala.Tuple2) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) Collectors(java.util.stream.Collectors) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieIndex(org.apache.hudi.index.HoodieIndex) Test(org.junit.jupiter.api.Test) AfterEach(org.junit.jupiter.api.AfterEach) List(java.util.List) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) Collections(java.util.Collections) HoodieJavaPairRDD(org.apache.hudi.data.HoodieJavaPairRDD) Pair(org.apache.hudi.common.util.collection.Pair) HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) ArrayList(java.util.ArrayList) List(java.util.List) Test(org.junit.jupiter.api.Test)

Example 23 with RawTripTestPayload

use of org.apache.hudi.common.testutils.RawTripTestPayload in project hudi by apache.

the class TestUpdateSchemaEvolution method buildUpdateRecords.

private List<HoodieRecord> buildUpdateRecords(String recordStr, String insertFileId) throws IOException {
    List<HoodieRecord> updateRecords = new ArrayList<>();
    RawTripTestPayload rowChange = new RawTripTestPayload(recordStr);
    HoodieRecord record = new HoodieAvroRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange);
    record.setCurrentLocation(new HoodieRecordLocation("101", insertFileId));
    record.seal();
    updateRecords.add(record);
    return updateRecords;
}
Also used : RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) ArrayList(java.util.ArrayList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation)

Example 24 with RawTripTestPayload

use of org.apache.hudi.common.testutils.RawTripTestPayload in project hudi by apache.

the class TestUpdateSchemaEvolution method prepareFirstRecordCommit.

private WriteStatus prepareFirstRecordCommit(List<String> recordsStrs) throws IOException {
    // Create a bunch of records with an old version of schema
    final HoodieWriteConfig config = makeHoodieClientConfig("/exampleSchema.avsc");
    final HoodieSparkTable table = HoodieSparkTable.create(config, context);
    final List<WriteStatus> statuses = jsc.parallelize(Arrays.asList(1)).map(x -> {
        List<HoodieRecord> insertRecords = new ArrayList<>();
        for (String recordStr : recordsStrs) {
            RawTripTestPayload rowChange = new RawTripTestPayload(recordStr);
            insertRecords.add(new HoodieAvroRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange));
        }
        Map<String, HoodieRecord> insertRecordMap = insertRecords.stream().collect(Collectors.toMap(r -> r.getRecordKey(), Function.identity()));
        HoodieCreateHandle<?, ?, ?, ?> createHandle = new HoodieCreateHandle(config, "100", table, insertRecords.get(0).getPartitionPath(), "f1-0", insertRecordMap, supplier);
        createHandle.write();
        return createHandle.close().get(0);
    }).collect();
    final Path commitFile = new Path(config.getBasePath() + "/.hoodie/" + HoodieTimeline.makeCommitFileName("100"));
    FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf()).create(commitFile);
    return statuses.get(0);
}
Also used : Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) BaseFileUtils(org.apache.hudi.common.util.BaseFileUtils) ParquetDecodingException(org.apache.parquet.io.ParquetDecodingException) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) Option(org.apache.hudi.common.util.Option) Function(java.util.function.Function) HoodieClientTestHarness(org.apache.hudi.testutils.HoodieClientTestHarness) ArrayList(java.util.ArrayList) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieMergeHandle(org.apache.hudi.io.HoodieMergeHandle) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) SchemaTestUtil.getSchemaFromResource(org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieCreateHandle(org.apache.hudi.io.HoodieCreateHandle) InvalidRecordException(org.apache.parquet.io.InvalidRecordException) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) Test(org.junit.jupiter.api.Test) AfterEach(org.junit.jupiter.api.AfterEach) List(java.util.List) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) Executable(org.junit.jupiter.api.function.Executable) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) Assertions.assertDoesNotThrow(org.junit.jupiter.api.Assertions.assertDoesNotThrow) FSUtils(org.apache.hudi.common.fs.FSUtils) Path(org.apache.hadoop.fs.Path) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieCreateHandle(org.apache.hudi.io.HoodieCreateHandle) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) ArrayList(java.util.ArrayList) List(java.util.List) Map(java.util.Map) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable)

Example 25 with RawTripTestPayload

use of org.apache.hudi.common.testutils.RawTripTestPayload in project hudi by apache.

the class TestHoodieBucketIndex method testTagLocation.

@Test
public void testTagLocation() throws Exception {
    String rowKey1 = UUID.randomUUID().toString();
    String rowKey2 = UUID.randomUUID().toString();
    String rowKey3 = UUID.randomUUID().toString();
    String recordStr1 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
    String recordStr2 = "{\"_row_key\":\"" + rowKey2 + "\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
    String recordStr3 = "{\"_row_key\":\"" + rowKey3 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
    String recordStr4 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
    RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
    HoodieRecord record1 = new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
    RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
    HoodieRecord record2 = new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
    RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
    HoodieRecord record3 = new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
    RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
    HoodieRecord record4 = new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
    JavaRDD<HoodieRecord<HoodieAvroRecord>> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4));
    HoodieWriteConfig config = makeConfig();
    HoodieTable table = HoodieSparkTable.create(config, context, metaClient);
    HoodieBucketIndex bucketIndex = new HoodieBucketIndex(config);
    HoodieData<HoodieRecord<HoodieAvroRecord>> taggedRecordRDD = bucketIndex.tagLocation(HoodieJavaRDD.of(recordRDD), context, table);
    assertFalse(taggedRecordRDD.collectAsList().stream().anyMatch(r -> r.isCurrentLocationKnown()));
    HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(table, SCHEMA);
    testTable.addCommit("001").withInserts("2016/01/31", getRecordFileId(record1), record1);
    testTable.addCommit("002").withInserts("2016/01/31", getRecordFileId(record2), record2);
    testTable.addCommit("003").withInserts("2016/01/31", getRecordFileId(record3), record3);
    taggedRecordRDD = bucketIndex.tagLocation(HoodieJavaRDD.of(recordRDD), context, HoodieSparkTable.create(config, context, metaClient));
    assertFalse(taggedRecordRDD.collectAsList().stream().filter(r -> r.isCurrentLocationKnown()).filter(r -> BucketIdentifier.bucketIdFromFileId(r.getCurrentLocation().getFileId()) != getRecordBucketId(r)).findAny().isPresent());
    assertTrue(taggedRecordRDD.collectAsList().stream().filter(r -> r.getPartitionPath().equals("2015/01/31") && !r.isCurrentLocationKnown()).count() == 1L);
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) HoodieClientTestHarness(org.apache.hudi.testutils.HoodieClientTestHarness) Logger(org.apache.log4j.Logger) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) JavaRDD(org.apache.spark.api.java.JavaRDD) SchemaTestUtil.getSchemaFromResource(org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieIndexException(org.apache.hudi.exception.HoodieIndexException) HoodieData(org.apache.hudi.common.data.HoodieData) Schema(org.apache.avro.Schema) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) KeyGeneratorOptions(org.apache.hudi.keygen.constant.KeyGeneratorOptions) UUID(java.util.UUID) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieIndex(org.apache.hudi.index.HoodieIndex) Test(org.junit.jupiter.api.Test) AfterEach(org.junit.jupiter.api.AfterEach) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) LogManager(org.apache.log4j.LogManager) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) Test(org.junit.jupiter.api.Test)

Aggregations

RawTripTestPayload (org.apache.hudi.common.testutils.RawTripTestPayload)31 HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)30 HoodieKey (org.apache.hudi.common.model.HoodieKey)30 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)30 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)27 ArrayList (java.util.ArrayList)24 Test (org.junit.jupiter.api.Test)20 Path (org.apache.hadoop.fs.Path)16 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)16 List (java.util.List)14 HoodieTable (org.apache.hudi.table.HoodieTable)14 Arrays (java.util.Arrays)12 Map (java.util.Map)12 Schema (org.apache.avro.Schema)12 Pair (org.apache.hudi.common.util.collection.Pair)12 AfterEach (org.junit.jupiter.api.AfterEach)12 HoodieSparkWriteableTestTable (org.apache.hudi.testutils.HoodieSparkWriteableTestTable)11 JavaRDD (org.apache.spark.api.java.JavaRDD)11 Assertions.assertFalse (org.junit.jupiter.api.Assertions.assertFalse)11 Assertions.assertTrue (org.junit.jupiter.api.Assertions.assertTrue)11