Search in sources :

Example 51 with HoodieAvroRecord

use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.

the class TestCopyOnWriteActionExecutor method testMetadataAggregateFromWriteStatus.

// Check if record level metadata is aggregated properly at the end of write.
@Test
public void testMetadataAggregateFromWriteStatus() throws Exception {
    // Prepare the AvroParquetIO
    HoodieWriteConfig config = makeHoodieClientConfigBuilder().withWriteStatusClass(MetadataMergeWriteStatus.class).build();
    String firstCommitTime = makeNewCommitTime();
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient);
    // Get some records belong to the same partition (2016/01/31)
    String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
    String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
    String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
    List<HoodieRecord> records = new ArrayList<>();
    RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
    records.add(new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
    RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
    records.add(new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
    RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
    records.add(new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
    // Insert new records
    BaseSparkCommitActionExecutor actionExecutor = new SparkInsertCommitActionExecutor(context, config, table, firstCommitTime, context.parallelize(records));
    List<WriteStatus> writeStatuses = jsc.parallelize(Arrays.asList(1)).map(x -> {
        return actionExecutor.handleInsert(FSUtils.createNewFileIdPfx(), records.iterator());
    }).flatMap(Transformations::flattenAsIterator).collect();
    Map<String, String> allWriteStatusMergedMetadataMap = MetadataMergeWriteStatus.mergeMetadataForWriteStatuses(writeStatuses);
    assertTrue(allWriteStatusMergedMetadataMap.containsKey("InputRecordCount_1506582000"));
    // For metadata key InputRecordCount_1506582000, value is 2 for each record. So sum of this
    // should be 2 * 3
    assertEquals("6", allWriteStatusMergedMetadataMap.get("InputRecordCount_1506582000"));
}
Also used : HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieSparkCopyOnWriteTable(org.apache.hudi.table.HoodieSparkCopyOnWriteTable) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) WriteStatus(org.apache.hudi.client.WriteStatus) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 52 with HoodieAvroRecord

use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.

the class TestCopyOnWriteActionExecutor method newHoodieRecords.

private List<HoodieRecord> newHoodieRecords(int n, String time) throws Exception {
    List<HoodieRecord> records = new ArrayList<>();
    for (int i = 0; i < n; i++) {
        String recordStr = String.format("{\"_row_key\":\"%s\",\"time\":\"%s\",\"number\":%d}", UUID.randomUUID().toString(), time, i);
        RawTripTestPayload rowChange = new RawTripTestPayload(recordStr);
        records.add(new HoodieAvroRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange));
    }
    return records;
}
Also used : RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) ArrayList(java.util.ArrayList)

Example 53 with HoodieAvroRecord

use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.

the class TestCopyOnWriteActionExecutor method testFileSizeUpsertRecords.

@Test
public void testFileSizeUpsertRecords() throws Exception {
    HoodieWriteConfig config = makeHoodieClientConfigBuilder().withStorageConfig(HoodieStorageConfig.newBuilder().parquetMaxFileSize(64 * 1024).hfileMaxFileSize(64 * 1024).parquetBlockSize(64 * 1024).parquetPageSize(64 * 1024).build()).build();
    String instantTime = makeNewCommitTime();
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient);
    List<HoodieRecord> records = new ArrayList<>();
    // Approx 1150 records are written for block size of 64KB
    for (int i = 0; i < 2000; i++) {
        String recordStr = "{\"_row_key\":\"" + UUID.randomUUID().toString() + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i + "}";
        RawTripTestPayload rowChange = new RawTripTestPayload(recordStr);
        records.add(new HoodieAvroRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange));
    }
    // Insert new records
    BaseSparkCommitActionExecutor actionExecutor = new SparkUpsertCommitActionExecutor(context, config, table, instantTime, context.parallelize(records));
    jsc.parallelize(Arrays.asList(1)).map(i -> actionExecutor.handleInsert(FSUtils.createNewFileIdPfx(), records.iterator())).map(Transformations::flatten).collect();
    // Check the updated file
    int counts = 0;
    for (File file : Paths.get(basePath, "2016/01/31").toFile().listFiles()) {
        if (file.getName().endsWith(table.getBaseFileExtension()) && FSUtils.getCommitTime(file.getName()).equals(instantTime)) {
            LOG.info(file.getName() + "-" + file.length());
            counts++;
        }
    }
    assertEquals(3, counts, "If the number of records are more than 1150, then there should be a new file");
}
Also used : HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieSparkCopyOnWriteTable(org.apache.hudi.table.HoodieSparkCopyOnWriteTable) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) File(java.io.File) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 54 with HoodieAvroRecord

use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.

the class TestBoundedInMemoryQueue method testRecordReading.

// Test to ensure that we are reading all records from queue iterator in the same order
// without any exceptions.
@SuppressWarnings("unchecked")
@Test
@Timeout(value = 60)
public void testRecordReading() throws Exception {
    final int numRecords = 128;
    final List<HoodieRecord> hoodieRecords = dataGen.generateInserts(instantTime, numRecords);
    final BoundedInMemoryQueue<HoodieRecord, HoodieLazyInsertIterable.HoodieInsertValueGenResult> queue = new BoundedInMemoryQueue(FileIOUtils.KB, getTransformFunction(HoodieTestDataGenerator.AVRO_SCHEMA));
    // Produce
    Future<Boolean> resFuture = executorService.submit(() -> {
        new IteratorBasedQueueProducer<>(hoodieRecords.iterator()).produce(queue);
        queue.close();
        return true;
    });
    final Iterator<HoodieRecord> originalRecordIterator = hoodieRecords.iterator();
    int recordsRead = 0;
    while (queue.iterator().hasNext()) {
        final HoodieAvroRecord originalRecord = (HoodieAvroRecord) originalRecordIterator.next();
        final Option<IndexedRecord> originalInsertValue = originalRecord.getData().getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA);
        final HoodieLazyInsertIterable.HoodieInsertValueGenResult<HoodieRecord> payload = queue.iterator().next();
        // Ensure that record ordering is guaranteed.
        assertEquals(originalRecord, payload.record);
        // cached insert value matches the expected insert value.
        assertEquals(originalInsertValue, ((HoodieAvroRecord) payload.record).getData().getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA));
        recordsRead++;
    }
    assertFalse(queue.iterator().hasNext() || originalRecordIterator.hasNext());
    // all the records should be read successfully.
    assertEquals(numRecords, recordsRead);
    // should not throw any exceptions.
    resFuture.get();
}
Also used : IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) BoundedInMemoryQueue(org.apache.hudi.common.util.queue.BoundedInMemoryQueue) Test(org.junit.jupiter.api.Test) Timeout(org.junit.jupiter.api.Timeout)

Example 55 with HoodieAvroRecord

use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.

the class TestHoodieBloomIndex method testCheckUUIDsAgainstOneFile.

@Test
public void testCheckUUIDsAgainstOneFile() throws Exception {
    final String partition = "2016/01/31";
    // Create some records to use
    String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
    String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
    String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
    String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}";
    RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
    HoodieRecord record1 = new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
    RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
    HoodieRecord record2 = new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
    RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
    HoodieRecord record3 = new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
    RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
    HoodieRecord record4 = new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
    // We write record1, record2 to a parquet file, but the bloom filter contains (record1,
    // record2, record3).
    BloomFilter filter = BloomFilterFactory.createBloomFilter(10000, 0.0000001, -1, BloomFilterTypeCode.SIMPLE.name());
    filter.add(record3.getRecordKey());
    HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(metaClient, SCHEMA, filter, metadataWriter);
    final Map<String, List<Pair<String, Integer>>> partitionToFilesNameLengthMap = new HashMap<>();
    final String commitTime = "0000001";
    final String fileId = UUID.randomUUID().toString();
    Path baseFilePath = testTable.forCommit(commitTime).withInserts(partition, fileId, Arrays.asList(record1, record2));
    long baseFileLength = fs.getFileStatus(baseFilePath).getLen();
    partitionToFilesNameLengthMap.computeIfAbsent(partition, k -> new ArrayList<>()).add(Pair.of(fileId, Integer.valueOf((int) baseFileLength)));
    testTable.doWriteOperation(commitTime, WriteOperationType.UPSERT, Collections.singletonList(partition), partitionToFilesNameLengthMap, false, false);
    final String filename = testTable.getBaseFileNameById(fileId);
    // The bloom filter contains 3 records
    assertTrue(filter.mightContain(record1.getRecordKey()));
    assertTrue(filter.mightContain(record2.getRecordKey()));
    assertTrue(filter.mightContain(record3.getRecordKey()));
    assertFalse(filter.mightContain(record4.getRecordKey()));
    // Compare with file
    List<String> uuids = Arrays.asList(record1.getRecordKey(), record2.getRecordKey(), record3.getRecordKey(), record4.getRecordKey());
    HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
    HoodieSparkTable table = HoodieSparkTable.create(config, context, metaClient);
    List<String> results = HoodieIndexUtils.filterKeysFromFile(new Path(Paths.get(basePath, partition, filename).toString()), uuids, hadoopConf);
    assertEquals(results.size(), 2);
    assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0") || results.get(1).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0"));
    assertTrue(results.get(0).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0") || results.get(1).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0"));
// TODO(vc): Need more coverage on actual filenames
// assertTrue(results.get(0)._2().equals(filename));
// assertTrue(results.get(1)._2().equals(filename));
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieTable(org.apache.hudi.table.HoodieTable) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) MethodSource(org.junit.jupiter.params.provider.MethodSource) Schema(org.apache.avro.Schema) BloomFilterFactory(org.apache.hudi.common.bloom.BloomFilterFactory) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) UUID(java.util.UUID) Arguments(org.junit.jupiter.params.provider.Arguments) Tuple2(scala.Tuple2) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) Test(org.junit.jupiter.api.Test) List(java.util.List) Stream(java.util.stream.Stream) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) HoodieIndexUtils(org.apache.hudi.index.HoodieIndexUtils) Assertions.assertDoesNotThrow(org.junit.jupiter.api.Assertions.assertDoesNotThrow) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Assertions.assertNotNull(org.junit.jupiter.api.Assertions.assertNotNull) Assertions.assertNull(org.junit.jupiter.api.Assertions.assertNull) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) JavaRDD(org.apache.spark.api.java.JavaRDD) SchemaTestUtil.getSchemaFromResource(org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) BloomFilterTypeCode(org.apache.hudi.common.bloom.BloomFilterTypeCode) TestHoodieMetadataBase(org.apache.hudi.client.functional.TestHoodieMetadataBase) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) AfterEach(org.junit.jupiter.api.AfterEach) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Paths(java.nio.file.Paths) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) Collections(java.util.Collections) HoodieJavaPairRDD(org.apache.hudi.data.HoodieJavaPairRDD) Pair(org.apache.hudi.common.util.collection.Pair) HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) List(java.util.List) ArrayList(java.util.ArrayList) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Aggregations

HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)84 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)72 HoodieKey (org.apache.hudi.common.model.HoodieKey)68 ArrayList (java.util.ArrayList)38 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)37 RawTripTestPayload (org.apache.hudi.common.testutils.RawTripTestPayload)31 Test (org.junit.jupiter.api.Test)30 GenericRecord (org.apache.avro.generic.GenericRecord)29 Path (org.apache.hadoop.fs.Path)26 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)25 IOException (java.io.IOException)24 HoodieTable (org.apache.hudi.table.HoodieTable)24 List (java.util.List)23 Schema (org.apache.avro.Schema)23 HashMap (java.util.HashMap)22 Pair (org.apache.hudi.common.util.collection.Pair)21 Map (java.util.Map)20 Collectors (java.util.stream.Collectors)20 Arrays (java.util.Arrays)17 Option (org.apache.hudi.common.util.Option)16