Search in sources :

Example 1 with HoodieFlinkTable

use of org.apache.hudi.table.HoodieFlinkTable in project hudi by apache.

the class HoodieFlinkWriteClient method insertOverwriteTable.

/**
 * Removes all existing records of the Hoodie table and inserts the given HoodieRecords, into the table.
 *
 * @param records     HoodieRecords to insert
 * @param instantTime Instant time of the commit
 * @return list of WriteStatus to inspect errors and counts
 */
public List<WriteStatus> insertOverwriteTable(List<HoodieRecord<T>> records, final String instantTime) {
    HoodieTable table = initTable(WriteOperationType.INSERT_OVERWRITE_TABLE, Option.ofNullable(instantTime));
    table.validateInsertSchema();
    preWrite(instantTime, WriteOperationType.INSERT_OVERWRITE_TABLE, table.getMetaClient());
    // create the write handle if not exists
    final HoodieWriteHandle<?, ?, ?, ?> writeHandle = getOrCreateWriteHandle(records.get(0), getConfig(), instantTime, table, records.listIterator());
    HoodieWriteMetadata result = ((HoodieFlinkTable<T>) table).insertOverwriteTable(context, writeHandle, instantTime, records);
    return postWrite(result, instantTime, table);
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieFlinkTable(org.apache.hudi.table.HoodieFlinkTable)

Example 2 with HoodieFlinkTable

use of org.apache.hudi.table.HoodieFlinkTable in project hudi by apache.

the class TestFlinkHoodieBloomIndex method testCheckUUIDsAgainstOneFile.

@Test
public void testCheckUUIDsAgainstOneFile() throws Exception {
    final String partition = "2016/01/31";
    // Create some records to use
    String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
    String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
    String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
    String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}";
    RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
    HoodieRecord record1 = new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
    RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
    HoodieRecord record2 = new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
    RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
    HoodieRecord record3 = new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
    RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
    HoodieRecord record4 = new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
    // We write record1, record2 to a base file, but the bloom filter contains (record1,
    // record2, record3).
    BloomFilter filter = BloomFilterFactory.createBloomFilter(10000, 0.0000001, -1, BloomFilterTypeCode.SIMPLE.name());
    filter.add(record3.getRecordKey());
    HoodieFlinkWriteableTestTable testTable = HoodieFlinkWriteableTestTable.of(metaClient, SCHEMA, filter);
    String fileId = testTable.addCommit("000").getFileIdWithInserts(partition, record1, record2);
    String filename = testTable.getBaseFileNameById(fileId);
    // The bloom filter contains 3 records
    assertTrue(filter.mightContain(record1.getRecordKey()));
    assertTrue(filter.mightContain(record2.getRecordKey()));
    assertTrue(filter.mightContain(record3.getRecordKey()));
    assertFalse(filter.mightContain(record4.getRecordKey()));
    // Compare with file
    List<String> uuids = asList(record1.getRecordKey(), record2.getRecordKey(), record3.getRecordKey(), record4.getRecordKey());
    HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
    HoodieFlinkTable table = HoodieFlinkTable.create(config, context, metaClient);
    List<String> results = HoodieIndexUtils.filterKeysFromFile(new Path(java.nio.file.Paths.get(basePath, partition, filename).toString()), uuids, hadoopConf);
    assertEquals(results.size(), 2);
    assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0") || results.get(1).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0"));
    assertTrue(results.get(0).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0") || results.get(1).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0"));
// TODO(vc): Need more coverage on actual filenames
// assertTrue(results.get(0)._2().equals(filename));
// assertTrue(results.get(1)._2().equals(filename));
}
Also used : Path(org.apache.hadoop.fs.Path) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieFlinkTable(org.apache.hudi.table.HoodieFlinkTable) HoodieFlinkWriteableTestTable(org.apache.hudi.testutils.HoodieFlinkWriteableTestTable) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 3 with HoodieFlinkTable

use of org.apache.hudi.table.HoodieFlinkTable in project hudi by apache.

the class TestData method checkWrittenFullData.

/**
 * Checks the source data are written as expected.
 *
 * <p>Note: Replace it with the Flink reader when it is supported.
 *
 * @param basePath The file base to check, should be a directory
 * @param expected The expected results mapping, the key should be the partition path
 */
public static void checkWrittenFullData(File basePath, Map<String, List<String>> expected) throws IOException {
    // 1. init flink table
    HoodieTableMetaClient metaClient = HoodieTestUtils.init(basePath.getAbsolutePath());
    HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath.getAbsolutePath()).build();
    HoodieFlinkTable table = HoodieFlinkTable.create(config, HoodieFlinkEngineContext.DEFAULT, metaClient);
    // 2. check each partition data
    expected.forEach((partition, partitionDataSet) -> {
        List<String> readBuffer = new ArrayList<>();
        table.getBaseFileOnlyView().getLatestBaseFiles(partition).forEach(baseFile -> {
            String path = baseFile.getPath();
            try {
                ParquetReader<GenericRecord> reader = AvroParquetReader.<GenericRecord>builder(new Path(path)).build();
                GenericRecord nextRecord = reader.read();
                while (nextRecord != null) {
                    readBuffer.add(filterOutVariables(nextRecord));
                    nextRecord = reader.read();
                }
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        });
        assertTrue(partitionDataSet.size() == readBuffer.size() && partitionDataSet.containsAll(readBuffer));
    });
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) IOException(java.io.IOException) HoodieFlinkTable(org.apache.hudi.table.HoodieFlinkTable) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 4 with HoodieFlinkTable

use of org.apache.hudi.table.HoodieFlinkTable in project hudi by apache.

the class HoodieFlinkWriteClient method insertOverwrite.

/**
 * Removes all existing records from the partitions affected and inserts the given HoodieRecords, into the table.
 *
 * @param records     HoodieRecords to insert
 * @param instantTime Instant time of the commit
 * @return list of WriteStatus to inspect errors and counts
 */
public List<WriteStatus> insertOverwrite(List<HoodieRecord<T>> records, final String instantTime) {
    HoodieTable<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> table = initTable(WriteOperationType.INSERT_OVERWRITE, Option.ofNullable(instantTime));
    table.validateInsertSchema();
    preWrite(instantTime, WriteOperationType.INSERT_OVERWRITE, table.getMetaClient());
    // create the write handle if not exists
    final HoodieWriteHandle<?, ?, ?, ?> writeHandle = getOrCreateWriteHandle(records.get(0), getConfig(), instantTime, table, records.listIterator());
    HoodieWriteMetadata result = ((HoodieFlinkTable<T>) table).insertOverwrite(context, writeHandle, instantTime, records);
    return postWrite(result, instantTime, table);
}
Also used : List(java.util.List) HoodieList(org.apache.hudi.common.data.HoodieList) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieFlinkTable(org.apache.hudi.table.HoodieFlinkTable)

Example 5 with HoodieFlinkTable

use of org.apache.hudi.table.HoodieFlinkTable in project hudi by apache.

the class TestFlinkHoodieBloomIndex method testTagLocationWithEmptyList.

@ParameterizedTest(name = TEST_NAME_WITH_PARAMS)
@MethodSource("configParams")
public void testTagLocationWithEmptyList(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) {
    // We have some records to be tagged (two different partitions)
    List<HoodieRecord> records = new ArrayList<>();
    // Also create the metadata and config
    HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking);
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieFlinkTable table = HoodieFlinkTable.create(config, context, metaClient);
    // Let's tag
    HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance());
    assertDoesNotThrow(() -> {
        tagLocation(bloomIndex, records, table);
    }, "EmptyList should not result in IllegalArgumentException: Positive number of slices required");
}
Also used : HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieFlinkTable(org.apache.hudi.table.HoodieFlinkTable) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Aggregations

HoodieFlinkTable (org.apache.hudi.table.HoodieFlinkTable)8 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)5 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)4 IOException (java.io.IOException)3 Path (org.apache.hadoop.fs.Path)3 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)3 ArrayList (java.util.ArrayList)2 List (java.util.List)2 Configuration (org.apache.flink.configuration.Configuration)2 HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)2 HoodieKey (org.apache.hudi.common.model.HoodieKey)2 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)2 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)2 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)2 RawTripTestPayload (org.apache.hudi.common.testutils.RawTripTestPayload)2 Option (org.apache.hudi.common.util.Option)2 HoodieWriteMetadata (org.apache.hudi.table.action.HoodieWriteMetadata)2 HoodieFlinkWriteableTestTable (org.apache.hudi.testutils.HoodieFlinkWriteableTestTable)2 MethodSource (org.junit.jupiter.params.provider.MethodSource)2 Locale (java.util.Locale)1