Search in sources :

Example 1 with KeyGenerator

use of org.apache.hudi.keygen.KeyGenerator in project hudi by apache.

the class TestCreateAvroKeyGeneratorByTypeWithFactory method testKeyGeneratorTypes.

@ParameterizedTest
@MethodSource("configParams")
public void testKeyGeneratorTypes(String keyGenType) throws IOException {
    props.put(HoodieWriteConfig.KEYGENERATOR_TYPE.key(), keyGenType);
    KeyGeneratorType keyType = KeyGeneratorType.valueOf(keyGenType);
    KeyGenerator keyGenerator = HoodieAvroKeyGeneratorFactory.createKeyGenerator(props);
    switch(keyType) {
        case SIMPLE:
            Assertions.assertEquals(SimpleAvroKeyGenerator.class.getName(), keyGenerator.getClass().getName());
            return;
        case COMPLEX:
            Assertions.assertEquals(ComplexAvroKeyGenerator.class.getName(), keyGenerator.getClass().getName());
            return;
        case TIMESTAMP:
            Assertions.assertEquals(TimestampBasedAvroKeyGenerator.class.getName(), keyGenerator.getClass().getName());
            return;
        case CUSTOM:
            Assertions.assertEquals(CustomAvroKeyGenerator.class.getName(), keyGenerator.getClass().getName());
            return;
        case NON_PARTITION:
            Assertions.assertEquals(NonpartitionedAvroKeyGenerator.class.getName(), keyGenerator.getClass().getName());
            return;
        case GLOBAL_DELETE:
            Assertions.assertEquals(GlobalAvroDeleteKeyGenerator.class.getName(), keyGenerator.getClass().getName());
            return;
        default:
            throw new HoodieKeyGeneratorException("Unsupported keyGenerator Type " + keyGenType);
    }
}
Also used : HoodieKeyGeneratorException(org.apache.hudi.exception.HoodieKeyGeneratorException) TimestampBasedAvroKeyGenerator(org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator) CustomAvroKeyGenerator(org.apache.hudi.keygen.CustomAvroKeyGenerator) ComplexAvroKeyGenerator(org.apache.hudi.keygen.ComplexAvroKeyGenerator) NonpartitionedAvroKeyGenerator(org.apache.hudi.keygen.NonpartitionedAvroKeyGenerator) SimpleAvroKeyGenerator(org.apache.hudi.keygen.SimpleAvroKeyGenerator) KeyGeneratorType(org.apache.hudi.keygen.constant.KeyGeneratorType) NonpartitionedAvroKeyGenerator(org.apache.hudi.keygen.NonpartitionedAvroKeyGenerator) SimpleAvroKeyGenerator(org.apache.hudi.keygen.SimpleAvroKeyGenerator) KeyGenerator(org.apache.hudi.keygen.KeyGenerator) ComplexAvroKeyGenerator(org.apache.hudi.keygen.ComplexAvroKeyGenerator) GlobalAvroDeleteKeyGenerator(org.apache.hudi.keygen.GlobalAvroDeleteKeyGenerator) TimestampBasedAvroKeyGenerator(org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator) CustomAvroKeyGenerator(org.apache.hudi.keygen.CustomAvroKeyGenerator) GlobalAvroDeleteKeyGenerator(org.apache.hudi.keygen.GlobalAvroDeleteKeyGenerator) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 2 with KeyGenerator

use of org.apache.hudi.keygen.KeyGenerator in project hudi by apache.

the class TestHoodieSparkKeyGeneratorFactory method testKeyGeneratorFactory.

@Test
public void testKeyGeneratorFactory() throws IOException {
    TypedProperties props = getCommonProps();
    // set KeyGenerator type only
    props.put(HoodieWriteConfig.KEYGENERATOR_TYPE.key(), KeyGeneratorType.SIMPLE.name());
    KeyGenerator keyGenerator = HoodieSparkKeyGeneratorFactory.createKeyGenerator(props);
    Assertions.assertEquals(SimpleKeyGenerator.class.getName(), keyGenerator.getClass().getName());
    // set KeyGenerator class only
    props = getCommonProps();
    props.put(HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key(), SimpleKeyGenerator.class.getName());
    KeyGenerator keyGenerator2 = HoodieSparkKeyGeneratorFactory.createKeyGenerator(props);
    Assertions.assertEquals(SimpleKeyGenerator.class.getName(), keyGenerator2.getClass().getName());
    // set both class name and keyGenerator type
    props.put(HoodieWriteConfig.KEYGENERATOR_TYPE.key(), KeyGeneratorType.CUSTOM.name());
    KeyGenerator keyGenerator3 = HoodieSparkKeyGeneratorFactory.createKeyGenerator(props);
    // KEYGENERATOR_TYPE_PROP was overwritten by KEYGENERATOR_CLASS_PROP
    Assertions.assertEquals(SimpleKeyGenerator.class.getName(), keyGenerator3.getClass().getName());
    // set wrong class name
    final TypedProperties props2 = getCommonProps();
    props2.put(HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key(), TestHoodieSparkKeyGeneratorFactory.class.getName());
    assertThrows(IOException.class, () -> HoodieSparkKeyGeneratorFactory.createKeyGenerator(props2));
    // set wrong keyGenerator type
    final TypedProperties props3 = getCommonProps();
    props3.put(HoodieWriteConfig.KEYGENERATOR_TYPE.key(), "wrong_type");
    assertThrows(HoodieKeyGeneratorException.class, () -> HoodieSparkKeyGeneratorFactory.createKeyGenerator(props3));
}
Also used : SimpleKeyGenerator(org.apache.hudi.keygen.SimpleKeyGenerator) TypedProperties(org.apache.hudi.common.config.TypedProperties) KeyGenerator(org.apache.hudi.keygen.KeyGenerator) TestComplexKeyGenerator(org.apache.hudi.keygen.TestComplexKeyGenerator) SimpleKeyGenerator(org.apache.hudi.keygen.SimpleKeyGenerator) Test(org.junit.jupiter.api.Test)

Example 3 with KeyGenerator

use of org.apache.hudi.keygen.KeyGenerator in project hudi by apache.

the class TestHoodieClientOnCopyOnWriteStorage method verifyRecordsWritten.

/**
 * Verify data in base files matches expected records and commit time.
 */
private void verifyRecordsWritten(String commitTime, boolean populateMetadataField, List<HoodieRecord> expectedRecords, List<WriteStatus> allStatus, HoodieWriteConfig config) throws IOException {
    List<GenericRecord> records = new ArrayList<>();
    Set<String> expectedKeys = verifyRecordKeys(expectedRecords, allStatus, records);
    if (config.populateMetaFields()) {
        for (GenericRecord record : records) {
            String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
            assertEquals(commitTime, record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
            assertTrue(expectedKeys.contains(recordKey));
        }
    } else {
        KeyGenerator keyGenerator = HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(config.getProps()));
        for (GenericRecord record : records) {
            String recordKey = keyGenerator.getKey(record).getRecordKey();
            if (!populateMetadataField) {
                assertNull(record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD));
            }
            assertTrue(expectedKeys.contains(recordKey));
        }
    }
}
Also used : ArrayList(java.util.ArrayList) GenericRecord(org.apache.avro.generic.GenericRecord) TypedProperties(org.apache.hudi.common.config.TypedProperties) BaseKeyGenerator(org.apache.hudi.keygen.BaseKeyGenerator) KeyGenerator(org.apache.hudi.keygen.KeyGenerator)

Example 4 with KeyGenerator

use of org.apache.hudi.keygen.KeyGenerator in project hudi by apache.

the class TestAbstractConnectWriter method testAbstractWriterForAllFormats.

@ParameterizedTest
@EnumSource(value = TestInputFormats.class)
public void testAbstractWriterForAllFormats(TestInputFormats inputFormats) throws Exception {
    Schema schema = schemaProvider.getSourceSchema();
    List<?> inputRecords;
    List<HoodieRecord> expectedRecords;
    String formatConverter;
    switch(inputFormats) {
        case JSON_STRING:
            formatConverter = AbstractConnectWriter.KAFKA_STRING_CONVERTER;
            GenericDatumReader<IndexedRecord> reader = new GenericDatumReader<>(schema, schema);
            inputRecords = SchemaTestUtil.generateTestJsonRecords(0, NUM_RECORDS);
            expectedRecords = ((List<String>) inputRecords).stream().map(s -> {
                try {
                    return HoodieAvroUtils.rewriteRecord((GenericRecord) reader.read(null, DecoderFactory.get().jsonDecoder(schema, s)), schema);
                } catch (IOException exception) {
                    throw new HoodieException("Error converting JSON records to AVRO");
                }
            }).map(p -> convertToHoodieRecords(p, p.get(RECORD_KEY_INDEX).toString(), "000/00/00")).collect(Collectors.toList());
            break;
        case AVRO:
            formatConverter = AbstractConnectWriter.KAFKA_AVRO_CONVERTER;
            inputRecords = SchemaTestUtil.generateTestRecords(0, NUM_RECORDS);
            expectedRecords = inputRecords.stream().map(s -> HoodieAvroUtils.rewriteRecord((GenericRecord) s, schema)).map(p -> convertToHoodieRecords(p, p.get(RECORD_KEY_INDEX).toString(), "000/00/00")).collect(Collectors.toList());
            break;
        default:
            throw new HoodieException("Unknown test scenario " + inputFormats);
    }
    configs = KafkaConnectConfigs.newBuilder().withProperties(Collections.singletonMap(KafkaConnectConfigs.KAFKA_VALUE_CONVERTER, formatConverter)).build();
    AbstractHudiConnectWriterTestWrapper writer = new AbstractHudiConnectWriterTestWrapper(configs, keyGenerator, schemaProvider);
    for (int i = 0; i < NUM_RECORDS; i++) {
        writer.writeRecord(getNextKafkaRecord(inputRecords.get(i)));
    }
    validateRecords(writer.getWrittenRecords(), expectedRecords);
}
Also used : HoodieAvroPayload(org.apache.hudi.common.model.HoodieAvroPayload) BeforeEach(org.junit.jupiter.api.BeforeEach) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) HoodieException(org.apache.hudi.exception.HoodieException) Option(org.apache.hudi.common.util.Option) EnumSource(org.junit.jupiter.params.provider.EnumSource) AbstractConnectWriter(org.apache.hudi.connect.writers.AbstractConnectWriter) ArrayList(java.util.ArrayList) KeyGenerator(org.apache.hudi.keygen.KeyGenerator) SchemaTestUtil(org.apache.hudi.common.testutils.SchemaTestUtil) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) TypedProperties(org.apache.hudi.common.config.TypedProperties) Iterator(java.util.Iterator) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) WriteStatus(org.apache.hudi.client.WriteStatus) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) List(java.util.List) SinkRecord(org.apache.kafka.connect.sink.SinkRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) SchemaProvider(org.apache.hudi.schema.SchemaProvider) Comparator(java.util.Comparator) Collections(java.util.Collections) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) DecoderFactory(org.apache.avro.io.DecoderFactory) KafkaConnectConfigs(org.apache.hudi.connect.writers.KafkaConnectConfigs) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) Schema(org.apache.avro.Schema) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) ArrayList(java.util.ArrayList) List(java.util.List) GenericRecord(org.apache.avro.generic.GenericRecord) EnumSource(org.junit.jupiter.params.provider.EnumSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 5 with KeyGenerator

use of org.apache.hudi.keygen.KeyGenerator in project hudi by apache.

the class SparkFullBootstrapDataProviderBase method generateInputRecords.

@Override
public JavaRDD<HoodieRecord> generateInputRecords(String tableName, String sourceBasePath, List<Pair<String, List<HoodieFileStatus>>> partitionPathsWithFiles) {
    String[] filePaths = partitionPathsWithFiles.stream().map(Pair::getValue).flatMap(f -> f.stream().map(fs -> FileStatusUtils.toPath(fs.getPath()).toString())).toArray(String[]::new);
    Dataset inputDataset = sparkSession.read().format(getFormat()).load(filePaths);
    try {
        KeyGenerator keyGenerator = HoodieSparkKeyGeneratorFactory.createKeyGenerator(props);
        String structName = tableName + "_record";
        String namespace = "hoodie." + tableName;
        RDD<GenericRecord> genericRecords = HoodieSparkUtils.createRdd(inputDataset, structName, namespace, false, Option.empty());
        return genericRecords.toJavaRDD().map(gr -> {
            String orderingVal = HoodieAvroUtils.getNestedFieldValAsString(gr, props.getString("hoodie.datasource.write.precombine.field"), false, props.getBoolean(KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.key(), Boolean.parseBoolean(KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.defaultValue())));
            try {
                return DataSourceUtils.createHoodieRecord(gr, orderingVal, keyGenerator.getKey(gr), props.getString("hoodie.datasource.write.payload.class"));
            } catch (IOException ioe) {
                throw new HoodieIOException(ioe.getMessage(), ioe);
            }
        });
    } catch (IOException ioe) {
        throw new HoodieIOException(ioe.getMessage(), ioe);
    }
}
Also used : HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) TypedProperties(org.apache.hudi.common.config.TypedProperties) Dataset(org.apache.spark.sql.Dataset) KeyGeneratorOptions(org.apache.hudi.keygen.constant.KeyGeneratorOptions) Option(org.apache.hudi.common.util.Option) IOException(java.io.IOException) DataSourceUtils(org.apache.hudi.DataSourceUtils) KeyGenerator(org.apache.hudi.keygen.KeyGenerator) List(java.util.List) HoodieSparkUtils(org.apache.hudi.HoodieSparkUtils) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) FileStatusUtils(org.apache.hudi.common.bootstrap.FileStatusUtils) HoodieIOException(org.apache.hudi.exception.HoodieIOException) RDD(org.apache.spark.rdd.RDD) HoodieSparkKeyGeneratorFactory(org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory) JavaRDD(org.apache.spark.api.java.JavaRDD) FullRecordBootstrapDataProvider(org.apache.hudi.client.bootstrap.FullRecordBootstrapDataProvider) Pair(org.apache.hudi.common.util.collection.Pair) SparkSession(org.apache.spark.sql.SparkSession) HoodieIOException(org.apache.hudi.exception.HoodieIOException) Dataset(org.apache.spark.sql.Dataset) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) GenericRecord(org.apache.avro.generic.GenericRecord) KeyGenerator(org.apache.hudi.keygen.KeyGenerator) Pair(org.apache.hudi.common.util.collection.Pair)

Aggregations

KeyGenerator (org.apache.hudi.keygen.KeyGenerator)7 TypedProperties (org.apache.hudi.common.config.TypedProperties)5 GenericRecord (org.apache.avro.generic.GenericRecord)3 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)3 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 List (java.util.List)2 HoodieAvroUtils (org.apache.hudi.avro.HoodieAvroUtils)2 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)2 Option (org.apache.hudi.common.util.Option)2 HoodieKeyGeneratorException (org.apache.hudi.exception.HoodieKeyGeneratorException)2 Collections (java.util.Collections)1 Comparator (java.util.Comparator)1 Iterator (java.util.Iterator)1 Collectors (java.util.stream.Collectors)1 Schema (org.apache.avro.Schema)1 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)1 IndexedRecord (org.apache.avro.generic.IndexedRecord)1 DecoderFactory (org.apache.avro.io.DecoderFactory)1 DataSourceUtils (org.apache.hudi.DataSourceUtils)1