Search in sources :

Example 31 with HoodieAvroRecord

use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.

the class AbstractConnectWriter method writeRecord.

@Override
public void writeRecord(SinkRecord record) throws IOException {
    AvroConvertor convertor = new AvroConvertor(schemaProvider.getSourceSchema());
    Option<GenericRecord> avroRecord;
    switch(connectConfigs.getKafkaValueConverter()) {
        case KAFKA_AVRO_CONVERTER:
            avroRecord = Option.of((GenericRecord) record.value());
            break;
        case KAFKA_STRING_CONVERTER:
            avroRecord = Option.of(convertor.fromJson((String) record.value()));
            break;
        case KAFKA_JSON_CONVERTER:
            throw new UnsupportedEncodingException("Currently JSON objects are not supported");
        default:
            throw new IOException("Unsupported Kafka Format type (" + connectConfigs.getKafkaValueConverter() + ")");
    }
    // Tag records with a file ID based on kafka partition and hudi partition.
    HoodieRecord<?> hoodieRecord = new HoodieAvroRecord<>(keyGenerator.getKey(avroRecord.get()), new HoodieAvroPayload(avroRecord));
    String fileId = KafkaConnectUtils.hashDigest(String.format("%s-%s", record.kafkaPartition(), hoodieRecord.getPartitionPath()));
    hoodieRecord.unseal();
    hoodieRecord.setCurrentLocation(new HoodieRecordLocation(instantTime, fileId));
    hoodieRecord.setNewLocation(new HoodieRecordLocation(instantTime, fileId));
    hoodieRecord.seal();
    writeHudiRecord(hoodieRecord);
}
Also used : AvroConvertor(org.apache.hudi.utilities.sources.helpers.AvroConvertor) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) UnsupportedEncodingException(java.io.UnsupportedEncodingException) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) IOException(java.io.IOException) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieAvroPayload(org.apache.hudi.common.model.HoodieAvroPayload)

Example 32 with HoodieAvroRecord

use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.

the class TestHoodieHFileReaderWriter method testWriteReadHFile.

@ParameterizedTest
@MethodSource("populateMetaFieldsAndTestAvroWithMeta")
public void testWriteReadHFile(boolean populateMetaFields, boolean testAvroWithMeta) throws Exception {
    Schema avroSchema = getSchemaFromResource(TestHoodieOrcReaderWriter.class, "/exampleSchemaWithMetaFields.avsc");
    HoodieHFileWriter writer = createHFileWriter(avroSchema, populateMetaFields);
    List<String> keys = new ArrayList<>();
    Map<String, GenericRecord> recordMap = new HashMap<>();
    for (int i = 0; i < 100; i++) {
        GenericRecord record = new GenericData.Record(avroSchema);
        String key = String.format("%s%04d", "key", i);
        record.put("_row_key", key);
        keys.add(key);
        record.put("time", Integer.toString(RANDOM.nextInt()));
        record.put("number", i);
        if (testAvroWithMeta) {
            writer.writeAvroWithMetadata(record, new HoodieAvroRecord(new HoodieKey((String) record.get("_row_key"), Integer.toString((Integer) record.get("number"))), // payload does not matter. GenericRecord passed in is what matters
            new EmptyHoodieRecordPayload()));
        // only HoodieKey will be looked up from the 2nd arg(HoodieRecord).
        } else {
            writer.writeAvro(key, record);
        }
        recordMap.put(key, record);
    }
    writer.close();
    Configuration conf = new Configuration();
    CacheConfig cacheConfig = new CacheConfig(conf);
    HoodieHFileReader hoodieHFileReader = new HoodieHFileReader(conf, filePath, cacheConfig, filePath.getFileSystem(conf));
    List<Pair<String, IndexedRecord>> records = hoodieHFileReader.readAllRecords();
    records.forEach(entry -> assertEquals(entry.getSecond(), recordMap.get(entry.getFirst())));
    hoodieHFileReader.close();
    for (int i = 0; i < 2; i++) {
        int randomRowstoFetch = 5 + RANDOM.nextInt(10);
        Set<String> rowsToFetch = getRandomKeys(randomRowstoFetch, keys);
        List<String> rowsList = new ArrayList<>(rowsToFetch);
        Collections.sort(rowsList);
        hoodieHFileReader = new HoodieHFileReader(conf, filePath, cacheConfig, filePath.getFileSystem(conf));
        List<Pair<String, GenericRecord>> result = hoodieHFileReader.readRecords(rowsList);
        assertEquals(result.size(), randomRowstoFetch);
        result.forEach(entry -> {
            assertEquals(entry.getSecond(), recordMap.get(entry.getFirst()));
            if (populateMetaFields && testAvroWithMeta) {
                assertNotNull(entry.getSecond().get(HoodieRecord.RECORD_KEY_METADATA_FIELD));
            } else {
                assertNull(entry.getSecond().get(HoodieRecord.RECORD_KEY_METADATA_FIELD));
            }
        });
        hoodieHFileReader.close();
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) GenericRecord(org.apache.avro.generic.GenericRecord) CacheConfig(org.apache.hadoop.hbase.io.hfile.CacheConfig) Pair(org.apache.hadoop.hbase.util.Pair) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 33 with HoodieAvroRecord

use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.

the class HoodieTestDataGenerator method generateUniqueUpdatesStream.

/**
 * Generates deduped updates of keys previously inserted, randomly distributed across the keys above.
 *
 * @param instantTime Commit Timestamp
 * @param n          Number of unique records
 * @return stream of hoodie record updates
 */
public Stream<HoodieRecord> generateUniqueUpdatesStream(String instantTime, Integer n, String schemaStr) {
    final Set<KeyPartition> used = new HashSet<>();
    int numExistingKeys = numKeysBySchema.getOrDefault(schemaStr, 0);
    Map<Integer, KeyPartition> existingKeys = existingKeysBySchema.get(schemaStr);
    if (n > numExistingKeys) {
        throw new IllegalArgumentException("Requested unique updates is greater than number of available keys");
    }
    return IntStream.range(0, n).boxed().map(i -> {
        int index = numExistingKeys == 1 ? 0 : rand.nextInt(numExistingKeys - 1);
        KeyPartition kp = existingKeys.get(index);
        // Find the available keyPartition starting from randomly chosen one.
        while (used.contains(kp)) {
            index = (index + 1) % numExistingKeys;
            kp = existingKeys.get(index);
        }
        logger.debug("key getting updated: " + kp.key.getRecordKey());
        used.add(kp);
        try {
            return new HoodieAvroRecord(kp.key, generateRandomValueAsPerSchema(schemaStr, kp.key, instantTime, false));
        } catch (IOException e) {
            throw new HoodieIOException(e.getMessage(), e);
        }
    });
}
Also used : HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HashSet(java.util.HashSet)

Example 34 with HoodieAvroRecord

use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.

the class HoodieTestDataGenerator method generateUniqueDeleteRecordStream.

/**
 * Generates deduped delete records previously inserted, randomly distributed across the keys above.
 *
 * @param instantTime Commit Timestamp
 * @param n          Number of unique records
 * @return stream of hoodie records for delete
 */
public Stream<HoodieRecord> generateUniqueDeleteRecordStream(String instantTime, Integer n) {
    final Set<KeyPartition> used = new HashSet<>();
    Map<Integer, KeyPartition> existingKeys = existingKeysBySchema.get(TRIP_EXAMPLE_SCHEMA);
    Integer numExistingKeys = numKeysBySchema.get(TRIP_EXAMPLE_SCHEMA);
    if (n > numExistingKeys) {
        throw new IllegalArgumentException("Requested unique deletes is greater than number of available keys");
    }
    List<HoodieRecord> result = new ArrayList<>();
    for (int i = 0; i < n; i++) {
        int index = rand.nextInt(numExistingKeys);
        while (!existingKeys.containsKey(index)) {
            index = (index + 1) % numExistingKeys;
        }
        // swap chosen index with last index and remove last entry.
        KeyPartition kp = existingKeys.remove(index);
        existingKeys.put(index, existingKeys.get(numExistingKeys - 1));
        existingKeys.remove(numExistingKeys - 1);
        numExistingKeys--;
        used.add(kp);
        try {
            result.add(new HoodieAvroRecord(kp.key, generateRandomDeleteValue(kp.key, instantTime)));
        } catch (IOException e) {
            throw new HoodieIOException(e.getMessage(), e);
        }
    }
    numKeysBySchema.put(TRIP_EXAMPLE_SCHEMA, numExistingKeys);
    return result.stream();
}
Also used : HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HashSet(java.util.HashSet)

Example 35 with HoodieAvroRecord

use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.

the class HoodieTestDataGenerator method generateUpdatesWithTS.

public List<HoodieRecord> generateUpdatesWithTS(String instantTime, List<HoodieRecord> baseRecords, int ts) throws IOException {
    List<HoodieRecord> updates = new ArrayList<>();
    for (HoodieRecord baseRecord : baseRecords) {
        HoodieRecord record = new HoodieAvroRecord(baseRecord.getKey(), generateRandomValue(baseRecord.getKey(), instantTime, false, ts));
        updates.add(record);
    }
    return updates;
}
Also used : HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList)

Aggregations

HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)84 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)72 HoodieKey (org.apache.hudi.common.model.HoodieKey)68 ArrayList (java.util.ArrayList)38 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)37 RawTripTestPayload (org.apache.hudi.common.testutils.RawTripTestPayload)31 Test (org.junit.jupiter.api.Test)30 GenericRecord (org.apache.avro.generic.GenericRecord)29 Path (org.apache.hadoop.fs.Path)26 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)25 IOException (java.io.IOException)24 HoodieTable (org.apache.hudi.table.HoodieTable)24 List (java.util.List)23 Schema (org.apache.avro.Schema)23 HashMap (java.util.HashMap)22 Pair (org.apache.hudi.common.util.collection.Pair)21 Map (java.util.Map)20 Collectors (java.util.stream.Collectors)20 Arrays (java.util.Arrays)17 Option (org.apache.hudi.common.util.Option)16