Search in sources :

Example 41 with HoodieAvroRecord

use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.

the class HoodieExampleDataGenerator method generateInsertsStream.

/**
 * Generates new inserts, uniformly across the partition paths above. It also updates the list of existing keys.
 */
public Stream<HoodieRecord<T>> generateInsertsStream(String commitTime, Integer n) {
    int currSize = getNumExistingKeys();
    return IntStream.range(0, n).boxed().map(i -> {
        String partitionPath = partitionPaths[rand.nextInt(partitionPaths.length)];
        HoodieKey key = new HoodieKey(UUID.randomUUID().toString(), partitionPath);
        KeyPartition kp = new KeyPartition();
        kp.key = key;
        kp.partitionPath = partitionPath;
        existingKeys.put(currSize + i, kp);
        numExistingKeys++;
        return new HoodieAvroRecord<>(key, generateRandomValue(key, commitTime));
    });
}
Also used : HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey)

Example 42 with HoodieAvroRecord

use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.

the class JdbcTestUtils method insert.

public static List<HoodieRecord> insert(String commitTime, int numRecords, Connection connection, HoodieTestDataGenerator dataGenerator, TypedProperties props) throws SQLException {
    PreparedStatement insertStatement = connection.prepareStatement("INSERT INTO triprec (" + "commit_time," + "_row_key," + "rider," + "driver," + "begin_lat," + "begin_lon," + "end_lat," + "end_lon," + "fare) " + "values(?,?,?,?,?,?,?,?,?)");
    List<HoodieRecord> hoodieRecords = dataGenerator.generateInserts(commitTime, numRecords);
    hoodieRecords.stream().map(r -> {
        try {
            return ((GenericRecord) ((HoodieAvroRecord) r).getData().getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA, props).get());
        } catch (IOException e) {
            return null;
        }
    }).filter(Objects::nonNull).forEach(record -> {
        try {
            insertStatement.setString(1, commitTime);
            insertStatement.setString(2, record.get("_row_key").toString());
            insertStatement.setString(3, record.get("rider").toString());
            insertStatement.setString(4, record.get("driver").toString());
            insertStatement.setDouble(5, Double.parseDouble(record.get("begin_lat").toString()));
            insertStatement.setDouble(6, Double.parseDouble(record.get("begin_lon").toString()));
            insertStatement.setDouble(7, Double.parseDouble(record.get("end_lat").toString()));
            insertStatement.setDouble(8, Double.parseDouble(record.get("end_lon").toString()));
            insertStatement.setDouble(9, Double.parseDouble(((GenericRecord) record.get("fare")).get("amount").toString()));
            insertStatement.addBatch();
        } catch (SQLException e) {
            LOG.warn(e.getMessage());
        }
    });
    insertStatement.executeBatch();
    close(insertStatement);
    return hoodieRecords;
}
Also used : HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) SQLException(java.sql.SQLException) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) PreparedStatement(java.sql.PreparedStatement) IOException(java.io.IOException)

Example 43 with HoodieAvroRecord

use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.

the class JdbcTestUtils method update.

public static List<HoodieRecord> update(String commitTime, List<HoodieRecord> inserts, Connection connection, HoodieTestDataGenerator dataGenerator, TypedProperties props) throws SQLException, IOException {
    PreparedStatement updateStatement = connection.prepareStatement("UPDATE triprec set commit_time=?," + "_row_key=?," + "rider=?," + "driver=?," + "begin_lat=?," + "begin_lon=?," + "end_lat=?," + "end_lon=?," + "fare=?" + "where _row_key=?");
    List<HoodieRecord> updateRecords = dataGenerator.generateUpdates(commitTime, inserts);
    updateRecords.stream().map(m -> {
        try {
            return ((HoodieAvroRecord) m).getData().getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA, props).get();
        } catch (IOException e) {
            return null;
        }
    }).filter(Objects::nonNull).map(r -> ((GenericRecord) r)).sequential().forEach(r -> {
        try {
            updateStatement.setString(1, commitTime);
            updateStatement.setString(2, r.get("_row_key").toString());
            updateStatement.setString(3, r.get("rider").toString());
            updateStatement.setString(4, r.get("driver").toString());
            updateStatement.setDouble(5, Double.parseDouble(r.get("begin_lat").toString()));
            updateStatement.setDouble(6, Double.parseDouble(r.get("begin_lon").toString()));
            updateStatement.setDouble(7, Double.parseDouble(r.get("end_lat").toString()));
            updateStatement.setDouble(8, Double.parseDouble(r.get("end_lon").toString()));
            updateStatement.setDouble(9, Double.parseDouble(((GenericRecord) r.get("fare")).get("amount").toString()));
            updateStatement.setString(10, r.get("_row_key").toString());
            updateStatement.addBatch();
        } catch (SQLException e) {
            LOG.warn(e.getMessage());
        }
    });
    updateStatement.executeBatch();
    close(updateStatement);
    return updateRecords;
}
Also used : HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) SQLException(java.sql.SQLException) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) PreparedStatement(java.sql.PreparedStatement) IOException(java.io.IOException) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 44 with HoodieAvroRecord

use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.

the class HoodieWriteHelper method deduplicateRecords.

@Override
public HoodieData<HoodieRecord<T>> deduplicateRecords(HoodieData<HoodieRecord<T>> records, HoodieIndex<?, ?> index, int parallelism) {
    boolean isIndexingGlobal = index.isGlobal();
    return records.mapToPair(record -> {
        HoodieKey hoodieKey = record.getKey();
        // If index used is global, then records are expected to differ in their partitionPath
        Object key = isIndexingGlobal ? hoodieKey.getRecordKey() : hoodieKey;
        return Pair.of(key, record);
    }).reduceByKey((rec1, rec2) -> {
        @SuppressWarnings("unchecked") T reducedData = (T) rec2.getData().preCombine(rec1.getData());
        HoodieKey reducedKey = rec1.getData().equals(reducedData) ? rec1.getKey() : rec2.getKey();
        return new HoodieAvroRecord<>(reducedKey, reducedData);
    }, parallelism).map(Pair::getRight);
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieIndex(org.apache.hudi.index.HoodieIndex) Pair(org.apache.hudi.common.util.collection.Pair) HoodieKey(org.apache.hudi.common.model.HoodieKey) Pair(org.apache.hudi.common.util.collection.Pair)

Example 45 with HoodieAvroRecord

use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.

the class HoodieGlobalSimpleIndex method getTaggedRecords.

/**
 * Tag records with right {@link HoodieRecordLocation}.
 *
 * @param incomingRecords incoming {@link HoodieRecord}s
 * @param existingRecords existing records with {@link HoodieRecordLocation}s
 * @return {@link HoodieData} of {@link HoodieRecord}s with tagged {@link HoodieRecordLocation}s
 */
private <R> HoodieData<HoodieRecord<R>> getTaggedRecords(HoodiePairData<String, HoodieRecord<R>> incomingRecords, HoodiePairData<HoodieKey, HoodieRecordLocation> existingRecords) {
    HoodiePairData<String, Pair<String, HoodieRecordLocation>> existingRecordByRecordKey = existingRecords.mapToPair(entry -> new ImmutablePair<>(entry.getLeft().getRecordKey(), Pair.of(entry.getLeft().getPartitionPath(), entry.getRight())));
    return incomingRecords.leftOuterJoin(existingRecordByRecordKey).values().flatMap(entry -> {
        HoodieRecord<R> inputRecord = entry.getLeft();
        Option<Pair<String, HoodieRecordLocation>> partitionPathLocationPair = Option.ofNullable(entry.getRight().orElse(null));
        List<HoodieRecord<R>> taggedRecords;
        if (partitionPathLocationPair.isPresent()) {
            String partitionPath = partitionPathLocationPair.get().getKey();
            HoodieRecordLocation location = partitionPathLocationPair.get().getRight();
            if (config.getGlobalSimpleIndexUpdatePartitionPath() && !(inputRecord.getPartitionPath().equals(partitionPath))) {
                // Create an empty record to delete the record in the old partition
                HoodieRecord<R> deleteRecord = new HoodieAvroRecord(new HoodieKey(inputRecord.getRecordKey(), partitionPath), new EmptyHoodieRecordPayload());
                deleteRecord.setCurrentLocation(location);
                deleteRecord.seal();
                // Tag the incoming record for inserting to the new partition
                HoodieRecord<R> insertRecord = (HoodieRecord<R>) HoodieIndexUtils.getTaggedRecord(inputRecord, Option.empty());
                taggedRecords = Arrays.asList(deleteRecord, insertRecord);
            } else {
                // Ignore the incoming record's partition, regardless of whether it differs from its old partition or not.
                // When it differs, the record will still be updated at its old partition.
                HoodieRecord<R> newRecord = new HoodieAvroRecord(new HoodieKey(inputRecord.getRecordKey(), partitionPath), (HoodieRecordPayload) inputRecord.getData());
                taggedRecords = Collections.singletonList((HoodieRecord<R>) HoodieIndexUtils.getTaggedRecord(newRecord, Option.ofNullable(location)));
            }
        } else {
            taggedRecords = Collections.singletonList((HoodieRecord<R>) HoodieIndexUtils.getTaggedRecord(inputRecord, Option.empty()));
        }
        return taggedRecords.iterator();
    });
}
Also used : HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Pair(org.apache.hudi.common.util.collection.Pair)

Aggregations

HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)84 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)72 HoodieKey (org.apache.hudi.common.model.HoodieKey)68 ArrayList (java.util.ArrayList)38 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)37 RawTripTestPayload (org.apache.hudi.common.testutils.RawTripTestPayload)31 Test (org.junit.jupiter.api.Test)30 GenericRecord (org.apache.avro.generic.GenericRecord)29 Path (org.apache.hadoop.fs.Path)26 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)25 IOException (java.io.IOException)24 HoodieTable (org.apache.hudi.table.HoodieTable)24 List (java.util.List)23 Schema (org.apache.avro.Schema)23 HashMap (java.util.HashMap)22 Pair (org.apache.hudi.common.util.collection.Pair)21 Map (java.util.Map)20 Collectors (java.util.stream.Collectors)20 Arrays (java.util.Arrays)17 Option (org.apache.hudi.common.util.Option)16