use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.
the class HoodieExampleDataGenerator method generateInsertsStream.
/**
* Generates new inserts, uniformly across the partition paths above. It also updates the list of existing keys.
*/
public Stream<HoodieRecord<T>> generateInsertsStream(String commitTime, Integer n) {
int currSize = getNumExistingKeys();
return IntStream.range(0, n).boxed().map(i -> {
String partitionPath = partitionPaths[rand.nextInt(partitionPaths.length)];
HoodieKey key = new HoodieKey(UUID.randomUUID().toString(), partitionPath);
KeyPartition kp = new KeyPartition();
kp.key = key;
kp.partitionPath = partitionPath;
existingKeys.put(currSize + i, kp);
numExistingKeys++;
return new HoodieAvroRecord<>(key, generateRandomValue(key, commitTime));
});
}
use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.
the class JdbcTestUtils method insert.
public static List<HoodieRecord> insert(String commitTime, int numRecords, Connection connection, HoodieTestDataGenerator dataGenerator, TypedProperties props) throws SQLException {
PreparedStatement insertStatement = connection.prepareStatement("INSERT INTO triprec (" + "commit_time," + "_row_key," + "rider," + "driver," + "begin_lat," + "begin_lon," + "end_lat," + "end_lon," + "fare) " + "values(?,?,?,?,?,?,?,?,?)");
List<HoodieRecord> hoodieRecords = dataGenerator.generateInserts(commitTime, numRecords);
hoodieRecords.stream().map(r -> {
try {
return ((GenericRecord) ((HoodieAvroRecord) r).getData().getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA, props).get());
} catch (IOException e) {
return null;
}
}).filter(Objects::nonNull).forEach(record -> {
try {
insertStatement.setString(1, commitTime);
insertStatement.setString(2, record.get("_row_key").toString());
insertStatement.setString(3, record.get("rider").toString());
insertStatement.setString(4, record.get("driver").toString());
insertStatement.setDouble(5, Double.parseDouble(record.get("begin_lat").toString()));
insertStatement.setDouble(6, Double.parseDouble(record.get("begin_lon").toString()));
insertStatement.setDouble(7, Double.parseDouble(record.get("end_lat").toString()));
insertStatement.setDouble(8, Double.parseDouble(record.get("end_lon").toString()));
insertStatement.setDouble(9, Double.parseDouble(((GenericRecord) record.get("fare")).get("amount").toString()));
insertStatement.addBatch();
} catch (SQLException e) {
LOG.warn(e.getMessage());
}
});
insertStatement.executeBatch();
close(insertStatement);
return hoodieRecords;
}
use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.
the class JdbcTestUtils method update.
public static List<HoodieRecord> update(String commitTime, List<HoodieRecord> inserts, Connection connection, HoodieTestDataGenerator dataGenerator, TypedProperties props) throws SQLException, IOException {
PreparedStatement updateStatement = connection.prepareStatement("UPDATE triprec set commit_time=?," + "_row_key=?," + "rider=?," + "driver=?," + "begin_lat=?," + "begin_lon=?," + "end_lat=?," + "end_lon=?," + "fare=?" + "where _row_key=?");
List<HoodieRecord> updateRecords = dataGenerator.generateUpdates(commitTime, inserts);
updateRecords.stream().map(m -> {
try {
return ((HoodieAvroRecord) m).getData().getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA, props).get();
} catch (IOException e) {
return null;
}
}).filter(Objects::nonNull).map(r -> ((GenericRecord) r)).sequential().forEach(r -> {
try {
updateStatement.setString(1, commitTime);
updateStatement.setString(2, r.get("_row_key").toString());
updateStatement.setString(3, r.get("rider").toString());
updateStatement.setString(4, r.get("driver").toString());
updateStatement.setDouble(5, Double.parseDouble(r.get("begin_lat").toString()));
updateStatement.setDouble(6, Double.parseDouble(r.get("begin_lon").toString()));
updateStatement.setDouble(7, Double.parseDouble(r.get("end_lat").toString()));
updateStatement.setDouble(8, Double.parseDouble(r.get("end_lon").toString()));
updateStatement.setDouble(9, Double.parseDouble(((GenericRecord) r.get("fare")).get("amount").toString()));
updateStatement.setString(10, r.get("_row_key").toString());
updateStatement.addBatch();
} catch (SQLException e) {
LOG.warn(e.getMessage());
}
});
updateStatement.executeBatch();
close(updateStatement);
return updateRecords;
}
use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.
the class HoodieWriteHelper method deduplicateRecords.
@Override
public HoodieData<HoodieRecord<T>> deduplicateRecords(HoodieData<HoodieRecord<T>> records, HoodieIndex<?, ?> index, int parallelism) {
boolean isIndexingGlobal = index.isGlobal();
return records.mapToPair(record -> {
HoodieKey hoodieKey = record.getKey();
// If index used is global, then records are expected to differ in their partitionPath
Object key = isIndexingGlobal ? hoodieKey.getRecordKey() : hoodieKey;
return Pair.of(key, record);
}).reduceByKey((rec1, rec2) -> {
@SuppressWarnings("unchecked") T reducedData = (T) rec2.getData().preCombine(rec1.getData());
HoodieKey reducedKey = rec1.getData().equals(reducedData) ? rec1.getKey() : rec2.getKey();
return new HoodieAvroRecord<>(reducedKey, reducedData);
}, parallelism).map(Pair::getRight);
}
use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.
the class HoodieGlobalSimpleIndex method getTaggedRecords.
/**
* Tag records with right {@link HoodieRecordLocation}.
*
* @param incomingRecords incoming {@link HoodieRecord}s
* @param existingRecords existing records with {@link HoodieRecordLocation}s
* @return {@link HoodieData} of {@link HoodieRecord}s with tagged {@link HoodieRecordLocation}s
*/
private <R> HoodieData<HoodieRecord<R>> getTaggedRecords(HoodiePairData<String, HoodieRecord<R>> incomingRecords, HoodiePairData<HoodieKey, HoodieRecordLocation> existingRecords) {
HoodiePairData<String, Pair<String, HoodieRecordLocation>> existingRecordByRecordKey = existingRecords.mapToPair(entry -> new ImmutablePair<>(entry.getLeft().getRecordKey(), Pair.of(entry.getLeft().getPartitionPath(), entry.getRight())));
return incomingRecords.leftOuterJoin(existingRecordByRecordKey).values().flatMap(entry -> {
HoodieRecord<R> inputRecord = entry.getLeft();
Option<Pair<String, HoodieRecordLocation>> partitionPathLocationPair = Option.ofNullable(entry.getRight().orElse(null));
List<HoodieRecord<R>> taggedRecords;
if (partitionPathLocationPair.isPresent()) {
String partitionPath = partitionPathLocationPair.get().getKey();
HoodieRecordLocation location = partitionPathLocationPair.get().getRight();
if (config.getGlobalSimpleIndexUpdatePartitionPath() && !(inputRecord.getPartitionPath().equals(partitionPath))) {
// Create an empty record to delete the record in the old partition
HoodieRecord<R> deleteRecord = new HoodieAvroRecord(new HoodieKey(inputRecord.getRecordKey(), partitionPath), new EmptyHoodieRecordPayload());
deleteRecord.setCurrentLocation(location);
deleteRecord.seal();
// Tag the incoming record for inserting to the new partition
HoodieRecord<R> insertRecord = (HoodieRecord<R>) HoodieIndexUtils.getTaggedRecord(inputRecord, Option.empty());
taggedRecords = Arrays.asList(deleteRecord, insertRecord);
} else {
// Ignore the incoming record's partition, regardless of whether it differs from its old partition or not.
// When it differs, the record will still be updated at its old partition.
HoodieRecord<R> newRecord = new HoodieAvroRecord(new HoodieKey(inputRecord.getRecordKey(), partitionPath), (HoodieRecordPayload) inputRecord.getData());
taggedRecords = Collections.singletonList((HoodieRecord<R>) HoodieIndexUtils.getTaggedRecord(newRecord, Option.ofNullable(location)));
}
} else {
taggedRecords = Collections.singletonList((HoodieRecord<R>) HoodieIndexUtils.getTaggedRecord(inputRecord, Option.empty()));
}
return taggedRecords.iterator();
});
}
Aggregations