use of org.apache.hudi.common.util.collection.ImmutablePair in project hudi by apache.
the class S3EventsMetaSelector method getNextEventsFromQueue.
/**
* Get the list of events from queue.
*
* @param lastCheckpointStr The last checkpoint instant string, empty if first run.
* @return A pair of dataset of event records and the next checkpoint instant string.
*/
public Pair<List<String>, String> getNextEventsFromQueue(AmazonSQS sqs, Option<String> lastCheckpointStr, List<Message> processedMessages) {
processedMessages.clear();
log.info("Reading messages....");
try {
log.info("Start Checkpoint : " + lastCheckpointStr);
List<Map<String, Object>> eventRecords = getValidEvents(sqs, processedMessages);
log.info("Number of valid events: " + eventRecords.size());
List<String> filteredEventRecords = new ArrayList<>();
long newCheckpointTime = eventRecords.stream().mapToLong(eventRecord -> Date.from(Instant.from(DateTimeFormatter.ISO_INSTANT.parse((String) eventRecord.get(S3_MODEL_EVENT_TIME)))).getTime()).max().orElse(lastCheckpointStr.map(Long::parseLong).orElse(0L));
for (Map<String, Object> eventRecord : eventRecords) {
filteredEventRecords.add(new ObjectMapper().writeValueAsString(eventRecord).replace("%3D", "="));
}
return new ImmutablePair<>(filteredEventRecords, String.valueOf(newCheckpointTime));
} catch (JSONException | IOException e) {
throw new HoodieException("Unable to read from SQS: ", e);
}
}
use of org.apache.hudi.common.util.collection.ImmutablePair in project hudi by apache.
the class HoodieGlobalSimpleIndex method getTaggedRecords.
/**
* Tag records with right {@link HoodieRecordLocation}.
*
* @param incomingRecords incoming {@link HoodieRecord}s
* @param existingRecords existing records with {@link HoodieRecordLocation}s
* @return {@link HoodieData} of {@link HoodieRecord}s with tagged {@link HoodieRecordLocation}s
*/
private <R> HoodieData<HoodieRecord<R>> getTaggedRecords(HoodiePairData<String, HoodieRecord<R>> incomingRecords, HoodiePairData<HoodieKey, HoodieRecordLocation> existingRecords) {
HoodiePairData<String, Pair<String, HoodieRecordLocation>> existingRecordByRecordKey = existingRecords.mapToPair(entry -> new ImmutablePair<>(entry.getLeft().getRecordKey(), Pair.of(entry.getLeft().getPartitionPath(), entry.getRight())));
return incomingRecords.leftOuterJoin(existingRecordByRecordKey).values().flatMap(entry -> {
HoodieRecord<R> inputRecord = entry.getLeft();
Option<Pair<String, HoodieRecordLocation>> partitionPathLocationPair = Option.ofNullable(entry.getRight().orElse(null));
List<HoodieRecord<R>> taggedRecords;
if (partitionPathLocationPair.isPresent()) {
String partitionPath = partitionPathLocationPair.get().getKey();
HoodieRecordLocation location = partitionPathLocationPair.get().getRight();
if (config.getGlobalSimpleIndexUpdatePartitionPath() && !(inputRecord.getPartitionPath().equals(partitionPath))) {
// Create an empty record to delete the record in the old partition
HoodieRecord<R> deleteRecord = new HoodieAvroRecord(new HoodieKey(inputRecord.getRecordKey(), partitionPath), new EmptyHoodieRecordPayload());
deleteRecord.setCurrentLocation(location);
deleteRecord.seal();
// Tag the incoming record for inserting to the new partition
HoodieRecord<R> insertRecord = (HoodieRecord<R>) HoodieIndexUtils.getTaggedRecord(inputRecord, Option.empty());
taggedRecords = Arrays.asList(deleteRecord, insertRecord);
} else {
// Ignore the incoming record's partition, regardless of whether it differs from its old partition or not.
// When it differs, the record will still be updated at its old partition.
HoodieRecord<R> newRecord = new HoodieAvroRecord(new HoodieKey(inputRecord.getRecordKey(), partitionPath), (HoodieRecordPayload) inputRecord.getData());
taggedRecords = Collections.singletonList((HoodieRecord<R>) HoodieIndexUtils.getTaggedRecord(newRecord, Option.ofNullable(location)));
}
} else {
taggedRecords = Collections.singletonList((HoodieRecord<R>) HoodieIndexUtils.getTaggedRecord(inputRecord, Option.empty()));
}
return taggedRecords.iterator();
});
}
use of org.apache.hudi.common.util.collection.ImmutablePair in project hudi by apache.
the class TestHiveSyncTool method testUpdateTableComments.
@ParameterizedTest
@MethodSource("syncMode")
public void testUpdateTableComments(String syncMode) throws Exception {
hiveSyncConfig.syncMode = syncMode;
String commitTime = "100";
HiveTestUtil.createCOWTableWithSchema(commitTime, "/simple-test.avsc");
HiveSyncTool tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem);
tool.syncHoodieTable();
HoodieHiveClient hiveClient = new HoodieHiveClient(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem);
Map<String, ImmutablePair<String, String>> alterCommentSchema = new HashMap<>();
// generate commented schema field
Schema schema = SchemaTestUtil.getSchemaFromResource(HiveTestUtil.class, "/simple-test.avsc");
Schema commentedSchema = SchemaTestUtil.getSchemaFromResource(HiveTestUtil.class, "/simple-test-doced.avsc");
Map<String, String> fieldsNameAndDoc = commentedSchema.getFields().stream().collect(Collectors.toMap(field -> field.name().toLowerCase(Locale.ROOT), field -> StringUtils.isNullOrEmpty(field.doc()) ? "" : field.doc()));
for (Field field : schema.getFields()) {
String name = field.name().toLowerCase(Locale.ROOT);
String comment = fieldsNameAndDoc.get(name);
if (fieldsNameAndDoc.containsKey(name) && !comment.equals(field.doc())) {
alterCommentSchema.put(name, new ImmutablePair<>(field.schema().getType().name(), comment));
}
}
ddlExecutor.updateTableComments(hiveSyncConfig.tableName, alterCommentSchema);
List<FieldSchema> fieldSchemas = hiveClient.getTableCommentUsingMetastoreClient(hiveSyncConfig.tableName);
int commentCnt = 0;
for (FieldSchema fieldSchema : fieldSchemas) {
if (!StringUtils.isNullOrEmpty(fieldSchema.getComment())) {
commentCnt++;
}
}
assertEquals(2, commentCnt, "hive schema field comment numbers should match the avro schema field doc numbers");
}
use of org.apache.hudi.common.util.collection.ImmutablePair in project hudi by apache.
the class FSUtils method parallelizeFilesProcess.
public static <T> Map<String, T> parallelizeFilesProcess(HoodieEngineContext hoodieEngineContext, FileSystem fs, int parallelism, SerializableFunction<Pair<String, SerializableConfiguration>, T> pairFunction, List<String> subPaths) {
Map<String, T> result = new HashMap<>();
if (subPaths.size() > 0) {
SerializableConfiguration conf = new SerializableConfiguration(fs.getConf());
int actualParallelism = Math.min(subPaths.size(), parallelism);
result = hoodieEngineContext.mapToPair(subPaths, subPath -> new ImmutablePair<>(subPath, pairFunction.apply(new ImmutablePair<>(subPath, conf))), actualParallelism);
}
return result;
}
Aggregations