Search in sources :

Example 11 with ImmutablePair

use of org.apache.hudi.common.util.collection.ImmutablePair in project hudi by apache.

the class S3EventsMetaSelector method getNextEventsFromQueue.

/**
 * Get the list of events from queue.
 *
 * @param lastCheckpointStr The last checkpoint instant string, empty if first run.
 * @return A pair of dataset of event records and the next checkpoint instant string.
 */
public Pair<List<String>, String> getNextEventsFromQueue(AmazonSQS sqs, Option<String> lastCheckpointStr, List<Message> processedMessages) {
    processedMessages.clear();
    log.info("Reading messages....");
    try {
        log.info("Start Checkpoint : " + lastCheckpointStr);
        List<Map<String, Object>> eventRecords = getValidEvents(sqs, processedMessages);
        log.info("Number of valid events: " + eventRecords.size());
        List<String> filteredEventRecords = new ArrayList<>();
        long newCheckpointTime = eventRecords.stream().mapToLong(eventRecord -> Date.from(Instant.from(DateTimeFormatter.ISO_INSTANT.parse((String) eventRecord.get(S3_MODEL_EVENT_TIME)))).getTime()).max().orElse(lastCheckpointStr.map(Long::parseLong).orElse(0L));
        for (Map<String, Object> eventRecord : eventRecords) {
            filteredEventRecords.add(new ObjectMapper().writeValueAsString(eventRecord).replace("%3D", "="));
        }
        return new ImmutablePair<>(filteredEventRecords, String.valueOf(newCheckpointTime));
    } catch (JSONException | IOException e) {
        throw new HoodieException("Unable to read from SQS: ", e);
    }
}
Also used : ArrayList(java.util.ArrayList) JSONException(org.json.JSONException) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) JSONObject(org.json.JSONObject) Map(java.util.Map) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper)

Example 12 with ImmutablePair

use of org.apache.hudi.common.util.collection.ImmutablePair in project hudi by apache.

the class HoodieGlobalSimpleIndex method getTaggedRecords.

/**
 * Tag records with right {@link HoodieRecordLocation}.
 *
 * @param incomingRecords incoming {@link HoodieRecord}s
 * @param existingRecords existing records with {@link HoodieRecordLocation}s
 * @return {@link HoodieData} of {@link HoodieRecord}s with tagged {@link HoodieRecordLocation}s
 */
private <R> HoodieData<HoodieRecord<R>> getTaggedRecords(HoodiePairData<String, HoodieRecord<R>> incomingRecords, HoodiePairData<HoodieKey, HoodieRecordLocation> existingRecords) {
    HoodiePairData<String, Pair<String, HoodieRecordLocation>> existingRecordByRecordKey = existingRecords.mapToPair(entry -> new ImmutablePair<>(entry.getLeft().getRecordKey(), Pair.of(entry.getLeft().getPartitionPath(), entry.getRight())));
    return incomingRecords.leftOuterJoin(existingRecordByRecordKey).values().flatMap(entry -> {
        HoodieRecord<R> inputRecord = entry.getLeft();
        Option<Pair<String, HoodieRecordLocation>> partitionPathLocationPair = Option.ofNullable(entry.getRight().orElse(null));
        List<HoodieRecord<R>> taggedRecords;
        if (partitionPathLocationPair.isPresent()) {
            String partitionPath = partitionPathLocationPair.get().getKey();
            HoodieRecordLocation location = partitionPathLocationPair.get().getRight();
            if (config.getGlobalSimpleIndexUpdatePartitionPath() && !(inputRecord.getPartitionPath().equals(partitionPath))) {
                // Create an empty record to delete the record in the old partition
                HoodieRecord<R> deleteRecord = new HoodieAvroRecord(new HoodieKey(inputRecord.getRecordKey(), partitionPath), new EmptyHoodieRecordPayload());
                deleteRecord.setCurrentLocation(location);
                deleteRecord.seal();
                // Tag the incoming record for inserting to the new partition
                HoodieRecord<R> insertRecord = (HoodieRecord<R>) HoodieIndexUtils.getTaggedRecord(inputRecord, Option.empty());
                taggedRecords = Arrays.asList(deleteRecord, insertRecord);
            } else {
                // Ignore the incoming record's partition, regardless of whether it differs from its old partition or not.
                // When it differs, the record will still be updated at its old partition.
                HoodieRecord<R> newRecord = new HoodieAvroRecord(new HoodieKey(inputRecord.getRecordKey(), partitionPath), (HoodieRecordPayload) inputRecord.getData());
                taggedRecords = Collections.singletonList((HoodieRecord<R>) HoodieIndexUtils.getTaggedRecord(newRecord, Option.ofNullable(location)));
            }
        } else {
            taggedRecords = Collections.singletonList((HoodieRecord<R>) HoodieIndexUtils.getTaggedRecord(inputRecord, Option.empty()));
        }
        return taggedRecords.iterator();
    });
}
Also used : HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Pair(org.apache.hudi.common.util.collection.Pair)

Example 13 with ImmutablePair

use of org.apache.hudi.common.util.collection.ImmutablePair in project hudi by apache.

the class TestHiveSyncTool method testUpdateTableComments.

@ParameterizedTest
@MethodSource("syncMode")
public void testUpdateTableComments(String syncMode) throws Exception {
    hiveSyncConfig.syncMode = syncMode;
    String commitTime = "100";
    HiveTestUtil.createCOWTableWithSchema(commitTime, "/simple-test.avsc");
    HiveSyncTool tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem);
    tool.syncHoodieTable();
    HoodieHiveClient hiveClient = new HoodieHiveClient(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem);
    Map<String, ImmutablePair<String, String>> alterCommentSchema = new HashMap<>();
    // generate commented schema field
    Schema schema = SchemaTestUtil.getSchemaFromResource(HiveTestUtil.class, "/simple-test.avsc");
    Schema commentedSchema = SchemaTestUtil.getSchemaFromResource(HiveTestUtil.class, "/simple-test-doced.avsc");
    Map<String, String> fieldsNameAndDoc = commentedSchema.getFields().stream().collect(Collectors.toMap(field -> field.name().toLowerCase(Locale.ROOT), field -> StringUtils.isNullOrEmpty(field.doc()) ? "" : field.doc()));
    for (Field field : schema.getFields()) {
        String name = field.name().toLowerCase(Locale.ROOT);
        String comment = fieldsNameAndDoc.get(name);
        if (fieldsNameAndDoc.containsKey(name) && !comment.equals(field.doc())) {
            alterCommentSchema.put(name, new ImmutablePair<>(field.schema().getType().name(), comment));
        }
    }
    ddlExecutor.updateTableComments(hiveSyncConfig.tableName, alterCommentSchema);
    List<FieldSchema> fieldSchemas = hiveClient.getTableCommentUsingMetastoreClient(hiveSyncConfig.tableName);
    int commentCnt = 0;
    for (FieldSchema fieldSchema : fieldSchemas) {
        if (!StringUtils.isNullOrEmpty(fieldSchema.getComment())) {
            commentCnt++;
        }
    }
    assertEquals(2, commentCnt, "hive schema field comment numbers should match the avro schema field doc numbers");
}
Also used : ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) URISyntaxException(java.net.URISyntaxException) ZonedDateTime(java.time.ZonedDateTime) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HiveTestUtil.ddlExecutor(org.apache.hudi.hive.testutils.HiveTestUtil.ddlExecutor) Partition(org.apache.hadoop.hive.metastore.api.Partition) ArrayList(java.util.ArrayList) AfterAll(org.junit.jupiter.api.AfterAll) StringUtils(org.apache.hudi.common.util.StringUtils) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Locale(java.util.Locale) Map(java.util.Map) HiveTestUtil.fileSystem(org.apache.hudi.hive.testutils.HiveTestUtil.fileSystem) SchemaTestUtil(org.apache.hudi.common.testutils.SchemaTestUtil) Path(org.apache.hadoop.fs.Path) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) MethodSource(org.junit.jupiter.params.provider.MethodSource) PartitionEventType(org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent.PartitionEventType) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Schema(org.apache.avro.Schema) Field(org.apache.avro.Schema.Field) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) Driver(org.apache.hadoop.hive.ql.Driver) IOException(java.io.IOException) SessionState(org.apache.hadoop.hive.ql.session.SessionState) Collectors(java.util.stream.Collectors) ConfigUtils(org.apache.hudi.hive.util.ConfigUtils) Test(org.junit.jupiter.api.Test) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) AfterEach(org.junit.jupiter.api.AfterEach) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) List(java.util.List) HiveTestUtil(org.apache.hudi.hive.testutils.HiveTestUtil) NetworkTestUtils(org.apache.hudi.common.testutils.NetworkTestUtils) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) PartitionEvent(org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) HiveTestUtil.hiveSyncConfig(org.apache.hudi.hive.testutils.HiveTestUtil.hiveSyncConfig) Assertions.assertDoesNotThrow(org.junit.jupiter.api.Assertions.assertDoesNotThrow) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) HashMap(java.util.HashMap) Schema(org.apache.avro.Schema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) Field(org.apache.avro.Schema.Field) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 14 with ImmutablePair

use of org.apache.hudi.common.util.collection.ImmutablePair in project hudi by apache.

the class FSUtils method parallelizeFilesProcess.

public static <T> Map<String, T> parallelizeFilesProcess(HoodieEngineContext hoodieEngineContext, FileSystem fs, int parallelism, SerializableFunction<Pair<String, SerializableConfiguration>, T> pairFunction, List<String> subPaths) {
    Map<String, T> result = new HashMap<>();
    if (subPaths.size() > 0) {
        SerializableConfiguration conf = new SerializableConfiguration(fs.getConf());
        int actualParallelism = Math.min(subPaths.size(), parallelism);
        result = hoodieEngineContext.mapToPair(subPaths, subPath -> new ImmutablePair<>(subPath, pairFunction.apply(new ImmutablePair<>(subPath, conf))), actualParallelism);
    }
    return result;
}
Also used : ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Arrays(java.util.Arrays) InvalidHoodiePathException(org.apache.hudi.exception.InvalidHoodiePathException) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieException(org.apache.hudi.exception.HoodieException) PathFilter(org.apache.hadoop.fs.PathFilter) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) FileStatus(org.apache.hadoop.fs.FileStatus) Function(java.util.function.Function) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Logger(org.apache.log4j.Logger) Matcher(java.util.regex.Matcher) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) DistributedFileSystem(org.apache.hadoop.hdfs.DistributedFileSystem) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) Predicate(java.util.function.Predicate) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) Set(java.util.Set) IOException(java.io.IOException) UUID(java.util.UUID) Collectors(java.util.stream.Collectors) File(java.io.File) FileNotFoundException(java.io.FileNotFoundException) Serializable(java.io.Serializable) Objects(java.util.Objects) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) List(java.util.List) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) HoodiePartitionMetadata(org.apache.hudi.common.model.HoodiePartitionMetadata) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) Entry(java.util.Map.Entry) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Pattern(java.util.regex.Pattern) RemoteIterator(org.apache.hadoop.fs.RemoteIterator) Pair(org.apache.hudi.common.util.collection.Pair) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) HashMap(java.util.HashMap) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration)

Aggregations

ImmutablePair (org.apache.hudi.common.util.collection.ImmutablePair)14 ArrayList (java.util.ArrayList)11 List (java.util.List)10 Pair (org.apache.hudi.common.util.collection.Pair)9 IOException (java.io.IOException)8 Collectors (java.util.stream.Collectors)8 Path (org.apache.hadoop.fs.Path)8 Option (org.apache.hudi.common.util.Option)8 Map (java.util.Map)7 Arrays (java.util.Arrays)5 HashMap (java.util.HashMap)5 FileSystem (org.apache.hadoop.fs.FileSystem)5 TypedProperties (org.apache.hudi.common.config.TypedProperties)5 HoodieIOException (org.apache.hudi.exception.HoodieIOException)5 Configuration (org.apache.hadoop.conf.Configuration)4 FileStatus (org.apache.hadoop.fs.FileStatus)4 HoodieEngineContext (org.apache.hudi.common.engine.HoodieEngineContext)4 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)4 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)4 LogManager (org.apache.log4j.LogManager)4