Search in sources :

Example 6 with HoodieMetadataException

use of org.apache.hudi.exception.HoodieMetadataException in project hudi by apache.

the class TestHoodieMetadataBase method getMetadataWriteConfig.

/**
 * Fetching WriteConfig for metadata table from Data table's writeConfig is not trivial and
 * the method is not public in source code. so, for now, using this method which mimics source code.
 */
protected HoodieWriteConfig getMetadataWriteConfig(HoodieWriteConfig writeConfig) {
    int parallelism = writeConfig.getMetadataInsertParallelism();
    int minCommitsToKeep = Math.max(writeConfig.getMetadataMinCommitsToKeep(), writeConfig.getMinCommitsToKeep());
    int maxCommitsToKeep = Math.max(writeConfig.getMetadataMaxCommitsToKeep(), writeConfig.getMaxCommitsToKeep());
    // Create the write config for the metadata table by borrowing options from the main write config.
    HoodieWriteConfig.Builder builder = HoodieWriteConfig.newBuilder().withTimelineLayoutVersion(TimelineLayoutVersion.CURR_VERSION).withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(writeConfig.getConsistencyGuardConfig().isConsistencyCheckEnabled()).withInitialConsistencyCheckIntervalMs(writeConfig.getConsistencyGuardConfig().getInitialConsistencyCheckIntervalMs()).withMaxConsistencyCheckIntervalMs(writeConfig.getConsistencyGuardConfig().getMaxConsistencyCheckIntervalMs()).withMaxConsistencyChecks(writeConfig.getConsistencyGuardConfig().getMaxConsistencyChecks()).build()).withWriteConcurrencyMode(WriteConcurrencyMode.SINGLE_WRITER).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).withFileListingParallelism(writeConfig.getFileListingParallelism()).build()).withAutoCommit(true).withAvroSchemaValidate(true).withEmbeddedTimelineServerEnabled(false).withMarkersType(MarkerType.DIRECT.name()).withRollbackUsingMarkers(false).withPath(HoodieTableMetadata.getMetadataTableBasePath(writeConfig.getBasePath())).withSchema(HoodieMetadataRecord.getClassSchema().toString()).forTable(writeConfig.getTableName() + METADATA_TABLE_NAME_SUFFIX).withCompactionConfig(HoodieCompactionConfig.newBuilder().withAsyncClean(writeConfig.isMetadataAsyncClean()).withAutoClean(false).withCleanerParallelism(parallelism).withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY).retainCommits(writeConfig.getMetadataCleanerCommitsRetained()).archiveCommitsWith(minCommitsToKeep, maxCommitsToKeep).withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(writeConfig.getMetadataCompactDeltaCommitMax()).build()).withParallelism(parallelism, parallelism).withDeleteParallelism(parallelism).withRollbackParallelism(parallelism).withFinalizeWriteParallelism(parallelism).withAllowMultiWriteOnSameInstant(true).withKeyGenerator(HoodieTableMetadataKeyGenerator.class.getCanonicalName()).withPopulateMetaFields(writeConfig.getMetadataConfig().populateMetaFields());
    // RecordKey properties are needed for the metadata table records
    final Properties properties = new Properties();
    properties.put(HoodieTableConfig.RECORDKEY_FIELDS.key(), HoodieMetadataPayload.KEY_FIELD_NAME);
    properties.put("hoodie.datasource.write.recordkey.field", HoodieMetadataPayload.KEY_FIELD_NAME);
    builder.withProperties(properties);
    if (writeConfig.isMetricsOn()) {
        builder.withMetricsConfig(HoodieMetricsConfig.newBuilder().withReporterType(writeConfig.getMetricsReporterType().toString()).withExecutorMetrics(writeConfig.isExecutorMetricsEnabled()).on(true).build());
        switch(writeConfig.getMetricsReporterType()) {
            case GRAPHITE:
                builder.withMetricsGraphiteConfig(HoodieMetricsGraphiteConfig.newBuilder().onGraphitePort(writeConfig.getGraphiteServerPort()).toGraphiteHost(writeConfig.getGraphiteServerHost()).usePrefix(writeConfig.getGraphiteMetricPrefix()).build());
                break;
            case JMX:
                builder.withMetricsJmxConfig(HoodieMetricsJmxConfig.newBuilder().onJmxPort(writeConfig.getJmxPort()).toJmxHost(writeConfig.getJmxHost()).build());
                break;
            case DATADOG:
            case PROMETHEUS:
            case PROMETHEUS_PUSHGATEWAY:
            case CONSOLE:
            case INMEMORY:
            case CLOUDWATCH:
                break;
            default:
                throw new HoodieMetadataException("Unsupported Metrics Reporter type " + writeConfig.getMetricsReporterType());
        }
    }
    return builder.build();
}
Also used : HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Properties(java.util.Properties)

Example 7 with HoodieMetadataException

use of org.apache.hudi.exception.HoodieMetadataException in project hudi by apache.

the class SparkHoodieBackedTableMetadataWriter method commit.

@Override
protected void commit(String instantTime, Map<MetadataPartitionType, HoodieData<HoodieRecord>> partitionRecordsMap, boolean canTriggerTableService) {
    ValidationUtils.checkState(metadataMetaClient != null, "Metadata table is not fully initialized yet.");
    ValidationUtils.checkState(enabled, "Metadata table cannot be committed to as it is not enabled");
    HoodieData<HoodieRecord> preppedRecords = prepRecords(partitionRecordsMap);
    JavaRDD<HoodieRecord> preppedRecordRDD = HoodieJavaRDD.getJavaRDD(preppedRecords);
    try (SparkRDDWriteClient writeClient = new SparkRDDWriteClient(engineContext, metadataWriteConfig, true)) {
        if (canTriggerTableService) {
            // trigger compaction before doing the delta commit. this is to ensure, if this delta commit succeeds in metadata table, but failed in data table,
            // we would have compacted metadata table and so could have included uncommitted data which will never be ignored while reading from metadata
            // table (since reader will filter out only from delta commits)
            compactIfNecessary(writeClient, instantTime);
        }
        if (!metadataMetaClient.getActiveTimeline().filterCompletedInstants().containsInstant(instantTime)) {
            // if this is a new commit being applied to metadata for the first time
            writeClient.startCommitWithTime(instantTime);
        } else {
            // this code path refers to a re-attempted commit that got committed to metadata table, but failed in datatable.
            // for eg, lets say compaction c1 on 1st attempt succeeded in metadata table and failed before committing to datatable.
            // when retried again, data table will first rollback pending compaction. these will be applied to metadata table, but all changes
            // are upserts to metadata table and so only a new delta commit will be created.
            // once rollback is complete, compaction will be retried again, which will eventually hit this code block where the respective commit is
            // already part of completed commit. So, we have to manually remove the completed instant and proceed.
            // and it is for the same reason we enabled withAllowMultiWriteOnSameInstant for metadata table.
            HoodieInstant alreadyCompletedInstant = metadataMetaClient.getActiveTimeline().filterCompletedInstants().filter(entry -> entry.getTimestamp().equals(instantTime)).lastInstant().get();
            HoodieActiveTimeline.deleteInstantFile(metadataMetaClient.getFs(), metadataMetaClient.getMetaPath(), alreadyCompletedInstant);
            metadataMetaClient.reloadActiveTimeline();
        }
        List<WriteStatus> statuses = writeClient.upsertPreppedRecords(preppedRecordRDD, instantTime).collect();
        statuses.forEach(writeStatus -> {
            if (writeStatus.hasErrors()) {
                throw new HoodieMetadataException("Failed to commit metadata table records at instant " + instantTime);
            }
        });
        // reload timeline
        metadataMetaClient.reloadActiveTimeline();
        if (canTriggerTableService) {
            cleanIfNecessary(writeClient, instantTime);
            writeClient.archive();
        }
    }
    // Update total size of the metadata and count of base/log files
    metrics.ifPresent(m -> m.updateSizeMetrics(metadataMetaClient, metadata));
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) WriteStatus(org.apache.hudi.client.WriteStatus)

Example 8 with HoodieMetadataException

use of org.apache.hudi.exception.HoodieMetadataException in project hudi by apache.

the class HoodieTableMetadataUtil method processRollbackMetadata.

/**
 * Extracts information about the deleted and append files from the {@code HoodieRollbackMetadata}.
 * <p>
 * During a rollback files may be deleted (COW, MOR) or rollback blocks be appended (MOR only) to files. This
 * function will extract this change file for each partition.
 *
 * @param metadataTableTimeline    Current timeline of the Metadata Table
 * @param rollbackMetadata         {@code HoodieRollbackMetadata}
 * @param partitionToDeletedFiles  The {@code Map} to fill with files deleted per partition.
 * @param partitionToAppendedFiles The {@code Map} to fill with files appended per partition and their sizes.
 */
private static void processRollbackMetadata(HoodieActiveTimeline metadataTableTimeline, HoodieRollbackMetadata rollbackMetadata, Map<String, List<String>> partitionToDeletedFiles, Map<String, Map<String, Long>> partitionToAppendedFiles, Option<String> lastSyncTs) {
    rollbackMetadata.getPartitionMetadata().values().forEach(pm -> {
        final String instantToRollback = rollbackMetadata.getCommitsRollback().get(0);
        // Has this rollback produced new files?
        boolean hasRollbackLogFiles = pm.getRollbackLogFiles() != null && !pm.getRollbackLogFiles().isEmpty();
        boolean hasNonZeroRollbackLogFiles = hasRollbackLogFiles && pm.getRollbackLogFiles().values().stream().mapToLong(Long::longValue).sum() > 0;
        // If instant-to-rollback has not been synced to metadata table yet then there is no need to update metadata
        // This can happen in two cases:
        // Case 1: Metadata Table timeline is behind the instant-to-rollback.
        boolean shouldSkip = lastSyncTs.isPresent() && HoodieTimeline.compareTimestamps(instantToRollback, HoodieTimeline.GREATER_THAN, lastSyncTs.get());
        if (!hasNonZeroRollbackLogFiles && shouldSkip) {
            LOG.info(String.format("Skipping syncing of rollbackMetadata at %s, given metadata table is already synced upto to %s", instantToRollback, lastSyncTs.get()));
            return;
        }
        // Case 2: The instant-to-rollback was never committed to Metadata Table. This can happen if the instant-to-rollback
        // was a failed commit (never completed) as only completed instants are synced to Metadata Table.
        // But the required Metadata Table instants should not have been archived
        HoodieInstant syncedInstant = new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, instantToRollback);
        if (metadataTableTimeline.getCommitsTimeline().isBeforeTimelineStarts(syncedInstant.getTimestamp())) {
            throw new HoodieMetadataException(String.format("The instant %s required to sync rollback of %s has been archived", syncedInstant, instantToRollback));
        }
        shouldSkip = !metadataTableTimeline.containsInstant(syncedInstant);
        if (!hasNonZeroRollbackLogFiles && shouldSkip) {
            LOG.info(String.format("Skipping syncing of rollbackMetadata at %s, since this instant was never committed to Metadata Table", instantToRollback));
            return;
        }
        final String partition = pm.getPartitionPath();
        if ((!pm.getSuccessDeleteFiles().isEmpty() || !pm.getFailedDeleteFiles().isEmpty()) && !shouldSkip) {
            if (!partitionToDeletedFiles.containsKey(partition)) {
                partitionToDeletedFiles.put(partition, new ArrayList<>());
            }
            // Extract deleted file name from the absolute paths saved in getSuccessDeleteFiles()
            List<String> deletedFiles = pm.getSuccessDeleteFiles().stream().map(p -> new Path(p).getName()).collect(Collectors.toList());
            if (!pm.getFailedDeleteFiles().isEmpty()) {
                deletedFiles.addAll(pm.getFailedDeleteFiles().stream().map(p -> new Path(p).getName()).collect(Collectors.toList()));
            }
            partitionToDeletedFiles.get(partition).addAll(deletedFiles);
        }
        BiFunction<Long, Long, Long> fileMergeFn = (oldSize, newSizeCopy) -> {
            // as rollback file could have been updated after written log files are computed.
            return oldSize > newSizeCopy ? oldSize : newSizeCopy;
        };
        if (hasRollbackLogFiles) {
            if (!partitionToAppendedFiles.containsKey(partition)) {
                partitionToAppendedFiles.put(partition, new HashMap<>());
            }
            // Extract appended file name from the absolute paths saved in getAppendFiles()
            pm.getRollbackLogFiles().forEach((path, size) -> {
                partitionToAppendedFiles.get(partition).merge(new Path(path).getName(), size, fileMergeFn);
            });
        }
    });
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) HoodieColumnRangeMetadata(org.apache.hudi.common.model.HoodieColumnRangeMetadata) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) BiFunction(java.util.function.BiFunction) HoodieException(org.apache.hudi.exception.HoodieException) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) ByteBuffer(java.nio.ByteBuffer) MAX(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.MAX) Logger(org.apache.log4j.Logger) Map(java.util.Map) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) Path(org.apache.hadoop.fs.Path) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) Schema(org.apache.avro.Schema) Collectors(java.util.stream.Collectors) TOTAL_SIZE(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.TOTAL_SIZE) Objects(java.util.Objects) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) VALUE_COUNT(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.VALUE_COUNT) List(java.util.List) Stream(java.util.stream.Stream) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieMetadataColumnStats(org.apache.hudi.avro.model.HoodieMetadataColumnStats) HoodieRestoreMetadata(org.apache.hudi.avro.model.HoodieRestoreMetadata) TOTAL_UNCOMPRESSED_SIZE(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.TOTAL_UNCOMPRESSED_SIZE) EMPTY_PARTITION_NAME(org.apache.hudi.metadata.HoodieTableMetadata.EMPTY_PARTITION_NAME) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) StringUtils(org.apache.hudi.common.util.StringUtils) HoodieFileReaderFactory(org.apache.hudi.io.storage.HoodieFileReaderFactory) NULL_COUNT(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.NULL_COUNT) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) LinkedList(java.util.LinkedList) Nonnull(javax.annotation.Nonnull) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) IndexedRecord(org.apache.avro.generic.IndexedRecord) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieAvroUtils.getNestedFieldValAsString(org.apache.hudi.avro.HoodieAvroUtils.getNestedFieldValAsString) GenericRecord(org.apache.avro.generic.GenericRecord) MIN(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.MIN) HoodieData(org.apache.hudi.common.data.HoodieData) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) HoodieDefaultTimeline(org.apache.hudi.common.table.timeline.HoodieDefaultTimeline) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) ParquetUtils(org.apache.hudi.common.util.ParquetUtils) HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) NON_PARTITIONED_NAME(org.apache.hudi.metadata.HoodieTableMetadata.NON_PARTITIONED_NAME) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) COLUMN_RANGE_MERGE_FUNCTION(org.apache.hudi.common.model.HoodieColumnRangeMetadata.COLUMN_RANGE_MERGE_FUNCTION) HoodieDeltaWriteStat(org.apache.hudi.common.model.HoodieDeltaWriteStat) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Comparator(java.util.Comparator) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) HoodieAvroUtils.getNestedFieldValAsString(org.apache.hudi.avro.HoodieAvroUtils.getNestedFieldValAsString)

Example 9 with HoodieMetadataException

use of org.apache.hudi.exception.HoodieMetadataException in project hudi by apache.

the class HoodieTableMetadataUtil method deleteMetadataTable.

/**
 * Delete the metadata table for the dataset. This will be invoked during upgrade/downgrade operation during which
 * no other
 * process should be running.
 *
 * @param basePath base path of the dataset
 * @param context  instance of {@link HoodieEngineContext}.
 */
public static void deleteMetadataTable(String basePath, HoodieEngineContext context) {
    final String metadataTablePath = HoodieTableMetadata.getMetadataTableBasePath(basePath);
    FileSystem fs = FSUtils.getFs(metadataTablePath, context.getHadoopConf().get());
    try {
        fs.delete(new Path(metadataTablePath), true);
    } catch (Exception e) {
        throw new HoodieMetadataException("Failed to remove metadata table from path " + metadataTablePath, e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieAvroUtils.getNestedFieldValAsString(org.apache.hudi.avro.HoodieAvroUtils.getNestedFieldValAsString) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Example 10 with HoodieMetadataException

use of org.apache.hudi.exception.HoodieMetadataException in project hudi by apache.

the class BaseTableMetadata method fetchAllFilesInPartitionPaths.

Map<String, FileStatus[]> fetchAllFilesInPartitionPaths(List<Path> partitionPaths) throws IOException {
    Map<String, Path> partitionInfo = new HashMap<>();
    boolean foundNonPartitionedPath = false;
    for (Path partitionPath : partitionPaths) {
        String partitionName = FSUtils.getRelativePartitionPath(new Path(dataBasePath), partitionPath);
        if (partitionName.isEmpty()) {
            if (partitionInfo.size() > 1) {
                throw new HoodieMetadataException("Found mix of partitioned and non partitioned paths while fetching data from metadata table");
            }
            partitionInfo.put(NON_PARTITIONED_NAME, partitionPath);
            foundNonPartitionedPath = true;
        } else {
            if (foundNonPartitionedPath) {
                throw new HoodieMetadataException("Found mix of partitioned and non partitioned paths while fetching data from metadata table");
            }
            partitionInfo.put(partitionName, partitionPath);
        }
    }
    HoodieTimer timer = new HoodieTimer().startTimer();
    List<Pair<String, Option<HoodieRecord<HoodieMetadataPayload>>>> partitionsFileStatus = getRecordsByKeys(new ArrayList<>(partitionInfo.keySet()), MetadataPartitionType.FILES.getPartitionPath());
    metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.LOOKUP_FILES_STR, timer.endTimer()));
    Map<String, FileStatus[]> result = new HashMap<>();
    for (Pair<String, Option<HoodieRecord<HoodieMetadataPayload>>> entry : partitionsFileStatus) {
        if (entry.getValue().isPresent()) {
            mayBeHandleSpuriousDeletes(entry.getValue(), entry.getKey());
            result.put(partitionInfo.get(entry.getKey()).toString(), entry.getValue().get().getData().getFileStatuses(hadoopConf.get(), partitionInfo.get(entry.getKey())));
        }
    }
    LOG.info("Listed files in partitions from metadata: partition list =" + Arrays.toString(partitionPaths.toArray()));
    return result;
}
Also used : Path(org.apache.hadoop.fs.Path) HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) Option(org.apache.hudi.common.util.Option) Pair(org.apache.hudi.common.util.collection.Pair)

Aggregations

HoodieMetadataException (org.apache.hudi.exception.HoodieMetadataException)10 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)4 IOException (java.io.IOException)3 Path (org.apache.hadoop.fs.Path)3 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)3 Pair (org.apache.hudi.common.util.collection.Pair)3 HoodieIOException (org.apache.hudi.exception.HoodieIOException)3 ArrayList (java.util.ArrayList)2 HashMap (java.util.HashMap)2 LinkedList (java.util.LinkedList)2 Map (java.util.Map)2 Properties (java.util.Properties)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 HoodieAvroUtils.getNestedFieldValAsString (org.apache.hudi.avro.HoodieAvroUtils.getNestedFieldValAsString)2 HoodieMetadataColumnStats (org.apache.hudi.avro.model.HoodieMetadataColumnStats)2 HoodieRestoreMetadata (org.apache.hudi.avro.model.HoodieRestoreMetadata)2 HoodieRollbackMetadata (org.apache.hudi.avro.model.HoodieRollbackMetadata)2 WriteStatus (org.apache.hudi.client.WriteStatus)2 Option (org.apache.hudi.common.util.Option)2 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)2