Search in sources :

Example 31 with HoodieIOException

use of org.apache.hudi.exception.HoodieIOException in project hudi by apache.

the class TestIncrementalFSViewSync method testMultipleReplaceSteps.

private void testMultipleReplaceSteps(Map<String, List<String>> instantsToFiles, SyncableFileSystemView view, List<String> instants, int initialExpectedSlicesPerPartition) {
    int expectedSlicesPerPartition = initialExpectedSlicesPerPartition;
    for (int i = 0; i < instants.size(); i++) {
        try {
            generateReplaceInstant(instants.get(i), instantsToFiles);
            view.sync();
            metaClient.reloadActiveTimeline();
            SyncableFileSystemView newView = getFileSystemView(metaClient);
            // 1 fileId is replaced for every partition, so subtract partitions.size()
            expectedSlicesPerPartition = expectedSlicesPerPartition + fileIdsPerPartition.size() - 1;
            areViewsConsistent(view, newView, expectedSlicesPerPartition * partitions.size());
        } catch (IOException e) {
            throw new HoodieIOException("unable to test replace", e);
        }
    }
}
Also used : HoodieIOException(org.apache.hudi.exception.HoodieIOException) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Example 32 with HoodieIOException

use of org.apache.hudi.exception.HoodieIOException in project hudi by apache.

the class HoodieInputFormatUtils method refreshFileStatus.

/**
 * Checks the file status for a race condition which can set the file size to 0. 1. HiveInputFormat does
 * super.listStatus() and gets back a FileStatus[] 2. Then it creates the HoodieTableMetaClient for the paths listed.
 * 3. Generation of splits looks at FileStatus size to create splits, which skips this file
 * @param conf
 * @param dataFile
 * @return
 */
private static HoodieBaseFile refreshFileStatus(Configuration conf, HoodieBaseFile dataFile) {
    Path dataPath = dataFile.getFileStatus().getPath();
    try {
        if (dataFile.getFileSize() == 0) {
            FileSystem fs = dataPath.getFileSystem(conf);
            LOG.info("Refreshing file status " + dataFile.getPath());
            return new HoodieBaseFile(fs.getFileStatus(dataPath), dataFile.getBootstrapBaseFile().orElse(null));
        }
        return dataFile;
    } catch (IOException e) {
        throw new HoodieIOException("Could not get FileStatus on path " + dataPath);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieIOException(org.apache.hudi.exception.HoodieIOException) FileSystem(org.apache.hadoop.fs.FileSystem) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Example 33 with HoodieIOException

use of org.apache.hudi.exception.HoodieIOException in project hudi by apache.

the class HoodieInputFormatUtils method getInputFormat.

public static FileInputFormat getInputFormat(HoodieFileFormat baseFileFormat, boolean realtime, Configuration conf) {
    switch(baseFileFormat) {
        case PARQUET:
            if (realtime) {
                HoodieParquetRealtimeInputFormat inputFormat = new HoodieParquetRealtimeInputFormat();
                inputFormat.setConf(conf);
                return inputFormat;
            } else {
                HoodieParquetInputFormat inputFormat = new HoodieParquetInputFormat();
                inputFormat.setConf(conf);
                return inputFormat;
            }
        case HFILE:
            if (realtime) {
                HoodieHFileRealtimeInputFormat inputFormat = new HoodieHFileRealtimeInputFormat();
                inputFormat.setConf(conf);
                return inputFormat;
            } else {
                HoodieHFileInputFormat inputFormat = new HoodieHFileInputFormat();
                inputFormat.setConf(conf);
                return inputFormat;
            }
        default:
            throw new HoodieIOException("Hoodie InputFormat not implemented for base file format " + baseFileFormat);
    }
}
Also used : HoodieHFileInputFormat(org.apache.hudi.hadoop.HoodieHFileInputFormat) HoodieParquetRealtimeInputFormat(org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieHFileRealtimeInputFormat(org.apache.hudi.hadoop.realtime.HoodieHFileRealtimeInputFormat) HoodieParquetInputFormat(org.apache.hudi.hadoop.HoodieParquetInputFormat)

Example 34 with HoodieIOException

use of org.apache.hudi.exception.HoodieIOException in project hudi by apache.

the class DFSTestSuitePathSelector method getNextFilePathsAndMaxModificationTime.

@Override
public Pair<Option<String>, String> getNextFilePathsAndMaxModificationTime(Option<String> lastCheckpointStr, long sourceLimit) {
    Integer lastBatchId;
    Integer nextBatchId;
    try {
        if (lastCheckpointStr.isPresent()) {
            lastBatchId = Integer.parseInt(lastCheckpointStr.get());
            nextBatchId = lastBatchId + 1;
        } else {
            lastBatchId = 0;
            nextBatchId = 1;
        }
        // obtain all eligible files for the batch
        List<FileStatus> eligibleFiles = new ArrayList<>();
        FileStatus[] fileStatuses = fs.globStatus(new Path(props.getString(Config.ROOT_INPUT_PATH_PROP), "*"));
        // Say input data is as follow input/1, input/2, input/5 since 3,4 was rolled back and 5 is new generated data
        // checkpoint from the latest commit metadata will be 2 since 3,4 has been rolled back. We need to set the
        // next batch id correctly as 5 instead of 3
        Option<String> correctBatchIdDueToRollback = Option.fromJavaOptional(Arrays.stream(fileStatuses).map(f -> f.getPath().toString().split("/")[f.getPath().toString().split("/").length - 1]).filter(bid1 -> Integer.parseInt(bid1) > lastBatchId).min((bid1, bid2) -> Integer.min(Integer.parseInt(bid1), Integer.parseInt(bid2))));
        if (correctBatchIdDueToRollback.isPresent() && Integer.parseInt(correctBatchIdDueToRollback.get()) > nextBatchId) {
            nextBatchId = Integer.parseInt(correctBatchIdDueToRollback.get());
        }
        log.info("Using DFSTestSuitePathSelector, checkpoint: " + lastCheckpointStr + " sourceLimit: " + sourceLimit + " lastBatchId: " + lastBatchId + " nextBatchId: " + nextBatchId);
        for (FileStatus fileStatus : fileStatuses) {
            if (!fileStatus.isDirectory() || IGNORE_FILEPREFIX_LIST.stream().anyMatch(pfx -> fileStatus.getPath().getName().startsWith(pfx))) {
                continue;
            } else if (Integer.parseInt(fileStatus.getPath().getName()) > lastBatchId && Integer.parseInt(fileStatus.getPath().getName()) <= nextBatchId) {
                RemoteIterator<LocatedFileStatus> files = fs.listFiles(fileStatus.getPath(), true);
                while (files.hasNext()) {
                    eligibleFiles.add(files.next());
                }
            }
        }
        // no data to readAvro
        if (eligibleFiles.size() == 0) {
            return new ImmutablePair<>(Option.empty(), lastCheckpointStr.orElseGet(() -> String.valueOf(Long.MIN_VALUE)));
        }
        // readAvro the files out.
        String pathStr = eligibleFiles.stream().map(f -> f.getPath().toString()).collect(Collectors.joining(","));
        return new ImmutablePair<>(Option.ofNullable(pathStr), String.valueOf(nextBatchId));
    } catch (IOException ioe) {
        throw new HoodieIOException("Unable to readAvro from source from checkpoint: " + lastCheckpointStr, ioe);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Arrays(java.util.Arrays) HoodieTestSuiteJob(org.apache.hudi.integ.testsuite.HoodieTestSuiteJob) Logger(org.slf4j.Logger) TypedProperties(org.apache.hudi.common.config.TypedProperties) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) LoggerFactory(org.slf4j.LoggerFactory) Option(org.apache.hudi.common.util.Option) IOException(java.io.IOException) FileStatus(org.apache.hadoop.fs.FileStatus) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) List(java.util.List) DFSPathSelector(org.apache.hudi.utilities.sources.helpers.DFSPathSelector) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) HoodieIOException(org.apache.hudi.exception.HoodieIOException) RemoteIterator(org.apache.hadoop.fs.RemoteIterator) Pair(org.apache.hudi.common.util.collection.Pair) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) RemoteIterator(org.apache.hadoop.fs.RemoteIterator) HoodieIOException(org.apache.hudi.exception.HoodieIOException) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair)

Example 35 with HoodieIOException

use of org.apache.hudi.exception.HoodieIOException in project hudi by apache.

the class HoodieMergeOnReadTableInputFormat method createRealtimeFileStatusUnchecked.

/**
 * Creates {@link RealtimeFileStatus} for the file-slice where base file is present
 */
private static RealtimeFileStatus createRealtimeFileStatusUnchecked(HoodieBaseFile baseFile, Stream<HoodieLogFile> logFiles, String basePath, Option<HoodieInstant> latestCompletedInstantOpt, Option<HoodieVirtualKeyInfo> virtualKeyInfoOpt) {
    FileStatus baseFileStatus = getFileStatusUnchecked(baseFile);
    List<HoodieLogFile> sortedLogFiles = logFiles.sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList());
    try {
        RealtimeFileStatus rtFileStatus = new RealtimeFileStatus(baseFileStatus, basePath, sortedLogFiles, false, virtualKeyInfoOpt);
        if (latestCompletedInstantOpt.isPresent()) {
            HoodieInstant latestCompletedInstant = latestCompletedInstantOpt.get();
            checkState(latestCompletedInstant.isCompleted());
            rtFileStatus.setMaxCommitTime(latestCompletedInstant.getTimestamp());
        }
        if (baseFileStatus instanceof LocatedFileStatusWithBootstrapBaseFile || baseFileStatus instanceof FileStatusWithBootstrapBaseFile) {
            rtFileStatus.setBootStrapFileStatus(baseFileStatus);
        }
        return rtFileStatus;
    } catch (IOException e) {
        throw new HoodieIOException(String.format("Failed to init %s", RealtimeFileStatus.class.getSimpleName()), e);
    }
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileStatusWithBootstrapBaseFile(org.apache.hudi.hadoop.FileStatusWithBootstrapBaseFile) LocatedFileStatusWithBootstrapBaseFile(org.apache.hudi.hadoop.LocatedFileStatusWithBootstrapBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) RealtimeFileStatus(org.apache.hudi.hadoop.RealtimeFileStatus) HoodieIOException(org.apache.hudi.exception.HoodieIOException) RealtimeFileStatus(org.apache.hudi.hadoop.RealtimeFileStatus) LocatedFileStatusWithBootstrapBaseFile(org.apache.hudi.hadoop.LocatedFileStatusWithBootstrapBaseFile) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Aggregations

HoodieIOException (org.apache.hudi.exception.HoodieIOException)139 IOException (java.io.IOException)127 Path (org.apache.hadoop.fs.Path)45 List (java.util.List)31 ArrayList (java.util.ArrayList)30 Option (org.apache.hudi.common.util.Option)27 Collectors (java.util.stream.Collectors)26 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)26 Pair (org.apache.hudi.common.util.collection.Pair)25 LogManager (org.apache.log4j.LogManager)25 Logger (org.apache.log4j.Logger)25 Map (java.util.Map)21 FileSystem (org.apache.hadoop.fs.FileSystem)20 GenericRecord (org.apache.avro.generic.GenericRecord)19 HashSet (java.util.HashSet)18 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)18 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)18 Set (java.util.Set)17 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)17 HoodieException (org.apache.hudi.exception.HoodieException)17