Search in sources :

Example 41 with HoodieException

use of org.apache.hudi.exception.HoodieException in project hudi by apache.

the class HoodieTestDataGenerator method genPseudoRandomUUID.

private static UUID genPseudoRandomUUID(Random r) {
    byte[] bytes = new byte[16];
    r.nextBytes(bytes);
    bytes[6] &= 0x0f;
    bytes[6] |= 0x40;
    bytes[8] &= 0x3f;
    bytes[8] |= 0x80;
    try {
        Constructor<UUID> ctor = UUID.class.getDeclaredConstructor(byte[].class);
        ctor.setAccessible(true);
        return ctor.newInstance((Object) bytes);
    } catch (InvocationTargetException | InstantiationException | IllegalAccessException | NoSuchMethodException e) {
        logger.info("Failed to generate pseudo-random UUID!");
        throw new HoodieException(e);
    }
}
Also used : HoodieException(org.apache.hudi.exception.HoodieException) UUID(java.util.UUID) InvocationTargetException(java.lang.reflect.InvocationTargetException)

Example 42 with HoodieException

use of org.apache.hudi.exception.HoodieException in project hudi by apache.

the class FileCreateUtils method getBaseFileCountsForPaths.

/**
 * Find total basefiles for passed in paths.
 */
public static Map<String, Long> getBaseFileCountsForPaths(String basePath, FileSystem fs, String... paths) {
    Map<String, Long> toReturn = new HashMap<>();
    try {
        HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build();
        for (String path : paths) {
            TableFileSystemView.BaseFileOnlyView fileSystemView = new HoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline().filterCompletedInstants(), fs.globStatus(new org.apache.hadoop.fs.Path(path)));
            toReturn.put(path, fileSystemView.getLatestBaseFiles().count());
        }
        return toReturn;
    } catch (Exception e) {
        throw new HoodieException("Error reading hoodie table as a dataframe", e);
    }
}
Also used : Path(java.nio.file.Path) HashMap(java.util.HashMap) HoodieException(org.apache.hudi.exception.HoodieException) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView)

Example 43 with HoodieException

use of org.apache.hudi.exception.HoodieException in project hudi by apache.

the class TestHoodieRealtimeRecordReader method testLogOnlyReader.

@Test
public void testLogOnlyReader() throws Exception {
    // initial commit
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
    HoodieTestUtils.init(hadoopConf, basePath.toString(), HoodieTableType.MERGE_ON_READ);
    String baseInstant = "100";
    File partitionDir = InputFormatTestUtil.prepareNonPartitionedParquetTable(basePath, schema, 1, 100, baseInstant, HoodieTableType.MERGE_ON_READ);
    FileCreateUtils.createDeltaCommit(basePath.toString(), baseInstant);
    // Add the paths
    FileInputFormat.setInputPaths(baseJobConf, partitionDir.getPath());
    FileSlice fileSlice = new FileSlice("default", baseInstant, "fileid1");
    try {
        // update files or generate new log file
        int logVersion = 1;
        int baseInstantTs = Integer.parseInt(baseInstant);
        String instantTime = String.valueOf(baseInstantTs + logVersion);
        HoodieLogFormat.Writer writer = InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid1", baseInstant, instantTime, 100, 0, logVersion);
        long size = writer.getCurrentSize();
        writer.close();
        assertTrue(size > 0, "block - size should be > 0");
        HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, schema.toString(), HoodieTimeline.COMMIT_ACTION);
        FileCreateUtils.createDeltaCommit(basePath.toString(), instantTime, commitMetadata);
        // create a split with new log file(s)
        fileSlice.addLogFile(new HoodieLogFile(writer.getLogFile().getPath(), size));
        RealtimeFileStatus realtimeFileStatus = new RealtimeFileStatus(new FileStatus(writer.getLogFile().getFileSize(), false, 1, 1, 0, writer.getLogFile().getPath()), basePath.toString(), fileSlice.getLogFiles().collect(Collectors.toList()), false, Option.empty());
        realtimeFileStatus.setMaxCommitTime(instantTime);
        HoodieRealtimePath realtimePath = (HoodieRealtimePath) realtimeFileStatus.getPath();
        HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(new FileSplit(realtimePath, 0, 0, new String[] { "" }), realtimePath);
        JobConf newJobConf = new JobConf(baseJobConf);
        List<Schema.Field> fields = schema.getFields();
        setHiveColumnNameProps(fields, newJobConf, false);
        // create a dummy RecordReader to be used by HoodieRealtimeRecordReader
        RecordReader<NullWritable, ArrayWritable> reader = new HoodieRealtimeRecordReader(split, newJobConf, new HoodieEmptyRecordReader(split, newJobConf));
        // use reader to read log file.
        NullWritable key = reader.createKey();
        ArrayWritable value = reader.createValue();
        while (reader.next(key, value)) {
            Writable[] values = value.get();
            assertEquals(instantTime, values[0].toString());
            key = reader.createKey();
            value = reader.createValue();
        }
        reader.close();
    } catch (Exception e) {
        throw new HoodieException(e.getMessage(), e);
    }
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) RealtimeFileStatus(org.apache.hudi.hadoop.RealtimeFileStatus) FileSlice(org.apache.hudi.common.model.FileSlice) Schema(org.apache.avro.Schema) NullWritable(org.apache.hadoop.io.NullWritable) Writable(org.apache.hadoop.io.Writable) LongWritable(org.apache.hadoop.io.LongWritable) ArrayWritable(org.apache.hadoop.io.ArrayWritable) IntWritable(org.apache.hadoop.io.IntWritable) BooleanWritable(org.apache.hadoop.io.BooleanWritable) DoubleWritable(org.apache.hadoop.io.DoubleWritable) FloatWritable(org.apache.hadoop.io.FloatWritable) HoodieException(org.apache.hudi.exception.HoodieException) FileSplit(org.apache.hadoop.mapred.FileSplit) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) Field(org.apache.avro.Schema.Field) Writer(org.apache.hudi.common.table.log.HoodieLogFormat.Writer) ArrayWritable(org.apache.hadoop.io.ArrayWritable) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) JobConf(org.apache.hadoop.mapred.JobConf) RealtimeFileStatus(org.apache.hudi.hadoop.RealtimeFileStatus) NullWritable(org.apache.hadoop.io.NullWritable) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) File(java.io.File) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 44 with HoodieException

use of org.apache.hudi.exception.HoodieException in project hudi by apache.

the class HoodieRealtimeInputFormatUtils method groupLogsByBaseFile.

// Return parquet file with a list of log files in the same file group.
public static List<Pair<Option<HoodieBaseFile>, List<HoodieLogFile>>> groupLogsByBaseFile(Configuration conf, List<Path> partitionPaths) {
    Set<Path> partitionSet = new HashSet<>(partitionPaths);
    // TODO(vc): Should we handle also non-hoodie splits here?
    Map<Path, HoodieTableMetaClient> partitionsToMetaClient = getTableMetaClientByPartitionPath(conf, partitionSet);
    // Get all the base file and it's log files pairs in required partition paths.
    List<Pair<Option<HoodieBaseFile>, List<HoodieLogFile>>> baseAndLogsList = new ArrayList<>();
    partitionSet.forEach(partitionPath -> {
        // for each partition path obtain the data & log file groupings, then map back to inputsplits
        HoodieTableMetaClient metaClient = partitionsToMetaClient.get(partitionPath);
        HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, metaClient.getActiveTimeline());
        String relPartitionPath = FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), partitionPath);
        try {
            // Both commit and delta-commits are included - pick the latest completed one
            Option<HoodieInstant> latestCompletedInstant = metaClient.getCommitsAndCompactionTimeline().filterCompletedAndCompactionInstants().lastInstant();
            Stream<FileSlice> latestFileSlices = latestCompletedInstant.map(instant -> fsView.getLatestMergedFileSlicesBeforeOrOn(relPartitionPath, instant.getTimestamp())).orElse(Stream.empty());
            latestFileSlices.forEach(fileSlice -> {
                List<HoodieLogFile> logFilePaths = fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList());
                baseAndLogsList.add(Pair.of(fileSlice.getBaseFile(), logFilePaths));
            });
        } catch (Exception e) {
            throw new HoodieException("Error obtaining data file/log file grouping: " + partitionPath, e);
        }
    });
    return baseAndLogsList;
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) ColumnProjectionUtils(org.apache.hadoop.hive.serde2.ColumnProjectionUtils) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieException(org.apache.hudi.exception.HoodieException) Option(org.apache.hudi.common.util.Option) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) HoodieRealtimeBootstrapBaseFileSplit(org.apache.hudi.hadoop.realtime.HoodieRealtimeBootstrapBaseFileSplit) Logger(org.apache.log4j.Logger) FileSplit(org.apache.hadoop.mapred.FileSplit) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Configuration(org.apache.hadoop.conf.Configuration) RealtimeSplit(org.apache.hudi.hadoop.realtime.RealtimeSplit) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) TypeUtils.unsafeCast(org.apache.hudi.TypeUtils.unsafeCast) HoodieVirtualKeyInfo(org.apache.hudi.hadoop.realtime.HoodieVirtualKeyInfo) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Set(java.util.Set) HoodieRealtimeFileSplit(org.apache.hudi.hadoop.realtime.HoodieRealtimeFileSplit) Collectors(java.util.stream.Collectors) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) JobConf(org.apache.hadoop.mapred.JobConf) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileSlice(org.apache.hudi.common.model.FileSlice) ArrayList(java.util.ArrayList) HoodieException(org.apache.hudi.exception.HoodieException) HoodieException(org.apache.hudi.exception.HoodieException) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HashSet(java.util.HashSet) Pair(org.apache.hudi.common.util.collection.Pair)

Example 45 with HoodieException

use of org.apache.hudi.exception.HoodieException in project hudi by apache.

the class HoodieRealtimeRecordReaderUtils method generateProjectionSchema.

/**
 * Generate a reader schema off the provided writeSchema, to just project out the provided columns.
 */
public static Schema generateProjectionSchema(Schema writeSchema, Map<String, Schema.Field> schemaFieldsMap, List<String> fieldNames) {
    /**
     * Avro & Presto field names seems to be case sensitive (support fields differing only in case) whereas
     * Hive/Impala/SparkSQL(default) are case-insensitive. Spark allows this to be configurable using
     * spark.sql.caseSensitive=true
     *
     * For a RT table setup with no delta-files (for a latest file-slice) -> we translate parquet schema to Avro Here
     * the field-name case is dependent on parquet schema. Hive (1.x/2.x/CDH) translate column projections to
     * lower-cases
     */
    List<Schema.Field> projectedFields = new ArrayList<>();
    for (String fn : fieldNames) {
        Schema.Field field = schemaFieldsMap.get(fn.toLowerCase());
        if (field == null) {
            throw new HoodieException("Field " + fn + " not found in log schema. Query cannot proceed! " + "Derived Schema Fields: " + new ArrayList<>(schemaFieldsMap.keySet()));
        } else {
            projectedFields.add(new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultVal()));
        }
    }
    Schema projectedSchema = Schema.createRecord(writeSchema.getName(), writeSchema.getDoc(), writeSchema.getNamespace(), writeSchema.isError());
    projectedSchema.setFields(projectedFields);
    return projectedSchema;
}
Also used : Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) HoodieException(org.apache.hudi.exception.HoodieException)

Aggregations

HoodieException (org.apache.hudi.exception.HoodieException)171 IOException (java.io.IOException)87 Path (org.apache.hadoop.fs.Path)45 Schema (org.apache.avro.Schema)35 HoodieIOException (org.apache.hudi.exception.HoodieIOException)35 List (java.util.List)30 ArrayList (java.util.ArrayList)27 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)23 Collectors (java.util.stream.Collectors)21 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)19 Option (org.apache.hudi.common.util.Option)19 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)18 Map (java.util.Map)16 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)16 GenericRecord (org.apache.avro.generic.GenericRecord)15 Arrays (java.util.Arrays)14 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)14 Logger (org.apache.log4j.Logger)14 FileStatus (org.apache.hadoop.fs.FileStatus)13 HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)13