Search in sources :

Example 21 with HoodieIOException

use of org.apache.hudi.exception.HoodieIOException in project hudi by apache.

the class OrcUtils method readAvroSchema.

@Override
public Schema readAvroSchema(Configuration conf, Path orcFilePath) {
    try (Reader reader = OrcFile.createReader(orcFilePath, OrcFile.readerOptions(conf))) {
        if (reader.hasMetadataValue("orc.avro.schema")) {
            ByteBuffer metadataValue = reader.getMetadataValue("orc.avro.schema");
            byte[] bytes = new byte[metadataValue.remaining()];
            metadataValue.get(bytes);
            return new Schema.Parser().parse(new String(bytes));
        } else {
            TypeDescription orcSchema = reader.getSchema();
            return AvroOrcUtils.createAvroSchema(orcSchema);
        }
    } catch (IOException io) {
        throw new HoodieIOException("Unable to get Avro schema for ORC file:" + orcFilePath, io);
    }
}
Also used : HoodieIOException(org.apache.hudi.exception.HoodieIOException) Schema(org.apache.avro.Schema) Reader(org.apache.orc.Reader) RecordReader(org.apache.orc.RecordReader) TypeDescription(org.apache.orc.TypeDescription) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) ByteBuffer(java.nio.ByteBuffer)

Example 22 with HoodieIOException

use of org.apache.hudi.exception.HoodieIOException in project hudi by apache.

the class OrcUtils method readAvroRecords.

/**
 * NOTE: This literally reads the entire file contents, thus should be used with caution.
 */
@Override
public List<GenericRecord> readAvroRecords(Configuration configuration, Path filePath, Schema avroSchema) {
    List<GenericRecord> records = new ArrayList<>();
    try (Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(configuration))) {
        TypeDescription orcSchema = reader.getSchema();
        try (RecordReader recordReader = reader.rows(new Options(configuration).schema(orcSchema))) {
            OrcReaderIterator<GenericRecord> iterator = new OrcReaderIterator<>(recordReader, avroSchema, orcSchema);
            while (iterator.hasNext()) {
                GenericRecord record = iterator.next();
                records.add(record);
            }
        }
    } catch (IOException io) {
        throw new HoodieIOException("Unable to create an ORC reader for ORC file:" + filePath, io);
    }
    return records;
}
Also used : Options(org.apache.orc.Reader.Options) HoodieIOException(org.apache.hudi.exception.HoodieIOException) RecordReader(org.apache.orc.RecordReader) ArrayList(java.util.ArrayList) Reader(org.apache.orc.Reader) RecordReader(org.apache.orc.RecordReader) TypeDescription(org.apache.orc.TypeDescription) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 23 with HoodieIOException

use of org.apache.hudi.exception.HoodieIOException in project hudi by apache.

the class OrcUtils method getHoodieKeyIterator.

/**
 * Provides a closable iterator for reading the given ORC file.
 *
 * @param configuration configuration to build fs object
 * @param filePath      The ORC file path
 * @return {@link ClosableIterator} of {@link HoodieKey}s for reading the ORC file
 */
@Override
public ClosableIterator<HoodieKey> getHoodieKeyIterator(Configuration configuration, Path filePath) {
    try {
        Configuration conf = new Configuration(configuration);
        conf.addResource(FSUtils.getFs(filePath.toString(), conf).getConf());
        Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(conf));
        Schema readSchema = HoodieAvroUtils.getRecordKeyPartitionPathSchema();
        TypeDescription orcSchema = AvroOrcUtils.createOrcSchema(readSchema);
        RecordReader recordReader = reader.rows(new Options(conf).schema(orcSchema));
        List<String> fieldNames = orcSchema.getFieldNames();
        // column indices for the RECORD_KEY_METADATA_FIELD, PARTITION_PATH_METADATA_FIELD fields
        int keyCol = -1;
        int partitionCol = -1;
        for (int i = 0; i < fieldNames.size(); i++) {
            if (fieldNames.get(i).equals(HoodieRecord.RECORD_KEY_METADATA_FIELD)) {
                keyCol = i;
            }
            if (fieldNames.get(i).equals(HoodieRecord.PARTITION_PATH_METADATA_FIELD)) {
                partitionCol = i;
            }
        }
        if (keyCol == -1 || partitionCol == -1) {
            throw new HoodieException(String.format("Couldn't find row keys or partition path in %s.", filePath));
        }
        return new OrcReaderIterator<>(recordReader, readSchema, orcSchema);
    } catch (IOException e) {
        throw new HoodieIOException("Failed to open reader from ORC file:" + filePath, e);
    }
}
Also used : Options(org.apache.orc.Reader.Options) Configuration(org.apache.hadoop.conf.Configuration) Schema(org.apache.avro.Schema) RecordReader(org.apache.orc.RecordReader) Reader(org.apache.orc.Reader) RecordReader(org.apache.orc.RecordReader) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) TypeDescription(org.apache.orc.TypeDescription)

Example 24 with HoodieIOException

use of org.apache.hudi.exception.HoodieIOException in project hudi by apache.

the class OrcUtils method readFooter.

@Override
public Map<String, String> readFooter(Configuration conf, boolean required, Path orcFilePath, String... footerNames) {
    try (Reader reader = OrcFile.createReader(orcFilePath, OrcFile.readerOptions(conf))) {
        Map<String, String> footerVals = new HashMap<>();
        List<UserMetadataItem> metadataItemList = reader.getFileTail().getFooter().getMetadataList();
        Map<String, String> metadata = metadataItemList.stream().collect(Collectors.toMap(UserMetadataItem::getName, metadataItem -> metadataItem.getValue().toStringUtf8()));
        for (String footerName : footerNames) {
            if (metadata.containsKey(footerName)) {
                footerVals.put(footerName, metadata.get(footerName));
            } else if (required) {
                throw new MetadataNotFoundException("Could not find index in ORC footer. Looked for key " + footerName + " in " + orcFilePath);
            }
        }
        return footerVals;
    } catch (IOException io) {
        throw new HoodieIOException("Unable to read footer for ORC file:" + orcFilePath, io);
    }
}
Also used : HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) VectorizedRowBatch(org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch) HoodieException(org.apache.hudi.exception.HoodieException) HashMap(java.util.HashMap) ByteBuffer(java.nio.ByteBuffer) OrcFile(org.apache.orc.OrcFile) ArrayList(java.util.ArrayList) BaseKeyGenerator(org.apache.hudi.keygen.BaseKeyGenerator) HashSet(java.util.HashSet) Reader(org.apache.orc.Reader) Options(org.apache.orc.Reader.Options) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) MetadataNotFoundException(org.apache.hudi.exception.MetadataNotFoundException) RecordReader(org.apache.orc.RecordReader) Set(java.util.Set) TypeDescription(org.apache.orc.TypeDescription) IOException(java.io.IOException) BytesColumnVector(org.apache.orc.storage.ql.exec.vector.BytesColumnVector) Collectors(java.util.stream.Collectors) List(java.util.List) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIOException(org.apache.hudi.exception.HoodieIOException) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) UserMetadataItem(org.apache.orc.OrcProto.UserMetadataItem) MetadataNotFoundException(org.apache.hudi.exception.MetadataNotFoundException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HashMap(java.util.HashMap) UserMetadataItem(org.apache.orc.OrcProto.UserMetadataItem) Reader(org.apache.orc.Reader) RecordReader(org.apache.orc.RecordReader) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Example 25 with HoodieIOException

use of org.apache.hudi.exception.HoodieIOException in project hudi by apache.

the class ParquetUtils method filterParquetRowKeys.

/**
 * Read the rowKey list matching the given filter, from the given parquet file. If the filter is empty, then this will
 * return all the rowkeys.
 *
 * @param filePath      The parquet file path.
 * @param configuration configuration to build fs object
 * @param filter        record keys filter
 * @param readSchema    schema of columns to be read
 * @return Set Set of row keys matching candidateRecordKeys
 */
private static Set<String> filterParquetRowKeys(Configuration configuration, Path filePath, Set<String> filter, Schema readSchema) {
    Option<RecordKeysFilterFunction> filterFunction = Option.empty();
    if (filter != null && !filter.isEmpty()) {
        filterFunction = Option.of(new RecordKeysFilterFunction(filter));
    }
    Configuration conf = new Configuration(configuration);
    conf.addResource(FSUtils.getFs(filePath.toString(), conf).getConf());
    AvroReadSupport.setAvroReadSchema(conf, readSchema);
    AvroReadSupport.setRequestedProjection(conf, readSchema);
    Set<String> rowKeys = new HashSet<>();
    try (ParquetReader reader = AvroParquetReader.builder(filePath).withConf(conf).build()) {
        Object obj = reader.read();
        while (obj != null) {
            if (obj instanceof GenericRecord) {
                String recordKey = ((GenericRecord) obj).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
                if (!filterFunction.isPresent() || filterFunction.get().apply(recordKey)) {
                    rowKeys.add(recordKey);
                }
            }
            obj = reader.read();
        }
    } catch (IOException e) {
        throw new HoodieIOException("Failed to read row keys from Parquet " + filePath, e);
    }
    // ignore
    return rowKeys;
}
Also used : HoodieIOException(org.apache.hudi.exception.HoodieIOException) Configuration(org.apache.hadoop.conf.Configuration) ParquetReader(org.apache.parquet.hadoop.ParquetReader) AvroParquetReader(org.apache.parquet.avro.AvroParquetReader) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) GenericRecord(org.apache.avro.generic.GenericRecord) HashSet(java.util.HashSet)

Aggregations

HoodieIOException (org.apache.hudi.exception.HoodieIOException)139 IOException (java.io.IOException)127 Path (org.apache.hadoop.fs.Path)45 List (java.util.List)31 ArrayList (java.util.ArrayList)30 Option (org.apache.hudi.common.util.Option)27 Collectors (java.util.stream.Collectors)26 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)26 Pair (org.apache.hudi.common.util.collection.Pair)25 LogManager (org.apache.log4j.LogManager)25 Logger (org.apache.log4j.Logger)25 Map (java.util.Map)21 FileSystem (org.apache.hadoop.fs.FileSystem)20 GenericRecord (org.apache.avro.generic.GenericRecord)19 HashSet (java.util.HashSet)18 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)18 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)18 Set (java.util.Set)17 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)17 HoodieException (org.apache.hudi.exception.HoodieException)17