Search in sources :

Example 1 with Options

use of org.apache.orc.Reader.Options in project hudi by apache.

the class OrcUtils method readAvroRecords.

/**
 * NOTE: This literally reads the entire file contents, thus should be used with caution.
 */
@Override
public List<GenericRecord> readAvroRecords(Configuration configuration, Path filePath, Schema avroSchema) {
    List<GenericRecord> records = new ArrayList<>();
    try (Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(configuration))) {
        TypeDescription orcSchema = reader.getSchema();
        try (RecordReader recordReader = reader.rows(new Options(configuration).schema(orcSchema))) {
            OrcReaderIterator<GenericRecord> iterator = new OrcReaderIterator<>(recordReader, avroSchema, orcSchema);
            while (iterator.hasNext()) {
                GenericRecord record = iterator.next();
                records.add(record);
            }
        }
    } catch (IOException io) {
        throw new HoodieIOException("Unable to create an ORC reader for ORC file:" + filePath, io);
    }
    return records;
}
Also used : Options(org.apache.orc.Reader.Options) HoodieIOException(org.apache.hudi.exception.HoodieIOException) RecordReader(org.apache.orc.RecordReader) ArrayList(java.util.ArrayList) Reader(org.apache.orc.Reader) RecordReader(org.apache.orc.RecordReader) TypeDescription(org.apache.orc.TypeDescription) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 2 with Options

use of org.apache.orc.Reader.Options in project hudi by apache.

the class OrcUtils method getHoodieKeyIterator.

/**
 * Provides a closable iterator for reading the given ORC file.
 *
 * @param configuration configuration to build fs object
 * @param filePath      The ORC file path
 * @return {@link ClosableIterator} of {@link HoodieKey}s for reading the ORC file
 */
@Override
public ClosableIterator<HoodieKey> getHoodieKeyIterator(Configuration configuration, Path filePath) {
    try {
        Configuration conf = new Configuration(configuration);
        conf.addResource(FSUtils.getFs(filePath.toString(), conf).getConf());
        Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(conf));
        Schema readSchema = HoodieAvroUtils.getRecordKeyPartitionPathSchema();
        TypeDescription orcSchema = AvroOrcUtils.createOrcSchema(readSchema);
        RecordReader recordReader = reader.rows(new Options(conf).schema(orcSchema));
        List<String> fieldNames = orcSchema.getFieldNames();
        // column indices for the RECORD_KEY_METADATA_FIELD, PARTITION_PATH_METADATA_FIELD fields
        int keyCol = -1;
        int partitionCol = -1;
        for (int i = 0; i < fieldNames.size(); i++) {
            if (fieldNames.get(i).equals(HoodieRecord.RECORD_KEY_METADATA_FIELD)) {
                keyCol = i;
            }
            if (fieldNames.get(i).equals(HoodieRecord.PARTITION_PATH_METADATA_FIELD)) {
                partitionCol = i;
            }
        }
        if (keyCol == -1 || partitionCol == -1) {
            throw new HoodieException(String.format("Couldn't find row keys or partition path in %s.", filePath));
        }
        return new OrcReaderIterator<>(recordReader, readSchema, orcSchema);
    } catch (IOException e) {
        throw new HoodieIOException("Failed to open reader from ORC file:" + filePath, e);
    }
}
Also used : Options(org.apache.orc.Reader.Options) Configuration(org.apache.hadoop.conf.Configuration) Schema(org.apache.avro.Schema) RecordReader(org.apache.orc.RecordReader) Reader(org.apache.orc.Reader) RecordReader(org.apache.orc.RecordReader) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) TypeDescription(org.apache.orc.TypeDescription)

Example 3 with Options

use of org.apache.orc.Reader.Options in project hudi by apache.

the class OrcUtils method filterRowKeys.

/**
 * Read the rowKey list matching the given filter, from the given ORC file. If the filter is empty, then this will
 * return all the rowkeys.
 *
 * @param conf configuration to build fs object.
 * @param filePath      The ORC file path.
 * @param filter        record keys filter
 * @return Set Set of row keys matching candidateRecordKeys
 */
@Override
public Set<String> filterRowKeys(Configuration conf, Path filePath, Set<String> filter) throws HoodieIOException {
    try (Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(conf))) {
        TypeDescription schema = reader.getSchema();
        try (RecordReader recordReader = reader.rows(new Options(conf).schema(schema))) {
            Set<String> filteredRowKeys = new HashSet<>();
            List<String> fieldNames = schema.getFieldNames();
            VectorizedRowBatch batch = schema.createRowBatch();
            // column index for the RECORD_KEY_METADATA_FIELD field
            int colIndex = -1;
            for (int i = 0; i < fieldNames.size(); i++) {
                if (fieldNames.get(i).equals(HoodieRecord.RECORD_KEY_METADATA_FIELD)) {
                    colIndex = i;
                    break;
                }
            }
            if (colIndex == -1) {
                throw new HoodieException(String.format("Couldn't find row keys in %s.", filePath));
            }
            while (recordReader.nextBatch(batch)) {
                BytesColumnVector rowKeys = (BytesColumnVector) batch.cols[colIndex];
                for (int i = 0; i < batch.size; i++) {
                    String rowKey = rowKeys.toString(i);
                    if (filter.isEmpty() || filter.contains(rowKey)) {
                        filteredRowKeys.add(rowKey);
                    }
                }
            }
            return filteredRowKeys;
        }
    } catch (IOException io) {
        throw new HoodieIOException("Unable to read row keys for ORC file:" + filePath, io);
    }
}
Also used : Options(org.apache.orc.Reader.Options) RecordReader(org.apache.orc.RecordReader) Reader(org.apache.orc.Reader) RecordReader(org.apache.orc.RecordReader) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) VectorizedRowBatch(org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch) HoodieIOException(org.apache.hudi.exception.HoodieIOException) BytesColumnVector(org.apache.orc.storage.ql.exec.vector.BytesColumnVector) TypeDescription(org.apache.orc.TypeDescription) HashSet(java.util.HashSet)

Example 4 with Options

use of org.apache.orc.Reader.Options in project hudi by apache.

the class HoodieOrcReader method getRecordIterator.

@Override
public Iterator<R> getRecordIterator(Schema schema) throws IOException {
    try {
        Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
        TypeDescription orcSchema = AvroOrcUtils.createOrcSchema(schema);
        RecordReader recordReader = reader.rows(new Options(conf).schema(orcSchema));
        return new OrcReaderIterator(recordReader, schema, orcSchema);
    } catch (IOException io) {
        throw new HoodieIOException("Unable to create an ORC reader.", io);
    }
}
Also used : Options(org.apache.orc.Reader.Options) HoodieIOException(org.apache.hudi.exception.HoodieIOException) OrcReaderIterator(org.apache.hudi.common.util.OrcReaderIterator) RecordReader(org.apache.orc.RecordReader) RecordReader(org.apache.orc.RecordReader) Reader(org.apache.orc.Reader) TypeDescription(org.apache.orc.TypeDescription) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Aggregations

IOException (java.io.IOException)4 HoodieIOException (org.apache.hudi.exception.HoodieIOException)4 Reader (org.apache.orc.Reader)4 Options (org.apache.orc.Reader.Options)4 RecordReader (org.apache.orc.RecordReader)4 TypeDescription (org.apache.orc.TypeDescription)4 HoodieException (org.apache.hudi.exception.HoodieException)2 ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1 Schema (org.apache.avro.Schema)1 GenericRecord (org.apache.avro.generic.GenericRecord)1 Configuration (org.apache.hadoop.conf.Configuration)1 OrcReaderIterator (org.apache.hudi.common.util.OrcReaderIterator)1 BytesColumnVector (org.apache.orc.storage.ql.exec.vector.BytesColumnVector)1 VectorizedRowBatch (org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch)1