use of org.apache.orc.Reader.Options in project hudi by apache.
the class OrcUtils method readAvroRecords.
/**
* NOTE: This literally reads the entire file contents, thus should be used with caution.
*/
@Override
public List<GenericRecord> readAvroRecords(Configuration configuration, Path filePath, Schema avroSchema) {
List<GenericRecord> records = new ArrayList<>();
try (Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(configuration))) {
TypeDescription orcSchema = reader.getSchema();
try (RecordReader recordReader = reader.rows(new Options(configuration).schema(orcSchema))) {
OrcReaderIterator<GenericRecord> iterator = new OrcReaderIterator<>(recordReader, avroSchema, orcSchema);
while (iterator.hasNext()) {
GenericRecord record = iterator.next();
records.add(record);
}
}
} catch (IOException io) {
throw new HoodieIOException("Unable to create an ORC reader for ORC file:" + filePath, io);
}
return records;
}
use of org.apache.orc.Reader.Options in project hudi by apache.
the class OrcUtils method getHoodieKeyIterator.
/**
* Provides a closable iterator for reading the given ORC file.
*
* @param configuration configuration to build fs object
* @param filePath The ORC file path
* @return {@link ClosableIterator} of {@link HoodieKey}s for reading the ORC file
*/
@Override
public ClosableIterator<HoodieKey> getHoodieKeyIterator(Configuration configuration, Path filePath) {
try {
Configuration conf = new Configuration(configuration);
conf.addResource(FSUtils.getFs(filePath.toString(), conf).getConf());
Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(conf));
Schema readSchema = HoodieAvroUtils.getRecordKeyPartitionPathSchema();
TypeDescription orcSchema = AvroOrcUtils.createOrcSchema(readSchema);
RecordReader recordReader = reader.rows(new Options(conf).schema(orcSchema));
List<String> fieldNames = orcSchema.getFieldNames();
// column indices for the RECORD_KEY_METADATA_FIELD, PARTITION_PATH_METADATA_FIELD fields
int keyCol = -1;
int partitionCol = -1;
for (int i = 0; i < fieldNames.size(); i++) {
if (fieldNames.get(i).equals(HoodieRecord.RECORD_KEY_METADATA_FIELD)) {
keyCol = i;
}
if (fieldNames.get(i).equals(HoodieRecord.PARTITION_PATH_METADATA_FIELD)) {
partitionCol = i;
}
}
if (keyCol == -1 || partitionCol == -1) {
throw new HoodieException(String.format("Couldn't find row keys or partition path in %s.", filePath));
}
return new OrcReaderIterator<>(recordReader, readSchema, orcSchema);
} catch (IOException e) {
throw new HoodieIOException("Failed to open reader from ORC file:" + filePath, e);
}
}
use of org.apache.orc.Reader.Options in project hudi by apache.
the class OrcUtils method filterRowKeys.
/**
* Read the rowKey list matching the given filter, from the given ORC file. If the filter is empty, then this will
* return all the rowkeys.
*
* @param conf configuration to build fs object.
* @param filePath The ORC file path.
* @param filter record keys filter
* @return Set Set of row keys matching candidateRecordKeys
*/
@Override
public Set<String> filterRowKeys(Configuration conf, Path filePath, Set<String> filter) throws HoodieIOException {
try (Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(conf))) {
TypeDescription schema = reader.getSchema();
try (RecordReader recordReader = reader.rows(new Options(conf).schema(schema))) {
Set<String> filteredRowKeys = new HashSet<>();
List<String> fieldNames = schema.getFieldNames();
VectorizedRowBatch batch = schema.createRowBatch();
// column index for the RECORD_KEY_METADATA_FIELD field
int colIndex = -1;
for (int i = 0; i < fieldNames.size(); i++) {
if (fieldNames.get(i).equals(HoodieRecord.RECORD_KEY_METADATA_FIELD)) {
colIndex = i;
break;
}
}
if (colIndex == -1) {
throw new HoodieException(String.format("Couldn't find row keys in %s.", filePath));
}
while (recordReader.nextBatch(batch)) {
BytesColumnVector rowKeys = (BytesColumnVector) batch.cols[colIndex];
for (int i = 0; i < batch.size; i++) {
String rowKey = rowKeys.toString(i);
if (filter.isEmpty() || filter.contains(rowKey)) {
filteredRowKeys.add(rowKey);
}
}
}
return filteredRowKeys;
}
} catch (IOException io) {
throw new HoodieIOException("Unable to read row keys for ORC file:" + filePath, io);
}
}
use of org.apache.orc.Reader.Options in project hudi by apache.
the class HoodieOrcReader method getRecordIterator.
@Override
public Iterator<R> getRecordIterator(Schema schema) throws IOException {
try {
Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
TypeDescription orcSchema = AvroOrcUtils.createOrcSchema(schema);
RecordReader recordReader = reader.rows(new Options(conf).schema(orcSchema));
return new OrcReaderIterator(recordReader, schema, orcSchema);
} catch (IOException io) {
throw new HoodieIOException("Unable to create an ORC reader.", io);
}
}
Aggregations