use of org.apache.hudi.exception.HoodieIOException in project hudi by apache.
the class OrcUtils method readAvroSchema.
@Override
public Schema readAvroSchema(Configuration conf, Path orcFilePath) {
try (Reader reader = OrcFile.createReader(orcFilePath, OrcFile.readerOptions(conf))) {
if (reader.hasMetadataValue("orc.avro.schema")) {
ByteBuffer metadataValue = reader.getMetadataValue("orc.avro.schema");
byte[] bytes = new byte[metadataValue.remaining()];
metadataValue.get(bytes);
return new Schema.Parser().parse(new String(bytes));
} else {
TypeDescription orcSchema = reader.getSchema();
return AvroOrcUtils.createAvroSchema(orcSchema);
}
} catch (IOException io) {
throw new HoodieIOException("Unable to get Avro schema for ORC file:" + orcFilePath, io);
}
}
use of org.apache.hudi.exception.HoodieIOException in project hudi by apache.
the class OrcUtils method readAvroRecords.
/**
* NOTE: This literally reads the entire file contents, thus should be used with caution.
*/
@Override
public List<GenericRecord> readAvroRecords(Configuration configuration, Path filePath, Schema avroSchema) {
List<GenericRecord> records = new ArrayList<>();
try (Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(configuration))) {
TypeDescription orcSchema = reader.getSchema();
try (RecordReader recordReader = reader.rows(new Options(configuration).schema(orcSchema))) {
OrcReaderIterator<GenericRecord> iterator = new OrcReaderIterator<>(recordReader, avroSchema, orcSchema);
while (iterator.hasNext()) {
GenericRecord record = iterator.next();
records.add(record);
}
}
} catch (IOException io) {
throw new HoodieIOException("Unable to create an ORC reader for ORC file:" + filePath, io);
}
return records;
}
use of org.apache.hudi.exception.HoodieIOException in project hudi by apache.
the class OrcUtils method getHoodieKeyIterator.
/**
* Provides a closable iterator for reading the given ORC file.
*
* @param configuration configuration to build fs object
* @param filePath The ORC file path
* @return {@link ClosableIterator} of {@link HoodieKey}s for reading the ORC file
*/
@Override
public ClosableIterator<HoodieKey> getHoodieKeyIterator(Configuration configuration, Path filePath) {
try {
Configuration conf = new Configuration(configuration);
conf.addResource(FSUtils.getFs(filePath.toString(), conf).getConf());
Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(conf));
Schema readSchema = HoodieAvroUtils.getRecordKeyPartitionPathSchema();
TypeDescription orcSchema = AvroOrcUtils.createOrcSchema(readSchema);
RecordReader recordReader = reader.rows(new Options(conf).schema(orcSchema));
List<String> fieldNames = orcSchema.getFieldNames();
// column indices for the RECORD_KEY_METADATA_FIELD, PARTITION_PATH_METADATA_FIELD fields
int keyCol = -1;
int partitionCol = -1;
for (int i = 0; i < fieldNames.size(); i++) {
if (fieldNames.get(i).equals(HoodieRecord.RECORD_KEY_METADATA_FIELD)) {
keyCol = i;
}
if (fieldNames.get(i).equals(HoodieRecord.PARTITION_PATH_METADATA_FIELD)) {
partitionCol = i;
}
}
if (keyCol == -1 || partitionCol == -1) {
throw new HoodieException(String.format("Couldn't find row keys or partition path in %s.", filePath));
}
return new OrcReaderIterator<>(recordReader, readSchema, orcSchema);
} catch (IOException e) {
throw new HoodieIOException("Failed to open reader from ORC file:" + filePath, e);
}
}
use of org.apache.hudi.exception.HoodieIOException in project hudi by apache.
the class OrcUtils method readFooter.
@Override
public Map<String, String> readFooter(Configuration conf, boolean required, Path orcFilePath, String... footerNames) {
try (Reader reader = OrcFile.createReader(orcFilePath, OrcFile.readerOptions(conf))) {
Map<String, String> footerVals = new HashMap<>();
List<UserMetadataItem> metadataItemList = reader.getFileTail().getFooter().getMetadataList();
Map<String, String> metadata = metadataItemList.stream().collect(Collectors.toMap(UserMetadataItem::getName, metadataItem -> metadataItem.getValue().toStringUtf8()));
for (String footerName : footerNames) {
if (metadata.containsKey(footerName)) {
footerVals.put(footerName, metadata.get(footerName));
} else if (required) {
throw new MetadataNotFoundException("Could not find index in ORC footer. Looked for key " + footerName + " in " + orcFilePath);
}
}
return footerVals;
} catch (IOException io) {
throw new HoodieIOException("Unable to read footer for ORC file:" + orcFilePath, io);
}
}
use of org.apache.hudi.exception.HoodieIOException in project hudi by apache.
the class ParquetUtils method filterParquetRowKeys.
/**
* Read the rowKey list matching the given filter, from the given parquet file. If the filter is empty, then this will
* return all the rowkeys.
*
* @param filePath The parquet file path.
* @param configuration configuration to build fs object
* @param filter record keys filter
* @param readSchema schema of columns to be read
* @return Set Set of row keys matching candidateRecordKeys
*/
private static Set<String> filterParquetRowKeys(Configuration configuration, Path filePath, Set<String> filter, Schema readSchema) {
Option<RecordKeysFilterFunction> filterFunction = Option.empty();
if (filter != null && !filter.isEmpty()) {
filterFunction = Option.of(new RecordKeysFilterFunction(filter));
}
Configuration conf = new Configuration(configuration);
conf.addResource(FSUtils.getFs(filePath.toString(), conf).getConf());
AvroReadSupport.setAvroReadSchema(conf, readSchema);
AvroReadSupport.setRequestedProjection(conf, readSchema);
Set<String> rowKeys = new HashSet<>();
try (ParquetReader reader = AvroParquetReader.builder(filePath).withConf(conf).build()) {
Object obj = reader.read();
while (obj != null) {
if (obj instanceof GenericRecord) {
String recordKey = ((GenericRecord) obj).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
if (!filterFunction.isPresent() || filterFunction.get().apply(recordKey)) {
rowKeys.add(recordKey);
}
}
obj = reader.read();
}
} catch (IOException e) {
throw new HoodieIOException("Failed to read row keys from Parquet " + filePath, e);
}
// ignore
return rowKeys;
}
Aggregations