use of org.apache.hudi.common.util.ParquetReaderIterator in project hudi by apache.
the class ParquetBootstrapMetadataHandler method executeBootstrap.
@Override
void executeBootstrap(HoodieBootstrapHandle<?, ?, ?, ?> bootstrapHandle, Path sourceFilePath, KeyGeneratorInterface keyGenerator, String partitionPath, Schema avroSchema) throws Exception {
BoundedInMemoryExecutor<GenericRecord, HoodieRecord, Void> wrapper = null;
try {
ParquetReader<IndexedRecord> reader = AvroParquetReader.<IndexedRecord>builder(sourceFilePath).withConf(table.getHadoopConf()).build();
wrapper = new BoundedInMemoryExecutor<GenericRecord, HoodieRecord, Void>(config.getWriteBufferLimitBytes(), new ParquetReaderIterator(reader), new BootstrapRecordConsumer(bootstrapHandle), inp -> {
String recKey = keyGenerator.getKey(inp).getRecordKey();
GenericRecord gr = new GenericData.Record(HoodieAvroUtils.RECORD_KEY_SCHEMA);
gr.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, recKey);
BootstrapRecordPayload payload = new BootstrapRecordPayload(gr);
HoodieRecord rec = new HoodieAvroRecord(new HoodieKey(recKey, partitionPath), payload);
return rec;
}, table.getPreExecuteRunnable());
wrapper.execute();
} catch (Exception e) {
throw new HoodieException(e);
} finally {
bootstrapHandle.close();
if (null != wrapper) {
wrapper.shutdownNow();
}
}
}
use of org.apache.hudi.common.util.ParquetReaderIterator in project hudi by apache.
the class TestBootstrap method generateInputBatch.
private static JavaRDD<HoodieRecord> generateInputBatch(JavaSparkContext jsc, List<Pair<String, List<HoodieFileStatus>>> partitionPaths, Schema writerSchema) {
List<Pair<String, Path>> fullFilePathsWithPartition = partitionPaths.stream().flatMap(p -> p.getValue().stream().map(x -> Pair.of(p.getKey(), FileStatusUtils.toPath(x.getPath())))).collect(Collectors.toList());
return jsc.parallelize(fullFilePathsWithPartition.stream().flatMap(p -> {
try {
Configuration conf = jsc.hadoopConfiguration();
AvroReadSupport.setAvroReadSchema(conf, writerSchema);
Iterator<GenericRecord> recIterator = new ParquetReaderIterator(AvroParquetReader.<GenericRecord>builder(p.getValue()).withConf(conf).build());
return StreamSupport.stream(Spliterators.spliteratorUnknownSize(recIterator, 0), false).map(gr -> {
try {
String key = gr.get("_row_key").toString();
String pPath = p.getKey();
return new HoodieAvroRecord<>(new HoodieKey(key, pPath), new RawTripTestPayload(gr.toString(), key, pPath, HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA));
} catch (IOException e) {
throw new HoodieIOException(e.getMessage(), e);
}
});
} catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
}).collect(Collectors.toList()));
}
Aggregations