use of org.apache.parquet.hadoop.ParquetFileReader in project ignite by apache.
the class SparkModelParser method parseAndBuildGDBModel.
/**
* Parse and build common GDB model with the custom label mapper.
*
* @param pathToMdl Path to model.
* @param pathToMdlMetaData Path to model meta data.
* @param lbMapper Label mapper.
* @param learningEnvironment learningEnvironment
*/
@Nullable
private static Model parseAndBuildGDBModel(String pathToMdl, String pathToMdlMetaData, IgniteFunction<Double, Double> lbMapper, LearningEnvironment learningEnvironment) {
double[] treeWeights = null;
final Map<Integer, Double> treeWeightsByTreeID = new HashMap<>();
try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdlMetaData), new Configuration()))) {
PageReadStore pagesMetaData;
final MessageType schema = r.getFooter().getFileMetaData().getSchema();
final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
while (null != (pagesMetaData = r.readNextRowGroup())) {
final long rows = pagesMetaData.getRowCount();
final RecordReader recordReader = colIO.getRecordReader(pagesMetaData, new GroupRecordConverter(schema));
for (int i = 0; i < rows; i++) {
final SimpleGroup g = (SimpleGroup) recordReader.read();
int treeId = g.getInteger(0, 0);
double treeWeight = g.getDouble(2, 0);
treeWeightsByTreeID.put(treeId, treeWeight);
}
}
} catch (IOException e) {
String msg = "Error reading parquet file with MetaData by the path: " + pathToMdlMetaData;
learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
e.printStackTrace();
}
treeWeights = new double[treeWeightsByTreeID.size()];
for (int i = 0; i < treeWeights.length; i++) treeWeights[i] = treeWeightsByTreeID.get(i);
try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
PageReadStore pages;
final MessageType schema = r.getFooter().getFileMetaData().getSchema();
final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
final Map<Integer, TreeMap<Integer, NodeData>> nodesByTreeId = new TreeMap<>();
while (null != (pages = r.readNextRowGroup())) {
final long rows = pages.getRowCount();
final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
for (int i = 0; i < rows; i++) {
final SimpleGroup g = (SimpleGroup) recordReader.read();
final int treeID = g.getInteger(0, 0);
final SimpleGroup nodeDataGroup = (SimpleGroup) g.getGroup(1, 0);
NodeData nodeData = extractNodeDataFromParquetRow(nodeDataGroup);
if (nodesByTreeId.containsKey(treeID)) {
Map<Integer, NodeData> nodesByNodeId = nodesByTreeId.get(treeID);
nodesByNodeId.put(nodeData.id, nodeData);
} else {
TreeMap<Integer, NodeData> nodesByNodeId = new TreeMap<>();
nodesByNodeId.put(nodeData.id, nodeData);
nodesByTreeId.put(treeID, nodesByNodeId);
}
}
}
final List<IgniteModel<Vector, Double>> models = new ArrayList<>();
nodesByTreeId.forEach((key, nodes) -> models.add(buildDecisionTreeModel(nodes)));
return new GDBModel(models, new WeightedPredictionsAggregator(treeWeights), lbMapper);
} catch (IOException e) {
String msg = "Error reading parquet file: " + e.getMessage();
learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
e.printStackTrace();
}
return null;
}
use of org.apache.parquet.hadoop.ParquetFileReader in project flink by apache.
the class ParquetVectorizedInputFormat method createReader.
@Override
public ParquetReader createReader(final Configuration config, final SplitT split) throws IOException {
final Path filePath = split.path();
final long splitOffset = split.offset();
final long splitLength = split.length();
org.apache.hadoop.fs.Path hadoopPath = new org.apache.hadoop.fs.Path(filePath.toUri());
ParquetMetadata footer = readFooter(hadoopConfig.conf(), hadoopPath, range(splitOffset, splitOffset + splitLength));
MessageType fileSchema = footer.getFileMetaData().getSchema();
FilterCompat.Filter filter = getFilter(hadoopConfig.conf());
List<BlockMetaData> blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
MessageType requestedSchema = clipParquetSchema(fileSchema);
ParquetFileReader reader = new ParquetFileReader(hadoopConfig.conf(), footer.getFileMetaData(), hadoopPath, blocks, requestedSchema.getColumns());
long totalRowCount = 0;
for (BlockMetaData block : blocks) {
totalRowCount += block.getRowCount();
}
checkSchema(fileSchema, requestedSchema);
final Pool<ParquetReaderBatch<T>> poolOfBatches = createPoolOfBatches(split, requestedSchema, numBatchesToCirculate(config));
return new ParquetReader(reader, requestedSchema, totalRowCount, poolOfBatches);
}
use of org.apache.parquet.hadoop.ParquetFileReader in project alluxio by Alluxio.
the class ParquetReader method create.
/**
* Creates a parquet reader.
*
* @param uri the URI to the input
* @return the reader
* @throws IOException when failed to create the reader
*/
public static ParquetReader create(AlluxioURI uri) throws IOException {
Path inputPath = new JobPath(uri.getScheme(), uri.getAuthority().toString(), uri.getPath());
Configuration conf = ReadWriterUtils.readNoCacheConf();
InputFile inputFile = HadoopInputFile.fromPath(inputPath, conf);
org.apache.parquet.hadoop.ParquetReader<Record> reader = AvroParquetReader.<Record>builder(inputFile).disableCompatibility().withDataModel(GenericData.get()).withConf(conf).build();
Schema schema;
ParquetMetadata footer;
try (ParquetFileReader r = new ParquetFileReader(inputFile, ParquetReadOptions.builder().build())) {
footer = r.getFooter();
schema = new AvroSchemaConverter().convert(footer.getFileMetaData().getSchema());
}
return new ParquetReader(reader, schema, footer);
}
use of org.apache.parquet.hadoop.ParquetFileReader in project drill by apache.
the class Metadata method getParquetFileMetadata_v4.
/**
* Get the file metadata for a single file
*
* @param parquetTableMetadata The table metadata to be updated with all the columns' info
* @param footer If non null, use this footer instead of reading it from the file
* @param file The file
* @param allColumnsInteresting If true, read the min/max metadata for all the columns
* @param skipNonInteresting If true, collect info only for the interesting columns
* @param columnSet Specifies specific columns for which min/max metadata is collected
* @param readerConfig for the options
* @return the file metadata
*/
public static ParquetFileAndRowCountMetadata getParquetFileMetadata_v4(ParquetTableMetadata_v4 parquetTableMetadata, ParquetMetadata footer, FileStatus file, FileSystem fs, boolean allColumnsInteresting, boolean skipNonInteresting, Set<SchemaPath> columnSet, ParquetReaderConfig readerConfig) throws IOException, InterruptedException {
// if a non-null footer is given, no need to read it again from the file
ParquetMetadata metadata = footer;
if (metadata == null) {
UserGroupInformation processUserUgi = ImpersonationUtil.getProcessUserUGI();
Configuration conf = new Configuration(fs.getConf());
try {
metadata = processUserUgi.doAs((PrivilegedExceptionAction<ParquetMetadata>) () -> {
try (ParquetFileReader parquetFileReader = ParquetFileReader.open(HadoopInputFile.fromStatus(file, conf), readerConfig.toReadOptions())) {
return parquetFileReader.getFooter();
}
});
} catch (Exception e) {
logger.error("Exception while reading footer of parquet file [Details - path: {}, owner: {}] as process user {}", file.getPath(), file.getOwner(), processUserUgi.getShortUserName(), e);
throw e;
}
}
FileMetadataCollector metadataCollector = new FileMetadataCollector(metadata, file, fs, allColumnsInteresting, skipNonInteresting, columnSet, readerConfig);
parquetTableMetadata.metadataSummary.columnTypeInfo.putAll(metadataCollector.getColumnTypeInfo());
return metadataCollector.getFileMetadata();
}
Aggregations