Search in sources :

Example 16 with ParquetFileReader

use of org.apache.parquet.hadoop.ParquetFileReader in project ignite by apache.

the class SparkModelParser method parseAndBuildGDBModel.

/**
 * Parse and build common GDB model with the custom label mapper.
 *
 * @param pathToMdl Path to model.
 * @param pathToMdlMetaData Path to model meta data.
 * @param lbMapper Label mapper.
 * @param learningEnvironment learningEnvironment
 */
@Nullable
private static Model parseAndBuildGDBModel(String pathToMdl, String pathToMdlMetaData, IgniteFunction<Double, Double> lbMapper, LearningEnvironment learningEnvironment) {
    double[] treeWeights = null;
    final Map<Integer, Double> treeWeightsByTreeID = new HashMap<>();
    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdlMetaData), new Configuration()))) {
        PageReadStore pagesMetaData;
        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
        while (null != (pagesMetaData = r.readNextRowGroup())) {
            final long rows = pagesMetaData.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pagesMetaData, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup) recordReader.read();
                int treeId = g.getInteger(0, 0);
                double treeWeight = g.getDouble(2, 0);
                treeWeightsByTreeID.put(treeId, treeWeight);
            }
        }
    } catch (IOException e) {
        String msg = "Error reading parquet file with MetaData by the path: " + pathToMdlMetaData;
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }
    treeWeights = new double[treeWeightsByTreeID.size()];
    for (int i = 0; i < treeWeights.length; i++) treeWeights[i] = treeWeightsByTreeID.get(i);
    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;
        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
        final Map<Integer, TreeMap<Integer, NodeData>> nodesByTreeId = new TreeMap<>();
        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup) recordReader.read();
                final int treeID = g.getInteger(0, 0);
                final SimpleGroup nodeDataGroup = (SimpleGroup) g.getGroup(1, 0);
                NodeData nodeData = extractNodeDataFromParquetRow(nodeDataGroup);
                if (nodesByTreeId.containsKey(treeID)) {
                    Map<Integer, NodeData> nodesByNodeId = nodesByTreeId.get(treeID);
                    nodesByNodeId.put(nodeData.id, nodeData);
                } else {
                    TreeMap<Integer, NodeData> nodesByNodeId = new TreeMap<>();
                    nodesByNodeId.put(nodeData.id, nodeData);
                    nodesByTreeId.put(treeID, nodesByNodeId);
                }
            }
        }
        final List<IgniteModel<Vector, Double>> models = new ArrayList<>();
        nodesByTreeId.forEach((key, nodes) -> models.add(buildDecisionTreeModel(nodes)));
        return new GDBModel(models, new WeightedPredictionsAggregator(treeWeights), lbMapper);
    } catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }
    return null;
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) RecordReader(org.apache.parquet.io.RecordReader) ArrayList(java.util.ArrayList) GDBModel(org.apache.ignite.ml.composition.boosting.GDBModel) SimpleGroup(org.apache.parquet.example.data.simple.SimpleGroup) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) PageReadStore(org.apache.parquet.column.page.PageReadStore) MessageType(org.apache.parquet.schema.MessageType) Path(org.apache.hadoop.fs.Path) GroupRecordConverter(org.apache.parquet.example.data.simple.convert.GroupRecordConverter) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) WeightedPredictionsAggregator(org.apache.ignite.ml.composition.predictionsaggregator.WeightedPredictionsAggregator) IOException(java.io.IOException) TreeMap(java.util.TreeMap) ColumnIOFactory(org.apache.parquet.io.ColumnIOFactory) NodeData(org.apache.ignite.ml.tree.NodeData) IgniteModel(org.apache.ignite.ml.IgniteModel) Nullable(org.jetbrains.annotations.Nullable)

Example 17 with ParquetFileReader

use of org.apache.parquet.hadoop.ParquetFileReader in project flink by apache.

the class ParquetVectorizedInputFormat method createReader.

@Override
public ParquetReader createReader(final Configuration config, final SplitT split) throws IOException {
    final Path filePath = split.path();
    final long splitOffset = split.offset();
    final long splitLength = split.length();
    org.apache.hadoop.fs.Path hadoopPath = new org.apache.hadoop.fs.Path(filePath.toUri());
    ParquetMetadata footer = readFooter(hadoopConfig.conf(), hadoopPath, range(splitOffset, splitOffset + splitLength));
    MessageType fileSchema = footer.getFileMetaData().getSchema();
    FilterCompat.Filter filter = getFilter(hadoopConfig.conf());
    List<BlockMetaData> blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
    MessageType requestedSchema = clipParquetSchema(fileSchema);
    ParquetFileReader reader = new ParquetFileReader(hadoopConfig.conf(), footer.getFileMetaData(), hadoopPath, blocks, requestedSchema.getColumns());
    long totalRowCount = 0;
    for (BlockMetaData block : blocks) {
        totalRowCount += block.getRowCount();
    }
    checkSchema(fileSchema, requestedSchema);
    final Pool<ParquetReaderBatch<T>> poolOfBatches = createPoolOfBatches(split, requestedSchema, numBatchesToCirculate(config));
    return new ParquetReader(reader, requestedSchema, totalRowCount, poolOfBatches);
}
Also used : Path(org.apache.flink.core.fs.Path) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) FilterCompat(org.apache.parquet.filter2.compat.FilterCompat) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) MessageType(org.apache.parquet.schema.MessageType)

Example 18 with ParquetFileReader

use of org.apache.parquet.hadoop.ParquetFileReader in project alluxio by Alluxio.

the class ParquetReader method create.

/**
 * Creates a parquet reader.
 *
 * @param uri the URI to the input
 * @return the reader
 * @throws IOException when failed to create the reader
 */
public static ParquetReader create(AlluxioURI uri) throws IOException {
    Path inputPath = new JobPath(uri.getScheme(), uri.getAuthority().toString(), uri.getPath());
    Configuration conf = ReadWriterUtils.readNoCacheConf();
    InputFile inputFile = HadoopInputFile.fromPath(inputPath, conf);
    org.apache.parquet.hadoop.ParquetReader<Record> reader = AvroParquetReader.<Record>builder(inputFile).disableCompatibility().withDataModel(GenericData.get()).withConf(conf).build();
    Schema schema;
    ParquetMetadata footer;
    try (ParquetFileReader r = new ParquetFileReader(inputFile, ParquetReadOptions.builder().build())) {
        footer = r.getFooter();
        schema = new AvroSchemaConverter().convert(footer.getFileMetaData().getSchema());
    }
    return new ParquetReader(reader, schema, footer);
}
Also used : JobPath(alluxio.job.plan.transform.format.JobPath) Path(org.apache.hadoop.fs.Path) AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) JobPath(alluxio.job.plan.transform.format.JobPath) Configuration(org.apache.hadoop.conf.Configuration) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) Schema(org.apache.avro.Schema) TableSchema(alluxio.job.plan.transform.format.TableSchema) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) AvroParquetReader(org.apache.parquet.avro.AvroParquetReader) InputFile(org.apache.parquet.io.InputFile) HadoopInputFile(org.apache.parquet.hadoop.util.HadoopInputFile) Record(org.apache.avro.generic.GenericData.Record)

Example 19 with ParquetFileReader

use of org.apache.parquet.hadoop.ParquetFileReader in project drill by apache.

the class Metadata method getParquetFileMetadata_v4.

/**
 * Get the file metadata for a single file
 *
 * @param parquetTableMetadata The table metadata to be updated with all the columns' info
 * @param footer If non null, use this footer instead of reading it from the file
 * @param file The file
 * @param allColumnsInteresting If true, read the min/max metadata for all the columns
 * @param skipNonInteresting If true, collect info only for the interesting columns
 * @param columnSet Specifies specific columns for which min/max metadata is collected
 * @param readerConfig for the options
 * @return the file metadata
 */
public static ParquetFileAndRowCountMetadata getParquetFileMetadata_v4(ParquetTableMetadata_v4 parquetTableMetadata, ParquetMetadata footer, FileStatus file, FileSystem fs, boolean allColumnsInteresting, boolean skipNonInteresting, Set<SchemaPath> columnSet, ParquetReaderConfig readerConfig) throws IOException, InterruptedException {
    // if a non-null footer is given, no need to read it again from the file
    ParquetMetadata metadata = footer;
    if (metadata == null) {
        UserGroupInformation processUserUgi = ImpersonationUtil.getProcessUserUGI();
        Configuration conf = new Configuration(fs.getConf());
        try {
            metadata = processUserUgi.doAs((PrivilegedExceptionAction<ParquetMetadata>) () -> {
                try (ParquetFileReader parquetFileReader = ParquetFileReader.open(HadoopInputFile.fromStatus(file, conf), readerConfig.toReadOptions())) {
                    return parquetFileReader.getFooter();
                }
            });
        } catch (Exception e) {
            logger.error("Exception while reading footer of parquet file [Details - path: {}, owner: {}] as process user {}", file.getPath(), file.getOwner(), processUserUgi.getShortUserName(), e);
            throw e;
        }
    }
    FileMetadataCollector metadataCollector = new FileMetadataCollector(metadata, file, fs, allColumnsInteresting, skipNonInteresting, columnSet, readerConfig);
    parquetTableMetadata.metadataSummary.columnTypeInfo.putAll(metadataCollector.getColumnTypeInfo());
    return metadataCollector.getFileMetadata();
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) PrivilegedExceptionAction(java.security.PrivilegedExceptionAction) IOException(java.io.IOException) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation)

Aggregations

ParquetFileReader (org.apache.parquet.hadoop.ParquetFileReader)19 MessageType (org.apache.parquet.schema.MessageType)13 Configuration (org.apache.hadoop.conf.Configuration)11 Path (org.apache.hadoop.fs.Path)11 PageReadStore (org.apache.parquet.column.page.PageReadStore)10 IOException (java.io.IOException)9 SimpleGroup (org.apache.parquet.example.data.simple.SimpleGroup)7 GroupRecordConverter (org.apache.parquet.example.data.simple.convert.GroupRecordConverter)7 ColumnIOFactory (org.apache.parquet.io.ColumnIOFactory)7 MessageColumnIO (org.apache.parquet.io.MessageColumnIO)7 RecordReader (org.apache.parquet.io.RecordReader)7 ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)6 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)5 Vector (org.apache.ignite.ml.math.primitives.vector.Vector)4 DenseVector (org.apache.ignite.ml.math.primitives.vector.impl.DenseVector)4 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)4 ArrayList (java.util.ArrayList)3 TreeMap (java.util.TreeMap)3 NodeData (org.apache.ignite.ml.tree.NodeData)3 HashSet (java.util.HashSet)2