Search in sources :

Example 6 with ParquetFileReader

use of org.apache.parquet.hadoop.ParquetFileReader in project parquet-mr by apache.

the class CheckParquet251Command method check.

private String check(String file) throws IOException {
    Path path = qualifiedPath(file);
    ParquetMetadata footer = ParquetFileReader.readFooter(getConf(), path, ParquetMetadataConverter.NO_FILTER);
    FileMetaData meta = footer.getFileMetaData();
    String createdBy = meta.getCreatedBy();
    if (CorruptStatistics.shouldIgnoreStatistics(createdBy, BINARY)) {
        // create fake metadata that will read corrupt stats and return them
        FileMetaData fakeMeta = new FileMetaData(meta.getSchema(), meta.getKeyValueMetaData(), Version.FULL_VERSION);
        // get just the binary columns
        List<ColumnDescriptor> columns = Lists.newArrayList();
        Iterables.addAll(columns, Iterables.filter(meta.getSchema().getColumns(), new Predicate<ColumnDescriptor>() {

            @Override
            public boolean apply(@Nullable ColumnDescriptor input) {
                return input != null && input.getType() == BINARY;
            }
        }));
        // now check to see if the data is actually corrupt
        ParquetFileReader reader = new ParquetFileReader(getConf(), fakeMeta, path, footer.getBlocks(), columns);
        try {
            PageStatsValidator validator = new PageStatsValidator();
            for (PageReadStore pages = reader.readNextRowGroup(); pages != null; pages = reader.readNextRowGroup()) {
                validator.validate(columns, pages);
            }
        } catch (BadStatsException e) {
            return e.getMessage();
        }
    }
    return null;
}
Also used : Path(org.apache.hadoop.fs.Path) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) PageReadStore(org.apache.parquet.column.page.PageReadStore) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) Nullable(javax.annotation.Nullable) Predicate(com.google.common.base.Predicate)

Example 7 with ParquetFileReader

use of org.apache.parquet.hadoop.ParquetFileReader in project hive by apache.

the class VectorizedParquetRecordReader method initialize.

@SuppressWarnings("deprecation")
public void initialize(InputSplit oldSplit, JobConf configuration) throws IOException, InterruptedException, HiveException {
    // the oldSplit may be null during the split phase
    if (oldSplit == null) {
        return;
    }
    ParquetMetadata footer;
    List<BlockMetaData> blocks;
    MapWork mapWork = LlapHiveUtils.findMapWork(jobConf);
    if (mapWork != null) {
        parts = mapWork.getPathToPartitionInfo();
    }
    ParquetInputSplit split = (ParquetInputSplit) oldSplit;
    boolean indexAccess = configuration.getBoolean(DataWritableReadSupport.PARQUET_COLUMN_INDEX_ACCESS, false);
    this.file = split.getPath();
    long[] rowGroupOffsets = split.getRowGroupOffsets();
    String columnNames = configuration.get(IOConstants.COLUMNS);
    columnNamesList = DataWritableReadSupport.getColumnNames(columnNames);
    String columnTypes = configuration.get(IOConstants.COLUMNS_TYPES);
    columnTypesList = DataWritableReadSupport.getColumnTypes(columnTypes);
    // if task.side.metadata is set, rowGroupOffsets is null
    Object cacheKey = null;
    CacheTag cacheTag = null;
    // TODO: also support fileKey in splits, like OrcSplit does
    if (metadataCache != null) {
        if (cacheKey == null) {
            cacheKey = HdfsUtils.getFileId(file.getFileSystem(configuration), file, HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_CACHE_ALLOW_SYNTHETIC_FILEID), HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_CACHE_DEFAULT_FS_FILE_ID), !HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_IO_USE_FILEID_PATH));
        }
    }
    if (cacheKey != null) {
        if (HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_TRACK_CACHE_USAGE)) {
            PartitionDesc partitionDesc = LlapHiveUtils.partitionDescForPath(split.getPath(), parts);
            cacheTag = LlapHiveUtils.getDbAndTableNameForMetrics(file, true, partitionDesc);
        }
        // If we are going to use cache, change the path to depend on file ID for extra consistency.
        FileSystem fs = file.getFileSystem(configuration);
        if (cacheKey instanceof Long && HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_IO_USE_FILEID_PATH)) {
            file = HdfsUtils.getFileIdPath(file, (long) cacheKey);
        }
    }
    if (rowGroupOffsets == null) {
        // TODO check whether rowGroupOffSets can be null
        // then we need to apply the predicate push down filter
        footer = readSplitFooter(configuration, file, cacheKey, range(split.getStart(), split.getEnd()), cacheTag);
        MessageType fileSchema = footer.getFileMetaData().getSchema();
        FilterCompat.Filter filter = getFilter(configuration);
        blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
    } else {
        // otherwise we find the row groups that were selected on the client
        footer = readSplitFooter(configuration, file, cacheKey, NO_FILTER, cacheTag);
        Set<Long> offsets = new HashSet<>();
        for (long offset : rowGroupOffsets) {
            offsets.add(offset);
        }
        blocks = new ArrayList<>();
        for (BlockMetaData block : footer.getBlocks()) {
            if (offsets.contains(block.getStartingPos())) {
                blocks.add(block);
            }
        }
        // verify we found them all
        if (blocks.size() != rowGroupOffsets.length) {
            long[] foundRowGroupOffsets = new long[footer.getBlocks().size()];
            for (int i = 0; i < foundRowGroupOffsets.length; i++) {
                foundRowGroupOffsets[i] = footer.getBlocks().get(i).getStartingPos();
            }
            // provide a good error message in case there's a bug
            throw new IllegalStateException("All the offsets listed in the split should be found in the file." + " expected: " + Arrays.toString(rowGroupOffsets) + " found: " + blocks + " out of: " + Arrays.toString(foundRowGroupOffsets) + " in range " + split.getStart() + ", " + split.getEnd());
        }
    }
    for (BlockMetaData block : blocks) {
        this.totalRowCount += block.getRowCount();
    }
    this.fileSchema = footer.getFileMetaData().getSchema();
    this.writerTimezone = DataWritableReadSupport.getWriterTimeZoneId(footer.getFileMetaData().getKeyValueMetaData());
    colsToInclude = ColumnProjectionUtils.getReadColumnIDs(configuration);
    requestedSchema = DataWritableReadSupport.getRequestedSchema(indexAccess, columnNamesList, columnTypesList, fileSchema, configuration);
    Path path = wrapPathForCache(file, cacheKey, configuration, blocks, cacheTag);
    this.reader = new ParquetFileReader(configuration, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns());
}
Also used : Path(org.apache.hadoop.fs.Path) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) FilterCompat(org.apache.parquet.filter2.compat.FilterCompat) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) FileSystem(org.apache.hadoop.fs.FileSystem) ParquetInputSplit(org.apache.parquet.hadoop.ParquetInputSplit) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) CacheTag(org.apache.hadoop.hive.common.io.CacheTag) MessageType(org.apache.parquet.schema.MessageType) HashSet(java.util.HashSet)

Example 8 with ParquetFileReader

use of org.apache.parquet.hadoop.ParquetFileReader in project parquet-mr by apache.

the class DumpCommand method dump.

public static void dump(PrettyPrintWriter out, ParquetMetadata meta, MessageType schema, Path inpath, boolean showmd, boolean showdt, Set<String> showColumns) throws IOException {
    Configuration conf = new Configuration();
    List<BlockMetaData> blocks = meta.getBlocks();
    List<ColumnDescriptor> columns = schema.getColumns();
    if (showColumns != null) {
        columns = new ArrayList<ColumnDescriptor>();
        for (ColumnDescriptor column : schema.getColumns()) {
            String path = Joiner.on('.').skipNulls().join(column.getPath());
            if (showColumns.contains(path)) {
                columns.add(column);
            }
        }
    }
    ParquetFileReader freader = null;
    if (showmd) {
        try {
            long group = 0;
            for (BlockMetaData block : blocks) {
                if (group != 0)
                    out.println();
                out.format("row group %d%n", group++);
                out.rule('-');
                List<ColumnChunkMetaData> ccmds = block.getColumns();
                if (showColumns != null) {
                    ccmds = new ArrayList<ColumnChunkMetaData>();
                    for (ColumnChunkMetaData ccmd : block.getColumns()) {
                        String path = Joiner.on('.').skipNulls().join(ccmd.getPath().toArray());
                        if (showColumns.contains(path)) {
                            ccmds.add(ccmd);
                        }
                    }
                }
                MetadataUtils.showDetails(out, ccmds);
                List<BlockMetaData> rblocks = Collections.singletonList(block);
                freader = new ParquetFileReader(conf, meta.getFileMetaData(), inpath, rblocks, columns);
                PageReadStore store = freader.readNextRowGroup();
                while (store != null) {
                    out.incrementTabLevel();
                    for (ColumnDescriptor column : columns) {
                        out.println();
                        dump(out, store, column);
                    }
                    out.decrementTabLevel();
                    store = freader.readNextRowGroup();
                }
                out.flushColumns();
            }
        } finally {
            if (freader != null) {
                freader.close();
            }
        }
    }
    if (showdt) {
        boolean first = true;
        for (ColumnDescriptor column : columns) {
            if (!first || showmd)
                out.println();
            first = false;
            out.format("%s %s%n", column.getType(), Joiner.on('.').skipNulls().join(column.getPath()));
            out.rule('-');
            try {
                long page = 1;
                long total = blocks.size();
                long offset = 1;
                freader = new ParquetFileReader(conf, meta.getFileMetaData(), inpath, blocks, Collections.singletonList(column));
                PageReadStore store = freader.readNextRowGroup();
                while (store != null) {
                    ColumnReadStoreImpl crstore = new ColumnReadStoreImpl(store, new DumpGroupConverter(), schema, meta.getFileMetaData().getCreatedBy());
                    dump(out, crstore, column, page++, total, offset);
                    offset += store.getRowCount();
                    store = freader.readNextRowGroup();
                }
                out.flushColumns();
            } finally {
                out.flushColumns();
                if (freader != null) {
                    freader.close();
                }
            }
        }
    }
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) Configuration(org.apache.hadoop.conf.Configuration) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) ColumnReadStoreImpl(org.apache.parquet.column.impl.ColumnReadStoreImpl) PageReadStore(org.apache.parquet.column.page.PageReadStore)

Example 9 with ParquetFileReader

use of org.apache.parquet.hadoop.ParquetFileReader in project hive by apache.

the class ParquetRecordWriterWrapper method close.

@Override
public void close(final Reporter reporter) throws IOException {
    try {
        realWriter.close(taskContext);
    } catch (final InterruptedException e) {
        throw new IOException(e);
    }
    // Collect file stats
    try {
        ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(this.file, this.jobConf));
        long totalSize = 0;
        for (BlockMetaData block : reader.getFooter().getBlocks()) {
            totalSize += block.getTotalByteSize();
        }
        stats = new SerDeStats();
        stats.setRowCount(reader.getRecordCount());
        stats.setRawDataSize(totalSize);
    } catch (IOException e) {
    // Ignore
    }
}
Also used : SerDeStats(org.apache.hadoop.hive.serde2.SerDeStats) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) IOException(java.io.IOException)

Example 10 with ParquetFileReader

use of org.apache.parquet.hadoop.ParquetFileReader in project ignite by apache.

the class SparkModelParser method loadKMeansModel.

/**
 * Load K-Means model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment learningEnvironment
 */
private static Model loadKMeansModel(String pathToMdl, LearningEnvironment learningEnvironment) {
    Vector[] centers = null;
    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;
        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
        while (null != (pages = r.readNextRowGroup())) {
            final int rows = (int) pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            centers = new DenseVector[rows];
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup) recordReader.read();
                // final int clusterIdx = g.getInteger(0, 0);
                Group clusterCenterCoeff = g.getGroup(1, 0).getGroup(3, 0);
                final int amountOfCoefficients = clusterCenterCoeff.getFieldRepetitionCount(0);
                centers[i] = new DenseVector(amountOfCoefficients);
                for (int j = 0; j < amountOfCoefficients; j++) {
                    double coefficient = clusterCenterCoeff.getGroup(0, j).getDouble(0, 0);
                    centers[i].set(j, coefficient);
                }
            }
        }
    } catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }
    return new KMeansModel(centers, new EuclideanDistance());
}
Also used : Path(org.apache.hadoop.fs.Path) Group(org.apache.parquet.example.data.Group) SimpleGroup(org.apache.parquet.example.data.simple.SimpleGroup) GroupRecordConverter(org.apache.parquet.example.data.simple.convert.GroupRecordConverter) KMeansModel(org.apache.ignite.ml.clustering.kmeans.KMeansModel) Configuration(org.apache.hadoop.conf.Configuration) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) RecordReader(org.apache.parquet.io.RecordReader) SimpleGroup(org.apache.parquet.example.data.simple.SimpleGroup) IOException(java.io.IOException) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) ColumnIOFactory(org.apache.parquet.io.ColumnIOFactory) EuclideanDistance(org.apache.ignite.ml.math.distances.EuclideanDistance) PageReadStore(org.apache.parquet.column.page.PageReadStore) Vector(org.apache.ignite.ml.math.primitives.vector.Vector) DenseVector(org.apache.ignite.ml.math.primitives.vector.impl.DenseVector) MessageType(org.apache.parquet.schema.MessageType) DenseVector(org.apache.ignite.ml.math.primitives.vector.impl.DenseVector)

Aggregations

ParquetFileReader (org.apache.parquet.hadoop.ParquetFileReader)19 MessageType (org.apache.parquet.schema.MessageType)13 Configuration (org.apache.hadoop.conf.Configuration)11 Path (org.apache.hadoop.fs.Path)11 PageReadStore (org.apache.parquet.column.page.PageReadStore)10 IOException (java.io.IOException)9 SimpleGroup (org.apache.parquet.example.data.simple.SimpleGroup)7 GroupRecordConverter (org.apache.parquet.example.data.simple.convert.GroupRecordConverter)7 ColumnIOFactory (org.apache.parquet.io.ColumnIOFactory)7 MessageColumnIO (org.apache.parquet.io.MessageColumnIO)7 RecordReader (org.apache.parquet.io.RecordReader)7 ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)6 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)5 Vector (org.apache.ignite.ml.math.primitives.vector.Vector)4 DenseVector (org.apache.ignite.ml.math.primitives.vector.impl.DenseVector)4 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)4 ArrayList (java.util.ArrayList)3 TreeMap (java.util.TreeMap)3 NodeData (org.apache.ignite.ml.tree.NodeData)3 HashSet (java.util.HashSet)2