Search in sources :

Example 66 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by axbaretto.

the class ParquetRecordReaderTest method validateFooters.

private void validateFooters(final List<Footer> metadata) {
    logger.debug(metadata.toString());
    assertEquals(3, metadata.size());
    for (Footer footer : metadata) {
        final File file = new File(footer.getFile().toUri());
        assertTrue(file.getName(), file.getName().startsWith("part"));
        assertTrue(file.getPath(), file.exists());
        final ParquetMetadata parquetMetadata = footer.getParquetMetadata();
        assertEquals(2, parquetMetadata.getBlocks().size());
        final Map<String, String> keyValueMetaData = parquetMetadata.getFileMetaData().getKeyValueMetaData();
        assertEquals("bar", keyValueMetaData.get("foo"));
        assertEquals(footer.getFile().getName(), keyValueMetaData.get(footer.getFile().getName()));
    }
}
Also used : ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) Footer(org.apache.parquet.hadoop.Footer) File(java.io.File)

Example 67 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by apache.

the class Metadata method getParquetFileMetadata_v4.

/**
 * Get the file metadata for a single file
 *
 * @param parquetTableMetadata The table metadata to be updated with all the columns' info
 * @param footer If non null, use this footer instead of reading it from the file
 * @param file The file
 * @param allColumnsInteresting If true, read the min/max metadata for all the columns
 * @param skipNonInteresting If true, collect info only for the interesting columns
 * @param columnSet Specifies specific columns for which min/max metadata is collected
 * @param readerConfig for the options
 * @return the file metadata
 */
public static ParquetFileAndRowCountMetadata getParquetFileMetadata_v4(ParquetTableMetadata_v4 parquetTableMetadata, ParquetMetadata footer, FileStatus file, FileSystem fs, boolean allColumnsInteresting, boolean skipNonInteresting, Set<SchemaPath> columnSet, ParquetReaderConfig readerConfig) throws IOException, InterruptedException {
    // if a non-null footer is given, no need to read it again from the file
    ParquetMetadata metadata = footer;
    if (metadata == null) {
        UserGroupInformation processUserUgi = ImpersonationUtil.getProcessUserUGI();
        Configuration conf = new Configuration(fs.getConf());
        try {
            metadata = processUserUgi.doAs((PrivilegedExceptionAction<ParquetMetadata>) () -> {
                try (ParquetFileReader parquetFileReader = ParquetFileReader.open(HadoopInputFile.fromStatus(file, conf), readerConfig.toReadOptions())) {
                    return parquetFileReader.getFooter();
                }
            });
        } catch (Exception e) {
            logger.error("Exception while reading footer of parquet file [Details - path: {}, owner: {}] as process user {}", file.getPath(), file.getOwner(), processUserUgi.getShortUserName(), e);
            throw e;
        }
    }
    FileMetadataCollector metadataCollector = new FileMetadataCollector(metadata, file, fs, allColumnsInteresting, skipNonInteresting, columnSet, readerConfig);
    parquetTableMetadata.metadataSummary.columnTypeInfo.putAll(metadataCollector.getColumnTypeInfo());
    return metadataCollector.getFileMetadata();
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) PrivilegedExceptionAction(java.security.PrivilegedExceptionAction) IOException(java.io.IOException) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation)

Example 68 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by apache.

the class ParquetFileWriter method mergeFooters.

/**
 * Will merge the metadata of all the footers together
 * @param root the directory containing all footers
 * @param footers the list files footers to merge
 * @param keyValueMergeStrategy strategy to merge values for a given key (if there are multiple values)
 * @return the global meta data for all the footers
 */
static ParquetMetadata mergeFooters(Path root, List<Footer> footers, KeyValueMetadataMergeStrategy keyValueMergeStrategy) {
    String rootPath = root.toUri().getPath();
    GlobalMetaData fileMetaData = null;
    List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
    for (Footer footer : footers) {
        String footerPath = footer.getFile().toUri().getPath();
        if (!footerPath.startsWith(rootPath)) {
            throw new ParquetEncodingException(footerPath + " invalid: all the files must be contained in the root " + root);
        }
        footerPath = footerPath.substring(rootPath.length());
        while (footerPath.startsWith("/")) {
            footerPath = footerPath.substring(1);
        }
        fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData);
        for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
            block.setPath(footerPath);
            blocks.add(block);
        }
    }
    return new ParquetMetadata(fileMetaData.merge(keyValueMergeStrategy), blocks);
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetEncodingException(org.apache.parquet.io.ParquetEncodingException) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ArrayList(java.util.ArrayList) GlobalMetaData(org.apache.parquet.hadoop.metadata.GlobalMetaData)

Example 69 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by apache.

the class ParquetFileWriter method writeMergedMetadataFile.

/**
 * Given a list of metadata files, merge them into a single metadata file.
 * Requires that the schemas be compatible, and the extraMetaData be exactly equal.
 * This is useful when merging 2 directories of parquet files into a single directory, as long
 * as both directories were written with compatible schemas and equal extraMetaData.
 * @param files a list of files to merge metadata from
 * @param outputPath path to write merged metadata to
 * @param conf a configuration
 * @throws IOException if there is an error while reading or writing
 * @deprecated metadata files are not recommended and will be removed in 2.0.0
 */
@Deprecated
public static void writeMergedMetadataFile(List<Path> files, Path outputPath, Configuration conf) throws IOException {
    ParquetMetadata merged = mergeMetadataFiles(files, conf);
    writeMetadataFile(outputPath, merged, outputPath.getFileSystem(conf));
}
Also used : ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata)

Example 70 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by apache.

the class ParquetFileWriter method end.

/**
 * ends a file once all blocks have been written.
 * closes the file.
 * @param extraMetaData the extra meta data to write in the footer
 * @throws IOException if there is an error while writing
 */
public void end(Map<String, String> extraMetaData) throws IOException {
    state = state.end();
    serializeColumnIndexes(columnIndexes, blocks, out, fileEncryptor);
    serializeOffsetIndexes(offsetIndexes, blocks, out, fileEncryptor);
    serializeBloomFilters(bloomFilters, blocks, out, fileEncryptor);
    LOG.debug("{}: end", out.getPos());
    this.footer = new ParquetMetadata(new FileMetaData(schema, extraMetaData, Version.FULL_VERSION), blocks);
    serializeFooter(footer, out, fileEncryptor);
    out.close();
}
Also used : ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) Util.writeFileMetaData(org.apache.parquet.format.Util.writeFileMetaData) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData)

Aggregations

ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)76 Path (org.apache.hadoop.fs.Path)39 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)27 Configuration (org.apache.hadoop.conf.Configuration)21 MessageType (org.apache.parquet.schema.MessageType)21 ArrayList (java.util.ArrayList)19 IOException (java.io.IOException)18 Test (org.junit.Test)17 FileSystem (org.apache.hadoop.fs.FileSystem)16 Map (java.util.Map)11 FileMetaData (org.apache.parquet.hadoop.metadata.FileMetaData)11 File (java.io.File)10 FileStatus (org.apache.hadoop.fs.FileStatus)10 ColumnPath (org.apache.parquet.hadoop.metadata.ColumnPath)9 HashMap (java.util.HashMap)8 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)7 List (java.util.List)6 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)6 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)6 ParquetFileReader (org.apache.parquet.hadoop.ParquetFileReader)6