Search in sources :

Example 26 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.

the class TestMergeMetadataFiles method testMergeMetadataFiles.

@Test
public void testMergeMetadataFiles() throws Exception {
    WrittenFileInfo info = writeFiles(false);
    ParquetMetadata commonMeta1 = ParquetFileReader.readFooter(info.conf, info.commonMetaPath1, ParquetMetadataConverter.NO_FILTER);
    ParquetMetadata commonMeta2 = ParquetFileReader.readFooter(info.conf, info.commonMetaPath2, ParquetMetadataConverter.NO_FILTER);
    ParquetMetadata meta1 = ParquetFileReader.readFooter(info.conf, info.metaPath1, ParquetMetadataConverter.NO_FILTER);
    ParquetMetadata meta2 = ParquetFileReader.readFooter(info.conf, info.metaPath2, ParquetMetadataConverter.NO_FILTER);
    assertTrue(commonMeta1.getBlocks().isEmpty());
    assertTrue(commonMeta2.getBlocks().isEmpty());
    assertEquals(commonMeta1.getFileMetaData().getSchema(), commonMeta2.getFileMetaData().getSchema());
    assertFalse(meta1.getBlocks().isEmpty());
    assertFalse(meta2.getBlocks().isEmpty());
    assertEquals(meta1.getFileMetaData().getSchema(), meta2.getFileMetaData().getSchema());
    assertEquals(commonMeta1.getFileMetaData().getKeyValueMetaData(), commonMeta2.getFileMetaData().getKeyValueMetaData());
    assertEquals(meta1.getFileMetaData().getKeyValueMetaData(), meta2.getFileMetaData().getKeyValueMetaData());
    // test file serialization
    Path mergedOut = new Path(new File(temp.getRoot(), "merged_meta").getAbsolutePath());
    Path mergedCommonOut = new Path(new File(temp.getRoot(), "merged_common_meta").getAbsolutePath());
    ParquetFileWriter.writeMergedMetadataFile(Arrays.asList(info.metaPath1, info.metaPath2), mergedOut, info.conf);
    ParquetFileWriter.writeMergedMetadataFile(Arrays.asList(info.commonMetaPath1, info.commonMetaPath2), mergedCommonOut, info.conf);
    ParquetMetadata mergedMeta = ParquetFileReader.readFooter(info.conf, mergedOut, ParquetMetadataConverter.NO_FILTER);
    ParquetMetadata mergedCommonMeta = ParquetFileReader.readFooter(info.conf, mergedCommonOut, ParquetMetadataConverter.NO_FILTER);
    // ideally we'd assert equality here, but BlockMetaData and it's references don't implement equals
    assertEquals(meta1.getBlocks().size() + meta2.getBlocks().size(), mergedMeta.getBlocks().size());
    assertTrue(mergedCommonMeta.getBlocks().isEmpty());
    assertEquals(meta1.getFileMetaData().getSchema(), mergedMeta.getFileMetaData().getSchema());
    assertEquals(commonMeta1.getFileMetaData().getSchema(), mergedCommonMeta.getFileMetaData().getSchema());
    assertEquals(meta1.getFileMetaData().getKeyValueMetaData(), mergedMeta.getFileMetaData().getKeyValueMetaData());
    assertEquals(commonMeta1.getFileMetaData().getKeyValueMetaData(), mergedCommonMeta.getFileMetaData().getKeyValueMetaData());
}
Also used : Path(org.apache.hadoop.fs.Path) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) File(java.io.File) Test(org.junit.Test)

Example 27 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.

the class ShowSchemaCommand method execute.

@Override
public void execute(CommandLine options) throws Exception {
    super.execute(options);
    String[] args = options.getArgs();
    String input = args[0];
    Configuration conf = new Configuration();
    ParquetMetadata metaData;
    Path path = new Path(input);
    FileSystem fs = path.getFileSystem(conf);
    Path file;
    if (fs.isDirectory(path)) {
        FileStatus[] statuses = fs.listStatus(path, HiddenFileFilter.INSTANCE);
        if (statuses.length == 0) {
            throw new RuntimeException("Directory " + path.toString() + " is empty");
        }
        file = statuses[0].getPath();
    } else {
        file = path;
    }
    metaData = ParquetFileReader.readFooter(conf, file, NO_FILTER);
    MessageType schema = metaData.getFileMetaData().getSchema();
    Main.out.println(schema);
    if (options.hasOption('d')) {
        PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter().build();
        MetadataUtils.showDetails(out, metaData);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) FileSystem(org.apache.hadoop.fs.FileSystem) PrettyPrintWriter(org.apache.parquet.tools.util.PrettyPrintWriter) MessageType(org.apache.parquet.schema.MessageType)

Example 28 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.

the class ParquetMetadataCommand method run.

@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
    Preconditions.checkArgument(targets != null && targets.size() >= 1, "A Parquet file is required.");
    Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files.");
    String source = targets.get(0);
    ParquetMetadata footer = ParquetFileReader.readFooter(getConf(), qualifiedPath(source), ParquetMetadataConverter.NO_FILTER);
    console.info("\nFile path:  {}", source);
    console.info("Created by: {}", footer.getFileMetaData().getCreatedBy());
    Map<String, String> kv = footer.getFileMetaData().getKeyValueMetaData();
    if (kv != null && !kv.isEmpty()) {
        console.info("Properties:");
        String format = "  %" + maxSize(kv.keySet()) + "s: %s";
        for (Map.Entry<String, String> entry : kv.entrySet()) {
            console.info(String.format(format, entry.getKey(), entry.getValue()));
        }
    } else {
        console.info("Properties: (none)");
    }
    MessageType schema = footer.getFileMetaData().getSchema();
    console.info("Schema:\n{}", schema);
    List<BlockMetaData> rowGroups = footer.getBlocks();
    for (int index = 0, n = rowGroups.size(); index < n; index += 1) {
        printRowGroup(console, index, rowGroups.get(index), schema);
    }
    console.info("");
    return 0;
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) Util.minMaxAsString(org.apache.parquet.cli.Util.minMaxAsString) Util.encodingsAsString(org.apache.parquet.cli.Util.encodingsAsString) Util.encodingStatsAsString(org.apache.parquet.cli.Util.encodingStatsAsString) Map(java.util.Map) MessageType(org.apache.parquet.schema.MessageType)

Example 29 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.

the class CheckParquet251Command method check.

private String check(String file) throws IOException {
    Path path = qualifiedPath(file);
    ParquetMetadata footer = ParquetFileReader.readFooter(getConf(), path, ParquetMetadataConverter.NO_FILTER);
    FileMetaData meta = footer.getFileMetaData();
    String createdBy = meta.getCreatedBy();
    if (CorruptStatistics.shouldIgnoreStatistics(createdBy, BINARY)) {
        // create fake metadata that will read corrupt stats and return them
        FileMetaData fakeMeta = new FileMetaData(meta.getSchema(), meta.getKeyValueMetaData(), Version.FULL_VERSION);
        // get just the binary columns
        List<ColumnDescriptor> columns = Lists.newArrayList();
        Iterables.addAll(columns, Iterables.filter(meta.getSchema().getColumns(), new Predicate<ColumnDescriptor>() {

            @Override
            public boolean apply(@Nullable ColumnDescriptor input) {
                return input != null && input.getType() == BINARY;
            }
        }));
        // now check to see if the data is actually corrupt
        ParquetFileReader reader = new ParquetFileReader(getConf(), fakeMeta, path, footer.getBlocks(), columns);
        try {
            PageStatsValidator validator = new PageStatsValidator();
            for (PageReadStore pages = reader.readNextRowGroup(); pages != null; pages = reader.readNextRowGroup()) {
                validator.validate(columns, pages);
            }
        } catch (BadStatsException e) {
            return e.getMessage();
        }
    }
    return null;
}
Also used : Path(org.apache.hadoop.fs.Path) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) PageReadStore(org.apache.parquet.column.page.PageReadStore) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) Nullable(javax.annotation.Nullable) Predicate(com.google.common.base.Predicate)

Example 30 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.

the class ParquetFileWriter method mergeMetadataFiles.

/**
 * Given a list of metadata files, merge them into a single ParquetMetadata
 * Requires that the schemas be compatible, and the extraMetadata be exactly equal.
 * @deprecated metadata files are not recommended and will be removed in 2.0.0
 */
@Deprecated
public static ParquetMetadata mergeMetadataFiles(List<Path> files, Configuration conf) throws IOException {
    Preconditions.checkArgument(!files.isEmpty(), "Cannot merge an empty list of metadata");
    GlobalMetaData globalMetaData = null;
    List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
    for (Path p : files) {
        ParquetMetadata pmd = ParquetFileReader.readFooter(conf, p, ParquetMetadataConverter.NO_FILTER);
        FileMetaData fmd = pmd.getFileMetaData();
        globalMetaData = mergeInto(fmd, globalMetaData, true);
        blocks.addAll(pmd.getBlocks());
    }
    // collapse GlobalMetaData into a single FileMetaData, which will throw if they are not compatible
    return new ParquetMetadata(globalMetaData.merge(), blocks);
}
Also used : ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) Path(org.apache.hadoop.fs.Path) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ArrayList(java.util.ArrayList) GlobalMetaData(org.apache.parquet.hadoop.metadata.GlobalMetaData) Util.writeFileMetaData(org.apache.parquet.format.Util.writeFileMetaData) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData)

Aggregations

ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)76 Path (org.apache.hadoop.fs.Path)39 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)27 Configuration (org.apache.hadoop.conf.Configuration)21 MessageType (org.apache.parquet.schema.MessageType)21 ArrayList (java.util.ArrayList)19 IOException (java.io.IOException)18 Test (org.junit.Test)17 FileSystem (org.apache.hadoop.fs.FileSystem)16 Map (java.util.Map)11 FileMetaData (org.apache.parquet.hadoop.metadata.FileMetaData)11 File (java.io.File)10 FileStatus (org.apache.hadoop.fs.FileStatus)10 ColumnPath (org.apache.parquet.hadoop.metadata.ColumnPath)9 HashMap (java.util.HashMap)8 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)7 List (java.util.List)6 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)6 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)6 ParquetFileReader (org.apache.parquet.hadoop.ParquetFileReader)6