use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.
the class TestMergeMetadataFiles method testMergeMetadataFiles.
@Test
public void testMergeMetadataFiles() throws Exception {
WrittenFileInfo info = writeFiles(false);
ParquetMetadata commonMeta1 = ParquetFileReader.readFooter(info.conf, info.commonMetaPath1, ParquetMetadataConverter.NO_FILTER);
ParquetMetadata commonMeta2 = ParquetFileReader.readFooter(info.conf, info.commonMetaPath2, ParquetMetadataConverter.NO_FILTER);
ParquetMetadata meta1 = ParquetFileReader.readFooter(info.conf, info.metaPath1, ParquetMetadataConverter.NO_FILTER);
ParquetMetadata meta2 = ParquetFileReader.readFooter(info.conf, info.metaPath2, ParquetMetadataConverter.NO_FILTER);
assertTrue(commonMeta1.getBlocks().isEmpty());
assertTrue(commonMeta2.getBlocks().isEmpty());
assertEquals(commonMeta1.getFileMetaData().getSchema(), commonMeta2.getFileMetaData().getSchema());
assertFalse(meta1.getBlocks().isEmpty());
assertFalse(meta2.getBlocks().isEmpty());
assertEquals(meta1.getFileMetaData().getSchema(), meta2.getFileMetaData().getSchema());
assertEquals(commonMeta1.getFileMetaData().getKeyValueMetaData(), commonMeta2.getFileMetaData().getKeyValueMetaData());
assertEquals(meta1.getFileMetaData().getKeyValueMetaData(), meta2.getFileMetaData().getKeyValueMetaData());
// test file serialization
Path mergedOut = new Path(new File(temp.getRoot(), "merged_meta").getAbsolutePath());
Path mergedCommonOut = new Path(new File(temp.getRoot(), "merged_common_meta").getAbsolutePath());
ParquetFileWriter.writeMergedMetadataFile(Arrays.asList(info.metaPath1, info.metaPath2), mergedOut, info.conf);
ParquetFileWriter.writeMergedMetadataFile(Arrays.asList(info.commonMetaPath1, info.commonMetaPath2), mergedCommonOut, info.conf);
ParquetMetadata mergedMeta = ParquetFileReader.readFooter(info.conf, mergedOut, ParquetMetadataConverter.NO_FILTER);
ParquetMetadata mergedCommonMeta = ParquetFileReader.readFooter(info.conf, mergedCommonOut, ParquetMetadataConverter.NO_FILTER);
// ideally we'd assert equality here, but BlockMetaData and it's references don't implement equals
assertEquals(meta1.getBlocks().size() + meta2.getBlocks().size(), mergedMeta.getBlocks().size());
assertTrue(mergedCommonMeta.getBlocks().isEmpty());
assertEquals(meta1.getFileMetaData().getSchema(), mergedMeta.getFileMetaData().getSchema());
assertEquals(commonMeta1.getFileMetaData().getSchema(), mergedCommonMeta.getFileMetaData().getSchema());
assertEquals(meta1.getFileMetaData().getKeyValueMetaData(), mergedMeta.getFileMetaData().getKeyValueMetaData());
assertEquals(commonMeta1.getFileMetaData().getKeyValueMetaData(), mergedCommonMeta.getFileMetaData().getKeyValueMetaData());
}
use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.
the class ShowSchemaCommand method execute.
@Override
public void execute(CommandLine options) throws Exception {
super.execute(options);
String[] args = options.getArgs();
String input = args[0];
Configuration conf = new Configuration();
ParquetMetadata metaData;
Path path = new Path(input);
FileSystem fs = path.getFileSystem(conf);
Path file;
if (fs.isDirectory(path)) {
FileStatus[] statuses = fs.listStatus(path, HiddenFileFilter.INSTANCE);
if (statuses.length == 0) {
throw new RuntimeException("Directory " + path.toString() + " is empty");
}
file = statuses[0].getPath();
} else {
file = path;
}
metaData = ParquetFileReader.readFooter(conf, file, NO_FILTER);
MessageType schema = metaData.getFileMetaData().getSchema();
Main.out.println(schema);
if (options.hasOption('d')) {
PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter().build();
MetadataUtils.showDetails(out, metaData);
}
}
use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.
the class ParquetMetadataCommand method run.
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
Preconditions.checkArgument(targets != null && targets.size() >= 1, "A Parquet file is required.");
Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files.");
String source = targets.get(0);
ParquetMetadata footer = ParquetFileReader.readFooter(getConf(), qualifiedPath(source), ParquetMetadataConverter.NO_FILTER);
console.info("\nFile path: {}", source);
console.info("Created by: {}", footer.getFileMetaData().getCreatedBy());
Map<String, String> kv = footer.getFileMetaData().getKeyValueMetaData();
if (kv != null && !kv.isEmpty()) {
console.info("Properties:");
String format = " %" + maxSize(kv.keySet()) + "s: %s";
for (Map.Entry<String, String> entry : kv.entrySet()) {
console.info(String.format(format, entry.getKey(), entry.getValue()));
}
} else {
console.info("Properties: (none)");
}
MessageType schema = footer.getFileMetaData().getSchema();
console.info("Schema:\n{}", schema);
List<BlockMetaData> rowGroups = footer.getBlocks();
for (int index = 0, n = rowGroups.size(); index < n; index += 1) {
printRowGroup(console, index, rowGroups.get(index), schema);
}
console.info("");
return 0;
}
use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.
the class CheckParquet251Command method check.
private String check(String file) throws IOException {
Path path = qualifiedPath(file);
ParquetMetadata footer = ParquetFileReader.readFooter(getConf(), path, ParquetMetadataConverter.NO_FILTER);
FileMetaData meta = footer.getFileMetaData();
String createdBy = meta.getCreatedBy();
if (CorruptStatistics.shouldIgnoreStatistics(createdBy, BINARY)) {
// create fake metadata that will read corrupt stats and return them
FileMetaData fakeMeta = new FileMetaData(meta.getSchema(), meta.getKeyValueMetaData(), Version.FULL_VERSION);
// get just the binary columns
List<ColumnDescriptor> columns = Lists.newArrayList();
Iterables.addAll(columns, Iterables.filter(meta.getSchema().getColumns(), new Predicate<ColumnDescriptor>() {
@Override
public boolean apply(@Nullable ColumnDescriptor input) {
return input != null && input.getType() == BINARY;
}
}));
// now check to see if the data is actually corrupt
ParquetFileReader reader = new ParquetFileReader(getConf(), fakeMeta, path, footer.getBlocks(), columns);
try {
PageStatsValidator validator = new PageStatsValidator();
for (PageReadStore pages = reader.readNextRowGroup(); pages != null; pages = reader.readNextRowGroup()) {
validator.validate(columns, pages);
}
} catch (BadStatsException e) {
return e.getMessage();
}
}
return null;
}
use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.
the class ParquetFileWriter method mergeMetadataFiles.
/**
* Given a list of metadata files, merge them into a single ParquetMetadata
* Requires that the schemas be compatible, and the extraMetadata be exactly equal.
* @deprecated metadata files are not recommended and will be removed in 2.0.0
*/
@Deprecated
public static ParquetMetadata mergeMetadataFiles(List<Path> files, Configuration conf) throws IOException {
Preconditions.checkArgument(!files.isEmpty(), "Cannot merge an empty list of metadata");
GlobalMetaData globalMetaData = null;
List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
for (Path p : files) {
ParquetMetadata pmd = ParquetFileReader.readFooter(conf, p, ParquetMetadataConverter.NO_FILTER);
FileMetaData fmd = pmd.getFileMetaData();
globalMetaData = mergeInto(fmd, globalMetaData, true);
blocks.addAll(pmd.getBlocks());
}
// collapse GlobalMetaData into a single FileMetaData, which will throw if they are not compatible
return new ParquetMetadata(globalMetaData.merge(), blocks);
}
Aggregations