use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by axbaretto.
the class ParquetRecordReaderTest method validateFooters.
private void validateFooters(final List<Footer> metadata) {
logger.debug(metadata.toString());
assertEquals(3, metadata.size());
for (Footer footer : metadata) {
final File file = new File(footer.getFile().toUri());
assertTrue(file.getName(), file.getName().startsWith("part"));
assertTrue(file.getPath(), file.exists());
final ParquetMetadata parquetMetadata = footer.getParquetMetadata();
assertEquals(2, parquetMetadata.getBlocks().size());
final Map<String, String> keyValueMetaData = parquetMetadata.getFileMetaData().getKeyValueMetaData();
assertEquals("bar", keyValueMetaData.get("foo"));
assertEquals(footer.getFile().getName(), keyValueMetaData.get(footer.getFile().getName()));
}
}
use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by apache.
the class Metadata method getParquetFileMetadata_v4.
/**
* Get the file metadata for a single file
*
* @param parquetTableMetadata The table metadata to be updated with all the columns' info
* @param footer If non null, use this footer instead of reading it from the file
* @param file The file
* @param allColumnsInteresting If true, read the min/max metadata for all the columns
* @param skipNonInteresting If true, collect info only for the interesting columns
* @param columnSet Specifies specific columns for which min/max metadata is collected
* @param readerConfig for the options
* @return the file metadata
*/
public static ParquetFileAndRowCountMetadata getParquetFileMetadata_v4(ParquetTableMetadata_v4 parquetTableMetadata, ParquetMetadata footer, FileStatus file, FileSystem fs, boolean allColumnsInteresting, boolean skipNonInteresting, Set<SchemaPath> columnSet, ParquetReaderConfig readerConfig) throws IOException, InterruptedException {
// if a non-null footer is given, no need to read it again from the file
ParquetMetadata metadata = footer;
if (metadata == null) {
UserGroupInformation processUserUgi = ImpersonationUtil.getProcessUserUGI();
Configuration conf = new Configuration(fs.getConf());
try {
metadata = processUserUgi.doAs((PrivilegedExceptionAction<ParquetMetadata>) () -> {
try (ParquetFileReader parquetFileReader = ParquetFileReader.open(HadoopInputFile.fromStatus(file, conf), readerConfig.toReadOptions())) {
return parquetFileReader.getFooter();
}
});
} catch (Exception e) {
logger.error("Exception while reading footer of parquet file [Details - path: {}, owner: {}] as process user {}", file.getPath(), file.getOwner(), processUserUgi.getShortUserName(), e);
throw e;
}
}
FileMetadataCollector metadataCollector = new FileMetadataCollector(metadata, file, fs, allColumnsInteresting, skipNonInteresting, columnSet, readerConfig);
parquetTableMetadata.metadataSummary.columnTypeInfo.putAll(metadataCollector.getColumnTypeInfo());
return metadataCollector.getFileMetadata();
}
use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by apache.
the class ParquetFileWriter method mergeFooters.
/**
* Will merge the metadata of all the footers together
* @param root the directory containing all footers
* @param footers the list files footers to merge
* @param keyValueMergeStrategy strategy to merge values for a given key (if there are multiple values)
* @return the global meta data for all the footers
*/
static ParquetMetadata mergeFooters(Path root, List<Footer> footers, KeyValueMetadataMergeStrategy keyValueMergeStrategy) {
String rootPath = root.toUri().getPath();
GlobalMetaData fileMetaData = null;
List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
for (Footer footer : footers) {
String footerPath = footer.getFile().toUri().getPath();
if (!footerPath.startsWith(rootPath)) {
throw new ParquetEncodingException(footerPath + " invalid: all the files must be contained in the root " + root);
}
footerPath = footerPath.substring(rootPath.length());
while (footerPath.startsWith("/")) {
footerPath = footerPath.substring(1);
}
fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData);
for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
block.setPath(footerPath);
blocks.add(block);
}
}
return new ParquetMetadata(fileMetaData.merge(keyValueMergeStrategy), blocks);
}
use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by apache.
the class ParquetFileWriter method writeMergedMetadataFile.
/**
* Given a list of metadata files, merge them into a single metadata file.
* Requires that the schemas be compatible, and the extraMetaData be exactly equal.
* This is useful when merging 2 directories of parquet files into a single directory, as long
* as both directories were written with compatible schemas and equal extraMetaData.
* @param files a list of files to merge metadata from
* @param outputPath path to write merged metadata to
* @param conf a configuration
* @throws IOException if there is an error while reading or writing
* @deprecated metadata files are not recommended and will be removed in 2.0.0
*/
@Deprecated
public static void writeMergedMetadataFile(List<Path> files, Path outputPath, Configuration conf) throws IOException {
ParquetMetadata merged = mergeMetadataFiles(files, conf);
writeMetadataFile(outputPath, merged, outputPath.getFileSystem(conf));
}
use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by apache.
the class ParquetFileWriter method end.
/**
* ends a file once all blocks have been written.
* closes the file.
* @param extraMetaData the extra meta data to write in the footer
* @throws IOException if there is an error while writing
*/
public void end(Map<String, String> extraMetaData) throws IOException {
state = state.end();
serializeColumnIndexes(columnIndexes, blocks, out, fileEncryptor);
serializeOffsetIndexes(offsetIndexes, blocks, out, fileEncryptor);
serializeBloomFilters(bloomFilters, blocks, out, fileEncryptor);
LOG.debug("{}: end", out.getPos());
this.footer = new ParquetMetadata(new FileMetaData(schema, extraMetaData, Version.FULL_VERSION), blocks);
serializeFooter(footer, out, fileEncryptor);
out.close();
}
Aggregations