use of org.apache.parquet.hadoop.metadata.GlobalMetaData in project parquet-mr by apache.
the class ParquetFileWriter method mergeInto.
static GlobalMetaData mergeInto(FileMetaData toMerge, GlobalMetaData mergedMetadata, boolean strict) {
MessageType schema = null;
Map<String, Set<String>> newKeyValues = new HashMap<String, Set<String>>();
Set<String> createdBy = new HashSet<String>();
if (mergedMetadata != null) {
schema = mergedMetadata.getSchema();
newKeyValues.putAll(mergedMetadata.getKeyValueMetaData());
createdBy.addAll(mergedMetadata.getCreatedBy());
}
if ((schema == null && toMerge.getSchema() != null) || (schema != null && !schema.equals(toMerge.getSchema()))) {
schema = mergeInto(toMerge.getSchema(), schema, strict);
}
for (Entry<String, String> entry : toMerge.getKeyValueMetaData().entrySet()) {
Set<String> values = newKeyValues.get(entry.getKey());
if (values == null) {
values = new LinkedHashSet<String>();
newKeyValues.put(entry.getKey(), values);
}
values.add(entry.getValue());
}
createdBy.add(toMerge.getCreatedBy());
return new GlobalMetaData(schema, newKeyValues, createdBy);
}
use of org.apache.parquet.hadoop.metadata.GlobalMetaData in project parquet-mr by apache.
the class ParquetFileWriter method mergeFooters.
static ParquetMetadata mergeFooters(Path root, List<Footer> footers) {
String rootPath = root.toUri().getPath();
GlobalMetaData fileMetaData = null;
List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
for (Footer footer : footers) {
String footerPath = footer.getFile().toUri().getPath();
if (!footerPath.startsWith(rootPath)) {
throw new ParquetEncodingException(footerPath + " invalid: all the files must be contained in the root " + root);
}
footerPath = footerPath.substring(rootPath.length());
while (footerPath.startsWith("/")) {
footerPath = footerPath.substring(1);
}
fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData);
for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
block.setPath(footerPath);
blocks.add(block);
}
}
return new ParquetMetadata(fileMetaData.merge(), blocks);
}
use of org.apache.parquet.hadoop.metadata.GlobalMetaData in project parquet-mr by apache.
the class ParquetFileWriter method mergeMetadataFiles.
/**
* Given a list of metadata files, merge them into a single ParquetMetadata
* Requires that the schemas be compatible, and the extraMetadata be exactly equal.
* @deprecated metadata files are not recommended and will be removed in 2.0.0
*/
@Deprecated
public static ParquetMetadata mergeMetadataFiles(List<Path> files, Configuration conf) throws IOException {
Preconditions.checkArgument(!files.isEmpty(), "Cannot merge an empty list of metadata");
GlobalMetaData globalMetaData = null;
List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
for (Path p : files) {
ParquetMetadata pmd = ParquetFileReader.readFooter(conf, p, ParquetMetadataConverter.NO_FILTER);
FileMetaData fmd = pmd.getFileMetaData();
globalMetaData = mergeInto(fmd, globalMetaData, true);
blocks.addAll(pmd.getBlocks());
}
// collapse GlobalMetaData into a single FileMetaData, which will throw if they are not compatible
return new ParquetMetadata(globalMetaData.merge(), blocks);
}
use of org.apache.parquet.hadoop.metadata.GlobalMetaData in project drill by apache.
the class ParquetFileWriter method mergeMetadataFiles.
/**
* Given a list of metadata files, merge them into a single ParquetMetadata
* Requires that the schemas be compatible, and the extraMetadata be exactly equal.
* @param files a list of files to merge metadata from
* @param conf a configuration
* @param keyValueMetadataMergeStrategy strategy to merge values for same key, if there are multiple
* @return merged parquet metadata for the files
* @throws IOException if there is an error while writing
* @deprecated metadata files are not recommended and will be removed in 2.0.0
*/
@Deprecated
public static ParquetMetadata mergeMetadataFiles(List<Path> files, Configuration conf, KeyValueMetadataMergeStrategy keyValueMetadataMergeStrategy) throws IOException {
Preconditions.checkArgument(!files.isEmpty(), "Cannot merge an empty list of metadata");
GlobalMetaData globalMetaData = null;
List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
for (Path p : files) {
ParquetMetadata pmd = ParquetFileReader.readFooter(conf, p, ParquetMetadataConverter.NO_FILTER);
FileMetaData fmd = pmd.getFileMetaData();
globalMetaData = mergeInto(fmd, globalMetaData, true);
blocks.addAll(pmd.getBlocks());
}
// collapse GlobalMetaData into a single FileMetaData, which will throw if they are not compatible
return new ParquetMetadata(globalMetaData.merge(keyValueMetadataMergeStrategy), blocks);
}
use of org.apache.parquet.hadoop.metadata.GlobalMetaData in project drill by apache.
the class ParquetFileWriter method getGlobalMetaData.
static GlobalMetaData getGlobalMetaData(List<Footer> footers, boolean strict) {
GlobalMetaData fileMetaData = null;
for (Footer footer : footers) {
ParquetMetadata currentMetadata = footer.getParquetMetadata();
fileMetaData = mergeInto(currentMetadata.getFileMetaData(), fileMetaData, strict);
}
return fileMetaData;
}
Aggregations