use of org.apache.parquet.hadoop.metadata.GlobalMetaData in project parquet-mr by apache.
the class ParquetFileWriter method getGlobalMetaData.
static GlobalMetaData getGlobalMetaData(List<Footer> footers, boolean strict) {
GlobalMetaData fileMetaData = null;
for (Footer footer : footers) {
ParquetMetadata currentMetadata = footer.getParquetMetadata();
fileMetaData = mergeInto(currentMetadata.getFileMetaData(), fileMetaData, strict);
}
return fileMetaData;
}
use of org.apache.parquet.hadoop.metadata.GlobalMetaData in project parquet-mr by apache.
the class ParquetLoader method initSchema.
private void initSchema(Job job) throws IOException {
if (schema != null) {
return;
}
if (schema == null && requestedSchema != null) {
// this is only true in front-end
schema = requestedSchema;
}
if (schema == null) {
// no requested schema => use the schema from the file
final GlobalMetaData globalMetaData = getParquetInputFormat().getGlobalMetaData(job);
schema = getPigSchemaFromMultipleFiles(globalMetaData.getSchema(), globalMetaData.getKeyValueMetaData());
}
if (isElephantBirdCompatible(job)) {
convertToElephantBirdCompatibleSchema(schema);
}
}
use of org.apache.parquet.hadoop.metadata.GlobalMetaData in project parquet-mr by apache.
the class ClientSideMetadataSplitStrategy method getSplits.
/**
* @param configuration the configuration to connect to the file system
* @param footers the footers of the files to read
* @return the splits for the footers
* @throws IOException
* @deprecated split planning using file footers will be removed
*/
@Deprecated
public List<ParquetInputSplit> getSplits(Configuration configuration, List<Footer> footers) throws IOException {
boolean strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
final long maxSplitSize = configuration.getLong("mapred.max.split.size", Long.MAX_VALUE);
final long minSplitSize = Math.max(getFormatMinSplitSize(), configuration.getLong("mapred.min.split.size", 0L));
if (maxSplitSize < 0 || minSplitSize < 0) {
throw new ParquetDecodingException("maxSplitSize or minSplitSize should not be negative: maxSplitSize = " + maxSplitSize + "; minSplitSize = " + minSplitSize);
}
GlobalMetaData globalMetaData = ParquetFileWriter.getGlobalMetaData(footers, strictTypeChecking);
ReadContext readContext = getReadSupport(configuration).init(new InitContext(configuration, globalMetaData.getKeyValueMetaData(), globalMetaData.getSchema()));
return new ClientSideMetadataSplitStrategy().getSplits(configuration, footers, maxSplitSize, minSplitSize, readContext);
}
use of org.apache.parquet.hadoop.metadata.GlobalMetaData in project drill by apache.
the class ParquetFileWriter method mergeFooters.
/**
* Will merge the metadata of all the footers together
* @param root the directory containing all footers
* @param footers the list files footers to merge
* @param keyValueMergeStrategy strategy to merge values for a given key (if there are multiple values)
* @return the global meta data for all the footers
*/
static ParquetMetadata mergeFooters(Path root, List<Footer> footers, KeyValueMetadataMergeStrategy keyValueMergeStrategy) {
String rootPath = root.toUri().getPath();
GlobalMetaData fileMetaData = null;
List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
for (Footer footer : footers) {
String footerPath = footer.getFile().toUri().getPath();
if (!footerPath.startsWith(rootPath)) {
throw new ParquetEncodingException(footerPath + " invalid: all the files must be contained in the root " + root);
}
footerPath = footerPath.substring(rootPath.length());
while (footerPath.startsWith("/")) {
footerPath = footerPath.substring(1);
}
fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData);
for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
block.setPath(footerPath);
blocks.add(block);
}
}
return new ParquetMetadata(fileMetaData.merge(keyValueMergeStrategy), blocks);
}
use of org.apache.parquet.hadoop.metadata.GlobalMetaData in project drill by apache.
the class ParquetFileWriter method mergeInto.
static GlobalMetaData mergeInto(FileMetaData toMerge, GlobalMetaData mergedMetadata, boolean strict) {
MessageType schema = null;
Map<String, Set<String>> newKeyValues = new HashMap<String, Set<String>>();
Set<String> createdBy = new HashSet<String>();
if (mergedMetadata != null) {
schema = mergedMetadata.getSchema();
newKeyValues.putAll(mergedMetadata.getKeyValueMetaData());
createdBy.addAll(mergedMetadata.getCreatedBy());
}
if ((schema == null && toMerge.getSchema() != null) || (schema != null && !schema.equals(toMerge.getSchema()))) {
schema = mergeInto(toMerge.getSchema(), schema, strict);
}
for (Entry<String, String> entry : toMerge.getKeyValueMetaData().entrySet()) {
Set<String> values = newKeyValues.get(entry.getKey());
if (values == null) {
values = new LinkedHashSet<String>();
newKeyValues.put(entry.getKey(), values);
}
values.add(entry.getValue());
}
createdBy.add(toMerge.getCreatedBy());
return new GlobalMetaData(schema, newKeyValues, createdBy);
}
Aggregations