Search in sources :

Example 6 with GlobalMetaData

use of org.apache.parquet.hadoop.metadata.GlobalMetaData in project parquet-mr by apache.

the class ParquetFileWriter method getGlobalMetaData.

static GlobalMetaData getGlobalMetaData(List<Footer> footers, boolean strict) {
    GlobalMetaData fileMetaData = null;
    for (Footer footer : footers) {
        ParquetMetadata currentMetadata = footer.getParquetMetadata();
        fileMetaData = mergeInto(currentMetadata.getFileMetaData(), fileMetaData, strict);
    }
    return fileMetaData;
}
Also used : ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) GlobalMetaData(org.apache.parquet.hadoop.metadata.GlobalMetaData)

Example 7 with GlobalMetaData

use of org.apache.parquet.hadoop.metadata.GlobalMetaData in project parquet-mr by apache.

the class ParquetLoader method initSchema.

private void initSchema(Job job) throws IOException {
    if (schema != null) {
        return;
    }
    if (schema == null && requestedSchema != null) {
        // this is only true in front-end
        schema = requestedSchema;
    }
    if (schema == null) {
        // no requested schema => use the schema from the file
        final GlobalMetaData globalMetaData = getParquetInputFormat().getGlobalMetaData(job);
        schema = getPigSchemaFromMultipleFiles(globalMetaData.getSchema(), globalMetaData.getKeyValueMetaData());
    }
    if (isElephantBirdCompatible(job)) {
        convertToElephantBirdCompatibleSchema(schema);
    }
}
Also used : GlobalMetaData(org.apache.parquet.hadoop.metadata.GlobalMetaData)

Example 8 with GlobalMetaData

use of org.apache.parquet.hadoop.metadata.GlobalMetaData in project parquet-mr by apache.

the class ClientSideMetadataSplitStrategy method getSplits.

/**
 * @param configuration the configuration to connect to the file system
 * @param footers the footers of the files to read
 * @return the splits for the footers
 * @throws IOException
 * @deprecated split planning using file footers will be removed
 */
@Deprecated
public List<ParquetInputSplit> getSplits(Configuration configuration, List<Footer> footers) throws IOException {
    boolean strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
    final long maxSplitSize = configuration.getLong("mapred.max.split.size", Long.MAX_VALUE);
    final long minSplitSize = Math.max(getFormatMinSplitSize(), configuration.getLong("mapred.min.split.size", 0L));
    if (maxSplitSize < 0 || minSplitSize < 0) {
        throw new ParquetDecodingException("maxSplitSize or minSplitSize should not be negative: maxSplitSize = " + maxSplitSize + "; minSplitSize = " + minSplitSize);
    }
    GlobalMetaData globalMetaData = ParquetFileWriter.getGlobalMetaData(footers, strictTypeChecking);
    ReadContext readContext = getReadSupport(configuration).init(new InitContext(configuration, globalMetaData.getKeyValueMetaData(), globalMetaData.getSchema()));
    return new ClientSideMetadataSplitStrategy().getSplits(configuration, footers, maxSplitSize, minSplitSize, readContext);
}
Also used : InitContext(org.apache.parquet.hadoop.api.InitContext) ParquetDecodingException(org.apache.parquet.io.ParquetDecodingException) ReadContext(org.apache.parquet.hadoop.api.ReadSupport.ReadContext) GlobalMetaData(org.apache.parquet.hadoop.metadata.GlobalMetaData)

Example 9 with GlobalMetaData

use of org.apache.parquet.hadoop.metadata.GlobalMetaData in project drill by apache.

the class ParquetFileWriter method mergeFooters.

/**
 * Will merge the metadata of all the footers together
 * @param root the directory containing all footers
 * @param footers the list files footers to merge
 * @param keyValueMergeStrategy strategy to merge values for a given key (if there are multiple values)
 * @return the global meta data for all the footers
 */
static ParquetMetadata mergeFooters(Path root, List<Footer> footers, KeyValueMetadataMergeStrategy keyValueMergeStrategy) {
    String rootPath = root.toUri().getPath();
    GlobalMetaData fileMetaData = null;
    List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
    for (Footer footer : footers) {
        String footerPath = footer.getFile().toUri().getPath();
        if (!footerPath.startsWith(rootPath)) {
            throw new ParquetEncodingException(footerPath + " invalid: all the files must be contained in the root " + root);
        }
        footerPath = footerPath.substring(rootPath.length());
        while (footerPath.startsWith("/")) {
            footerPath = footerPath.substring(1);
        }
        fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData);
        for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
            block.setPath(footerPath);
            blocks.add(block);
        }
    }
    return new ParquetMetadata(fileMetaData.merge(keyValueMergeStrategy), blocks);
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetEncodingException(org.apache.parquet.io.ParquetEncodingException) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ArrayList(java.util.ArrayList) GlobalMetaData(org.apache.parquet.hadoop.metadata.GlobalMetaData)

Example 10 with GlobalMetaData

use of org.apache.parquet.hadoop.metadata.GlobalMetaData in project drill by apache.

the class ParquetFileWriter method mergeInto.

static GlobalMetaData mergeInto(FileMetaData toMerge, GlobalMetaData mergedMetadata, boolean strict) {
    MessageType schema = null;
    Map<String, Set<String>> newKeyValues = new HashMap<String, Set<String>>();
    Set<String> createdBy = new HashSet<String>();
    if (mergedMetadata != null) {
        schema = mergedMetadata.getSchema();
        newKeyValues.putAll(mergedMetadata.getKeyValueMetaData());
        createdBy.addAll(mergedMetadata.getCreatedBy());
    }
    if ((schema == null && toMerge.getSchema() != null) || (schema != null && !schema.equals(toMerge.getSchema()))) {
        schema = mergeInto(toMerge.getSchema(), schema, strict);
    }
    for (Entry<String, String> entry : toMerge.getKeyValueMetaData().entrySet()) {
        Set<String> values = newKeyValues.get(entry.getKey());
        if (values == null) {
            values = new LinkedHashSet<String>();
            newKeyValues.put(entry.getKey(), values);
        }
        values.add(entry.getValue());
    }
    createdBy.add(toMerge.getCreatedBy());
    return new GlobalMetaData(schema, newKeyValues, createdBy);
}
Also used : Set(java.util.Set) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) HashMap(java.util.HashMap) GlobalMetaData(org.apache.parquet.hadoop.metadata.GlobalMetaData) MessageType(org.apache.parquet.schema.MessageType) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Aggregations

GlobalMetaData (org.apache.parquet.hadoop.metadata.GlobalMetaData)10 ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)6 ArrayList (java.util.ArrayList)4 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)4 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 LinkedHashSet (java.util.LinkedHashSet)2 Set (java.util.Set)2 Path (org.apache.hadoop.fs.Path)2 Util.writeFileMetaData (org.apache.parquet.format.Util.writeFileMetaData)2 ColumnPath (org.apache.parquet.hadoop.metadata.ColumnPath)2 FileMetaData (org.apache.parquet.hadoop.metadata.FileMetaData)2 ParquetEncodingException (org.apache.parquet.io.ParquetEncodingException)2 MessageType (org.apache.parquet.schema.MessageType)2 InitContext (org.apache.parquet.hadoop.api.InitContext)1 ReadContext (org.apache.parquet.hadoop.api.ReadSupport.ReadContext)1 ParquetDecodingException (org.apache.parquet.io.ParquetDecodingException)1