Search in sources :

Example 1 with FileMetaData

use of org.apache.parquet.hadoop.metadata.FileMetaData in project hive by apache.

the class ParquetRecordReaderBase method setTimeZoneConversion.

/**
   * Sets the TimeZone conversion for Parquet timestamp columns.
   *
   * @param configuration Configuration object where to get and set the TimeZone conversion
   * @param finalPath     path to the parquet file
   */
protected void setTimeZoneConversion(Configuration configuration, Path finalPath) {
    ParquetMetadata parquetMetadata;
    String timeZoneID;
    try {
        parquetMetadata = ParquetFileReader.readFooter(configuration, finalPath, ParquetMetadataConverter.NO_FILTER);
    } catch (IOException e) {
        // If an error occurred while reading the file, then we just skip the TimeZone setting.
        // This error will probably occur on any other part of the code.
        LOG.debug("Could not read parquet file footer at " + finalPath + ". Cannot determine " + "parquet file timezone", e);
        return;
    }
    boolean skipConversion = HiveConf.getBoolVar(configuration, HiveConf.ConfVars.HIVE_PARQUET_TIMESTAMP_SKIP_CONVERSION);
    FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
    if (!Strings.nullToEmpty(fileMetaData.getCreatedBy()).startsWith("parquet-mr") || skipConversion) {
        // Impala writes timestamp values using GMT only. We should not try to convert Impala
        // files to other type of timezones.
        timeZoneID = ParquetTableUtils.PARQUET_INT96_NO_ADJUSTMENT_ZONE;
    } else {
        // TABLE_PARQUET_INT96_TIMEZONE is a table property used to detect what timezone conversion
        // to use when reading Parquet timestamps.
        timeZoneID = configuration.get(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY, ParquetTableUtils.PARQUET_INT96_NO_ADJUSTMENT_ZONE);
        if (!Arrays.asList(TimeZone.getAvailableIDs()).contains(timeZoneID)) {
            throw new IllegalStateException("Unexpected timezone id found for parquet int96 conversion: " + timeZoneID);
        }
    }
    // 'timeZoneID' should be valid, since we did not throw exception above
    configuration.set(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY, TimeZone.getTimeZone(timeZoneID).getID());
}
Also used : ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) IOException(java.io.IOException) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData)

Example 2 with FileMetaData

use of org.apache.parquet.hadoop.metadata.FileMetaData in project parquet-mr by apache.

the class InternalParquetRecordReader method initialize.

public void initialize(ParquetFileReader reader, Configuration configuration) throws IOException {
    // initialize a ReadContext for this file
    this.reader = reader;
    FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
    this.fileSchema = parquetFileMetadata.getSchema();
    Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
    ReadSupport.ReadContext readContext = readSupport.init(new InitContext(configuration, toSetMultiMap(fileMetadata), fileSchema));
    this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
    this.requestedSchema = readContext.getRequestedSchema();
    this.columnCount = requestedSchema.getPaths().size();
    this.recordConverter = readSupport.prepareForRead(configuration, fileMetadata, fileSchema, readContext);
    this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
    this.total = reader.getRecordCount();
    this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
    this.filterRecords = configuration.getBoolean(RECORD_FILTERING_ENABLED, true);
    reader.setRequestedSchema(requestedSchema);
    LOG.info("RecordReader initialized will read a total of {} records.", total);
}
Also used : ReadSupport(org.apache.parquet.hadoop.api.ReadSupport) InitContext(org.apache.parquet.hadoop.api.InitContext) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) ColumnIOFactory(org.apache.parquet.io.ColumnIOFactory)

Example 3 with FileMetaData

use of org.apache.parquet.hadoop.metadata.FileMetaData in project parquet-mr by apache.

the class CheckParquet251Command method check.

private String check(String file) throws IOException {
    Path path = qualifiedPath(file);
    ParquetMetadata footer = ParquetFileReader.readFooter(getConf(), path, ParquetMetadataConverter.NO_FILTER);
    FileMetaData meta = footer.getFileMetaData();
    String createdBy = meta.getCreatedBy();
    if (CorruptStatistics.shouldIgnoreStatistics(createdBy, BINARY)) {
        // create fake metadata that will read corrupt stats and return them
        FileMetaData fakeMeta = new FileMetaData(meta.getSchema(), meta.getKeyValueMetaData(), Version.FULL_VERSION);
        // get just the binary columns
        List<ColumnDescriptor> columns = Lists.newArrayList();
        Iterables.addAll(columns, Iterables.filter(meta.getSchema().getColumns(), new Predicate<ColumnDescriptor>() {

            @Override
            public boolean apply(@Nullable ColumnDescriptor input) {
                return input != null && input.getType() == BINARY;
            }
        }));
        // now check to see if the data is actually corrupt
        ParquetFileReader reader = new ParquetFileReader(getConf(), fakeMeta, path, footer.getBlocks(), columns);
        try {
            PageStatsValidator validator = new PageStatsValidator();
            for (PageReadStore pages = reader.readNextRowGroup(); pages != null; pages = reader.readNextRowGroup()) {
                validator.validate(columns, pages);
            }
        } catch (BadStatsException e) {
            return e.getMessage();
        }
    }
    return null;
}
Also used : Path(org.apache.hadoop.fs.Path) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) PageReadStore(org.apache.parquet.column.page.PageReadStore) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) Nullable(javax.annotation.Nullable) Predicate(com.google.common.base.Predicate)

Example 4 with FileMetaData

use of org.apache.parquet.hadoop.metadata.FileMetaData in project parquet-mr by apache.

the class ParquetFileWriter method mergeMetadataFiles.

/**
 * Given a list of metadata files, merge them into a single ParquetMetadata
 * Requires that the schemas be compatible, and the extraMetadata be exactly equal.
 * @deprecated metadata files are not recommended and will be removed in 2.0.0
 */
@Deprecated
public static ParquetMetadata mergeMetadataFiles(List<Path> files, Configuration conf) throws IOException {
    Preconditions.checkArgument(!files.isEmpty(), "Cannot merge an empty list of metadata");
    GlobalMetaData globalMetaData = null;
    List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
    for (Path p : files) {
        ParquetMetadata pmd = ParquetFileReader.readFooter(conf, p, ParquetMetadataConverter.NO_FILTER);
        FileMetaData fmd = pmd.getFileMetaData();
        globalMetaData = mergeInto(fmd, globalMetaData, true);
        blocks.addAll(pmd.getBlocks());
    }
    // collapse GlobalMetaData into a single FileMetaData, which will throw if they are not compatible
    return new ParquetMetadata(globalMetaData.merge(), blocks);
}
Also used : ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) Path(org.apache.hadoop.fs.Path) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ArrayList(java.util.ArrayList) GlobalMetaData(org.apache.parquet.hadoop.metadata.GlobalMetaData) Util.writeFileMetaData(org.apache.parquet.format.Util.writeFileMetaData) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData)

Example 5 with FileMetaData

use of org.apache.parquet.hadoop.metadata.FileMetaData in project parquet-mr by apache.

the class ParquetRecordReaderWrapper method getSplit.

/**
 * gets a ParquetInputSplit corresponding to a split given by Hive
 *
 * @param oldSplit The split given by Hive
 * @param conf The JobConf of the Hive job
 * @return a ParquetInputSplit corresponding to the oldSplit
 * @throws IOException if the config cannot be enhanced or if the footer cannot be read from the file
 */
protected ParquetInputSplit getSplit(final InputSplit oldSplit, final JobConf conf) throws IOException {
    if (oldSplit instanceof FileSplit) {
        FileSplit fileSplit = (FileSplit) oldSplit;
        final long splitStart = fileSplit.getStart();
        final long splitLength = fileSplit.getLength();
        final Path finalPath = fileSplit.getPath();
        final JobConf cloneJob = hiveBinding.pushProjectionsAndFilters(conf, finalPath.getParent());
        final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(cloneJob, finalPath, SKIP_ROW_GROUPS);
        final FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
        final ReadContext readContext = new DataWritableReadSupport().init(cloneJob, fileMetaData.getKeyValueMetaData(), fileMetaData.getSchema());
        schemaSize = MessageTypeParser.parseMessageType(readContext.getReadSupportMetadata().get(DataWritableReadSupport.HIVE_SCHEMA_KEY)).getFieldCount();
        return new ParquetInputSplit(finalPath, splitStart, splitStart + splitLength, splitLength, fileSplit.getLocations(), null);
    } else {
        throw new IllegalArgumentException("Unknown split type: " + oldSplit);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ReadContext(org.apache.parquet.hadoop.api.ReadSupport.ReadContext) ParquetInputSplit(org.apache.parquet.hadoop.ParquetInputSplit) FileSplit(org.apache.hadoop.mapred.FileSplit) JobConf(org.apache.hadoop.mapred.JobConf) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData)

Aggregations

FileMetaData (org.apache.parquet.hadoop.metadata.FileMetaData)16 ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)11 Path (org.apache.hadoop.fs.Path)10 ArrayList (java.util.ArrayList)7 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)7 Domain (com.facebook.presto.common.predicate.Domain)3 TupleDomain (com.facebook.presto.common.predicate.TupleDomain)3 Type (com.facebook.presto.common.type.Type)3 TypeManager (com.facebook.presto.common.type.TypeManager)3 FileFormatDataSourceStats (com.facebook.presto.hive.FileFormatDataSourceStats)3 HdfsEnvironment (com.facebook.presto.hive.HdfsEnvironment)3 IOException (java.io.IOException)3 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)3 RuntimeStats (com.facebook.presto.common.RuntimeStats)2 HdfsContext (com.facebook.presto.hive.HdfsContext)2 HdfsParquetDataSource.buildHdfsParquetDataSource (com.facebook.presto.hive.parquet.HdfsParquetDataSource.buildHdfsParquetDataSource)2 ParquetPageSource (com.facebook.presto.hive.parquet.ParquetPageSource)2 AggregatedMemoryContext (com.facebook.presto.memory.context.AggregatedMemoryContext)2 AggregatedMemoryContext.newSimpleAggregatedMemoryContext (com.facebook.presto.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext)2 Field (com.facebook.presto.parquet.Field)2