use of org.apache.parquet.hadoop.metadata.FileMetaData in project hive by apache.
the class ParquetRecordReaderBase method setTimeZoneConversion.
/**
* Sets the TimeZone conversion for Parquet timestamp columns.
*
* @param configuration Configuration object where to get and set the TimeZone conversion
* @param finalPath path to the parquet file
*/
protected void setTimeZoneConversion(Configuration configuration, Path finalPath) {
ParquetMetadata parquetMetadata;
String timeZoneID;
try {
parquetMetadata = ParquetFileReader.readFooter(configuration, finalPath, ParquetMetadataConverter.NO_FILTER);
} catch (IOException e) {
// If an error occurred while reading the file, then we just skip the TimeZone setting.
// This error will probably occur on any other part of the code.
LOG.debug("Could not read parquet file footer at " + finalPath + ". Cannot determine " + "parquet file timezone", e);
return;
}
boolean skipConversion = HiveConf.getBoolVar(configuration, HiveConf.ConfVars.HIVE_PARQUET_TIMESTAMP_SKIP_CONVERSION);
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
if (!Strings.nullToEmpty(fileMetaData.getCreatedBy()).startsWith("parquet-mr") || skipConversion) {
// Impala writes timestamp values using GMT only. We should not try to convert Impala
// files to other type of timezones.
timeZoneID = ParquetTableUtils.PARQUET_INT96_NO_ADJUSTMENT_ZONE;
} else {
// TABLE_PARQUET_INT96_TIMEZONE is a table property used to detect what timezone conversion
// to use when reading Parquet timestamps.
timeZoneID = configuration.get(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY, ParquetTableUtils.PARQUET_INT96_NO_ADJUSTMENT_ZONE);
if (!Arrays.asList(TimeZone.getAvailableIDs()).contains(timeZoneID)) {
throw new IllegalStateException("Unexpected timezone id found for parquet int96 conversion: " + timeZoneID);
}
}
// 'timeZoneID' should be valid, since we did not throw exception above
configuration.set(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY, TimeZone.getTimeZone(timeZoneID).getID());
}
use of org.apache.parquet.hadoop.metadata.FileMetaData in project parquet-mr by apache.
the class InternalParquetRecordReader method initialize.
public void initialize(ParquetFileReader reader, Configuration configuration) throws IOException {
// initialize a ReadContext for this file
this.reader = reader;
FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
this.fileSchema = parquetFileMetadata.getSchema();
Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
ReadSupport.ReadContext readContext = readSupport.init(new InitContext(configuration, toSetMultiMap(fileMetadata), fileSchema));
this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
this.requestedSchema = readContext.getRequestedSchema();
this.columnCount = requestedSchema.getPaths().size();
this.recordConverter = readSupport.prepareForRead(configuration, fileMetadata, fileSchema, readContext);
this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
this.total = reader.getRecordCount();
this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
this.filterRecords = configuration.getBoolean(RECORD_FILTERING_ENABLED, true);
reader.setRequestedSchema(requestedSchema);
LOG.info("RecordReader initialized will read a total of {} records.", total);
}
use of org.apache.parquet.hadoop.metadata.FileMetaData in project parquet-mr by apache.
the class CheckParquet251Command method check.
private String check(String file) throws IOException {
Path path = qualifiedPath(file);
ParquetMetadata footer = ParquetFileReader.readFooter(getConf(), path, ParquetMetadataConverter.NO_FILTER);
FileMetaData meta = footer.getFileMetaData();
String createdBy = meta.getCreatedBy();
if (CorruptStatistics.shouldIgnoreStatistics(createdBy, BINARY)) {
// create fake metadata that will read corrupt stats and return them
FileMetaData fakeMeta = new FileMetaData(meta.getSchema(), meta.getKeyValueMetaData(), Version.FULL_VERSION);
// get just the binary columns
List<ColumnDescriptor> columns = Lists.newArrayList();
Iterables.addAll(columns, Iterables.filter(meta.getSchema().getColumns(), new Predicate<ColumnDescriptor>() {
@Override
public boolean apply(@Nullable ColumnDescriptor input) {
return input != null && input.getType() == BINARY;
}
}));
// now check to see if the data is actually corrupt
ParquetFileReader reader = new ParquetFileReader(getConf(), fakeMeta, path, footer.getBlocks(), columns);
try {
PageStatsValidator validator = new PageStatsValidator();
for (PageReadStore pages = reader.readNextRowGroup(); pages != null; pages = reader.readNextRowGroup()) {
validator.validate(columns, pages);
}
} catch (BadStatsException e) {
return e.getMessage();
}
}
return null;
}
use of org.apache.parquet.hadoop.metadata.FileMetaData in project parquet-mr by apache.
the class ParquetFileWriter method mergeMetadataFiles.
/**
* Given a list of metadata files, merge them into a single ParquetMetadata
* Requires that the schemas be compatible, and the extraMetadata be exactly equal.
* @deprecated metadata files are not recommended and will be removed in 2.0.0
*/
@Deprecated
public static ParquetMetadata mergeMetadataFiles(List<Path> files, Configuration conf) throws IOException {
Preconditions.checkArgument(!files.isEmpty(), "Cannot merge an empty list of metadata");
GlobalMetaData globalMetaData = null;
List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
for (Path p : files) {
ParquetMetadata pmd = ParquetFileReader.readFooter(conf, p, ParquetMetadataConverter.NO_FILTER);
FileMetaData fmd = pmd.getFileMetaData();
globalMetaData = mergeInto(fmd, globalMetaData, true);
blocks.addAll(pmd.getBlocks());
}
// collapse GlobalMetaData into a single FileMetaData, which will throw if they are not compatible
return new ParquetMetadata(globalMetaData.merge(), blocks);
}
use of org.apache.parquet.hadoop.metadata.FileMetaData in project parquet-mr by apache.
the class ParquetRecordReaderWrapper method getSplit.
/**
* gets a ParquetInputSplit corresponding to a split given by Hive
*
* @param oldSplit The split given by Hive
* @param conf The JobConf of the Hive job
* @return a ParquetInputSplit corresponding to the oldSplit
* @throws IOException if the config cannot be enhanced or if the footer cannot be read from the file
*/
protected ParquetInputSplit getSplit(final InputSplit oldSplit, final JobConf conf) throws IOException {
if (oldSplit instanceof FileSplit) {
FileSplit fileSplit = (FileSplit) oldSplit;
final long splitStart = fileSplit.getStart();
final long splitLength = fileSplit.getLength();
final Path finalPath = fileSplit.getPath();
final JobConf cloneJob = hiveBinding.pushProjectionsAndFilters(conf, finalPath.getParent());
final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(cloneJob, finalPath, SKIP_ROW_GROUPS);
final FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
final ReadContext readContext = new DataWritableReadSupport().init(cloneJob, fileMetaData.getKeyValueMetaData(), fileMetaData.getSchema());
schemaSize = MessageTypeParser.parseMessageType(readContext.getReadSupportMetadata().get(DataWritableReadSupport.HIVE_SCHEMA_KEY)).getFieldCount();
return new ParquetInputSplit(finalPath, splitStart, splitStart + splitLength, splitLength, fileSplit.getLocations(), null);
} else {
throw new IllegalArgumentException("Unknown split type: " + oldSplit);
}
}
Aggregations