use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project hive by apache.
the class VectorizedParquetRecordReader method initialize.
public void initialize(ParquetInputSplit split, JobConf configuration) throws IOException, InterruptedException {
jobConf = configuration;
ParquetMetadata footer;
List<BlockMetaData> blocks;
boolean indexAccess = configuration.getBoolean(DataWritableReadSupport.PARQUET_COLUMN_INDEX_ACCESS, false);
this.file = split.getPath();
long[] rowGroupOffsets = split.getRowGroupOffsets();
String columnNames = configuration.get(IOConstants.COLUMNS);
columnNamesList = DataWritableReadSupport.getColumnNames(columnNames);
String columnTypes = configuration.get(IOConstants.COLUMNS_TYPES);
columnTypesList = DataWritableReadSupport.getColumnTypes(columnTypes);
// if task.side.metadata is set, rowGroupOffsets is null
if (rowGroupOffsets == null) {
//TODO check whether rowGroupOffSets can be null
// then we need to apply the predicate push down filter
footer = readFooter(configuration, file, range(split.getStart(), split.getEnd()));
MessageType fileSchema = footer.getFileMetaData().getSchema();
FilterCompat.Filter filter = getFilter(configuration);
blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
} else {
// otherwise we find the row groups that were selected on the client
footer = readFooter(configuration, file, NO_FILTER);
Set<Long> offsets = new HashSet<>();
for (long offset : rowGroupOffsets) {
offsets.add(offset);
}
blocks = new ArrayList<>();
for (BlockMetaData block : footer.getBlocks()) {
if (offsets.contains(block.getStartingPos())) {
blocks.add(block);
}
}
// verify we found them all
if (blocks.size() != rowGroupOffsets.length) {
long[] foundRowGroupOffsets = new long[footer.getBlocks().size()];
for (int i = 0; i < foundRowGroupOffsets.length; i++) {
foundRowGroupOffsets[i] = footer.getBlocks().get(i).getStartingPos();
}
// provide a good error message in case there's a bug
throw new IllegalStateException("All the offsets listed in the split should be found in the file." + " expected: " + Arrays.toString(rowGroupOffsets) + " found: " + blocks + " out of: " + Arrays.toString(foundRowGroupOffsets) + " in range " + split.getStart() + ", " + split.getEnd());
}
}
for (BlockMetaData block : blocks) {
this.totalRowCount += block.getRowCount();
}
this.fileSchema = footer.getFileMetaData().getSchema();
MessageType tableSchema;
if (indexAccess) {
List<Integer> indexSequence = new ArrayList<>();
// Generates a sequence list of indexes
for (int i = 0; i < columnNamesList.size(); i++) {
indexSequence.add(i);
}
tableSchema = DataWritableReadSupport.getSchemaByIndex(fileSchema, columnNamesList, indexSequence);
} else {
tableSchema = DataWritableReadSupport.getSchemaByName(fileSchema, columnNamesList, columnTypesList);
}
indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration);
if (!ColumnProjectionUtils.isReadAllColumns(configuration) && !indexColumnsWanted.isEmpty()) {
requestedSchema = DataWritableReadSupport.getSchemaByIndex(tableSchema, columnNamesList, indexColumnsWanted);
} else {
requestedSchema = fileSchema;
}
this.reader = new ParquetFileReader(configuration, footer.getFileMetaData(), file, blocks, requestedSchema.getColumns());
}
use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project hive by apache.
the class ParquetRecordReaderBase method setTimeZoneConversion.
/**
* Sets the TimeZone conversion for Parquet timestamp columns.
*
* @param configuration Configuration object where to get and set the TimeZone conversion
* @param finalPath path to the parquet file
*/
protected void setTimeZoneConversion(Configuration configuration, Path finalPath) {
ParquetMetadata parquetMetadata;
String timeZoneID;
try {
parquetMetadata = ParquetFileReader.readFooter(configuration, finalPath, ParquetMetadataConverter.NO_FILTER);
} catch (IOException e) {
// If an error occurred while reading the file, then we just skip the TimeZone setting.
// This error will probably occur on any other part of the code.
LOG.debug("Could not read parquet file footer at " + finalPath + ". Cannot determine " + "parquet file timezone", e);
return;
}
boolean skipConversion = HiveConf.getBoolVar(configuration, HiveConf.ConfVars.HIVE_PARQUET_TIMESTAMP_SKIP_CONVERSION);
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
if (!Strings.nullToEmpty(fileMetaData.getCreatedBy()).startsWith("parquet-mr") || skipConversion) {
// Impala writes timestamp values using GMT only. We should not try to convert Impala
// files to other type of timezones.
timeZoneID = ParquetTableUtils.PARQUET_INT96_NO_ADJUSTMENT_ZONE;
} else {
// TABLE_PARQUET_INT96_TIMEZONE is a table property used to detect what timezone conversion
// to use when reading Parquet timestamps.
timeZoneID = configuration.get(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY, ParquetTableUtils.PARQUET_INT96_NO_ADJUSTMENT_ZONE);
if (!Arrays.asList(TimeZone.getAvailableIDs()).contains(timeZoneID)) {
throw new IllegalStateException("Unexpected timezone id found for parquet int96 conversion: " + timeZoneID);
}
}
// 'timeZoneID' should be valid, since we did not throw exception above
configuration.set(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY, TimeZone.getTimeZone(timeZoneID).getID());
}
use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by apache.
the class HiveDrillNativeScanBatchCreator method getBatch.
@Override
public ScanBatch getBatch(FragmentContext context, HiveDrillNativeParquetSubScan config, List<RecordBatch> children) throws ExecutionSetupException {
final HiveTableWithColumnCache table = config.getTable();
final List<InputSplit> splits = config.getInputSplits();
final List<HivePartition> partitions = config.getPartitions();
final List<SchemaPath> columns = config.getColumns();
final String partitionDesignator = context.getOptions().getOption(ExecConstants.FILESYSTEM_PARTITION_COLUMN_LABEL).string_val;
List<Map<String, String>> implicitColumns = Lists.newLinkedList();
boolean selectAllQuery = AbstractRecordReader.isStarQuery(columns);
final boolean hasPartitions = (partitions != null && partitions.size() > 0);
final List<String[]> partitionColumns = Lists.newArrayList();
final List<Integer> selectedPartitionColumns = Lists.newArrayList();
List<SchemaPath> newColumns = columns;
if (!selectAllQuery) {
// Separate out the partition and non-partition columns. Non-partition columns are passed directly to the
// ParquetRecordReader. Partition columns are passed to ScanBatch.
newColumns = Lists.newArrayList();
Pattern pattern = Pattern.compile(String.format("%s[0-9]+", partitionDesignator));
for (SchemaPath column : columns) {
Matcher m = pattern.matcher(column.getAsUnescapedPath());
if (m.matches()) {
selectedPartitionColumns.add(Integer.parseInt(column.getAsUnescapedPath().substring(partitionDesignator.length())));
} else {
newColumns.add(column);
}
}
}
final OperatorContext oContext = context.newOperatorContext(config);
int currentPartitionIndex = 0;
final List<RecordReader> readers = Lists.newArrayList();
final HiveConf conf = config.getHiveConf();
// TODO: In future we can get this cache from Metadata cached on filesystem.
final Map<String, ParquetMetadata> footerCache = Maps.newHashMap();
Map<String, String> mapWithMaxColumns = Maps.newLinkedHashMap();
try {
for (InputSplit split : splits) {
final FileSplit fileSplit = (FileSplit) split;
final Path finalPath = fileSplit.getPath();
final JobConf cloneJob = new ProjectionPusher().pushProjectionsAndFilters(new JobConf(conf), finalPath.getParent());
final FileSystem fs = finalPath.getFileSystem(cloneJob);
ParquetMetadata parquetMetadata = footerCache.get(finalPath.toString());
if (parquetMetadata == null) {
parquetMetadata = ParquetFileReader.readFooter(cloneJob, finalPath);
footerCache.put(finalPath.toString(), parquetMetadata);
}
final List<Integer> rowGroupNums = getRowGroupNumbersFromFileSplit(fileSplit, parquetMetadata);
for (int rowGroupNum : rowGroupNums) {
//DRILL-5009 : Skip the row group if the row count is zero
if (parquetMetadata.getBlocks().get(rowGroupNum).getRowCount() == 0) {
continue;
}
// Drill has only ever written a single row group per file, only detect corruption
// in the first row group
ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(parquetMetadata, config.getColumns(), true);
if (logger.isDebugEnabled()) {
logger.debug(containsCorruptDates.toString());
}
readers.add(new ParquetRecordReader(context, Path.getPathWithoutSchemeAndAuthority(finalPath).toString(), rowGroupNum, fs, CodecFactory.createDirectCodecFactory(fs.getConf(), new ParquetDirectByteBufferAllocator(oContext.getAllocator()), 0), parquetMetadata, newColumns, containsCorruptDates));
Map<String, String> implicitValues = Maps.newLinkedHashMap();
if (hasPartitions) {
List<String> values = partitions.get(currentPartitionIndex).getValues();
for (int i = 0; i < values.size(); i++) {
if (selectAllQuery || selectedPartitionColumns.contains(i)) {
implicitValues.put(partitionDesignator + i, values.get(i));
}
}
}
implicitColumns.add(implicitValues);
if (implicitValues.size() > mapWithMaxColumns.size()) {
mapWithMaxColumns = implicitValues;
}
}
currentPartitionIndex++;
}
} catch (final IOException | RuntimeException e) {
AutoCloseables.close(e, readers);
throw new ExecutionSetupException("Failed to create RecordReaders. " + e.getMessage(), e);
}
// all readers should have the same number of implicit columns, add missing ones with value null
mapWithMaxColumns = Maps.transformValues(mapWithMaxColumns, Functions.constant((String) null));
for (Map<String, String> map : implicitColumns) {
map.putAll(Maps.difference(map, mapWithMaxColumns).entriesOnlyOnRight());
}
// create an empty RecordReader to output the schema
if (readers.size() == 0) {
readers.add(new HiveDefaultReader(table, null, null, columns, context, conf, ImpersonationUtil.createProxyUgi(config.getUserName(), context.getQueryUserName())));
}
return new ScanBatch(config, context, oContext, readers.iterator(), implicitColumns);
}
use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project h2o-3 by h2oai.
the class ParquetParser method parseChunk.
@Override
protected final ParseWriter parseChunk(int cidx, ParseReader din, ParseWriter dout) {
if (!(din instanceof FVecParseReader)) {
// TODO: Should we modify the interface to expose the underlying chunk for non-streaming parsers?
throw new IllegalStateException("We only accept parser readers backed by a Vec (no streaming support!).");
}
Chunk chunk = ((FVecParseReader) din).getChunk();
Vec vec = chunk.vec();
// extract metadata, we want to read only the row groups that have centers in this chunk
ParquetMetadataConverter.MetadataFilter chunkFilter = ParquetMetadataConverter.range(chunk.start(), chunk.start() + chunk.len());
ParquetMetadata metadata = VecParquetReader.readFooter(_metadata, chunkFilter);
if (metadata.getBlocks().isEmpty()) {
Log.trace("Chunk #", cidx, " doesn't contain any Parquet block center.");
return dout;
}
Log.info("Processing ", metadata.getBlocks().size(), " blocks of chunk #", cidx);
VecParquetReader reader = new VecParquetReader(vec, metadata, dout, _setup.getColumnTypes());
try {
Integer recordNumber;
do {
recordNumber = reader.read();
} while (recordNumber != null);
} catch (IOException e) {
throw new RuntimeException("Failed to parse records", e);
}
return dout;
}
use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by apache.
the class Metadata method getParquetFileMetadata_v3.
/**
* Get the metadata for a single file
*
* @param file
* @return
* @throws IOException
*/
private ParquetFileMetadata_v3 getParquetFileMetadata_v3(ParquetTableMetadata_v3 parquetTableMetadata, FileStatus file) throws IOException {
ParquetMetadata metadata = ParquetFileReader.readFooter(fs.getConf(), file);
MessageType schema = metadata.getFileMetaData().getSchema();
// Map<SchemaPath, OriginalType> originalTypeMap = Maps.newHashMap();
Map<SchemaPath, ColTypeInfo> colTypeInfoMap = Maps.newHashMap();
schema.getPaths();
for (String[] path : schema.getPaths()) {
colTypeInfoMap.put(SchemaPath.getCompoundPath(path), getColTypeInfo(schema, schema, path, 0));
}
List<RowGroupMetadata_v3> rowGroupMetadataList = Lists.newArrayList();
ArrayList<SchemaPath> ALL_COLS = new ArrayList<>();
ALL_COLS.add(AbstractRecordReader.STAR_COLUMN);
boolean autoCorrectCorruptDates = formatConfig.autoCorrectCorruptDates;
ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(metadata, ALL_COLS, autoCorrectCorruptDates);
if (logger.isDebugEnabled()) {
logger.debug(containsCorruptDates.toString());
}
for (BlockMetaData rowGroup : metadata.getBlocks()) {
List<ColumnMetadata_v3> columnMetadataList = Lists.newArrayList();
long length = 0;
for (ColumnChunkMetaData col : rowGroup.getColumns()) {
ColumnMetadata_v3 columnMetadata;
boolean statsAvailable = (col.getStatistics() != null && !col.getStatistics().isEmpty());
Statistics<?> stats = col.getStatistics();
String[] columnName = col.getPath().toArray();
SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName);
ColTypeInfo colTypeInfo = colTypeInfoMap.get(columnSchemaName);
ColumnTypeMetadata_v3 columnTypeMetadata = new ColumnTypeMetadata_v3(columnName, col.getType(), colTypeInfo.originalType, colTypeInfo.precision, colTypeInfo.scale, colTypeInfo.repetitionLevel, colTypeInfo.definitionLevel);
if (parquetTableMetadata.columnTypeInfo == null) {
parquetTableMetadata.columnTypeInfo = new ConcurrentHashMap<>();
}
// Save the column schema info. We'll merge it into one list
parquetTableMetadata.columnTypeInfo.put(new ColumnTypeMetadata_v3.Key(columnTypeMetadata.name), columnTypeMetadata);
if (statsAvailable) {
// Write stats when they are not null
Object minValue = null;
Object maxValue = null;
if (stats.genericGetMax() != null && stats.genericGetMin() != null) {
minValue = stats.genericGetMin();
maxValue = stats.genericGetMax();
if (containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION && columnTypeMetadata.originalType == OriginalType.DATE) {
minValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) minValue);
maxValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) maxValue);
}
}
columnMetadata = new ColumnMetadata_v3(columnTypeMetadata.name, col.getType(), minValue, maxValue, stats.getNumNulls());
} else {
columnMetadata = new ColumnMetadata_v3(columnTypeMetadata.name, col.getType(), null, null, null);
}
columnMetadataList.add(columnMetadata);
length += col.getTotalSize();
}
// Note we still read the schema even if there are no values in the RowGroup
if (rowGroup.getRowCount() == 0) {
continue;
}
RowGroupMetadata_v3 rowGroupMeta = new RowGroupMetadata_v3(rowGroup.getStartingPos(), length, rowGroup.getRowCount(), getHostAffinity(file, rowGroup.getStartingPos(), length), columnMetadataList);
rowGroupMetadataList.add(rowGroupMeta);
}
String path = Path.getPathWithoutSchemeAndAuthority(file.getPath()).toString();
return new ParquetFileMetadata_v3(path, file.getLen(), rowGroupMetadataList);
}
Aggregations