use of org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetFileMetadata in project drill by apache.
the class Metadata method readBlockMeta.
/**
* Read the parquet metadata from a file
*
* @param path to metadata file
* @param dirsOnly true for {@link Metadata#METADATA_DIRECTORIES_FILENAME}
* or false for {@link Metadata#OLD_METADATA_FILENAME} files reading
* @param metaContext current metadata context
*/
private void readBlockMeta(Path path, boolean dirsOnly, MetadataContext metaContext, FileSystem fs) {
Stopwatch timer = logger.isDebugEnabled() ? Stopwatch.createStarted() : null;
Path metadataParentDir = Path.getPathWithoutSchemeAndAuthority(path.getParent());
String metadataParentDirPath = metadataParentDir.toUri().getPath();
ObjectMapper mapper = new ObjectMapper();
final SimpleModule serialModule = new SimpleModule();
serialModule.addDeserializer(SchemaPath.class, new SchemaPath.De());
serialModule.addKeyDeserializer(Metadata_V2.ColumnTypeMetadata_v2.Key.class, new Metadata_V2.ColumnTypeMetadata_v2.Key.DeSerializer());
serialModule.addKeyDeserializer(Metadata_V3.ColumnTypeMetadata_v3.Key.class, new Metadata_V3.ColumnTypeMetadata_v3.Key.DeSerializer());
serialModule.addKeyDeserializer(ColumnTypeMetadata_v4.Key.class, new ColumnTypeMetadata_v4.Key.DeSerializer());
AfterburnerModule module = new AfterburnerModule();
module.setUseOptimizedBeanDeserializer(true);
boolean isFileMetadata = path.toString().endsWith(METADATA_FILENAME);
boolean isSummaryFile = path.toString().endsWith(METADATA_SUMMARY_FILENAME);
mapper.registerModule(serialModule);
mapper.registerModule(module);
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
try (InputStream is = fs.open(path)) {
boolean alreadyCheckedModification;
boolean newMetadata = false;
alreadyCheckedModification = metaContext.getStatus(metadataParentDirPath);
if (dirsOnly) {
parquetTableMetadataDirs = mapper.readValue(is, ParquetTableMetadataDirs.class);
if (timer != null) {
logger.debug("Took {} ms to read directories from directory cache file", timer.elapsed(TimeUnit.MILLISECONDS));
timer.stop();
}
parquetTableMetadataDirs.updateRelativePaths(metadataParentDirPath);
if (!alreadyCheckedModification && tableModified(parquetTableMetadataDirs.getDirectories(), path, metadataParentDir, metaContext, fs)) {
parquetTableMetadataDirs = (createMetaFilesRecursivelyAsProcessUser(Path.getPathWithoutSchemeAndAuthority(path.getParent()), fs, true, null, true)).getRight();
newMetadata = true;
}
} else {
if (isFileMetadata) {
parquetTableMetadata.assignFiles((mapper.readValue(is, FileMetadata.class)).getFiles());
if (new MetadataVersion(parquetTableMetadata.getMetadataVersion()).isAtLeast(4, 0)) {
((ParquetTableMetadata_v4) parquetTableMetadata).updateRelativePaths(metadataParentDirPath);
}
if (!alreadyCheckedModification && tableModified(parquetTableMetadata.getDirectories(), path, metadataParentDir, metaContext, fs)) {
parquetTableMetadata = (createMetaFilesRecursivelyAsProcessUser(Path.getPathWithoutSchemeAndAuthority(path.getParent()), fs, true, null, true)).getLeft();
newMetadata = true;
}
} else if (isSummaryFile) {
MetadataSummary metadataSummary = mapper.readValue(is, Metadata_V4.MetadataSummary.class);
parquetTableMetadata = new ParquetTableMetadata_v4(metadataSummary);
} else {
parquetTableMetadata = mapper.readValue(is, ParquetTableMetadataBase.class);
if (new MetadataVersion(parquetTableMetadata.getMetadataVersion()).isAtLeast(3, 0)) {
((Metadata_V3.ParquetTableMetadata_v3) parquetTableMetadata).updateRelativePaths(metadataParentDirPath);
}
if (!alreadyCheckedModification && tableModified((parquetTableMetadata.getDirectories()), path, metadataParentDir, metaContext, fs)) {
parquetTableMetadata = (createMetaFilesRecursivelyAsProcessUser(Path.getPathWithoutSchemeAndAuthority(path.getParent()), fs, true, null, true)).getLeft();
newMetadata = true;
}
}
if (timer != null) {
logger.debug("Took {} ms to read metadata from cache file", timer.elapsed(TimeUnit.MILLISECONDS));
timer.stop();
}
if (!isSummaryFile) {
List<? extends ParquetFileMetadata> files = parquetTableMetadata.getFiles();
if (files != null) {
for (ParquetFileMetadata file : files) {
// DRILL-5009: Remove empty row groups unless it is the only row group
List<? extends RowGroupMetadata> rowGroups = file.getRowGroups();
if (rowGroups.size() == 1) {
continue;
}
rowGroups.removeIf(r -> r.getRowCount() == 0);
}
}
}
if (newMetadata) {
// if new metadata files were created, invalidate the existing metadata context
metaContext.clear();
}
}
} catch (IOException e) {
logger.error("Failed to read '{}' metadata file", path, e);
metaContext.setMetadataCacheCorrupted(true);
}
}
use of org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetFileMetadata in project drill by apache.
the class MetadataPathUtils method convertToFilesWithAbsolutePaths.
/**
* Convert a list of files with relative paths to files with absolute ones
*
* @param files list of files with relative paths
* @param baseDir base parent directory
* @return list of files with absolute paths
*/
public static List<? extends ParquetFileMetadata> convertToFilesWithAbsolutePaths(List<? extends ParquetFileMetadata> files, String baseDir) {
if (!files.isEmpty()) {
List<ParquetFileMetadata> filesWithAbsolutePaths = new ArrayList<>();
for (ParquetFileMetadata file : files) {
Path relativePath = file.getPath();
ParquetFileMetadata fileWithAbsolutePath = null;
// create a new file if old one contains a relative path, otherwise use an old file
if (file instanceof ParquetFileMetadata_v4) {
fileWithAbsolutePath = (relativePath.isAbsolute()) ? file : new ParquetFileMetadata_v4(new Path(baseDir, relativePath), file.getLength(), (List<Metadata_V4.RowGroupMetadata_v4>) file.getRowGroups());
} else if (file instanceof ParquetFileMetadata_v3) {
fileWithAbsolutePath = (relativePath.isAbsolute()) ? file : new ParquetFileMetadata_v3(new Path(baseDir, relativePath), file.getLength(), (List<Metadata_V3.RowGroupMetadata_v3>) file.getRowGroups());
}
filesWithAbsolutePaths.add(fileWithAbsolutePath);
}
return filesWithAbsolutePaths;
}
return files;
}
use of org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetFileMetadata in project drill by apache.
the class ParquetReaderUtility method transformBinaryInMetadataCache.
/**
* Transforms values for min / max binary statistics to byte array.
* Transformation logic depends on metadata file version.
*
* @param parquetTableMetadata table metadata that should be corrected
* @param readerConfig parquet reader config
*/
public static void transformBinaryInMetadataCache(ParquetTableMetadataBase parquetTableMetadata, ParquetReaderConfig readerConfig) {
// Looking for the names of the columns with BINARY data type
// in the metadata cache file for V2 and all v3 versions
Set<List<String>> columnsNames = getBinaryColumnsNames(parquetTableMetadata);
boolean allowBinaryMetadata = allowBinaryMetadata(parquetTableMetadata.getDrillVersion(), readerConfig);
// Setting Min / Max values for ParquetTableMetadata_v1
MetadataVersion metadataVersion = new MetadataVersion(parquetTableMetadata.getMetadataVersion());
if (metadataVersion.isEqualTo(1, 0)) {
for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
for (RowGroupMetadata rowGroupMetadata : file.getRowGroups()) {
Long rowCount = rowGroupMetadata.getRowCount();
for (ColumnMetadata columnMetadata : rowGroupMetadata.getColumns()) {
if (columnMetadata.getPrimitiveType() == PrimitiveTypeName.BINARY || columnMetadata.getPrimitiveType() == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
setMinMaxValues(columnMetadata, rowCount, allowBinaryMetadata, false);
}
}
}
}
return;
}
// Variables needed for debugging only
Stopwatch timer = logger.isDebugEnabled() ? Stopwatch.createStarted() : null;
int maxRowGroups = 0;
int minRowGroups = Integer.MAX_VALUE;
int maxNumColumns = 0;
// Setting Min / Max values for V2, V3 and V4 versions; for versions V3_3 and above need to do decoding
boolean needDecoding = metadataVersion.isAtLeast(3, 3);
for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
if (timer != null) {
// for debugging only
maxRowGroups = Math.max(maxRowGroups, file.getRowGroups().size());
minRowGroups = Math.min(minRowGroups, file.getRowGroups().size());
}
for (RowGroupMetadata rowGroupMetadata : file.getRowGroups()) {
Long rowCount = rowGroupMetadata.getRowCount();
if (timer != null) {
// for debugging only
maxNumColumns = Math.max(maxNumColumns, rowGroupMetadata.getColumns().size());
}
for (ColumnMetadata columnMetadata : rowGroupMetadata.getColumns()) {
if (columnsNames.contains(Arrays.asList(columnMetadata.getName()))) {
setMinMaxValues(columnMetadata, rowCount, allowBinaryMetadata, needDecoding);
}
}
}
}
if (timer != null) {
// log a debug message and stop the timer
String reportRG = 1 == maxRowGroups ? "1 rowgroup" : "between " + minRowGroups + "-" + maxRowGroups + "rowgroups";
logger.debug("Transforming binary in metadata cache took {} ms ({} files, {} per file, max {} columns)", timer.elapsed(TimeUnit.MILLISECONDS), parquetTableMetadata.getFiles().size(), reportRG, maxNumColumns);
timer.stop();
}
}
use of org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetFileMetadata in project drill by apache.
the class ParquetReaderUtility method correctDatesInMetadataCache.
public static void correctDatesInMetadataCache(ParquetTableMetadataBase parquetTableMetadata) {
MetadataVersion metadataVersion = new MetadataVersion(parquetTableMetadata.getMetadataVersion());
DateCorruptionStatus cacheFileCanContainsCorruptDates = metadataVersion.isAtLeast(3, 0) ? DateCorruptionStatus.META_SHOWS_NO_CORRUPTION : DateCorruptionStatus.META_UNCLEAR_TEST_VALUES;
if (cacheFileCanContainsCorruptDates == DateCorruptionStatus.META_UNCLEAR_TEST_VALUES) {
// Looking for the DATE data type of column names in the metadata cache file ("metadata_version" : "v2")
String[] names = new String[0];
if (metadataVersion.isEqualTo(2, 0)) {
for (ColumnTypeMetadata_v2 columnTypeMetadata : ((ParquetTableMetadata_v2) parquetTableMetadata).columnTypeInfo.values()) {
if (OriginalType.DATE.equals(columnTypeMetadata.originalType)) {
names = columnTypeMetadata.name;
}
}
}
for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
// Drill has only ever written a single row group per file, only need to correct the statistics
// on the first row group
RowGroupMetadata rowGroupMetadata = file.getRowGroups().get(0);
Long rowCount = rowGroupMetadata.getRowCount();
for (ColumnMetadata columnMetadata : rowGroupMetadata.getColumns()) {
// Setting Min/Max values for ParquetTableMetadata_v1
if (metadataVersion.isEqualTo(1, 0)) {
OriginalType originalType = columnMetadata.getOriginalType();
if (OriginalType.DATE.equals(originalType) && columnMetadata.hasSingleValue(rowCount) && (Integer) columnMetadata.getMaxValue() > ParquetReaderUtility.DATE_CORRUPTION_THRESHOLD) {
int newMinMax = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) columnMetadata.getMaxValue());
columnMetadata.setMax(newMinMax);
columnMetadata.setMin(newMinMax);
}
} else // Setting Max values for ParquetTableMetadata_v2
if (metadataVersion.isEqualTo(2, 0) && columnMetadata.getName() != null && Arrays.equals(columnMetadata.getName(), names) && columnMetadata.hasSingleValue(rowCount) && (Integer) columnMetadata.getMaxValue() > ParquetReaderUtility.DATE_CORRUPTION_THRESHOLD) {
int newMax = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) columnMetadata.getMaxValue());
columnMetadata.setMax(newMax);
}
}
}
}
}
Aggregations