use of org.apache.drill.exec.store.parquet.metadata.MetadataVersion in project drill by apache.
the class TestParquetMetadataVersion method testZeroMinorVersion.
@Test
public void testZeroMinorVersion() throws Exception {
MetadataVersion zeroMinorVersion = new MetadataVersion("4.0");
MetadataVersion expectedVersionZeroMinorVersion = new MetadataVersion(4, 0);
assertEquals("Parquet metadata version is parsed incorrectly", expectedVersionZeroMinorVersion, zeroMinorVersion);
}
use of org.apache.drill.exec.store.parquet.metadata.MetadataVersion in project drill by apache.
the class ParquetReaderUtility method transformBinaryInMetadataCache.
/**
* Transforms values for min / max binary statistics to byte array.
* Transformation logic depends on metadata file version.
*
* @param parquetTableMetadata table metadata that should be corrected
* @param readerConfig parquet reader config
*/
public static void transformBinaryInMetadataCache(ParquetTableMetadataBase parquetTableMetadata, ParquetReaderConfig readerConfig) {
// Looking for the names of the columns with BINARY data type
// in the metadata cache file for V2 and all v3 versions
Set<List<String>> columnsNames = getBinaryColumnsNames(parquetTableMetadata);
boolean allowBinaryMetadata = allowBinaryMetadata(parquetTableMetadata.getDrillVersion(), readerConfig);
// Setting Min / Max values for ParquetTableMetadata_v1
MetadataVersion metadataVersion = new MetadataVersion(parquetTableMetadata.getMetadataVersion());
if (metadataVersion.isEqualTo(1, 0)) {
for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
for (RowGroupMetadata rowGroupMetadata : file.getRowGroups()) {
Long rowCount = rowGroupMetadata.getRowCount();
for (ColumnMetadata columnMetadata : rowGroupMetadata.getColumns()) {
if (columnMetadata.getPrimitiveType() == PrimitiveTypeName.BINARY || columnMetadata.getPrimitiveType() == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
setMinMaxValues(columnMetadata, rowCount, allowBinaryMetadata, false);
}
}
}
}
return;
}
// Variables needed for debugging only
Stopwatch timer = logger.isDebugEnabled() ? Stopwatch.createStarted() : null;
int maxRowGroups = 0;
int minRowGroups = Integer.MAX_VALUE;
int maxNumColumns = 0;
// Setting Min / Max values for V2, V3 and V4 versions; for versions V3_3 and above need to do decoding
boolean needDecoding = metadataVersion.isAtLeast(3, 3);
for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
if (timer != null) {
// for debugging only
maxRowGroups = Math.max(maxRowGroups, file.getRowGroups().size());
minRowGroups = Math.min(minRowGroups, file.getRowGroups().size());
}
for (RowGroupMetadata rowGroupMetadata : file.getRowGroups()) {
Long rowCount = rowGroupMetadata.getRowCount();
if (timer != null) {
// for debugging only
maxNumColumns = Math.max(maxNumColumns, rowGroupMetadata.getColumns().size());
}
for (ColumnMetadata columnMetadata : rowGroupMetadata.getColumns()) {
if (columnsNames.contains(Arrays.asList(columnMetadata.getName()))) {
setMinMaxValues(columnMetadata, rowCount, allowBinaryMetadata, needDecoding);
}
}
}
}
if (timer != null) {
// log a debug message and stop the timer
String reportRG = 1 == maxRowGroups ? "1 rowgroup" : "between " + minRowGroups + "-" + maxRowGroups + "rowgroups";
logger.debug("Transforming binary in metadata cache took {} ms ({} files, {} per file, max {} columns)", timer.elapsed(TimeUnit.MILLISECONDS), parquetTableMetadata.getFiles().size(), reportRG, maxNumColumns);
timer.stop();
}
}
use of org.apache.drill.exec.store.parquet.metadata.MetadataVersion in project drill by apache.
the class ParquetTableMetadataUtils method getRowGroupFields.
/**
* Returns map of column names with their drill types for specified {@code rowGroup}.
*
* @param parquetTableMetadata the source of primitive and original column types
* @param rowGroup row group whose columns should be discovered
* @return map of column names with their drill types
*/
public static Map<SchemaPath, TypeProtos.MajorType> getRowGroupFields(MetadataBase.ParquetTableMetadataBase parquetTableMetadata, MetadataBase.RowGroupMetadata rowGroup) {
Map<SchemaPath, TypeProtos.MajorType> columns = new LinkedHashMap<>();
if (new MetadataVersion(parquetTableMetadata.getMetadataVersion()).isHigherThan(4, 0) && !((Metadata_V4.ParquetTableMetadata_v4) parquetTableMetadata).isAllColumnsInteresting()) {
// adds non-interesting fields from table metadata
for (MetadataBase.ColumnTypeMetadata columnTypeMetadata : parquetTableMetadata.getColumnTypeInfoList()) {
Metadata_V4.ColumnTypeMetadata_v4 metadata = (Metadata_V4.ColumnTypeMetadata_v4) columnTypeMetadata;
if (!metadata.isInteresting) {
TypeProtos.MajorType columnType = getColumnType(metadata.name, metadata.primitiveType, metadata.originalType, parquetTableMetadata);
SchemaPath columnPath = SchemaPath.getCompoundPath(metadata.name);
putType(columns, columnPath, columnType);
}
}
}
for (MetadataBase.ColumnMetadata column : rowGroup.getColumns()) {
TypeProtos.MajorType columnType = getColumnType(parquetTableMetadata, column);
SchemaPath columnPath = SchemaPath.getCompoundPath(column.getName());
putType(columns, columnPath, columnType);
}
return columns;
}
use of org.apache.drill.exec.store.parquet.metadata.MetadataVersion in project drill by apache.
the class ParquetTableMetadataUtils method getColumnType.
private static TypeProtos.MajorType getColumnType(String[] name, PrimitiveType.PrimitiveTypeName primitiveType, OriginalType originalType, MetadataBase.ParquetTableMetadataBase parquetTableMetadata) {
int precision = 0;
int scale = 0;
MetadataVersion metadataVersion = new MetadataVersion(parquetTableMetadata.getMetadataVersion());
// only ColumnTypeMetadata_v3 and ColumnTypeMetadata_v4 store information about scale, precision, repetition level and definition level
if (metadataVersion.isAtLeast(3, 0)) {
scale = parquetTableMetadata.getScale(name);
precision = parquetTableMetadata.getPrecision(name);
}
TypeProtos.DataMode mode = getDataMode(parquetTableMetadata, metadataVersion, name);
return TypeProtos.MajorType.newBuilder(ParquetReaderUtility.getType(primitiveType, originalType, precision, scale)).setMode(mode).build();
}
use of org.apache.drill.exec.store.parquet.metadata.MetadataVersion in project drill by apache.
the class ParquetReaderUtility method correctDatesInMetadataCache.
public static void correctDatesInMetadataCache(ParquetTableMetadataBase parquetTableMetadata) {
MetadataVersion metadataVersion = new MetadataVersion(parquetTableMetadata.getMetadataVersion());
DateCorruptionStatus cacheFileCanContainsCorruptDates = metadataVersion.isAtLeast(3, 0) ? DateCorruptionStatus.META_SHOWS_NO_CORRUPTION : DateCorruptionStatus.META_UNCLEAR_TEST_VALUES;
if (cacheFileCanContainsCorruptDates == DateCorruptionStatus.META_UNCLEAR_TEST_VALUES) {
// Looking for the DATE data type of column names in the metadata cache file ("metadata_version" : "v2")
String[] names = new String[0];
if (metadataVersion.isEqualTo(2, 0)) {
for (ColumnTypeMetadata_v2 columnTypeMetadata : ((ParquetTableMetadata_v2) parquetTableMetadata).columnTypeInfo.values()) {
if (OriginalType.DATE.equals(columnTypeMetadata.originalType)) {
names = columnTypeMetadata.name;
}
}
}
for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
// Drill has only ever written a single row group per file, only need to correct the statistics
// on the first row group
RowGroupMetadata rowGroupMetadata = file.getRowGroups().get(0);
Long rowCount = rowGroupMetadata.getRowCount();
for (ColumnMetadata columnMetadata : rowGroupMetadata.getColumns()) {
// Setting Min/Max values for ParquetTableMetadata_v1
if (metadataVersion.isEqualTo(1, 0)) {
OriginalType originalType = columnMetadata.getOriginalType();
if (OriginalType.DATE.equals(originalType) && columnMetadata.hasSingleValue(rowCount) && (Integer) columnMetadata.getMaxValue() > ParquetReaderUtility.DATE_CORRUPTION_THRESHOLD) {
int newMinMax = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) columnMetadata.getMaxValue());
columnMetadata.setMax(newMinMax);
columnMetadata.setMin(newMinMax);
}
} else // Setting Max values for ParquetTableMetadata_v2
if (metadataVersion.isEqualTo(2, 0) && columnMetadata.getName() != null && Arrays.equals(columnMetadata.getName(), names) && columnMetadata.hasSingleValue(rowCount) && (Integer) columnMetadata.getMaxValue() > ParquetReaderUtility.DATE_CORRUPTION_THRESHOLD) {
int newMax = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) columnMetadata.getMaxValue());
columnMetadata.setMax(newMax);
}
}
}
}
}
Aggregations