use of org.apache.parquet.schema.OriginalType in project hive by apache.
the class HiveParquetSchemaTestUtils method testConversion.
public static void testConversion(final String columnNamesStr, final String columnsTypeStr, final String actualSchema) throws Exception {
final List<String> columnNames = createHiveColumnsFrom(columnNamesStr);
final List<TypeInfo> columnTypes = createHiveTypeInfoFrom(columnsTypeStr);
final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes);
final MessageType expectedMT = MessageTypeParser.parseMessageType(actualSchema);
assertEquals("converting " + columnNamesStr + ": " + columnsTypeStr + " to " + actualSchema, expectedMT, messageTypeFound);
// Required to check the original types manually as PrimitiveType.equals does not care about it
List<Type> expectedFields = expectedMT.getFields();
List<Type> actualFields = messageTypeFound.getFields();
for (int i = 0, n = expectedFields.size(); i < n; ++i) {
OriginalType exp = expectedFields.get(i).getOriginalType();
OriginalType act = actualFields.get(i).getOriginalType();
assertEquals("Original types of the field do not match", exp, act);
}
}
use of org.apache.parquet.schema.OriginalType in project drill by apache.
the class ParquetMetaStatCollector method collectColStat.
@Override
public Map<SchemaPath, ColumnStatistics> collectColStat(Set<SchemaPath> fields) {
Stopwatch timer = Stopwatch.createStarted();
// map from column to ColumnMetadata
final Map<SchemaPath, Metadata.ColumnMetadata> columnMetadataMap = new HashMap<>();
// map from column name to column statistics.
final Map<SchemaPath, ColumnStatistics> statMap = new HashMap<>();
for (final Metadata.ColumnMetadata columnMetadata : columnMetadataList) {
SchemaPath schemaPath = SchemaPath.getCompoundPath(columnMetadata.getName());
columnMetadataMap.put(schemaPath, columnMetadata);
}
for (final SchemaPath schemaPath : fields) {
final PrimitiveType.PrimitiveTypeName primitiveType;
final OriginalType originalType;
final Metadata.ColumnMetadata columnMetadata = columnMetadataMap.get(schemaPath);
if (columnMetadata != null) {
final Object min = columnMetadata.getMinValue();
final Object max = columnMetadata.getMaxValue();
final Long numNull = columnMetadata.getNulls();
primitiveType = this.parquetTableMetadata.getPrimitiveType(columnMetadata.getName());
originalType = this.parquetTableMetadata.getOriginalType(columnMetadata.getName());
final Integer repetitionLevel = this.parquetTableMetadata.getRepetitionLevel(columnMetadata.getName());
statMap.put(schemaPath, getStat(min, max, numNull, primitiveType, originalType, repetitionLevel));
} else {
final String columnName = schemaPath.getRootSegment().getPath();
if (implicitColValues.containsKey(columnName)) {
TypeProtos.MajorType type = Types.required(TypeProtos.MinorType.VARCHAR);
Statistics stat = new BinaryStatistics();
stat.setNumNulls(0);
byte[] val = implicitColValues.get(columnName).getBytes();
stat.setMinMaxFromBytes(val, val);
statMap.put(schemaPath, new ColumnStatistics(stat, type));
}
}
}
if (logger.isDebugEnabled()) {
logger.debug("Took {} ms to column statistics for row group", timer.elapsed(TimeUnit.MILLISECONDS));
}
return statMap;
}
use of org.apache.parquet.schema.OriginalType in project drill by apache.
the class ParquetReaderUtility method correctDatesInMetadataCache.
public static void correctDatesInMetadataCache(Metadata.ParquetTableMetadataBase parquetTableMetadata) {
DateCorruptionStatus cacheFileCanContainsCorruptDates = parquetTableMetadata instanceof Metadata.ParquetTableMetadata_v3 ? DateCorruptionStatus.META_SHOWS_NO_CORRUPTION : DateCorruptionStatus.META_UNCLEAR_TEST_VALUES;
if (cacheFileCanContainsCorruptDates == DateCorruptionStatus.META_UNCLEAR_TEST_VALUES) {
// Looking for the DATE data type of column names in the metadata cache file ("metadata_version" : "v2")
String[] names = new String[0];
if (parquetTableMetadata instanceof Metadata.ParquetTableMetadata_v2) {
for (Metadata.ColumnTypeMetadata_v2 columnTypeMetadata : ((Metadata.ParquetTableMetadata_v2) parquetTableMetadata).columnTypeInfo.values()) {
if (OriginalType.DATE.equals(columnTypeMetadata.originalType)) {
names = columnTypeMetadata.name;
}
}
}
for (Metadata.ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
// Drill has only ever written a single row group per file, only need to correct the statistics
// on the first row group
Metadata.RowGroupMetadata rowGroupMetadata = file.getRowGroups().get(0);
for (Metadata.ColumnMetadata columnMetadata : rowGroupMetadata.getColumns()) {
// Setting Min/Max values for ParquetTableMetadata_v1
if (parquetTableMetadata instanceof Metadata.ParquetTableMetadata_v1) {
OriginalType originalType = columnMetadata.getOriginalType();
if (OriginalType.DATE.equals(originalType) && columnMetadata.hasSingleValue() && (Integer) columnMetadata.getMaxValue() > ParquetReaderUtility.DATE_CORRUPTION_THRESHOLD) {
int newMinMax = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) columnMetadata.getMaxValue());
columnMetadata.setMax(newMinMax);
columnMetadata.setMin(newMinMax);
}
} else // Setting Max values for ParquetTableMetadata_v2
if (parquetTableMetadata instanceof Metadata.ParquetTableMetadata_v2 && columnMetadata.getName() != null && Arrays.equals(columnMetadata.getName(), names) && columnMetadata.hasSingleValue() && (Integer) columnMetadata.getMaxValue() > ParquetReaderUtility.DATE_CORRUPTION_THRESHOLD) {
int newMax = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) columnMetadata.getMaxValue());
columnMetadata.setMax(newMax);
}
}
}
}
}
use of org.apache.parquet.schema.OriginalType in project hive by apache.
the class DataWritableWriter method createWriter.
/**
* Creates a writer for the specific object inspector. The returned writer will be used
* to call Parquet API for the specific data type.
* @param inspector The object inspector used to get the correct value type.
* @param type Type that contains information about the type schema.
* @return A ParquetWriter object used to call the Parquet API fo the specific data type.
*/
private DataWriter createWriter(ObjectInspector inspector, Type type) {
if (type.isPrimitive()) {
checkInspectorCategory(inspector, ObjectInspector.Category.PRIMITIVE);
PrimitiveObjectInspector primitiveInspector = (PrimitiveObjectInspector) inspector;
switch(primitiveInspector.getPrimitiveCategory()) {
case BOOLEAN:
return new BooleanDataWriter((BooleanObjectInspector) inspector);
case BYTE:
return new ByteDataWriter((ByteObjectInspector) inspector);
case SHORT:
return new ShortDataWriter((ShortObjectInspector) inspector);
case INT:
return new IntDataWriter((IntObjectInspector) inspector);
case LONG:
return new LongDataWriter((LongObjectInspector) inspector);
case FLOAT:
return new FloatDataWriter((FloatObjectInspector) inspector);
case DOUBLE:
return new DoubleDataWriter((DoubleObjectInspector) inspector);
case STRING:
return new StringDataWriter((StringObjectInspector) inspector);
case CHAR:
return new CharDataWriter((HiveCharObjectInspector) inspector);
case VARCHAR:
return new VarcharDataWriter((HiveVarcharObjectInspector) inspector);
case BINARY:
return new BinaryDataWriter((BinaryObjectInspector) inspector);
case TIMESTAMP:
return new TimestampDataWriter((TimestampObjectInspector) inspector);
case DECIMAL:
return new DecimalDataWriter((HiveDecimalObjectInspector) inspector);
case DATE:
return new DateDataWriter((DateObjectInspector) inspector);
default:
throw new IllegalArgumentException("Unsupported primitive data type: " + primitiveInspector.getPrimitiveCategory());
}
} else {
GroupType groupType = type.asGroupType();
OriginalType originalType = type.getOriginalType();
if (originalType != null && originalType.equals(OriginalType.LIST)) {
checkInspectorCategory(inspector, ObjectInspector.Category.LIST);
return new ListDataWriter((ListObjectInspector) inspector, groupType);
} else if (originalType != null && originalType.equals(OriginalType.MAP)) {
checkInspectorCategory(inspector, ObjectInspector.Category.MAP);
return new MapDataWriter((MapObjectInspector) inspector, groupType);
} else {
checkInspectorCategory(inspector, ObjectInspector.Category.STRUCT);
return new StructDataWriter((StructObjectInspector) inspector, groupType);
}
}
}
use of org.apache.parquet.schema.OriginalType in project drill by apache.
the class ParquetGroupScan method checkForPartitionColumn.
/**
* When reading the very first footer, any column is a potential partition column. So for the first footer, we check
* every column to see if it is single valued, and if so, add it to the list of potential partition columns. For the
* remaining footers, we will not find any new partition columns, but we may discover that what was previously a
* potential partition column now no longer qualifies, so it needs to be removed from the list.
* @return whether column is a potential partition column
*/
private boolean checkForPartitionColumn(ColumnMetadata columnMetadata, boolean first) {
SchemaPath schemaPath = SchemaPath.getCompoundPath(columnMetadata.getName());
final PrimitiveTypeName primitiveType;
final OriginalType originalType;
if (this.parquetTableMetadata.hasColumnMetadata()) {
primitiveType = this.parquetTableMetadata.getPrimitiveType(columnMetadata.getName());
originalType = this.parquetTableMetadata.getOriginalType(columnMetadata.getName());
} else {
primitiveType = columnMetadata.getPrimitiveType();
originalType = columnMetadata.getOriginalType();
}
if (first) {
if (hasSingleValue(columnMetadata)) {
partitionColTypeMap.put(schemaPath, getType(primitiveType, originalType));
return true;
} else {
return false;
}
} else {
if (!partitionColTypeMap.keySet().contains(schemaPath)) {
return false;
} else {
if (!hasSingleValue(columnMetadata)) {
partitionColTypeMap.remove(schemaPath);
return false;
}
if (!getType(primitiveType, originalType).equals(partitionColTypeMap.get(schemaPath))) {
partitionColTypeMap.remove(schemaPath);
return false;
}
}
}
return true;
}
Aggregations