use of org.apache.parquet.schema.PrimitiveType in project presto by prestodb.
the class TestParquetPredicateUtils method testParquetTupleDomainMap.
@Test
public void testParquetTupleDomainMap() {
HiveColumnHandle columnHandle = new HiveColumnHandle("my_map", HiveType.valueOf("map<int,int>"), parseTypeSignature(StandardTypes.MAP), 0, REGULAR, Optional.empty(), Optional.empty());
MapType mapType = new MapType(INTEGER, INTEGER, methodHandle(TestParquetPredicateUtils.class, "throwUnsupportedOperationException"), methodHandle(TestParquetPredicateUtils.class, "throwUnsupportedOperationException"));
TupleDomain<HiveColumnHandle> domain = withColumnDomains(ImmutableMap.of(columnHandle, Domain.notNull(mapType)));
MessageType fileSchema = new MessageType("hive_schema", new GroupType(OPTIONAL, "my_map", new GroupType(REPEATED, "map", new PrimitiveType(REQUIRED, INT32, "key"), new PrimitiveType(OPTIONAL, INT32, "value"))));
Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, fileSchema);
TupleDomain<ColumnDescriptor> tupleDomain = getParquetTupleDomain(descriptorsByPath, domain);
assertTrue(tupleDomain.getDomains().get().isEmpty());
}
use of org.apache.parquet.schema.PrimitiveType in project presto by prestodb.
the class TestParquetPredicateUtils method testParquetTupleDomainPrimitiveArray.
@Test
public void testParquetTupleDomainPrimitiveArray() {
HiveColumnHandle columnHandle = new HiveColumnHandle("my_array", HiveType.valueOf("array<int>"), parseTypeSignature(StandardTypes.ARRAY), 0, REGULAR, Optional.empty(), Optional.empty());
TupleDomain<HiveColumnHandle> domain = withColumnDomains(ImmutableMap.of(columnHandle, Domain.notNull(new ArrayType(INTEGER))));
MessageType fileSchema = new MessageType("hive_schema", new GroupType(OPTIONAL, "my_array", new GroupType(REPEATED, "bag", new PrimitiveType(OPTIONAL, INT32, "array_element"))));
Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, fileSchema);
TupleDomain<ColumnDescriptor> tupleDomain = getParquetTupleDomain(descriptorsByPath, domain);
assertTrue(tupleDomain.getDomains().get().isEmpty());
}
use of org.apache.parquet.schema.PrimitiveType in project presto by prestodb.
the class TestParquetPredicateUtils method testParquetTupleDomainStruct.
@Test
public void testParquetTupleDomainStruct() {
HiveColumnHandle columnHandle = new HiveColumnHandle("my_struct", HiveType.valueOf("struct<a:int,b:int>"), parseTypeSignature(StandardTypes.ROW), 0, REGULAR, Optional.empty(), Optional.empty());
RowType.Field rowField = new RowType.Field(Optional.of("my_struct"), INTEGER);
RowType rowType = RowType.from(ImmutableList.of(rowField));
TupleDomain<HiveColumnHandle> domain = withColumnDomains(ImmutableMap.of(columnHandle, Domain.notNull(rowType)));
MessageType fileSchema = new MessageType("hive_schema", new GroupType(OPTIONAL, "my_struct", new PrimitiveType(OPTIONAL, INT32, "a"), new PrimitiveType(OPTIONAL, INT32, "b")));
Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, fileSchema);
TupleDomain<ColumnDescriptor> tupleDomain = getParquetTupleDomain(descriptorsByPath, domain);
assertTrue(tupleDomain.getDomains().get().isEmpty());
}
use of org.apache.parquet.schema.PrimitiveType in project druid by druid-io.
the class ParquetGroupConverter method convertPrimitiveField.
/**
* Convert a primitive group field to a "ingestion friendly" java object
*
* @return "ingestion ready" java object, or null
*/
@Nullable
private static Object convertPrimitiveField(Group g, int fieldIndex, int index, boolean binaryAsString) {
PrimitiveType pt = (PrimitiveType) g.getType().getFields().get(fieldIndex);
OriginalType ot = pt.getOriginalType();
try {
if (ot != null) {
// convert logical types
switch(ot) {
case DATE:
long ts = g.getInteger(fieldIndex, index) * MILLIS_IN_DAY;
return ts;
case TIME_MICROS:
return g.getLong(fieldIndex, index);
case TIME_MILLIS:
return g.getInteger(fieldIndex, index);
case TIMESTAMP_MICROS:
return TimeUnit.MILLISECONDS.convert(g.getLong(fieldIndex, index), TimeUnit.MICROSECONDS);
case TIMESTAMP_MILLIS:
return g.getLong(fieldIndex, index);
case INTERVAL:
/*
INTERVAL is used for an interval of time. It must annotate a fixed_len_byte_array of length 12.
This array stores three little-endian unsigned integers that represent durations at different
granularities of time. The first stores a number in months, the second stores a number in days,
and the third stores a number in milliseconds. This representation is independent of any particular
timezone or date.
Each component in this representation is independent of the others. For example, there is no
requirement that a large number of days should be expressed as a mix of months and days because there is
not a constant conversion from days to months.
The sort order used for INTERVAL is undefined. When writing data, no min/max statistics should be
saved for this type and if such non-compliant statistics are found during reading, they must be ignored.
*/
Binary intervalVal = g.getBinary(fieldIndex, index);
IntBuffer intBuf = intervalVal.toByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asIntBuffer();
int months = intBuf.get(0);
int days = intBuf.get(1);
int millis = intBuf.get(2);
StringBuilder periodBuilder = new StringBuilder("P");
if (months > 0) {
periodBuilder.append(months).append("M");
}
if (days > 0) {
periodBuilder.append(days).append("D");
}
if (periodBuilder.length() > 1) {
Period p = Period.parse(periodBuilder.toString());
Duration d = p.toStandardDuration().plus(millis);
return d;
} else {
return new Duration(millis);
}
case INT_8:
case INT_16:
case INT_32:
return g.getInteger(fieldIndex, index);
case INT_64:
return g.getLong(fieldIndex, index);
// todo: idk wtd about unsigned
case UINT_8:
case UINT_16:
case UINT_32:
return g.getInteger(fieldIndex, index);
case UINT_64:
return g.getLong(fieldIndex, index);
case DECIMAL:
/*
DECIMAL can be used to annotate the following types:
int32: for 1 <= precision <= 9
int64: for 1 <= precision <= 18; precision < 10 will produce a warning
fixed_len_byte_array: precision is limited by the array size. Length n can
store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits
binary: precision is not limited, but is required. The minimum number of bytes to store
the unscaled value should be used.
*/
int precision = pt.asPrimitiveType().getDecimalMetadata().getPrecision();
int scale = pt.asPrimitiveType().getDecimalMetadata().getScale();
switch(pt.getPrimitiveTypeName()) {
case INT32:
return new BigDecimal(g.getInteger(fieldIndex, index));
case INT64:
return new BigDecimal(g.getLong(fieldIndex, index));
case FIXED_LEN_BYTE_ARRAY:
case BINARY:
Binary value = g.getBinary(fieldIndex, index);
return convertBinaryToDecimal(value, precision, scale);
default:
throw new RE("Unknown 'DECIMAL' type supplied to primitive conversion: %s (this should never happen)", pt.getPrimitiveTypeName());
}
case UTF8:
case ENUM:
case JSON:
return g.getString(fieldIndex, index);
case LIST:
case MAP:
case MAP_KEY_VALUE:
case BSON:
default:
throw new RE("Non-primitive supplied to primitive conversion: %s (this should never happen)", ot.name());
}
} else {
// fallback to handling the raw primitive type if no logical type mapping
switch(pt.getPrimitiveTypeName()) {
case BOOLEAN:
return g.getBoolean(fieldIndex, index);
case INT32:
return g.getInteger(fieldIndex, index);
case INT64:
return g.getLong(fieldIndex, index);
case FLOAT:
return g.getFloat(fieldIndex, index);
case DOUBLE:
return g.getDouble(fieldIndex, index);
case INT96:
Binary tsBin = g.getInt96(fieldIndex, index);
return convertInt96BinaryToTimestamp(tsBin);
case FIXED_LEN_BYTE_ARRAY:
case BINARY:
Binary bin = g.getBinary(fieldIndex, index);
byte[] bytes = bin.getBytes();
if (binaryAsString) {
return StringUtils.fromUtf8(bytes);
} else {
return bytes;
}
default:
throw new RE("Unknown primitive conversion: %s", pt.getPrimitiveTypeName());
}
}
} catch (Exception ex) {
return null;
}
}
use of org.apache.parquet.schema.PrimitiveType in project drill by apache.
the class ParquetRecordWriter method addElementType.
/**
* Adds element type to {@code listBuilder} based on Drill's
* {@code elementField}.
*
* @param listBuilder list schema builder
* @param elementField Drill's type of list elements
*/
private void addElementType(ListBuilder<GroupType> listBuilder, MaterializedField elementField) {
if (elementField.getDataMode() == DataMode.REPEATED) {
ListBuilder<GroupType> inner = org.apache.parquet.schema.Types.requiredList();
if (elementField.getType().getMinorType() == MinorType.MAP) {
GroupType mapGroupType = new GroupType(Repetition.REQUIRED, ELEMENT, getChildrenTypes(elementField));
inner.element(mapGroupType);
} else {
MaterializedField child2 = getDataField(elementField);
addElementType(inner, child2);
}
listBuilder.setElementType(inner.named(ELEMENT));
} else {
Type element = getType(elementField);
// rename it to 'element' according to Parquet list schema
if (element.isPrimitive()) {
PrimitiveType primitiveElement = element.asPrimitiveType();
element = new PrimitiveType(primitiveElement.getRepetition(), primitiveElement.getPrimitiveTypeName(), ELEMENT, primitiveElement.getOriginalType());
} else {
GroupType groupElement = element.asGroupType();
element = new GroupType(groupElement.getRepetition(), ELEMENT, groupElement.getFields());
}
listBuilder.element(element);
}
}
Aggregations