use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.
the class DataWritableReadSupport method init.
/**
* It creates the readContext for Parquet side with the requested schema during the init phase.
*
* @param configuration needed to get the wanted columns
* @param keyValueMetaData // unused
* @param fileSchema parquet file schema
* @return the parquet ReadContext
*/
@Override
public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(final Configuration configuration, final Map<String, String> keyValueMetaData, final MessageType fileSchema) {
final String columns = configuration.get(IOConstants.COLUMNS);
final Map<String, String> contextMetadata = new HashMap<String, String>();
if (columns != null) {
final List<String> listColumns = getColumns(columns);
final List<Type> typeListTable = new ArrayList<Type>();
for (final String col : listColumns) {
// listColumns contains partition columns which are metadata only
if (fileSchema.containsField(col)) {
typeListTable.add(fileSchema.getType(col));
} else {
// below allows schema evolution
typeListTable.add(new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, col));
}
}
MessageType tableSchema = new MessageType(TABLE_SCHEMA, typeListTable);
contextMetadata.put(HIVE_SCHEMA_KEY, tableSchema.toString());
MessageType requestedSchemaByUser = tableSchema;
final List<Integer> indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration);
final List<Type> typeListWanted = new ArrayList<Type>();
for (final Integer idx : indexColumnsWanted) {
typeListWanted.add(tableSchema.getType(listColumns.get(idx)));
}
requestedSchemaByUser = resolveSchemaAccess(new MessageType(fileSchema.getName(), typeListWanted), fileSchema, configuration);
return new ReadContext(requestedSchemaByUser, contextMetadata);
} else {
contextMetadata.put(HIVE_SCHEMA_KEY, fileSchema.toString());
return new ReadContext(fileSchema, contextMetadata);
}
}
use of org.apache.parquet.schema.PrimitiveType in project drill by apache.
the class DrillParquetGroupConverter method getVarDecimalConverter.
private PrimitiveConverter getVarDecimalConverter(String name, PrimitiveType type) {
int scale = type.getDecimalMetadata().getScale();
int precision = type.getDecimalMetadata().getPrecision();
VarDecimalWriter writer = type.isRepetition(Repetition.REPEATED) ? getWriter(name, (m, f) -> m.list(f).varDecimal(precision, scale), l -> l.list().varDecimal(precision, scale)) : getWriter(name, (m, f) -> m.varDecimal(f, precision, scale), l -> l.varDecimal(precision, scale));
return new DrillVarDecimalConverter(writer, precision, scale, mutator.getManagedBuffer());
}
use of org.apache.parquet.schema.PrimitiveType in project drill by apache.
the class DrillParquetGroupConverter method getConverterForType.
protected PrimitiveConverter getConverterForType(String name, PrimitiveType type) {
switch(type.getPrimitiveTypeName()) {
case INT32:
{
if (type.getOriginalType() == null) {
return getIntConverter(name, type);
}
switch(type.getOriginalType()) {
case UINT_8:
case UINT_16:
case UINT_32:
case INT_8:
case INT_16:
case INT_32:
{
return getIntConverter(name, type);
}
case DECIMAL:
{
ParquetReaderUtility.checkDecimalTypeEnabled(options);
return getVarDecimalConverter(name, type);
}
case DATE:
{
DateWriter writer = type.isRepetition(Repetition.REPEATED) ? getWriter(name, (m, f) -> m.list(f).date(), l -> l.list().date()) : getWriter(name, (m, f) -> m.date(f), l -> l.date());
switch(containsCorruptedDates) {
case META_SHOWS_CORRUPTION:
return new DrillCorruptedDateConverter(writer);
case META_SHOWS_NO_CORRUPTION:
return new DrillDateConverter(writer);
case META_UNCLEAR_TEST_VALUES:
return new CorruptionDetectingDateConverter(writer);
default:
throw new DrillRuntimeException(String.format("Issue setting up parquet reader for date type, " + "unrecognized date corruption status %s. See DRILL-4203 for more info.", containsCorruptedDates));
}
}
case TIME_MILLIS:
{
TimeWriter writer = type.isRepetition(Repetition.REPEATED) ? getWriter(name, (m, f) -> m.list(f).time(), l -> l.list().time()) : getWriter(name, (m, f) -> m.time(f), l -> l.time());
return new DrillTimeConverter(writer);
}
default:
{
throw new UnsupportedOperationException("Unsupported type: " + type.getOriginalType());
}
}
}
case INT64:
{
if (type.getOriginalType() == null) {
return getBigIntConverter(name, type);
}
switch(type.getOriginalType()) {
case UINT_64:
case INT_64:
return getBigIntConverter(name, type);
case TIMESTAMP_MICROS:
{
TimeStampWriter writer = getTimeStampWriter(name, type);
return new DrillTimeStampMicrosConverter(writer);
}
case TIME_MICROS:
{
TimeWriter writer = type.isRepetition(Repetition.REPEATED) ? getWriter(name, (m, f) -> m.list(f).time(), l -> l.list().time()) : getWriter(name, MapWriter::time, ListWriter::time);
return new DrillTimeMicrosConverter(writer);
}
case DECIMAL:
{
ParquetReaderUtility.checkDecimalTypeEnabled(options);
return getVarDecimalConverter(name, type);
}
case TIMESTAMP_MILLIS:
{
TimeStampWriter writer = getTimeStampWriter(name, type);
return new DrillTimeStampConverter(writer);
}
default:
{
throw new UnsupportedOperationException("Unsupported type " + type.getOriginalType());
}
}
}
case INT96:
{
// TODO: replace null with TIMESTAMP_NANOS once parquet support such type annotation.
if (type.getOriginalType() == null) {
if (options.getOption(ExecConstants.PARQUET_READER_INT96_AS_TIMESTAMP).bool_val) {
TimeStampWriter writer = getTimeStampWriter(name, type);
return new DrillFixedBinaryToTimeStampConverter(writer);
} else {
VarBinaryWriter writer = type.isRepetition(Repetition.REPEATED) ? getWriter(name, (m, f) -> m.list(f).varBinary(), l -> l.list().varBinary()) : getWriter(name, (m, f) -> m.varBinary(f), listWriter -> listWriter.varBinary());
return new DrillFixedBinaryToVarbinaryConverter(writer, ParquetColumnMetadata.getTypeLengthInBits(type.getPrimitiveTypeName()) / 8, mutator.getManagedBuffer());
}
}
}
case FLOAT:
{
Float4Writer writer = type.isRepetition(Repetition.REPEATED) ? getWriter(name, (m, f) -> m.list(f).float4(), l -> l.list().float4()) : getWriter(name, (m, f) -> m.float4(f), l -> l.float4());
return new DrillFloat4Converter(writer);
}
case DOUBLE:
{
Float8Writer writer = type.isRepetition(Repetition.REPEATED) ? getWriter(name, (m, f) -> m.list(f).float8(), l -> l.list().float8()) : getWriter(name, (m, f) -> m.float8(f), l -> l.float8());
return new DrillFloat8Converter(writer);
}
case BOOLEAN:
{
BitWriter writer = type.isRepetition(Repetition.REPEATED) ? getWriter(name, (m, f) -> m.list(f).bit(), l -> l.list().bit()) : getWriter(name, (m, f) -> m.bit(f), l -> l.bit());
return new DrillBoolConverter(writer);
}
case BINARY:
{
LogicalTypeAnnotation.LogicalTypeAnnotationVisitor<PrimitiveConverter> typeAnnotationVisitor = new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor<PrimitiveConverter>() {
@Override
public Optional<PrimitiveConverter> visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) {
ParquetReaderUtility.checkDecimalTypeEnabled(options);
return Optional.of(getVarDecimalConverter(name, type));
}
@Override
public Optional<PrimitiveConverter> visit(LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) {
return Optional.of(getVarCharConverter(name, type));
}
@Override
public Optional<PrimitiveConverter> visit(LogicalTypeAnnotation.EnumLogicalTypeAnnotation stringLogicalType) {
return Optional.of(getVarCharConverter(name, type));
}
};
Supplier<PrimitiveConverter> converterSupplier = () -> {
VarBinaryWriter writer = type.isRepetition(Repetition.REPEATED) ? getWriter(name, (m, f) -> m.list(f).varBinary(), l -> l.list().varBinary()) : getWriter(name, MapWriter::varBinary, ListWriter::varBinary);
return new DrillVarBinaryConverter(writer, mutator.getManagedBuffer());
};
return Optional.ofNullable(type.getLogicalTypeAnnotation()).map(typeAnnotation -> typeAnnotation.accept(typeAnnotationVisitor)).flatMap(Function.identity()).orElseGet(converterSupplier);
}
case FIXED_LEN_BYTE_ARRAY:
LogicalTypeAnnotation.LogicalTypeAnnotationVisitor<PrimitiveConverter> typeAnnotationVisitor = new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor<PrimitiveConverter>() {
@Override
public Optional<PrimitiveConverter> visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) {
ParquetReaderUtility.checkDecimalTypeEnabled(options);
return Optional.of(getVarDecimalConverter(name, type));
}
@Override
public Optional<PrimitiveConverter> visit(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation intervalLogicalType) {
IntervalWriter writer = type.isRepetition(Repetition.REPEATED) ? getWriter(name, (m, f) -> m.list(f).interval(), l -> l.list().interval()) : getWriter(name, MapWriter::interval, ListWriter::interval);
return Optional.of(new DrillFixedLengthByteArrayToInterval(writer));
}
};
Supplier<PrimitiveConverter> converterSupplier = () -> {
VarBinaryWriter writer = type.isRepetition(Repetition.REPEATED) ? getWriter(name, (m, f) -> m.list(f).varBinary(), l -> l.list().varBinary()) : getWriter(name, MapWriter::varBinary, ListWriter::varBinary);
return new DrillFixedBinaryToVarbinaryConverter(writer, type.getTypeLength(), mutator.getManagedBuffer());
};
return Optional.ofNullable(type.getLogicalTypeAnnotation()).map(typeAnnotation -> typeAnnotation.accept(typeAnnotationVisitor)).flatMap(Function.identity()).orElseGet(converterSupplier);
default:
throw new UnsupportedOperationException("Unsupported type: " + type.getPrimitiveTypeName());
}
}
use of org.apache.parquet.schema.PrimitiveType in project drill by apache.
the class ParquetSchemaMerge method main.
public static void main(String[] args) {
MessageType message1;
MessageType message2;
PrimitiveType c = new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.INT32, "c");
GroupType b = new GroupType(Repetition.REQUIRED, "b");
GroupType a = new GroupType(Repetition.OPTIONAL, "a", b);
message1 = new MessageType("root", a);
PrimitiveType c2 = new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.INT32, "d");
GroupType b2 = new GroupType(Repetition.OPTIONAL, "b", c2);
GroupType a2 = new GroupType(Repetition.OPTIONAL, "a", b2);
message2 = new MessageType("root", a2);
MessageType message3 = message1.union(message2);
StringBuilder builder = new StringBuilder();
message3.writeToStringBuilder(builder, "");
logger.info(builder.toString());
}
use of org.apache.parquet.schema.PrimitiveType in project presto by prestodb.
the class TestColumnIndexBuilder method testBuildBinaryUtf8.
@Test
public void testBuildBinaryUtf8() {
PrimitiveType type = Types.required(BINARY).as(UTF8).named("test_binary_utf8");
ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
// assertThat(builder, instanceOf(BinaryColumnIndexBuilder.class));
assertNull(builder.build());
Operators.BinaryColumn col = binaryColumn("test_col");
StatsBuilder sb = new StatsBuilder();
builder.add(sb.stats(type, null, null));
builder.add(sb.stats(type, stringBinary("Jeltz"), stringBinary("Slartibartfast"), null, null));
builder.add(sb.stats(type, null, null, null, null, null));
builder.add(sb.stats(type, null, null));
builder.add(sb.stats(type, stringBinary("Beeblebrox"), stringBinary("Prefect")));
builder.add(sb.stats(type, stringBinary("Dent"), stringBinary("Trilian"), null));
builder.add(sb.stats(type, stringBinary("Beeblebrox")));
builder.add(sb.stats(type, null, null));
assertEquals(8, builder.getPageCount());
assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize());
ColumnIndex columnIndex = builder.build();
assertEquals(BoundaryOrder.UNORDERED, columnIndex.getBoundaryOrder());
assertCorrectNullCounts(columnIndex, 2, 2, 5, 2, 0, 1, 0, 2);
assertCorrectNullPages(columnIndex, true, false, true, true, false, false, false, true);
assertCorrectValues(columnIndex.getMaxValues(), null, stringBinary("Slartibartfast"), null, null, stringBinary("Prefect"), stringBinary("Trilian"), stringBinary("Beeblebrox"), null);
assertCorrectValues(columnIndex.getMinValues(), null, stringBinary("Jeltz"), null, null, stringBinary("Beeblebrox"), stringBinary("Dent"), stringBinary("Beeblebrox"), null);
assertCorrectFiltering(columnIndex, eq(col, stringBinary("Marvin")), 1, 4, 5);
assertCorrectFiltering(columnIndex, eq(col, null), 0, 1, 2, 3, 5, 7);
assertCorrectFiltering(columnIndex, notEq(col, stringBinary("Beeblebrox")), 0, 1, 2, 3, 4, 5, 7);
assertCorrectFiltering(columnIndex, notEq(col, null), 1, 4, 5, 6);
assertCorrectFiltering(columnIndex, gt(col, stringBinary("Prefect")), 1, 5);
assertCorrectFiltering(columnIndex, gtEq(col, stringBinary("Prefect")), 1, 4, 5);
assertCorrectFiltering(columnIndex, lt(col, stringBinary("Dent")), 4, 6);
assertCorrectFiltering(columnIndex, ltEq(col, stringBinary("Dent")), 4, 5, 6);
assertCorrectFiltering(columnIndex, userDefined(col, BinaryUtf8StartsWithB.class), 4, 6);
assertCorrectFiltering(columnIndex, invert(userDefined(col, BinaryUtf8StartsWithB.class)), 0, 1, 2, 3, 4, 5, 7);
builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
sb = new StatsBuilder();
builder.add(sb.stats(type, stringBinary("Beeblebrox"), stringBinary("Dent"), null, null));
builder.add(sb.stats(type, null, null));
builder.add(sb.stats(type, null, null, null, null, null));
builder.add(sb.stats(type, stringBinary("Dent"), stringBinary("Jeltz")));
builder.add(sb.stats(type, stringBinary("Dent"), stringBinary("Prefect"), null));
builder.add(sb.stats(type, null, null));
builder.add(sb.stats(type, stringBinary("Slartibartfast")));
builder.add(sb.stats(type, null, null));
assertEquals(8, builder.getPageCount());
assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize());
columnIndex = builder.build();
assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder());
assertCorrectNullCounts(columnIndex, 2, 2, 5, 0, 1, 2, 0, 2);
assertCorrectNullPages(columnIndex, false, true, true, false, false, true, false, true);
assertCorrectValues(columnIndex.getMaxValues(), stringBinary("Dent"), null, null, stringBinary("Jeltz"), stringBinary("Prefect"), null, stringBinary("Slartibartfast"), null);
assertCorrectValues(columnIndex.getMinValues(), stringBinary("Beeblebrox"), null, null, stringBinary("Dent"), stringBinary("Dent"), null, stringBinary("Slartibartfast"), null);
assertCorrectFiltering(columnIndex, eq(col, stringBinary("Jeltz")), 3, 4);
assertCorrectFiltering(columnIndex, eq(col, null), 0, 1, 2, 4, 5, 7);
assertCorrectFiltering(columnIndex, notEq(col, stringBinary("Slartibartfast")), 0, 1, 2, 3, 4, 5, 7);
assertCorrectFiltering(columnIndex, notEq(col, null), 0, 3, 4, 6);
assertCorrectFiltering(columnIndex, gt(col, stringBinary("Marvin")), 4, 6);
assertCorrectFiltering(columnIndex, gtEq(col, stringBinary("Marvin")), 4, 6);
assertCorrectFiltering(columnIndex, lt(col, stringBinary("Dent")), 0);
assertCorrectFiltering(columnIndex, ltEq(col, stringBinary("Dent")), 0, 3, 4);
assertCorrectFiltering(columnIndex, userDefined(col, BinaryUtf8StartsWithB.class), 0);
assertCorrectFiltering(columnIndex, invert(userDefined(col, BinaryUtf8StartsWithB.class)), 0, 1, 2, 3, 4, 5, 6, 7);
builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
sb = new StatsBuilder();
builder.add(sb.stats(type, null, null));
builder.add(sb.stats(type, stringBinary("Slartibartfast")));
builder.add(sb.stats(type, null, null, null, null, null));
builder.add(sb.stats(type, stringBinary("Prefect"), stringBinary("Jeltz"), null));
builder.add(sb.stats(type, stringBinary("Dent"), stringBinary("Dent")));
builder.add(sb.stats(type, null, null));
builder.add(sb.stats(type, null, null));
builder.add(sb.stats(type, stringBinary("Dent"), stringBinary("Beeblebrox"), null, null));
assertEquals(8, builder.getPageCount());
assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize());
columnIndex = builder.build();
assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder());
assertCorrectNullCounts(columnIndex, 2, 0, 5, 1, 0, 2, 2, 2);
assertCorrectNullPages(columnIndex, true, false, true, false, false, true, true, false);
assertCorrectValues(columnIndex.getMaxValues(), null, stringBinary("Slartibartfast"), null, stringBinary("Prefect"), stringBinary("Dent"), null, null, stringBinary("Dent"));
assertCorrectValues(columnIndex.getMinValues(), null, stringBinary("Slartibartfast"), null, stringBinary("Jeltz"), stringBinary("Dent"), null, null, stringBinary("Beeblebrox"));
assertCorrectFiltering(columnIndex, eq(col, stringBinary("Marvin")), 3);
assertCorrectFiltering(columnIndex, eq(col, null), 0, 2, 3, 5, 6, 7);
assertCorrectFiltering(columnIndex, notEq(col, stringBinary("Dent")), 0, 1, 2, 3, 5, 6, 7);
assertCorrectFiltering(columnIndex, notEq(col, null), 1, 3, 4, 7);
assertCorrectFiltering(columnIndex, gt(col, stringBinary("Prefect")), 1);
assertCorrectFiltering(columnIndex, gtEq(col, stringBinary("Prefect")), 1, 3);
assertCorrectFiltering(columnIndex, lt(col, stringBinary("Marvin")), 3, 4, 7);
assertCorrectFiltering(columnIndex, ltEq(col, stringBinary("Marvin")), 3, 4, 7);
assertCorrectFiltering(columnIndex, userDefined(col, BinaryUtf8StartsWithB.class), 7);
assertCorrectFiltering(columnIndex, invert(userDefined(col, BinaryUtf8StartsWithB.class)), 0, 1, 2, 3, 4, 5, 6, 7);
}
Aggregations