Search in sources :

Example 1 with GenericFixed

use of org.apache.avro.generic.GenericFixed in project parquet-mr by apache.

the class TestReadWriteOldListBehavior method testAllUsingDefaultAvroSchema.

@Test
public void testAllUsingDefaultAvroSchema() throws Exception {
    File tmp = File.createTempFile(getClass().getSimpleName(), ".tmp");
    tmp.deleteOnExit();
    tmp.delete();
    Path file = new Path(tmp.getPath());
    // write file using Parquet APIs
    ParquetWriter<Map<String, Object>> parquetWriter = new ParquetWriter<Map<String, Object>>(file, new WriteSupport<Map<String, Object>>() {

        private RecordConsumer recordConsumer;

        @Override
        public WriteContext init(Configuration configuration) {
            return new WriteContext(MessageTypeParser.parseMessageType(TestAvroSchemaConverter.ALL_PARQUET_SCHEMA), new HashMap<String, String>());
        }

        @Override
        public void prepareForWrite(RecordConsumer recordConsumer) {
            this.recordConsumer = recordConsumer;
        }

        @Override
        public void write(Map<String, Object> record) {
            recordConsumer.startMessage();
            int index = 0;
            recordConsumer.startField("myboolean", index);
            recordConsumer.addBoolean((Boolean) record.get("myboolean"));
            recordConsumer.endField("myboolean", index++);
            recordConsumer.startField("myint", index);
            recordConsumer.addInteger((Integer) record.get("myint"));
            recordConsumer.endField("myint", index++);
            recordConsumer.startField("mylong", index);
            recordConsumer.addLong((Long) record.get("mylong"));
            recordConsumer.endField("mylong", index++);
            recordConsumer.startField("myfloat", index);
            recordConsumer.addFloat((Float) record.get("myfloat"));
            recordConsumer.endField("myfloat", index++);
            recordConsumer.startField("mydouble", index);
            recordConsumer.addDouble((Double) record.get("mydouble"));
            recordConsumer.endField("mydouble", index++);
            recordConsumer.startField("mybytes", index);
            recordConsumer.addBinary(Binary.fromReusedByteBuffer((ByteBuffer) record.get("mybytes")));
            recordConsumer.endField("mybytes", index++);
            recordConsumer.startField("mystring", index);
            recordConsumer.addBinary(Binary.fromString((String) record.get("mystring")));
            recordConsumer.endField("mystring", index++);
            recordConsumer.startField("mynestedrecord", index);
            recordConsumer.startGroup();
            recordConsumer.startField("mynestedint", 0);
            recordConsumer.addInteger((Integer) record.get("mynestedint"));
            recordConsumer.endField("mynestedint", 0);
            recordConsumer.endGroup();
            recordConsumer.endField("mynestedrecord", index++);
            recordConsumer.startField("myenum", index);
            recordConsumer.addBinary(Binary.fromString((String) record.get("myenum")));
            recordConsumer.endField("myenum", index++);
            recordConsumer.startField("myarray", index);
            recordConsumer.startGroup();
            recordConsumer.startField("array", 0);
            for (int val : (int[]) record.get("myarray")) {
                recordConsumer.addInteger(val);
            }
            recordConsumer.endField("array", 0);
            recordConsumer.endGroup();
            recordConsumer.endField("myarray", index++);
            recordConsumer.startField("myoptionalarray", index);
            recordConsumer.startGroup();
            recordConsumer.startField("array", 0);
            for (int val : (int[]) record.get("myoptionalarray")) {
                recordConsumer.addInteger(val);
            }
            recordConsumer.endField("array", 0);
            recordConsumer.endGroup();
            recordConsumer.endField("myoptionalarray", index++);
            recordConsumer.startField("myarrayofoptional", index);
            recordConsumer.startGroup();
            recordConsumer.startField("list", 0);
            for (Integer val : (Integer[]) record.get("myarrayofoptional")) {
                recordConsumer.startGroup();
                if (val != null) {
                    recordConsumer.startField("element", 0);
                    recordConsumer.addInteger(val);
                    recordConsumer.endField("element", 0);
                }
                recordConsumer.endGroup();
            }
            recordConsumer.endField("list", 0);
            recordConsumer.endGroup();
            recordConsumer.endField("myarrayofoptional", index++);
            recordConsumer.startField("myrecordarray", index);
            recordConsumer.startGroup();
            recordConsumer.startField("array", 0);
            recordConsumer.startGroup();
            recordConsumer.startField("a", 0);
            for (int val : (int[]) record.get("myrecordarraya")) {
                recordConsumer.addInteger(val);
            }
            recordConsumer.endField("a", 0);
            recordConsumer.startField("b", 1);
            for (int val : (int[]) record.get("myrecordarrayb")) {
                recordConsumer.addInteger(val);
            }
            recordConsumer.endField("b", 1);
            recordConsumer.endGroup();
            recordConsumer.endField("array", 0);
            recordConsumer.endGroup();
            recordConsumer.endField("myrecordarray", index++);
            recordConsumer.startField("mymap", index);
            recordConsumer.startGroup();
            recordConsumer.startField("map", 0);
            recordConsumer.startGroup();
            Map<String, Integer> mymap = (Map<String, Integer>) record.get("mymap");
            recordConsumer.startField("key", 0);
            for (String key : mymap.keySet()) {
                recordConsumer.addBinary(Binary.fromString(key));
            }
            recordConsumer.endField("key", 0);
            recordConsumer.startField("value", 1);
            for (int val : mymap.values()) {
                recordConsumer.addInteger(val);
            }
            recordConsumer.endField("value", 1);
            recordConsumer.endGroup();
            recordConsumer.endField("map", 0);
            recordConsumer.endGroup();
            recordConsumer.endField("mymap", index++);
            recordConsumer.startField("myfixed", index);
            recordConsumer.addBinary(Binary.fromReusedByteArray((byte[]) record.get("myfixed")));
            recordConsumer.endField("myfixed", index++);
            recordConsumer.endMessage();
        }
    });
    Map<String, Object> record = new HashMap<String, Object>();
    record.put("myboolean", true);
    record.put("myint", 1);
    record.put("mylong", 2L);
    record.put("myfloat", 3.1f);
    record.put("mydouble", 4.1);
    record.put("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)));
    record.put("mystring", "hello");
    record.put("myenum", "a");
    record.put("mynestedint", 1);
    record.put("myarray", new int[] { 1, 2, 3 });
    record.put("myoptionalarray", new int[] { 1, 2, 3 });
    record.put("myarrayofoptional", new Integer[] { 1, null, 2, null, 3 });
    record.put("myrecordarraya", new int[] { 1, 2, 3 });
    record.put("myrecordarrayb", new int[] { 4, 5, 6 });
    record.put("mymap", ImmutableMap.of("a", 1, "b", 2));
    record.put("myfixed", new byte[] { (byte) 65 });
    parquetWriter.write(record);
    parquetWriter.close();
    Schema nestedRecordSchema = Schema.createRecord("mynestedrecord", null, null, false);
    nestedRecordSchema.setFields(Arrays.asList(new Schema.Field("mynestedint", Schema.create(Schema.Type.INT), null, null)));
    GenericData.Record nestedRecord = new GenericRecordBuilder(nestedRecordSchema).set("mynestedint", 1).build();
    List<Integer> integerArray = Arrays.asList(1, 2, 3);
    Schema recordArraySchema = Schema.createRecord("array", null, null, false);
    recordArraySchema.setFields(Arrays.asList(new Schema.Field("a", Schema.create(Schema.Type.INT), null, null), new Schema.Field("b", Schema.create(Schema.Type.INT), null, null)));
    GenericRecordBuilder builder = new GenericRecordBuilder(recordArraySchema);
    List<GenericData.Record> recordArray = new ArrayList<GenericData.Record>();
    recordArray.add(builder.set("a", 1).set("b", 4).build());
    recordArray.add(builder.set("a", 2).set("b", 5).build());
    recordArray.add(builder.set("a", 3).set("b", 6).build());
    GenericData.Array<GenericData.Record> genericRecordArray = new GenericData.Array<GenericData.Record>(Schema.createArray(recordArraySchema), recordArray);
    GenericFixed genericFixed = new GenericData.Fixed(Schema.createFixed("fixed", null, null, 1), new byte[] { (byte) 65 });
    // 3-level lists are deserialized with the extra layer present
    Schema elementSchema = record("list", optionalField("element", primitive(Schema.Type.INT)));
    GenericRecordBuilder elementBuilder = new GenericRecordBuilder(elementSchema);
    GenericData.Array<GenericData.Record> genericRecordArrayWithNullIntegers = new GenericData.Array<GenericData.Record>(array(elementSchema), Arrays.asList(elementBuilder.set("element", 1).build(), elementBuilder.set("element", null).build(), elementBuilder.set("element", 2).build(), elementBuilder.set("element", null).build(), elementBuilder.set("element", 3).build()));
    AvroParquetReader<GenericRecord> reader = new AvroParquetReader<GenericRecord>(testConf, file);
    GenericRecord nextRecord = reader.read();
    assertNotNull(nextRecord);
    assertEquals(true, nextRecord.get("myboolean"));
    assertEquals(1, nextRecord.get("myint"));
    assertEquals(2L, nextRecord.get("mylong"));
    assertEquals(3.1f, nextRecord.get("myfloat"));
    assertEquals(4.1, nextRecord.get("mydouble"));
    assertEquals(ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)), nextRecord.get("mybytes"));
    assertEquals(str("hello"), nextRecord.get("mystring"));
    assertEquals(str("a"), nextRecord.get("myenum"));
    assertEquals(nestedRecord, nextRecord.get("mynestedrecord"));
    assertEquals(integerArray, nextRecord.get("myarray"));
    assertEquals(integerArray, nextRecord.get("myoptionalarray"));
    assertEquals(genericRecordArrayWithNullIntegers, nextRecord.get("myarrayofoptional"));
    assertEquals(genericRecordArray, nextRecord.get("myrecordarray"));
    assertEquals(ImmutableMap.of(str("a"), 1, str("b"), 2), nextRecord.get("mymap"));
    assertEquals(genericFixed, nextRecord.get("myfixed"));
}
Also used : GenericFixed(org.apache.avro.generic.GenericFixed) Configuration(org.apache.hadoop.conf.Configuration) ParquetWriter(org.apache.parquet.hadoop.ParquetWriter) HashMap(java.util.HashMap) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) RecordConsumer(org.apache.parquet.io.api.RecordConsumer) AvroTestUtil.optionalField(org.apache.parquet.avro.AvroTestUtil.optionalField) GenericRecordBuilder(org.apache.avro.generic.GenericRecordBuilder) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) GenericFixed(org.apache.avro.generic.GenericFixed) Path(org.apache.hadoop.fs.Path) GenericData(org.apache.avro.generic.GenericData) File(java.io.File) HashMap(java.util.HashMap) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) Test(org.junit.Test)

Example 2 with GenericFixed

use of org.apache.avro.generic.GenericFixed in project parquet-mr by apache.

the class TestReadWriteOldListBehavior method testArrayWithNullValues.

@Test
public void testArrayWithNullValues() throws Exception {
    Schema schema = new Schema.Parser().parse(Resources.getResource("all.avsc").openStream());
    File tmp = File.createTempFile(getClass().getSimpleName(), ".tmp");
    tmp.deleteOnExit();
    tmp.delete();
    Path file = new Path(tmp.getPath());
    GenericData.Record nestedRecord = new GenericRecordBuilder(schema.getField("mynestedrecord").schema()).set("mynestedint", 1).build();
    List<Integer> integerArray = Arrays.asList(1, 2, 3);
    GenericData.Array<Integer> genericIntegerArray = new GenericData.Array<Integer>(Schema.createArray(Schema.create(Schema.Type.INT)), integerArray);
    GenericFixed genericFixed = new GenericData.Fixed(Schema.createFixed("fixed", null, null, 1), new byte[] { (byte) 65 });
    List<Integer> emptyArray = new ArrayList<Integer>();
    ImmutableMap emptyMap = new ImmutableMap.Builder<String, Integer>().build();
    Schema arrayOfOptionalIntegers = Schema.createArray(optional(Schema.create(Schema.Type.INT)));
    GenericData.Array<Integer> genericIntegerArrayWithNulls = new GenericData.Array<Integer>(arrayOfOptionalIntegers, Arrays.asList(1, null, 2, null, 3));
    GenericData.Record record = new GenericRecordBuilder(schema).set("mynull", null).set("myboolean", true).set("myint", 1).set("mylong", 2L).set("myfloat", 3.1f).set("mydouble", 4.1).set("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8))).set("mystring", "hello").set("mynestedrecord", nestedRecord).set("myenum", "a").set("myarray", genericIntegerArray).set("myemptyarray", emptyArray).set("myoptionalarray", genericIntegerArray).set("myarrayofoptional", genericIntegerArrayWithNulls).set("mymap", ImmutableMap.of("a", 1, "b", 2)).set("myemptymap", emptyMap).set("myfixed", genericFixed).build();
    final AvroParquetWriter<GenericRecord> writer = new AvroParquetWriter<GenericRecord>(file, schema);
    try {
        writer.write(record);
        fail("Should not succeed writing an array with null values");
    } catch (Exception e) {
        Assert.assertTrue("Error message should provide context and help", e.getMessage().contains("parquet.avro.write-old-list-structure"));
    } finally {
        writer.close();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) GenericFixed(org.apache.avro.generic.GenericFixed) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) GenericData(org.apache.avro.generic.GenericData) ImmutableMap(com.google.common.collect.ImmutableMap) GenericRecordBuilder(org.apache.avro.generic.GenericRecordBuilder) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File) GenericFixed(org.apache.avro.generic.GenericFixed) Test(org.junit.Test)

Example 3 with GenericFixed

use of org.apache.avro.generic.GenericFixed in project streamline by hortonworks.

the class AvroStreamsSnapshotDeserializer method convertValue.

private Object convertValue(Object deserializedObj) {
    Object value;
    // check for specific-record type and build a map from that
    if (deserializedObj instanceof IndexedRecord) {
        // record
        IndexedRecord indexedRecord = (IndexedRecord) deserializedObj;
        List<Schema.Field> fields = indexedRecord.getSchema().getFields();
        ImmutableMap.Builder<String, Object> keyValues = ImmutableMap.builder();
        for (Schema.Field field : fields) {
            Object currentValue = convertValue(indexedRecord.get(field.pos()));
            if (currentValue != null) {
                keyValues.put(field.name(), currentValue);
            }
        }
        value = keyValues.build();
    } else if (deserializedObj instanceof ByteBuffer) {
        // byte array representation
        ByteBuffer byteBuffer = (ByteBuffer) deserializedObj;
        byte[] bytes = new byte[byteBuffer.remaining()];
        byteBuffer.get(bytes);
        value = bytes;
    } else if (deserializedObj instanceof GenericEnumSymbol) {
        // enums
        GenericEnumSymbol symbol = (GenericEnumSymbol) deserializedObj;
        value = symbol.toString();
    } else if (deserializedObj instanceof CharSequence) {
        // symbols
        value = deserializedObj.toString();
    } else if (deserializedObj instanceof Map) {
        // type of map
        Map<Object, Object> map = (Map<Object, Object>) deserializedObj;
        ImmutableMap.Builder<String, Object> keyValues = ImmutableMap.builder();
        for (Map.Entry entry : map.entrySet()) {
            Object currentValue = convertValue(entry.getValue());
            if (currentValue != null) {
                keyValues.put(entry.getKey().toString(), currentValue);
            }
        }
        value = keyValues.build();
    } else if (deserializedObj instanceof Collection) {
        // type of array
        Collection<Object> collection = (Collection<Object>) deserializedObj;
        ImmutableList.Builder<Object> values = ImmutableList.builder();
        for (Object obj : collection) {
            Object currentValue = convertValue(obj);
            if (currentValue != null) {
                values.add(currentValue);
            }
        }
        value = values.build();
    } else if (deserializedObj instanceof GenericFixed) {
        // fixed type
        GenericFixed genericFixed = (GenericFixed) deserializedObj;
        value = genericFixed.bytes();
    } else {
        // other primitive types
        value = deserializedObj;
    }
    return value;
}
Also used : GenericFixed(org.apache.avro.generic.GenericFixed) IndexedRecord(org.apache.avro.generic.IndexedRecord) Schema(org.apache.avro.Schema) GenericEnumSymbol(org.apache.avro.generic.GenericEnumSymbol) ByteBuffer(java.nio.ByteBuffer) ImmutableMap(com.google.common.collect.ImmutableMap) Collection(java.util.Collection) ImmutableMap(com.google.common.collect.ImmutableMap) Map(java.util.Map)

Example 4 with GenericFixed

use of org.apache.avro.generic.GenericFixed in project flink by apache.

the class AvroRowDeserializationSchema method convertAvroType.

private Object convertAvroType(Schema schema, TypeInformation<?> info, Object object) {
    if (object == null) {
        return null;
    }
    switch(schema.getType()) {
        case RECORD:
            if (object instanceof IndexedRecord) {
                return convertAvroRecordToRow(schema, (RowTypeInfo) info, (IndexedRecord) object);
            }
            throw new IllegalStateException("IndexedRecord expected but was: " + object.getClass());
        case ENUM:
        case STRING:
            return object.toString();
        case ARRAY:
            if (info instanceof BasicArrayTypeInfo) {
                final TypeInformation<?> elementInfo = ((BasicArrayTypeInfo<?, ?>) info).getComponentInfo();
                return convertToObjectArray(schema.getElementType(), elementInfo, object);
            } else {
                final TypeInformation<?> elementInfo = ((ObjectArrayTypeInfo<?, ?>) info).getComponentInfo();
                return convertToObjectArray(schema.getElementType(), elementInfo, object);
            }
        case MAP:
            final MapTypeInfo<?, ?> mapTypeInfo = (MapTypeInfo<?, ?>) info;
            final Map<String, Object> convertedMap = new HashMap<>();
            final Map<?, ?> map = (Map<?, ?>) object;
            for (Map.Entry<?, ?> entry : map.entrySet()) {
                convertedMap.put(entry.getKey().toString(), convertAvroType(schema.getValueType(), mapTypeInfo.getValueTypeInfo(), entry.getValue()));
            }
            return convertedMap;
        case UNION:
            final List<Schema> types = schema.getTypes();
            final int size = types.size();
            final Schema actualSchema;
            if (size == 2 && types.get(0).getType() == Schema.Type.NULL) {
                actualSchema = types.get(1);
            } else if (size == 2 && types.get(1).getType() == Schema.Type.NULL) {
                actualSchema = types.get(0);
            } else if (size == 1) {
                actualSchema = types.get(0);
            } else {
                // generic type
                return object;
            }
            return convertAvroType(actualSchema, info, object);
        case FIXED:
            final byte[] fixedBytes = ((GenericFixed) object).bytes();
            if (info == Types.BIG_DEC) {
                return convertToDecimal(schema, fixedBytes);
            }
            return fixedBytes;
        case BYTES:
            final ByteBuffer byteBuffer = (ByteBuffer) object;
            final byte[] bytes = new byte[byteBuffer.remaining()];
            byteBuffer.get(bytes);
            if (info == Types.BIG_DEC) {
                return convertToDecimal(schema, bytes);
            }
            return bytes;
        case INT:
            if (info == Types.SQL_DATE) {
                return convertToDate(object);
            } else if (info == Types.SQL_TIME) {
                return convertToTime(object);
            }
            return object;
        case LONG:
            if (info == Types.SQL_TIMESTAMP) {
                return convertToTimestamp(object, schema.getLogicalType() == LogicalTypes.timestampMicros());
            } else if (info == Types.SQL_TIME) {
                return convertToTime(object);
            }
            return object;
        case FLOAT:
        case DOUBLE:
        case BOOLEAN:
            return object;
    }
    throw new RuntimeException("Unsupported Avro type:" + schema);
}
Also used : GenericFixed(org.apache.avro.generic.GenericFixed) IndexedRecord(org.apache.avro.generic.IndexedRecord) HashMap(java.util.HashMap) AbstractDeserializationSchema(org.apache.flink.api.common.serialization.AbstractDeserializationSchema) Schema(org.apache.avro.Schema) DeserializationSchema(org.apache.flink.api.common.serialization.DeserializationSchema) ByteBuffer(java.nio.ByteBuffer) ObjectArrayTypeInfo(org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo) MapTypeInfo(org.apache.flink.api.java.typeutils.MapTypeInfo) HashMap(java.util.HashMap) Map(java.util.Map) BasicArrayTypeInfo(org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo)

Example 5 with GenericFixed

use of org.apache.avro.generic.GenericFixed in project flink by apache.

the class AvroToRowDataConverters method createDecimalConverter.

private static AvroToRowDataConverter createDecimalConverter(DecimalType decimalType) {
    final int precision = decimalType.getPrecision();
    final int scale = decimalType.getScale();
    return avroObject -> {
        final byte[] bytes;
        if (avroObject instanceof GenericFixed) {
            bytes = ((GenericFixed) avroObject).bytes();
        } else if (avroObject instanceof ByteBuffer) {
            ByteBuffer byteBuffer = (ByteBuffer) avroObject;
            bytes = new byte[byteBuffer.remaining()];
            byteBuffer.get(bytes);
        } else {
            bytes = (byte[]) avroObject;
        }
        return DecimalData.fromUnscaledBytes(bytes, precision, scale);
    };
}
Also used : ChronoField(java.time.temporal.ChronoField) Array(java.lang.reflect.Array) GenericArrayData(org.apache.flink.table.data.GenericArrayData) HashMap(java.util.HashMap) RowType(org.apache.flink.table.types.logical.RowType) ByteBuffer(java.nio.ByteBuffer) GenericRowData(org.apache.flink.table.data.GenericRowData) DecimalType(org.apache.flink.table.types.logical.DecimalType) GenericMapData(org.apache.flink.table.data.GenericMapData) Map(java.util.Map) LocalTime(java.time.LocalTime) LogicalTypeUtils(org.apache.flink.table.types.logical.utils.LogicalTypeUtils) IndexedRecord(org.apache.avro.generic.IndexedRecord) GenericRecord(org.apache.avro.generic.GenericRecord) RowData(org.apache.flink.table.data.RowData) GenericFixed(org.apache.avro.generic.GenericFixed) TimestampData(org.apache.flink.table.data.TimestampData) DataTypes(org.apache.flink.table.api.DataTypes) DecimalData(org.apache.flink.table.data.DecimalData) ArrayType(org.apache.flink.table.types.logical.ArrayType) Instant(java.time.Instant) AvroSchemaConverter.extractValueTypeToAvroMap(org.apache.flink.formats.avro.typeutils.AvroSchemaConverter.extractValueTypeToAvroMap) Serializable(java.io.Serializable) StringData(org.apache.flink.table.data.StringData) List(java.util.List) LogicalType(org.apache.flink.table.types.logical.LogicalType) LocalDate(java.time.LocalDate) Internal(org.apache.flink.annotation.Internal) GenericFixed(org.apache.avro.generic.GenericFixed) ByteBuffer(java.nio.ByteBuffer)

Aggregations

GenericFixed (org.apache.avro.generic.GenericFixed)10 Schema (org.apache.avro.Schema)8 GenericRecord (org.apache.avro.generic.GenericRecord)7 ImmutableMap (com.google.common.collect.ImmutableMap)6 Map (java.util.Map)6 GenericData (org.apache.avro.generic.GenericData)6 File (java.io.File)5 ByteBuffer (java.nio.ByteBuffer)5 ArrayList (java.util.ArrayList)5 HashMap (java.util.HashMap)5 GenericRecordBuilder (org.apache.avro.generic.GenericRecordBuilder)5 Path (org.apache.hadoop.fs.Path)5 Test (org.junit.Test)5 BigInteger (java.math.BigInteger)3 IndexedRecord (org.apache.avro.generic.IndexedRecord)3 Utf8 (org.apache.avro.util.Utf8)2 Serializable (java.io.Serializable)1 Array (java.lang.reflect.Array)1 BigDecimal (java.math.BigDecimal)1 IntBuffer (java.nio.IntBuffer)1