Search in sources :

Example 1 with SpecificDatumReader

use of org.apache.avro.specific.SpecificDatumReader in project flink by apache.

the class RollingSinkITCase method testNonRollingAvroKeyValueWithCompressionWriter.

/**
	 * This tests {@link AvroKeyValueSinkWriter}
	 * with non-rolling output and with compression.
	 */
@Test
public void testNonRollingAvroKeyValueWithCompressionWriter() throws Exception {
    final int NUM_ELEMENTS = 20;
    final int PARALLELISM = 2;
    final String outPath = hdfsURI + "/avro-kv-no-comp-non-rolling-out";
    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(PARALLELISM);
    DataStream<Tuple2<Integer, String>> source = env.addSource(new TestSourceFunction(NUM_ELEMENTS)).broadcast().filter(new OddEvenFilter());
    Map<String, String> properties = new HashMap<>();
    Schema keySchema = Schema.create(Type.INT);
    Schema valueSchema = Schema.create(Type.STRING);
    properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString());
    properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString());
    properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true));
    properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC);
    RollingSink<Tuple2<Integer, String>> sink = new RollingSink<Tuple2<Integer, String>>(outPath).setWriter(new AvroKeyValueSinkWriter<Integer, String>(properties)).setBucketer(new NonRollingBucketer()).setPartPrefix("part").setPendingPrefix("").setPendingSuffix("");
    source.addSink(sink);
    env.execute("RollingSink Avro KeyValue Writer Test");
    GenericData.setStringType(valueSchema, StringType.String);
    Schema elementSchema = AvroKeyValue.getSchema(keySchema, valueSchema);
    FSDataInputStream inStream = dfs.open(new Path(outPath + "/part-0-0"));
    SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<GenericRecord>(elementSchema);
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(inStream, elementReader);
    for (int i = 0; i < NUM_ELEMENTS; i += 2) {
        AvroKeyValue<Integer, String> wrappedEntry = new AvroKeyValue<Integer, String>(dataFileStream.next());
        int key = wrappedEntry.getKey().intValue();
        Assert.assertEquals(i, key);
        String value = wrappedEntry.getValue();
        Assert.assertEquals("message #" + i, value);
    }
    dataFileStream.close();
    inStream.close();
    inStream = dfs.open(new Path(outPath + "/part-1-0"));
    dataFileStream = new DataFileStream<GenericRecord>(inStream, elementReader);
    for (int i = 1; i < NUM_ELEMENTS; i += 2) {
        AvroKeyValue<Integer, String> wrappedEntry = new AvroKeyValue<Integer, String>(dataFileStream.next());
        int key = wrappedEntry.getKey().intValue();
        Assert.assertEquals(i, key);
        String value = wrappedEntry.getValue();
        Assert.assertEquals("message #" + i, value);
    }
    dataFileStream.close();
    inStream.close();
}
Also used : Path(org.apache.hadoop.fs.Path) HashMap(java.util.HashMap) Schema(org.apache.avro.Schema) DataFileStream(org.apache.avro.file.DataFileStream) AvroKeyValue(org.apache.flink.streaming.connectors.fs.AvroKeyValueSinkWriter.AvroKeyValue) Tuple2(org.apache.flink.api.java.tuple.Tuple2) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) SpecificDatumReader(org.apache.avro.specific.SpecificDatumReader) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.Test)

Example 2 with SpecificDatumReader

use of org.apache.avro.specific.SpecificDatumReader in project flink by apache.

the class AvroInputFormat method initReader.

private DataFileReader<E> initReader(FileInputSplit split) throws IOException {
    DatumReader<E> datumReader;
    if (org.apache.avro.generic.GenericRecord.class == avroValueType) {
        datumReader = new GenericDatumReader<E>();
    } else {
        datumReader = org.apache.avro.specific.SpecificRecordBase.class.isAssignableFrom(avroValueType) ? new SpecificDatumReader<E>(avroValueType) : new ReflectDatumReader<E>(avroValueType);
    }
    if (LOG.isInfoEnabled()) {
        LOG.info("Opening split {}", split);
    }
    SeekableInput in = new FSDataInputStreamWrapper(stream, split.getPath().getFileSystem().getFileStatus(split.getPath()).getLen());
    DataFileReader<E> dataFileReader = (DataFileReader) DataFileReader.openReader(in, datumReader);
    if (LOG.isDebugEnabled()) {
        LOG.debug("Loaded SCHEMA: {}", dataFileReader.getSchema());
    }
    end = split.getStart() + split.getLength();
    recordsReadSinceLastSync = 0;
    return dataFileReader;
}
Also used : DataFileReader(org.apache.avro.file.DataFileReader) SeekableInput(org.apache.avro.file.SeekableInput) FSDataInputStreamWrapper(org.apache.flink.api.avro.FSDataInputStreamWrapper) SpecificDatumReader(org.apache.avro.specific.SpecificDatumReader) ReflectDatumReader(org.apache.avro.reflect.ReflectDatumReader)

Example 3 with SpecificDatumReader

use of org.apache.avro.specific.SpecificDatumReader in project databus by linkedin.

the class DbusEventAvroDecoder method getTypedValue.

@Override
public <T extends SpecificRecord> T getTypedValue(DbusEvent e, T reuse, Class<T> targetClass) {
    if (null == reuse) {
        try {
            reuse = targetClass.newInstance();
        } catch (InstantiationException e1) {
            LOG.error("getTypedValue class instantiation error (" + e1.getMessage() + ") for event " + e, e1);
            return null;
        } catch (IllegalAccessException e1) {
            LOG.error("getTypedValue access error (" + e1.getMessage() + ") for event " + e, e1);
            return null;
        }
    }
    byte[] md5 = new byte[16];
    e.schemaId(md5);
    SchemaId schemaId = new SchemaId(md5);
    VersionedSchema writerSchema = _schemaSet.getById(schemaId);
    if (null == writerSchema) {
        LOG.error("Unable to find schema for id " + schemaId + "; event = " + e);
        throw new DatabusRuntimeException("No schema available to decode event " + e);
    }
    ByteBuffer valueBuffer = e.value();
    byte[] valueBytes = new byte[valueBuffer.remaining()];
    valueBuffer.get(valueBytes);
    try {
        //JsonDecoder jsonDec = new JsonDecoder(sourceSchema.getSchema(),new ByteArrayInputStream(valueBytes));
        binDecoder.set(DecoderFactory.defaultFactory().createBinaryDecoder(valueBytes, binDecoder.get()));
        SpecificDatumReader<SpecificRecord> reader = new SpecificDatumReader<SpecificRecord>(writerSchema.getSchema(), reuse.getSchema());
        return targetClass.cast(reader.read(reuse, binDecoder.get()));
    } catch (IOException e1) {
        LOG.error("getTypedValue IO error (" + e1.getMessage() + ") for event " + e, e1);
    }
    return reuse;
}
Also used : SpecificRecord(org.apache.avro.specific.SpecificRecord) SchemaId(com.linkedin.databus2.schemas.SchemaId) IOException(java.io.IOException) SpecificDatumReader(org.apache.avro.specific.SpecificDatumReader) VersionedSchema(com.linkedin.databus2.schemas.VersionedSchema) ByteBuffer(java.nio.ByteBuffer) DatabusRuntimeException(com.linkedin.databus.core.DatabusRuntimeException)

Example 4 with SpecificDatumReader

use of org.apache.avro.specific.SpecificDatumReader in project gora by apache.

the class AccumuloStore method populate.

public ByteSequence populate(Iterator<Entry<Key, Value>> iter, T persistent) throws IOException {
    ByteSequence row = null;
    Map<Utf8, Object> currentMap = null;
    List currentArray = null;
    Text currentFam = null;
    int currentPos = 0;
    Schema currentSchema = null;
    Field currentField = null;
    BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(new byte[0], null);
    while (iter.hasNext()) {
        Entry<Key, Value> entry = iter.next();
        if (row == null) {
            row = entry.getKey().getRowData();
        }
        byte[] val = entry.getValue().get();
        Field field = fieldMap.get(getFieldName(entry));
        if (currentMap != null) {
            if (currentFam.equals(entry.getKey().getColumnFamily())) {
                currentMap.put(new Utf8(entry.getKey().getColumnQualifierData().toArray()), fromBytes(currentSchema, entry.getValue().get()));
                continue;
            } else {
                persistent.put(currentPos, currentMap);
                currentMap = null;
            }
        } else if (currentArray != null) {
            if (currentFam.equals(entry.getKey().getColumnFamily())) {
                currentArray.add(fromBytes(currentSchema, entry.getValue().get()));
                continue;
            } else {
                persistent.put(currentPos, new GenericData.Array<T>(currentField.schema(), currentArray));
                currentArray = null;
            }
        }
        switch(field.schema().getType()) {
            case // first entry only. Next are handled above on the next loop
            MAP:
                currentMap = new DirtyMapWrapper<>(new HashMap<Utf8, Object>());
                currentPos = field.pos();
                currentFam = entry.getKey().getColumnFamily();
                currentSchema = field.schema().getValueType();
                currentMap.put(new Utf8(entry.getKey().getColumnQualifierData().toArray()), fromBytes(currentSchema, entry.getValue().get()));
                break;
            case ARRAY:
                currentArray = new DirtyListWrapper<>(new ArrayList<>());
                currentPos = field.pos();
                currentFam = entry.getKey().getColumnFamily();
                currentSchema = field.schema().getElementType();
                currentField = field;
                currentArray.add(fromBytes(currentSchema, entry.getValue().get()));
                break;
            case // default value of null acts like union with null
            UNION:
                Schema effectiveSchema = field.schema().getTypes().get(firstNotNullSchemaTypeIndex(field.schema()));
                // map and array were coded without union index so need to be read the same way
                if (effectiveSchema.getType() == Type.ARRAY) {
                    currentArray = new DirtyListWrapper<>(new ArrayList<>());
                    currentPos = field.pos();
                    currentFam = entry.getKey().getColumnFamily();
                    currentSchema = field.schema().getElementType();
                    currentField = field;
                    currentArray.add(fromBytes(currentSchema, entry.getValue().get()));
                    break;
                } else if (effectiveSchema.getType() == Type.MAP) {
                    currentMap = new DirtyMapWrapper<>(new HashMap<Utf8, Object>());
                    currentPos = field.pos();
                    currentFam = entry.getKey().getColumnFamily();
                    currentSchema = effectiveSchema.getValueType();
                    currentMap.put(new Utf8(entry.getKey().getColumnQualifierData().toArray()), fromBytes(currentSchema, entry.getValue().get()));
                    break;
                }
            // continue like a regular top-level union
            case RECORD:
                SpecificDatumReader<?> reader = new SpecificDatumReader<Schema>(field.schema());
                persistent.put(field.pos(), reader.read(null, DecoderFactory.get().binaryDecoder(val, decoder)));
                break;
            default:
                persistent.put(field.pos(), fromBytes(field.schema(), entry.getValue().get()));
        }
    }
    if (currentMap != null) {
        persistent.put(currentPos, currentMap);
    } else if (currentArray != null) {
        persistent.put(currentPos, new GenericData.Array<T>(currentField.schema(), currentArray));
    }
    persistent.clearDirty();
    return row;
}
Also used : DirtyMapWrapper(org.apache.gora.persistency.impl.DirtyMapWrapper) HashMap(java.util.HashMap) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) Text(org.apache.hadoop.io.Text) BinaryDecoder(org.apache.avro.io.BinaryDecoder) Field(org.apache.avro.Schema.Field) Value(org.apache.accumulo.core.data.Value) Utf8(org.apache.avro.util.Utf8) List(java.util.List) ArrayList(java.util.ArrayList) NodeList(org.w3c.dom.NodeList) SpecificDatumReader(org.apache.avro.specific.SpecificDatumReader) ByteSequence(org.apache.accumulo.core.data.ByteSequence) Key(org.apache.accumulo.core.data.Key)

Example 5 with SpecificDatumReader

use of org.apache.avro.specific.SpecificDatumReader in project gora by apache.

the class AvroSerializerUtil method deserializer.

public static Object deserializer(Object value, Schema schema) throws IOException {
    String schemaId = schema.getFullName();
    SpecificDatumReader<?> reader = readerMap.get(schemaId);
    if (reader == null) {
        // ignore dirty bits
        reader = new SpecificDatumReader(schema);
        SpecificDatumReader localReader = null;
        if ((localReader = readerMap.putIfAbsent(schemaId, reader)) != null) {
            reader = localReader;
        }
    }
    // initialize a decoder, possibly reusing previous one
    BinaryDecoder decoderFromCache = decoders.get();
    BinaryDecoder decoder = DecoderFactory.get().binaryDecoder((byte[]) value, null);
    // put in threadlocal cache if the initial get was empty
    if (decoderFromCache == null) {
        decoders.set(decoder);
    }
    Object result = reader.read(null, decoder);
    return result;
}
Also used : SpecificDatumReader(org.apache.avro.specific.SpecificDatumReader) BinaryDecoder(org.apache.avro.io.BinaryDecoder)

Aggregations

SpecificDatumReader (org.apache.avro.specific.SpecificDatumReader)17 Schema (org.apache.avro.Schema)6 HashMap (java.util.HashMap)5 GenericRecord (org.apache.avro.generic.GenericRecord)5 BinaryDecoder (org.apache.avro.io.BinaryDecoder)5 Test (org.junit.Test)5 DataFileStream (org.apache.avro.file.DataFileStream)4 Utf8 (org.apache.avro.util.Utf8)4 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)4 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)4 Path (org.apache.hadoop.fs.Path)4 DataFileReader (org.apache.avro.file.DataFileReader)3 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 Type (org.apache.avro.Schema.Type)2 ReflectDatumReader (org.apache.avro.reflect.ReflectDatumReader)2 TypeHint (org.apache.flink.api.common.typeinfo.TypeHint)2 StreamExecutionEnvironment (org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)2 AvroKeyValueSinkWriter (org.apache.flink.streaming.connectors.fs.AvroKeyValueSinkWriter)2 AvroKeyValue (org.apache.flink.streaming.connectors.fs.AvroKeyValueSinkWriter.AvroKeyValue)2