Search in sources :

Example 1 with BinaryDecoder

use of in project beam by apache.

the class AvroSource method readMetadataFromFile.

   * Reads the {@link AvroMetadata} from the header of an Avro file.
   * <p>This method parses the header of an Avro
   * <a href="">
   * Object Container File</a>.
   * @throws IOException if the file is an invalid format.
static AvroMetadata readMetadataFromFile(ResourceId fileResource) throws IOException {
    String codec = null;
    String schemaString = null;
    byte[] syncMarker;
    try (InputStream stream = Channels.newInputStream( {
        BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(stream, null);
        // The header of an object container file begins with a four-byte magic number, followed
        // by the file metadata (including the schema and codec), encoded as a map. Finally, the
        // header ends with the file's 16-byte sync marker.
        // See for details on
        // the encoding of container files.
        // Read the magic number.
        byte[] magic = new byte[DataFileConstants.MAGIC.length];
        if (!Arrays.equals(magic, DataFileConstants.MAGIC)) {
            throw new IOException("Missing Avro file signature: " + fileResource);
        // Read the metadata to find the codec and schema.
        ByteBuffer valueBuffer = ByteBuffer.allocate(512);
        long numRecords = decoder.readMapStart();
        while (numRecords > 0) {
            for (long recordIndex = 0; recordIndex < numRecords; recordIndex++) {
                String key = decoder.readString();
                // readBytes() clears the buffer and returns a buffer where:
                // - position is the start of the bytes read
                // - limit is the end of the bytes read
                valueBuffer = decoder.readBytes(valueBuffer);
                byte[] bytes = new byte[valueBuffer.remaining()];
                if (key.equals(DataFileConstants.CODEC)) {
                    codec = new String(bytes, "UTF-8");
                } else if (key.equals(DataFileConstants.SCHEMA)) {
                    schemaString = new String(bytes, "UTF-8");
            numRecords = decoder.mapNext();
        if (codec == null) {
            codec = DataFileConstants.NULL_CODEC;
        // Finally, read the sync marker.
        syncMarker = new byte[DataFileConstants.SYNC_SIZE];
    checkState(schemaString != null, "No schema present in Avro file metadata %s", fileResource);
    return new AvroMetadata(syncMarker, codec, schemaString);
Also used : PushbackInputStream( InflaterInputStream( SnappyCompressorInputStream(org.apache.commons.compress.compressors.snappy.SnappyCompressorInputStream) ByteArrayInputStream( XZCompressorInputStream(org.apache.commons.compress.compressors.xz.XZCompressorInputStream) BZip2CompressorInputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream) CountingInputStream(org.apache.commons.compress.utils.CountingInputStream) InputStream( IOException( ByteBuffer(java.nio.ByteBuffer) BinaryDecoder( VisibleForTesting(

Example 2 with BinaryDecoder

use of in project gora by apache.

the class AccumuloStore method populate.

public ByteSequence populate(Iterator<Entry<Key, Value>> iter, T persistent) throws IOException {
    ByteSequence row = null;
    Map<Utf8, Object> currentMap = null;
    List currentArray = null;
    Text currentFam = null;
    int currentPos = 0;
    Schema currentSchema = null;
    Field currentField = null;
    BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(new byte[0], null);
    while (iter.hasNext()) {
        Entry<Key, Value> entry =;
        if (row == null) {
            row = entry.getKey().getRowData();
        byte[] val = entry.getValue().get();
        Field field = fieldMap.get(getFieldName(entry));
        if (currentMap != null) {
            if (currentFam.equals(entry.getKey().getColumnFamily())) {
                currentMap.put(new Utf8(entry.getKey().getColumnQualifierData().toArray()), fromBytes(currentSchema, entry.getValue().get()));
            } else {
                persistent.put(currentPos, currentMap);
                currentMap = null;
        } else if (currentArray != null) {
            if (currentFam.equals(entry.getKey().getColumnFamily())) {
                currentArray.add(fromBytes(currentSchema, entry.getValue().get()));
            } else {
                persistent.put(currentPos, new GenericData.Array<T>(currentField.schema(), currentArray));
                currentArray = null;
        switch(field.schema().getType()) {
            case // first entry only. Next are handled above on the next loop
                currentMap = new DirtyMapWrapper<>(new HashMap<Utf8, Object>());
                currentPos = field.pos();
                currentFam = entry.getKey().getColumnFamily();
                currentSchema = field.schema().getValueType();
                currentMap.put(new Utf8(entry.getKey().getColumnQualifierData().toArray()), fromBytes(currentSchema, entry.getValue().get()));
            case ARRAY:
                currentArray = new DirtyListWrapper<>(new ArrayList<>());
                currentPos = field.pos();
                currentFam = entry.getKey().getColumnFamily();
                currentSchema = field.schema().getElementType();
                currentField = field;
                currentArray.add(fromBytes(currentSchema, entry.getValue().get()));
            case // default value of null acts like union with null
                Schema effectiveSchema = field.schema().getTypes().get(firstNotNullSchemaTypeIndex(field.schema()));
                // map and array were coded without union index so need to be read the same way
                if (effectiveSchema.getType() == Type.ARRAY) {
                    currentArray = new DirtyListWrapper<>(new ArrayList<>());
                    currentPos = field.pos();
                    currentFam = entry.getKey().getColumnFamily();
                    currentSchema = field.schema().getElementType();
                    currentField = field;
                    currentArray.add(fromBytes(currentSchema, entry.getValue().get()));
                } else if (effectiveSchema.getType() == Type.MAP) {
                    currentMap = new DirtyMapWrapper<>(new HashMap<Utf8, Object>());
                    currentPos = field.pos();
                    currentFam = entry.getKey().getColumnFamily();
                    currentSchema = effectiveSchema.getValueType();
                    currentMap.put(new Utf8(entry.getKey().getColumnQualifierData().toArray()), fromBytes(currentSchema, entry.getValue().get()));
            // continue like a regular top-level union
            case RECORD:
                SpecificDatumReader<?> reader = new SpecificDatumReader<Schema>(field.schema());
                persistent.put(field.pos(),, DecoderFactory.get().binaryDecoder(val, decoder)));
                persistent.put(field.pos(), fromBytes(field.schema(), entry.getValue().get()));
    if (currentMap != null) {
        persistent.put(currentPos, currentMap);
    } else if (currentArray != null) {
        persistent.put(currentPos, new GenericData.Array<T>(currentField.schema(), currentArray));
    return row;
Also used : DirtyMapWrapper(org.apache.gora.persistency.impl.DirtyMapWrapper) HashMap(java.util.HashMap) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) Text( BinaryDecoder( Field(org.apache.avro.Schema.Field) Value( Utf8(org.apache.avro.util.Utf8) List(java.util.List) ArrayList(java.util.ArrayList) NodeList(org.w3c.dom.NodeList) SpecificDatumReader(org.apache.avro.specific.SpecificDatumReader) ByteSequence( Key(

Example 3 with BinaryDecoder

use of in project gora by apache.

the class AccumuloStore method fromBytes.

public Object fromBytes(Schema schema, byte[] data) throws IOException {
    Schema fromSchema = null;
    if (schema.getType() == Type.UNION) {
        try {
            Decoder decoder = DecoderFactory.get().binaryDecoder(data, null);
            int unionIndex = decoder.readIndex();
            List<Schema> possibleTypes = schema.getTypes();
            fromSchema = possibleTypes.get(unionIndex);
            Schema effectiveSchema = possibleTypes.get(unionIndex);
            if (effectiveSchema.getType() == Type.NULL) {
                return null;
            } else {
                data = decoder.readBytes(null).array();
        } catch (IOException e) {
            throw new GoraException("Error decoding union type: ", e);
    } else {
        fromSchema = schema;
    return fromBytes(encoder, fromSchema, data);
Also used : GoraException(org.apache.gora.util.GoraException) Schema(org.apache.avro.Schema) IOException( Decoder( BinaryDecoder(

Example 4 with BinaryDecoder

use of in project gora by apache.

the class AvroSerializerUtil method deserializer.

public static Object deserializer(Object value, Schema schema) throws IOException {
    String schemaId = schema.getFullName();
    SpecificDatumReader<?> reader = readerMap.get(schemaId);
    if (reader == null) {
        // ignore dirty bits
        reader = new SpecificDatumReader(schema);
        SpecificDatumReader localReader = null;
        if ((localReader = readerMap.putIfAbsent(schemaId, reader)) != null) {
            reader = localReader;
    // initialize a decoder, possibly reusing previous one
    BinaryDecoder decoderFromCache = decoders.get();
    BinaryDecoder decoder = DecoderFactory.get().binaryDecoder((byte[]) value, null);
    // put in threadlocal cache if the initial get was empty
    if (decoderFromCache == null) {
    Object result =, decoder);
    return result;
Also used : SpecificDatumReader(org.apache.avro.specific.SpecificDatumReader) BinaryDecoder(

Example 5 with BinaryDecoder

use of in project eiger by wlloyd.

the class SerDeUtils method deserializeWithSchema.

     * Deserializes a single object as stored along with its Schema by serialize(T). NB: See warnings on serialize(T).
     * @param ob An empty object to deserialize into (must not be null).
     * @param bytes Array to deserialize from
     * @throws IOException
public static <T extends SpecificRecord> T deserializeWithSchema(ByteBuffer bytes, T ob) throws IOException {
    BinaryDecoder dec = DIRECT_DECODERS.createBinaryDecoder(ByteBufferUtil.getArray(bytes), null);
    Schema writer = Schema.parse(dec.readString(new Utf8()).toString());
    SpecificDatumReader<T> reader = new SpecificDatumReader<T>(writer);
    return, dec);
Also used : Schema(org.apache.avro.Schema) Utf8(org.apache.avro.util.Utf8) SpecificDatumReader(org.apache.avro.specific.SpecificDatumReader) BinaryDecoder(


BinaryDecoder ( SpecificDatumReader (org.apache.avro.specific.SpecificDatumReader)5 Schema (org.apache.avro.Schema)4 IOException ( Utf8 (org.apache.avro.util.Utf8)3 ArrayList (java.util.ArrayList)2 VisibleForTesting ( ByteArrayInputStream ( InputStream ( PushbackInputStream ( ByteBuffer (java.nio.ByteBuffer)1 HashMap (java.util.HashMap)1 List (java.util.List)1 InflaterInputStream ( ByteSequence ( Key ( Value ( Field (org.apache.avro.Schema.Field)1 Type (org.apache.avro.Schema.Type)1 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)1