Search in sources :

Example 1 with BinaryDecoder

use of org.apache.avro.io.BinaryDecoder in project beam by apache.

the class AvroSource method readMetadataFromFile.

/**
   * Reads the {@link AvroMetadata} from the header of an Avro file.
   *
   * <p>This method parses the header of an Avro
   * <a href="https://avro.apache.org/docs/1.7.7/spec.html#Object+Container+Files">
   * Object Container File</a>.
   *
   * @throws IOException if the file is an invalid format.
   */
@VisibleForTesting
static AvroMetadata readMetadataFromFile(ResourceId fileResource) throws IOException {
    String codec = null;
    String schemaString = null;
    byte[] syncMarker;
    try (InputStream stream = Channels.newInputStream(FileSystems.open(fileResource))) {
        BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(stream, null);
        // The header of an object container file begins with a four-byte magic number, followed
        // by the file metadata (including the schema and codec), encoded as a map. Finally, the
        // header ends with the file's 16-byte sync marker.
        // See https://avro.apache.org/docs/1.7.7/spec.html#Object+Container+Files for details on
        // the encoding of container files.
        // Read the magic number.
        byte[] magic = new byte[DataFileConstants.MAGIC.length];
        decoder.readFixed(magic);
        if (!Arrays.equals(magic, DataFileConstants.MAGIC)) {
            throw new IOException("Missing Avro file signature: " + fileResource);
        }
        // Read the metadata to find the codec and schema.
        ByteBuffer valueBuffer = ByteBuffer.allocate(512);
        long numRecords = decoder.readMapStart();
        while (numRecords > 0) {
            for (long recordIndex = 0; recordIndex < numRecords; recordIndex++) {
                String key = decoder.readString();
                // readBytes() clears the buffer and returns a buffer where:
                // - position is the start of the bytes read
                // - limit is the end of the bytes read
                valueBuffer = decoder.readBytes(valueBuffer);
                byte[] bytes = new byte[valueBuffer.remaining()];
                valueBuffer.get(bytes);
                if (key.equals(DataFileConstants.CODEC)) {
                    codec = new String(bytes, "UTF-8");
                } else if (key.equals(DataFileConstants.SCHEMA)) {
                    schemaString = new String(bytes, "UTF-8");
                }
            }
            numRecords = decoder.mapNext();
        }
        if (codec == null) {
            codec = DataFileConstants.NULL_CODEC;
        }
        // Finally, read the sync marker.
        syncMarker = new byte[DataFileConstants.SYNC_SIZE];
        decoder.readFixed(syncMarker);
    }
    checkState(schemaString != null, "No schema present in Avro file metadata %s", fileResource);
    return new AvroMetadata(syncMarker, codec, schemaString);
}
Also used : PushbackInputStream(java.io.PushbackInputStream) InflaterInputStream(java.util.zip.InflaterInputStream) SnappyCompressorInputStream(org.apache.commons.compress.compressors.snappy.SnappyCompressorInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) XZCompressorInputStream(org.apache.commons.compress.compressors.xz.XZCompressorInputStream) BZip2CompressorInputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream) CountingInputStream(org.apache.commons.compress.utils.CountingInputStream) InputStream(java.io.InputStream) IOException(java.io.IOException) ByteBuffer(java.nio.ByteBuffer) BinaryDecoder(org.apache.avro.io.BinaryDecoder) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 2 with BinaryDecoder

use of org.apache.avro.io.BinaryDecoder in project gora by apache.

the class AccumuloStore method populate.

public ByteSequence populate(Iterator<Entry<Key, Value>> iter, T persistent) throws IOException {
    ByteSequence row = null;
    Map<Utf8, Object> currentMap = null;
    List currentArray = null;
    Text currentFam = null;
    int currentPos = 0;
    Schema currentSchema = null;
    Field currentField = null;
    BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(new byte[0], null);
    while (iter.hasNext()) {
        Entry<Key, Value> entry = iter.next();
        if (row == null) {
            row = entry.getKey().getRowData();
        }
        byte[] val = entry.getValue().get();
        Field field = fieldMap.get(getFieldName(entry));
        if (currentMap != null) {
            if (currentFam.equals(entry.getKey().getColumnFamily())) {
                currentMap.put(new Utf8(entry.getKey().getColumnQualifierData().toArray()), fromBytes(currentSchema, entry.getValue().get()));
                continue;
            } else {
                persistent.put(currentPos, currentMap);
                currentMap = null;
            }
        } else if (currentArray != null) {
            if (currentFam.equals(entry.getKey().getColumnFamily())) {
                currentArray.add(fromBytes(currentSchema, entry.getValue().get()));
                continue;
            } else {
                persistent.put(currentPos, new GenericData.Array<T>(currentField.schema(), currentArray));
                currentArray = null;
            }
        }
        switch(field.schema().getType()) {
            case // first entry only. Next are handled above on the next loop
            MAP:
                currentMap = new DirtyMapWrapper<>(new HashMap<Utf8, Object>());
                currentPos = field.pos();
                currentFam = entry.getKey().getColumnFamily();
                currentSchema = field.schema().getValueType();
                currentMap.put(new Utf8(entry.getKey().getColumnQualifierData().toArray()), fromBytes(currentSchema, entry.getValue().get()));
                break;
            case ARRAY:
                currentArray = new DirtyListWrapper<>(new ArrayList<>());
                currentPos = field.pos();
                currentFam = entry.getKey().getColumnFamily();
                currentSchema = field.schema().getElementType();
                currentField = field;
                currentArray.add(fromBytes(currentSchema, entry.getValue().get()));
                break;
            case // default value of null acts like union with null
            UNION:
                Schema effectiveSchema = field.schema().getTypes().get(firstNotNullSchemaTypeIndex(field.schema()));
                // map and array were coded without union index so need to be read the same way
                if (effectiveSchema.getType() == Type.ARRAY) {
                    currentArray = new DirtyListWrapper<>(new ArrayList<>());
                    currentPos = field.pos();
                    currentFam = entry.getKey().getColumnFamily();
                    currentSchema = field.schema().getElementType();
                    currentField = field;
                    currentArray.add(fromBytes(currentSchema, entry.getValue().get()));
                    break;
                } else if (effectiveSchema.getType() == Type.MAP) {
                    currentMap = new DirtyMapWrapper<>(new HashMap<Utf8, Object>());
                    currentPos = field.pos();
                    currentFam = entry.getKey().getColumnFamily();
                    currentSchema = effectiveSchema.getValueType();
                    currentMap.put(new Utf8(entry.getKey().getColumnQualifierData().toArray()), fromBytes(currentSchema, entry.getValue().get()));
                    break;
                }
            // continue like a regular top-level union
            case RECORD:
                SpecificDatumReader<?> reader = new SpecificDatumReader<Schema>(field.schema());
                persistent.put(field.pos(), reader.read(null, DecoderFactory.get().binaryDecoder(val, decoder)));
                break;
            default:
                persistent.put(field.pos(), fromBytes(field.schema(), entry.getValue().get()));
        }
    }
    if (currentMap != null) {
        persistent.put(currentPos, currentMap);
    } else if (currentArray != null) {
        persistent.put(currentPos, new GenericData.Array<T>(currentField.schema(), currentArray));
    }
    persistent.clearDirty();
    return row;
}
Also used : DirtyMapWrapper(org.apache.gora.persistency.impl.DirtyMapWrapper) HashMap(java.util.HashMap) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) Text(org.apache.hadoop.io.Text) BinaryDecoder(org.apache.avro.io.BinaryDecoder) Field(org.apache.avro.Schema.Field) Value(org.apache.accumulo.core.data.Value) Utf8(org.apache.avro.util.Utf8) List(java.util.List) ArrayList(java.util.ArrayList) NodeList(org.w3c.dom.NodeList) SpecificDatumReader(org.apache.avro.specific.SpecificDatumReader) ByteSequence(org.apache.accumulo.core.data.ByteSequence) Key(org.apache.accumulo.core.data.Key)

Example 3 with BinaryDecoder

use of org.apache.avro.io.BinaryDecoder in project gora by apache.

the class AccumuloStore method fromBytes.

public Object fromBytes(Schema schema, byte[] data) throws IOException {
    Schema fromSchema = null;
    if (schema.getType() == Type.UNION) {
        try {
            Decoder decoder = DecoderFactory.get().binaryDecoder(data, null);
            int unionIndex = decoder.readIndex();
            List<Schema> possibleTypes = schema.getTypes();
            fromSchema = possibleTypes.get(unionIndex);
            Schema effectiveSchema = possibleTypes.get(unionIndex);
            if (effectiveSchema.getType() == Type.NULL) {
                decoder.readNull();
                return null;
            } else {
                data = decoder.readBytes(null).array();
            }
        } catch (IOException e) {
            LOG.error(e.getMessage());
            throw new GoraException("Error decoding union type: ", e);
        }
    } else {
        fromSchema = schema;
    }
    return fromBytes(encoder, fromSchema, data);
}
Also used : GoraException(org.apache.gora.util.GoraException) Schema(org.apache.avro.Schema) IOException(java.io.IOException) Decoder(org.apache.avro.io.Decoder) BinaryDecoder(org.apache.avro.io.BinaryDecoder)

Example 4 with BinaryDecoder

use of org.apache.avro.io.BinaryDecoder in project gora by apache.

the class AvroSerializerUtil method deserializer.

public static Object deserializer(Object value, Schema schema) throws IOException {
    String schemaId = schema.getFullName();
    SpecificDatumReader<?> reader = readerMap.get(schemaId);
    if (reader == null) {
        // ignore dirty bits
        reader = new SpecificDatumReader(schema);
        SpecificDatumReader localReader = null;
        if ((localReader = readerMap.putIfAbsent(schemaId, reader)) != null) {
            reader = localReader;
        }
    }
    // initialize a decoder, possibly reusing previous one
    BinaryDecoder decoderFromCache = decoders.get();
    BinaryDecoder decoder = DecoderFactory.get().binaryDecoder((byte[]) value, null);
    // put in threadlocal cache if the initial get was empty
    if (decoderFromCache == null) {
        decoders.set(decoder);
    }
    Object result = reader.read(null, decoder);
    return result;
}
Also used : SpecificDatumReader(org.apache.avro.specific.SpecificDatumReader) BinaryDecoder(org.apache.avro.io.BinaryDecoder)

Example 5 with BinaryDecoder

use of org.apache.avro.io.BinaryDecoder in project eiger by wlloyd.

the class SerDeUtils method deserializeWithSchema.

/**
     * Deserializes a single object as stored along with its Schema by serialize(T). NB: See warnings on serialize(T).
     * @param ob An empty object to deserialize into (must not be null).
     * @param bytes Array to deserialize from
     * @throws IOException
     */
public static <T extends SpecificRecord> T deserializeWithSchema(ByteBuffer bytes, T ob) throws IOException {
    BinaryDecoder dec = DIRECT_DECODERS.createBinaryDecoder(ByteBufferUtil.getArray(bytes), null);
    Schema writer = Schema.parse(dec.readString(new Utf8()).toString());
    SpecificDatumReader<T> reader = new SpecificDatumReader<T>(writer);
    reader.setExpected(ob.getSchema());
    return reader.read(ob, dec);
}
Also used : Schema(org.apache.avro.Schema) Utf8(org.apache.avro.util.Utf8) SpecificDatumReader(org.apache.avro.specific.SpecificDatumReader) BinaryDecoder(org.apache.avro.io.BinaryDecoder)

Aggregations

BinaryDecoder (org.apache.avro.io.BinaryDecoder)9 SpecificDatumReader (org.apache.avro.specific.SpecificDatumReader)5 Schema (org.apache.avro.Schema)4 IOException (java.io.IOException)3 Utf8 (org.apache.avro.util.Utf8)3 ArrayList (java.util.ArrayList)2 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 ByteArrayInputStream (java.io.ByteArrayInputStream)1 InputStream (java.io.InputStream)1 PushbackInputStream (java.io.PushbackInputStream)1 ByteBuffer (java.nio.ByteBuffer)1 HashMap (java.util.HashMap)1 List (java.util.List)1 InflaterInputStream (java.util.zip.InflaterInputStream)1 ByteSequence (org.apache.accumulo.core.data.ByteSequence)1 Key (org.apache.accumulo.core.data.Key)1 Value (org.apache.accumulo.core.data.Value)1 Field (org.apache.avro.Schema.Field)1 Type (org.apache.avro.Schema.Type)1 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)1