use of org.apache.avro.io.BinaryDecoder in project beam by apache.
the class AvroSource method readMetadataFromFile.
/**
* Reads the {@link AvroMetadata} from the header of an Avro file.
*
* <p>This method parses the header of an Avro
* <a href="https://avro.apache.org/docs/1.7.7/spec.html#Object+Container+Files">
* Object Container File</a>.
*
* @throws IOException if the file is an invalid format.
*/
@VisibleForTesting
static AvroMetadata readMetadataFromFile(ResourceId fileResource) throws IOException {
String codec = null;
String schemaString = null;
byte[] syncMarker;
try (InputStream stream = Channels.newInputStream(FileSystems.open(fileResource))) {
BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(stream, null);
// The header of an object container file begins with a four-byte magic number, followed
// by the file metadata (including the schema and codec), encoded as a map. Finally, the
// header ends with the file's 16-byte sync marker.
// See https://avro.apache.org/docs/1.7.7/spec.html#Object+Container+Files for details on
// the encoding of container files.
// Read the magic number.
byte[] magic = new byte[DataFileConstants.MAGIC.length];
decoder.readFixed(magic);
if (!Arrays.equals(magic, DataFileConstants.MAGIC)) {
throw new IOException("Missing Avro file signature: " + fileResource);
}
// Read the metadata to find the codec and schema.
ByteBuffer valueBuffer = ByteBuffer.allocate(512);
long numRecords = decoder.readMapStart();
while (numRecords > 0) {
for (long recordIndex = 0; recordIndex < numRecords; recordIndex++) {
String key = decoder.readString();
// readBytes() clears the buffer and returns a buffer where:
// - position is the start of the bytes read
// - limit is the end of the bytes read
valueBuffer = decoder.readBytes(valueBuffer);
byte[] bytes = new byte[valueBuffer.remaining()];
valueBuffer.get(bytes);
if (key.equals(DataFileConstants.CODEC)) {
codec = new String(bytes, "UTF-8");
} else if (key.equals(DataFileConstants.SCHEMA)) {
schemaString = new String(bytes, "UTF-8");
}
}
numRecords = decoder.mapNext();
}
if (codec == null) {
codec = DataFileConstants.NULL_CODEC;
}
// Finally, read the sync marker.
syncMarker = new byte[DataFileConstants.SYNC_SIZE];
decoder.readFixed(syncMarker);
}
checkState(schemaString != null, "No schema present in Avro file metadata %s", fileResource);
return new AvroMetadata(syncMarker, codec, schemaString);
}
use of org.apache.avro.io.BinaryDecoder in project gora by apache.
the class AccumuloStore method populate.
public ByteSequence populate(Iterator<Entry<Key, Value>> iter, T persistent) throws IOException {
ByteSequence row = null;
Map<Utf8, Object> currentMap = null;
List currentArray = null;
Text currentFam = null;
int currentPos = 0;
Schema currentSchema = null;
Field currentField = null;
BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(new byte[0], null);
while (iter.hasNext()) {
Entry<Key, Value> entry = iter.next();
if (row == null) {
row = entry.getKey().getRowData();
}
byte[] val = entry.getValue().get();
Field field = fieldMap.get(getFieldName(entry));
if (currentMap != null) {
if (currentFam.equals(entry.getKey().getColumnFamily())) {
currentMap.put(new Utf8(entry.getKey().getColumnQualifierData().toArray()), fromBytes(currentSchema, entry.getValue().get()));
continue;
} else {
persistent.put(currentPos, currentMap);
currentMap = null;
}
} else if (currentArray != null) {
if (currentFam.equals(entry.getKey().getColumnFamily())) {
currentArray.add(fromBytes(currentSchema, entry.getValue().get()));
continue;
} else {
persistent.put(currentPos, new GenericData.Array<T>(currentField.schema(), currentArray));
currentArray = null;
}
}
switch(field.schema().getType()) {
case // first entry only. Next are handled above on the next loop
MAP:
currentMap = new DirtyMapWrapper<>(new HashMap<Utf8, Object>());
currentPos = field.pos();
currentFam = entry.getKey().getColumnFamily();
currentSchema = field.schema().getValueType();
currentMap.put(new Utf8(entry.getKey().getColumnQualifierData().toArray()), fromBytes(currentSchema, entry.getValue().get()));
break;
case ARRAY:
currentArray = new DirtyListWrapper<>(new ArrayList<>());
currentPos = field.pos();
currentFam = entry.getKey().getColumnFamily();
currentSchema = field.schema().getElementType();
currentField = field;
currentArray.add(fromBytes(currentSchema, entry.getValue().get()));
break;
case // default value of null acts like union with null
UNION:
Schema effectiveSchema = field.schema().getTypes().get(firstNotNullSchemaTypeIndex(field.schema()));
// map and array were coded without union index so need to be read the same way
if (effectiveSchema.getType() == Type.ARRAY) {
currentArray = new DirtyListWrapper<>(new ArrayList<>());
currentPos = field.pos();
currentFam = entry.getKey().getColumnFamily();
currentSchema = field.schema().getElementType();
currentField = field;
currentArray.add(fromBytes(currentSchema, entry.getValue().get()));
break;
} else if (effectiveSchema.getType() == Type.MAP) {
currentMap = new DirtyMapWrapper<>(new HashMap<Utf8, Object>());
currentPos = field.pos();
currentFam = entry.getKey().getColumnFamily();
currentSchema = effectiveSchema.getValueType();
currentMap.put(new Utf8(entry.getKey().getColumnQualifierData().toArray()), fromBytes(currentSchema, entry.getValue().get()));
break;
}
// continue like a regular top-level union
case RECORD:
SpecificDatumReader<?> reader = new SpecificDatumReader<Schema>(field.schema());
persistent.put(field.pos(), reader.read(null, DecoderFactory.get().binaryDecoder(val, decoder)));
break;
default:
persistent.put(field.pos(), fromBytes(field.schema(), entry.getValue().get()));
}
}
if (currentMap != null) {
persistent.put(currentPos, currentMap);
} else if (currentArray != null) {
persistent.put(currentPos, new GenericData.Array<T>(currentField.schema(), currentArray));
}
persistent.clearDirty();
return row;
}
use of org.apache.avro.io.BinaryDecoder in project gora by apache.
the class AccumuloStore method fromBytes.
public Object fromBytes(Schema schema, byte[] data) throws IOException {
Schema fromSchema = null;
if (schema.getType() == Type.UNION) {
try {
Decoder decoder = DecoderFactory.get().binaryDecoder(data, null);
int unionIndex = decoder.readIndex();
List<Schema> possibleTypes = schema.getTypes();
fromSchema = possibleTypes.get(unionIndex);
Schema effectiveSchema = possibleTypes.get(unionIndex);
if (effectiveSchema.getType() == Type.NULL) {
decoder.readNull();
return null;
} else {
data = decoder.readBytes(null).array();
}
} catch (IOException e) {
LOG.error(e.getMessage());
throw new GoraException("Error decoding union type: ", e);
}
} else {
fromSchema = schema;
}
return fromBytes(encoder, fromSchema, data);
}
use of org.apache.avro.io.BinaryDecoder in project gora by apache.
the class AvroSerializerUtil method deserializer.
public static Object deserializer(Object value, Schema schema) throws IOException {
String schemaId = schema.getFullName();
SpecificDatumReader<?> reader = readerMap.get(schemaId);
if (reader == null) {
// ignore dirty bits
reader = new SpecificDatumReader(schema);
SpecificDatumReader localReader = null;
if ((localReader = readerMap.putIfAbsent(schemaId, reader)) != null) {
reader = localReader;
}
}
// initialize a decoder, possibly reusing previous one
BinaryDecoder decoderFromCache = decoders.get();
BinaryDecoder decoder = DecoderFactory.get().binaryDecoder((byte[]) value, null);
// put in threadlocal cache if the initial get was empty
if (decoderFromCache == null) {
decoders.set(decoder);
}
Object result = reader.read(null, decoder);
return result;
}
use of org.apache.avro.io.BinaryDecoder in project eiger by wlloyd.
the class SerDeUtils method deserializeWithSchema.
/**
* Deserializes a single object as stored along with its Schema by serialize(T). NB: See warnings on serialize(T).
* @param ob An empty object to deserialize into (must not be null).
* @param bytes Array to deserialize from
* @throws IOException
*/
public static <T extends SpecificRecord> T deserializeWithSchema(ByteBuffer bytes, T ob) throws IOException {
BinaryDecoder dec = DIRECT_DECODERS.createBinaryDecoder(ByteBufferUtil.getArray(bytes), null);
Schema writer = Schema.parse(dec.readString(new Utf8()).toString());
SpecificDatumReader<T> reader = new SpecificDatumReader<T>(writer);
reader.setExpected(ob.getSchema());
return reader.read(ob, dec);
}
Aggregations