use of org.apache.avro.util.Utf8 in project flink by apache.
the class DataInputDecoder method readString.
// --------------------------------------------------------------------------------------------
// strings
// --------------------------------------------------------------------------------------------
@Override
public Utf8 readString(Utf8 old) throws IOException {
int length = readInt();
Utf8 result = (old != null ? old : new Utf8());
result.setByteLength(length);
if (length > 0) {
in.readFully(result.getBytes(), 0, length);
}
return result;
}
use of org.apache.avro.util.Utf8 in project h2o-3 by h2oai.
the class AvroParser method write2frame.
/**
* The main method transforming Avro record into a row in H2O frame.
*
* @param gr Avro generic record
* @param columnNames Column names prepared by parser setup
* @param inSchema Flattenized Avro schema which corresponds to passed column names
* @param columnTypes Target H2O types
* @param dout Parser writer
*/
private static void write2frame(GenericRecord gr, String[] columnNames, Schema.Field[] inSchema, byte[] columnTypes, ParseWriter dout) {
assert inSchema.length == columnTypes.length : "AVRO field flatenized schema has to match to parser setup";
BufferedString bs = new BufferedString();
for (int cIdx = 0; cIdx < columnNames.length; cIdx++) {
int inputFieldIdx = inSchema[cIdx].pos();
Schema.Type inputType = toPrimitiveType(inSchema[cIdx].schema());
// FIXME: support target conversions
byte targetType = columnTypes[cIdx];
Object value = gr.get(inputFieldIdx);
if (value == null) {
dout.addInvalidCol(cIdx);
} else {
switch(inputType) {
case BOOLEAN:
dout.addNumCol(cIdx, ((Boolean) value) ? 1 : 0);
break;
case INT:
dout.addNumCol(cIdx, ((Integer) value), 0);
break;
case LONG:
dout.addNumCol(cIdx, ((Long) value), 0);
break;
case FLOAT:
dout.addNumCol(cIdx, (Float) value);
break;
case DOUBLE:
dout.addNumCol(cIdx, (Double) value);
break;
case ENUM:
// Note: this code expects ordering of categoricals provided by Avro remain same
// as in H2O!!!
GenericData.EnumSymbol es = (GenericData.EnumSymbol) value;
dout.addNumCol(cIdx, es.getSchema().getEnumOrdinal(es.toString()));
break;
case BYTES:
dout.addStrCol(cIdx, bs.set(((ByteBuffer) value).array()));
break;
case STRING:
dout.addStrCol(cIdx, bs.set(((Utf8) value).getBytes()));
break;
case NULL:
dout.addInvalidCol(cIdx);
break;
}
}
}
}
use of org.apache.avro.util.Utf8 in project beam by apache.
the class BigQueryAvroUtilsTest method testConvertGenericRecordToTableRow.
@Test
public void testConvertGenericRecordToTableRow() throws Exception {
TableSchema tableSchema = new TableSchema();
tableSchema.setFields(fields);
Schema avroSchema = AvroCoder.of(Bird.class).getSchema();
{
// Test nullable fields.
GenericRecord record = new GenericData.Record(avroSchema);
record.put("number", 5L);
TableRow convertedRow = BigQueryAvroUtils.convertGenericRecordToTableRow(record, tableSchema);
TableRow row = new TableRow().set("number", "5").set("associates", new ArrayList<TableRow>());
assertEquals(row, convertedRow);
}
{
// Test type conversion for:
// INTEGER, FLOAT, TIMESTAMP, BOOLEAN, BYTES, DATE, DATETIME, TIME.
GenericRecord record = new GenericData.Record(avroSchema);
byte[] soundBytes = "chirp,chirp".getBytes();
ByteBuffer soundByteBuffer = ByteBuffer.wrap(soundBytes);
soundByteBuffer.rewind();
record.put("number", 5L);
record.put("quality", 5.0);
record.put("birthday", 5L);
record.put("flighted", Boolean.TRUE);
record.put("sound", soundByteBuffer);
record.put("anniversaryDate", new Utf8("2000-01-01"));
record.put("anniversaryDatetime", new String("2000-01-01 00:00:00.000005"));
record.put("anniversaryTime", new Utf8("00:00:00.000005"));
TableRow convertedRow = BigQueryAvroUtils.convertGenericRecordToTableRow(record, tableSchema);
TableRow row = new TableRow().set("number", "5").set("birthday", "1970-01-01 00:00:00.000005 UTC").set("quality", 5.0).set("associates", new ArrayList<TableRow>()).set("flighted", Boolean.TRUE).set("sound", BaseEncoding.base64().encode(soundBytes)).set("anniversaryDate", "2000-01-01").set("anniversaryDatetime", "2000-01-01 00:00:00.000005").set("anniversaryTime", "00:00:00.000005");
assertEquals(row, convertedRow);
}
{
// Test repeated fields.
Schema subBirdSchema = AvroCoder.of(Bird.SubBird.class).getSchema();
GenericRecord nestedRecord = new GenericData.Record(subBirdSchema);
nestedRecord.put("species", "other");
GenericRecord record = new GenericData.Record(avroSchema);
record.put("number", 5L);
record.put("associates", Lists.<GenericRecord>newArrayList(nestedRecord));
TableRow convertedRow = BigQueryAvroUtils.convertGenericRecordToTableRow(record, tableSchema);
TableRow row = new TableRow().set("associates", Lists.<TableRow>newArrayList(new TableRow().set("species", "other"))).set("number", "5");
assertEquals(row, convertedRow);
}
}
use of org.apache.avro.util.Utf8 in project gora by apache.
the class AccumuloStore method populate.
public ByteSequence populate(Iterator<Entry<Key, Value>> iter, T persistent) throws IOException {
ByteSequence row = null;
Map<Utf8, Object> currentMap = null;
List currentArray = null;
Text currentFam = null;
int currentPos = 0;
Schema currentSchema = null;
Field currentField = null;
BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(new byte[0], null);
while (iter.hasNext()) {
Entry<Key, Value> entry = iter.next();
if (row == null) {
row = entry.getKey().getRowData();
}
byte[] val = entry.getValue().get();
Field field = fieldMap.get(getFieldName(entry));
if (currentMap != null) {
if (currentFam.equals(entry.getKey().getColumnFamily())) {
currentMap.put(new Utf8(entry.getKey().getColumnQualifierData().toArray()), fromBytes(currentSchema, entry.getValue().get()));
continue;
} else {
persistent.put(currentPos, currentMap);
currentMap = null;
}
} else if (currentArray != null) {
if (currentFam.equals(entry.getKey().getColumnFamily())) {
currentArray.add(fromBytes(currentSchema, entry.getValue().get()));
continue;
} else {
persistent.put(currentPos, new GenericData.Array<T>(currentField.schema(), currentArray));
currentArray = null;
}
}
switch(field.schema().getType()) {
case // first entry only. Next are handled above on the next loop
MAP:
currentMap = new DirtyMapWrapper<>(new HashMap<Utf8, Object>());
currentPos = field.pos();
currentFam = entry.getKey().getColumnFamily();
currentSchema = field.schema().getValueType();
currentMap.put(new Utf8(entry.getKey().getColumnQualifierData().toArray()), fromBytes(currentSchema, entry.getValue().get()));
break;
case ARRAY:
currentArray = new DirtyListWrapper<>(new ArrayList<>());
currentPos = field.pos();
currentFam = entry.getKey().getColumnFamily();
currentSchema = field.schema().getElementType();
currentField = field;
currentArray.add(fromBytes(currentSchema, entry.getValue().get()));
break;
case // default value of null acts like union with null
UNION:
Schema effectiveSchema = field.schema().getTypes().get(firstNotNullSchemaTypeIndex(field.schema()));
// map and array were coded without union index so need to be read the same way
if (effectiveSchema.getType() == Type.ARRAY) {
currentArray = new DirtyListWrapper<>(new ArrayList<>());
currentPos = field.pos();
currentFam = entry.getKey().getColumnFamily();
currentSchema = field.schema().getElementType();
currentField = field;
currentArray.add(fromBytes(currentSchema, entry.getValue().get()));
break;
} else if (effectiveSchema.getType() == Type.MAP) {
currentMap = new DirtyMapWrapper<>(new HashMap<Utf8, Object>());
currentPos = field.pos();
currentFam = entry.getKey().getColumnFamily();
currentSchema = effectiveSchema.getValueType();
currentMap.put(new Utf8(entry.getKey().getColumnQualifierData().toArray()), fromBytes(currentSchema, entry.getValue().get()));
break;
}
// continue like a regular top-level union
case RECORD:
SpecificDatumReader<?> reader = new SpecificDatumReader<Schema>(field.schema());
persistent.put(field.pos(), reader.read(null, DecoderFactory.get().binaryDecoder(val, decoder)));
break;
default:
persistent.put(field.pos(), fromBytes(field.schema(), entry.getValue().get()));
}
}
if (currentMap != null) {
persistent.put(currentPos, currentMap);
} else if (currentArray != null) {
persistent.put(currentPos, new GenericData.Array<T>(currentField.schema(), currentArray));
}
persistent.clearDirty();
return row;
}
use of org.apache.avro.util.Utf8 in project gora by apache.
the class GoraSerializerTypeInferer method getSerializer.
@SuppressWarnings({ "rawtypes", "unchecked" })
public static <T> Serializer<T> getSerializer(Object value) {
Serializer serializer = null;
if (value == null) {
serializer = ByteBufferSerializer.get();
} else if (value instanceof CharSequence) {
serializer = CharSequenceSerializer.get();
} else if (value instanceof Utf8) {
serializer = CharSequenceSerializer.get();
} else if (value instanceof Boolean) {
serializer = BooleanSerializer.get();
} else if (value instanceof ByteBuffer) {
serializer = ByteBufferSerializer.get();
} else if (value instanceof byte[]) {
serializer = BytesArraySerializer.get();
} else if (value instanceof Double) {
serializer = DoubleSerializer.get();
} else if (value instanceof Float) {
serializer = FloatSerializer.get();
} else if (value instanceof Integer) {
serializer = IntegerSerializer.get();
} else if (value instanceof Long) {
serializer = LongSerializer.get();
} else if (value instanceof String) {
serializer = StringSerializer.get();
} else if (value instanceof SpecificFixed) {
serializer = SpecificFixedSerializer.get(value.getClass());
} else if (value instanceof GenericArray) {
Schema schema = ((GenericArray) value).getSchema();
if (schema.getType() == Type.ARRAY) {
schema = schema.getElementType();
}
serializer = ListSerializer.get(schema);
} else if (value instanceof Map) {
Map map = (Map) value;
if (map.size() == 0) {
serializer = ByteBufferSerializer.get();
} else {
Object value0 = map.values().iterator().next();
Schema schema = TypeUtils.getSchema(value0);
serializer = MapSerializer.get(schema);
}
} else if (value instanceof Persistent) {
serializer = ObjectSerializer.get();
} else {
serializer = SerializerTypeInferer.getSerializer(value);
}
return serializer;
}
Aggregations