Search in sources :

Example 21 with Utf8

use of org.apache.avro.util.Utf8 in project flink by apache.

the class DataInputDecoder method readString.

// --------------------------------------------------------------------------------------------
// strings
// --------------------------------------------------------------------------------------------
@Override
public Utf8 readString(Utf8 old) throws IOException {
    int length = readInt();
    Utf8 result = (old != null ? old : new Utf8());
    result.setByteLength(length);
    if (length > 0) {
        in.readFully(result.getBytes(), 0, length);
    }
    return result;
}
Also used : Utf8(org.apache.avro.util.Utf8)

Example 22 with Utf8

use of org.apache.avro.util.Utf8 in project h2o-3 by h2oai.

the class AvroParser method write2frame.

/**
   * The main method transforming Avro record into a row in H2O frame.
   *
   * @param gr  Avro generic record
   * @param columnNames Column names prepared by parser setup
   * @param inSchema  Flattenized Avro schema which corresponds to passed column names
   * @param columnTypes  Target H2O types
   * @param dout  Parser writer
   */
private static void write2frame(GenericRecord gr, String[] columnNames, Schema.Field[] inSchema, byte[] columnTypes, ParseWriter dout) {
    assert inSchema.length == columnTypes.length : "AVRO field flatenized schema has to match to parser setup";
    BufferedString bs = new BufferedString();
    for (int cIdx = 0; cIdx < columnNames.length; cIdx++) {
        int inputFieldIdx = inSchema[cIdx].pos();
        Schema.Type inputType = toPrimitiveType(inSchema[cIdx].schema());
        // FIXME: support target conversions
        byte targetType = columnTypes[cIdx];
        Object value = gr.get(inputFieldIdx);
        if (value == null) {
            dout.addInvalidCol(cIdx);
        } else {
            switch(inputType) {
                case BOOLEAN:
                    dout.addNumCol(cIdx, ((Boolean) value) ? 1 : 0);
                    break;
                case INT:
                    dout.addNumCol(cIdx, ((Integer) value), 0);
                    break;
                case LONG:
                    dout.addNumCol(cIdx, ((Long) value), 0);
                    break;
                case FLOAT:
                    dout.addNumCol(cIdx, (Float) value);
                    break;
                case DOUBLE:
                    dout.addNumCol(cIdx, (Double) value);
                    break;
                case ENUM:
                    // Note: this code expects ordering of categoricals provided by Avro remain same
                    // as in H2O!!!
                    GenericData.EnumSymbol es = (GenericData.EnumSymbol) value;
                    dout.addNumCol(cIdx, es.getSchema().getEnumOrdinal(es.toString()));
                    break;
                case BYTES:
                    dout.addStrCol(cIdx, bs.set(((ByteBuffer) value).array()));
                    break;
                case STRING:
                    dout.addStrCol(cIdx, bs.set(((Utf8) value).getBytes()));
                    break;
                case NULL:
                    dout.addInvalidCol(cIdx);
                    break;
            }
        }
    }
}
Also used : Schema(org.apache.avro.Schema) GenericData(org.apache.avro.generic.GenericData) ByteBuffer(java.nio.ByteBuffer) Utf8(org.apache.avro.util.Utf8) BufferedString(water.parser.BufferedString)

Example 23 with Utf8

use of org.apache.avro.util.Utf8 in project beam by apache.

the class BigQueryAvroUtilsTest method testConvertGenericRecordToTableRow.

@Test
public void testConvertGenericRecordToTableRow() throws Exception {
    TableSchema tableSchema = new TableSchema();
    tableSchema.setFields(fields);
    Schema avroSchema = AvroCoder.of(Bird.class).getSchema();
    {
        // Test nullable fields.
        GenericRecord record = new GenericData.Record(avroSchema);
        record.put("number", 5L);
        TableRow convertedRow = BigQueryAvroUtils.convertGenericRecordToTableRow(record, tableSchema);
        TableRow row = new TableRow().set("number", "5").set("associates", new ArrayList<TableRow>());
        assertEquals(row, convertedRow);
    }
    {
        // Test type conversion for:
        // INTEGER, FLOAT, TIMESTAMP, BOOLEAN, BYTES, DATE, DATETIME, TIME.
        GenericRecord record = new GenericData.Record(avroSchema);
        byte[] soundBytes = "chirp,chirp".getBytes();
        ByteBuffer soundByteBuffer = ByteBuffer.wrap(soundBytes);
        soundByteBuffer.rewind();
        record.put("number", 5L);
        record.put("quality", 5.0);
        record.put("birthday", 5L);
        record.put("flighted", Boolean.TRUE);
        record.put("sound", soundByteBuffer);
        record.put("anniversaryDate", new Utf8("2000-01-01"));
        record.put("anniversaryDatetime", new String("2000-01-01 00:00:00.000005"));
        record.put("anniversaryTime", new Utf8("00:00:00.000005"));
        TableRow convertedRow = BigQueryAvroUtils.convertGenericRecordToTableRow(record, tableSchema);
        TableRow row = new TableRow().set("number", "5").set("birthday", "1970-01-01 00:00:00.000005 UTC").set("quality", 5.0).set("associates", new ArrayList<TableRow>()).set("flighted", Boolean.TRUE).set("sound", BaseEncoding.base64().encode(soundBytes)).set("anniversaryDate", "2000-01-01").set("anniversaryDatetime", "2000-01-01 00:00:00.000005").set("anniversaryTime", "00:00:00.000005");
        assertEquals(row, convertedRow);
    }
    {
        // Test repeated fields.
        Schema subBirdSchema = AvroCoder.of(Bird.SubBird.class).getSchema();
        GenericRecord nestedRecord = new GenericData.Record(subBirdSchema);
        nestedRecord.put("species", "other");
        GenericRecord record = new GenericData.Record(avroSchema);
        record.put("number", 5L);
        record.put("associates", Lists.<GenericRecord>newArrayList(nestedRecord));
        TableRow convertedRow = BigQueryAvroUtils.convertGenericRecordToTableRow(record, tableSchema);
        TableRow row = new TableRow().set("associates", Lists.<TableRow>newArrayList(new TableRow().set("species", "other"))).set("number", "5");
        assertEquals(row, convertedRow);
    }
}
Also used : TableSchema(com.google.api.services.bigquery.model.TableSchema) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) Schema(org.apache.avro.Schema) TableSchema(com.google.api.services.bigquery.model.TableSchema) TableRow(com.google.api.services.bigquery.model.TableRow) ArrayList(java.util.ArrayList) Utf8(org.apache.avro.util.Utf8) GenericRecord(org.apache.avro.generic.GenericRecord) GenericData(org.apache.avro.generic.GenericData) ByteBuffer(java.nio.ByteBuffer) Test(org.junit.Test)

Example 24 with Utf8

use of org.apache.avro.util.Utf8 in project gora by apache.

the class AccumuloStore method populate.

public ByteSequence populate(Iterator<Entry<Key, Value>> iter, T persistent) throws IOException {
    ByteSequence row = null;
    Map<Utf8, Object> currentMap = null;
    List currentArray = null;
    Text currentFam = null;
    int currentPos = 0;
    Schema currentSchema = null;
    Field currentField = null;
    BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(new byte[0], null);
    while (iter.hasNext()) {
        Entry<Key, Value> entry = iter.next();
        if (row == null) {
            row = entry.getKey().getRowData();
        }
        byte[] val = entry.getValue().get();
        Field field = fieldMap.get(getFieldName(entry));
        if (currentMap != null) {
            if (currentFam.equals(entry.getKey().getColumnFamily())) {
                currentMap.put(new Utf8(entry.getKey().getColumnQualifierData().toArray()), fromBytes(currentSchema, entry.getValue().get()));
                continue;
            } else {
                persistent.put(currentPos, currentMap);
                currentMap = null;
            }
        } else if (currentArray != null) {
            if (currentFam.equals(entry.getKey().getColumnFamily())) {
                currentArray.add(fromBytes(currentSchema, entry.getValue().get()));
                continue;
            } else {
                persistent.put(currentPos, new GenericData.Array<T>(currentField.schema(), currentArray));
                currentArray = null;
            }
        }
        switch(field.schema().getType()) {
            case // first entry only. Next are handled above on the next loop
            MAP:
                currentMap = new DirtyMapWrapper<>(new HashMap<Utf8, Object>());
                currentPos = field.pos();
                currentFam = entry.getKey().getColumnFamily();
                currentSchema = field.schema().getValueType();
                currentMap.put(new Utf8(entry.getKey().getColumnQualifierData().toArray()), fromBytes(currentSchema, entry.getValue().get()));
                break;
            case ARRAY:
                currentArray = new DirtyListWrapper<>(new ArrayList<>());
                currentPos = field.pos();
                currentFam = entry.getKey().getColumnFamily();
                currentSchema = field.schema().getElementType();
                currentField = field;
                currentArray.add(fromBytes(currentSchema, entry.getValue().get()));
                break;
            case // default value of null acts like union with null
            UNION:
                Schema effectiveSchema = field.schema().getTypes().get(firstNotNullSchemaTypeIndex(field.schema()));
                // map and array were coded without union index so need to be read the same way
                if (effectiveSchema.getType() == Type.ARRAY) {
                    currentArray = new DirtyListWrapper<>(new ArrayList<>());
                    currentPos = field.pos();
                    currentFam = entry.getKey().getColumnFamily();
                    currentSchema = field.schema().getElementType();
                    currentField = field;
                    currentArray.add(fromBytes(currentSchema, entry.getValue().get()));
                    break;
                } else if (effectiveSchema.getType() == Type.MAP) {
                    currentMap = new DirtyMapWrapper<>(new HashMap<Utf8, Object>());
                    currentPos = field.pos();
                    currentFam = entry.getKey().getColumnFamily();
                    currentSchema = effectiveSchema.getValueType();
                    currentMap.put(new Utf8(entry.getKey().getColumnQualifierData().toArray()), fromBytes(currentSchema, entry.getValue().get()));
                    break;
                }
            // continue like a regular top-level union
            case RECORD:
                SpecificDatumReader<?> reader = new SpecificDatumReader<Schema>(field.schema());
                persistent.put(field.pos(), reader.read(null, DecoderFactory.get().binaryDecoder(val, decoder)));
                break;
            default:
                persistent.put(field.pos(), fromBytes(field.schema(), entry.getValue().get()));
        }
    }
    if (currentMap != null) {
        persistent.put(currentPos, currentMap);
    } else if (currentArray != null) {
        persistent.put(currentPos, new GenericData.Array<T>(currentField.schema(), currentArray));
    }
    persistent.clearDirty();
    return row;
}
Also used : DirtyMapWrapper(org.apache.gora.persistency.impl.DirtyMapWrapper) HashMap(java.util.HashMap) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) Text(org.apache.hadoop.io.Text) BinaryDecoder(org.apache.avro.io.BinaryDecoder) Field(org.apache.avro.Schema.Field) Value(org.apache.accumulo.core.data.Value) Utf8(org.apache.avro.util.Utf8) List(java.util.List) ArrayList(java.util.ArrayList) NodeList(org.w3c.dom.NodeList) SpecificDatumReader(org.apache.avro.specific.SpecificDatumReader) ByteSequence(org.apache.accumulo.core.data.ByteSequence) Key(org.apache.accumulo.core.data.Key)

Example 25 with Utf8

use of org.apache.avro.util.Utf8 in project gora by apache.

the class GoraSerializerTypeInferer method getSerializer.

@SuppressWarnings({ "rawtypes", "unchecked" })
public static <T> Serializer<T> getSerializer(Object value) {
    Serializer serializer = null;
    if (value == null) {
        serializer = ByteBufferSerializer.get();
    } else if (value instanceof CharSequence) {
        serializer = CharSequenceSerializer.get();
    } else if (value instanceof Utf8) {
        serializer = CharSequenceSerializer.get();
    } else if (value instanceof Boolean) {
        serializer = BooleanSerializer.get();
    } else if (value instanceof ByteBuffer) {
        serializer = ByteBufferSerializer.get();
    } else if (value instanceof byte[]) {
        serializer = BytesArraySerializer.get();
    } else if (value instanceof Double) {
        serializer = DoubleSerializer.get();
    } else if (value instanceof Float) {
        serializer = FloatSerializer.get();
    } else if (value instanceof Integer) {
        serializer = IntegerSerializer.get();
    } else if (value instanceof Long) {
        serializer = LongSerializer.get();
    } else if (value instanceof String) {
        serializer = StringSerializer.get();
    } else if (value instanceof SpecificFixed) {
        serializer = SpecificFixedSerializer.get(value.getClass());
    } else if (value instanceof GenericArray) {
        Schema schema = ((GenericArray) value).getSchema();
        if (schema.getType() == Type.ARRAY) {
            schema = schema.getElementType();
        }
        serializer = ListSerializer.get(schema);
    } else if (value instanceof Map) {
        Map map = (Map) value;
        if (map.size() == 0) {
            serializer = ByteBufferSerializer.get();
        } else {
            Object value0 = map.values().iterator().next();
            Schema schema = TypeUtils.getSchema(value0);
            serializer = MapSerializer.get(schema);
        }
    } else if (value instanceof Persistent) {
        serializer = ObjectSerializer.get();
    } else {
        serializer = SerializerTypeInferer.getSerializer(value);
    }
    return serializer;
}
Also used : Schema(org.apache.avro.Schema) SpecificFixed(org.apache.avro.specific.SpecificFixed) Persistent(org.apache.gora.persistency.Persistent) ByteBuffer(java.nio.ByteBuffer) Utf8(org.apache.avro.util.Utf8) GenericArray(org.apache.avro.generic.GenericArray) Map(java.util.Map) Serializer(me.prettyprint.hector.api.Serializer) DoubleSerializer(me.prettyprint.cassandra.serializers.DoubleSerializer) BytesArraySerializer(me.prettyprint.cassandra.serializers.BytesArraySerializer) ByteBufferSerializer(me.prettyprint.cassandra.serializers.ByteBufferSerializer) FloatSerializer(me.prettyprint.cassandra.serializers.FloatSerializer) ObjectSerializer(me.prettyprint.cassandra.serializers.ObjectSerializer) LongSerializer(me.prettyprint.cassandra.serializers.LongSerializer) StringSerializer(me.prettyprint.cassandra.serializers.StringSerializer) BooleanSerializer(me.prettyprint.cassandra.serializers.BooleanSerializer) IntegerSerializer(me.prettyprint.cassandra.serializers.IntegerSerializer)

Aggregations

Utf8 (org.apache.avro.util.Utf8)123 Test (org.junit.Test)34 WebPage (org.apache.gora.examples.generated.WebPage)32 GenericRecord (org.apache.avro.generic.GenericRecord)17 Schema (org.apache.avro.Schema)14 GenericData (org.apache.avro.generic.GenericData)13 ByteBuffer (java.nio.ByteBuffer)12 HashMap (java.util.HashMap)12 Map (java.util.Map)12 Employee (org.apache.gora.examples.generated.Employee)11 IOException (java.io.IOException)7 ArrayList (java.util.ArrayList)7 Field (org.apache.avro.Schema.Field)6 Record (org.apache.avro.generic.GenericData.Record)5 File (java.io.File)4 SpecificDatumReader (org.apache.avro.specific.SpecificDatumReader)4 Metadata (org.apache.gora.examples.generated.Metadata)4 ByteArrayInputStream (java.io.ByteArrayInputStream)3 Iterator (java.util.Iterator)3 List (java.util.List)3