Search in sources :

Example 1 with Array

use of org.apache.avro.generic.GenericData.Array in project pinot by linkedin.

the class AvroRecordToPinotRowGenerator method transform.

public GenericRow transform(GenericData.Record record, org.apache.avro.Schema schema, GenericRow destination) {
    for (String column : indexingSchema.getColumnNames()) {
        Object entry = record.get(column);
        FieldSpec fieldSpec = indexingSchema.getFieldSpecFor(column);
        if (entry != null) {
            if (entry instanceof Array) {
                entry = AvroRecordReader.transformAvroArrayToObjectArray((Array) entry, fieldSpec);
                if (fieldSpec.getDataType() == DataType.STRING || fieldSpec.getDataType() == DataType.STRING_ARRAY) {
                    for (int i = 0; i < ((Object[]) entry).length; ++i) {
                        if (((Object[]) entry)[i] != null) {
                            ((Object[]) entry)[i] = ((Object[]) entry)[i].toString();
                        }
                    }
                }
            } else {
                if (entry instanceof Utf8) {
                    entry = ((Utf8) entry).toString();
                }
                if (fieldSpec.getDataType() == DataType.STRING) {
                    entry = entry.toString();
                }
            }
        } else {
            // entry was null.
            if (fieldSpec.isSingleValueField()) {
                entry = AvroRecordReader.getDefaultNullValue(fieldSpec);
            } else {
                // A multi-value field, and null. Any of the instanceof checks above will not match, so we need to repeat some
                // of the logic above here.
                entry = AvroRecordReader.transformAvroArrayToObjectArray((Array) entry, fieldSpec);
                if (fieldSpec.getDataType() == DataType.STRING || fieldSpec.getDataType() == DataType.STRING_ARRAY) {
                    for (int i = 0; i < ((Object[]) entry).length; ++i) {
                        if (((Object[]) entry)[i] != null) {
                            ((Object[]) entry)[i] = ((Object[]) entry)[i].toString();
                        }
                    }
                }
            }
        }
        destination.putField(column, entry);
    }
    return destination;
}
Also used : Array(org.apache.avro.generic.GenericData.Array) Utf8(org.apache.avro.util.Utf8) FieldSpec(com.linkedin.pinot.common.data.FieldSpec)

Example 2 with Array

use of org.apache.avro.generic.GenericData.Array in project gora by apache.

the class GoraStorage method writeField.

/**
 * Converts one pig field data to PersistentBase Data.
 *
 * @param avroSchema PersistentBase schema used to create new nested records
 * @param pigField Pig schema of the field being converted
 * @param pigData Pig data relative to the schema
 * @return PersistentBase data
 * @throws IOException
 */
private Object writeField(Schema avroSchema, ResourceFieldSchema pigField, Object pigData) throws IOException {
    // If data is null, return null (check if avro schema is right)
    if (pigData == null) {
        if (avroSchema.getType() != Type.UNION && avroSchema.getType() != Type.NULL) {
            throw new IOException("Tuple field " + pigField.getName() + " is null, but Avro Schema is not union nor null");
        } else {
            return null;
        }
    }
    // ONLY SUPPORT 2 ELEMENTS UNION!
    if (avroSchema.getType() == Type.UNION) {
        if (avroSchema.getTypes().get(0).getType() == Schema.Type.NULL) {
            avroSchema = avroSchema.getTypes().get(1);
        } else {
            avroSchema = avroSchema.getTypes().get(0);
        }
    }
    switch(pigField.getType()) {
        case DataType.DOUBLE:
        case DataType.FLOAT:
        case DataType.LONG:
        case DataType.BOOLEAN:
        case DataType.NULL:
            if (LOG.isTraceEnabled())
                LOG.trace("    Writing double, float, long, boolean or null.");
            return (Object) pigData;
        case DataType.CHARARRAY:
            if (LOG.isTraceEnabled())
                LOG.trace("    Writing chararray.");
            return pigData.toString();
        case DataType.INTEGER:
            if (LOG.isTraceEnabled())
                LOG.trace("    Writing integer/enum.");
            if (avroSchema.getType() == Type.ENUM) {
                return AvroUtils.getEnumValue(avroSchema, ((Number) pigData).intValue());
            } else {
                return ((Number) pigData).intValue();
            }
        case DataType.BYTEARRAY:
            if (LOG.isTraceEnabled())
                LOG.trace("    Writing bytearray.");
            return ByteBuffer.wrap(((DataByteArray) pigData).get());
        case // Pig Map -> Avro Map
        DataType.MAP:
            if (LOG.isTraceEnabled())
                LOG.trace("   Writing map.");
            @SuppressWarnings("unchecked") Map<String, Object> pigMap = (Map<String, Object>) pigData;
            Map<String, Object> goraMap = new HashMap<String, Object>(pigMap.size());
            if (pigField.getSchema() == null) {
                throw new IOException("The map being written does not have schema.");
            }
            for (Entry<String, Object> pigEntry : pigMap.entrySet()) {
                goraMap.put(pigEntry.getKey(), this.writeField(avroSchema.getValueType(), pigField.getSchema().getFields()[0], pigEntry.getValue()));
            }
            return goraMap;
        case // Pig Bag -> Avro Array
        DataType.BAG:
            if (LOG.isTraceEnabled())
                LOG.trace("    Writing bag.");
            Array<Object> persistentArray = new Array<Object>((int) ((DataBag) pigData).size(), avroSchema);
            for (Object pigArrayElement : (DataBag) pigData) {
                if (avroSchema.getElementType().getType() == Type.RECORD) {
                    // If element type is record, the mapping Persistent->PigType deletes one nested tuple:
                    // We want the map as: map((a1,a2,a3), (b1,b2,b3),...) instead of map(((a1,a2,a3)), ((b1,b2,b3)), ...)
                    persistentArray.add(this.writeField(avroSchema.getElementType(), pigField.getSchema().getFields()[0], pigArrayElement));
                } else {
                    // Every bag has a tuple as element type. Since this is not a record, that "tuple" container must be ignored
                    persistentArray.add(this.writeField(avroSchema.getElementType(), pigField.getSchema().getFields()[0].getSchema().getFields()[0], ((Tuple) pigArrayElement).get(0)));
                }
            }
            return persistentArray;
        case // Pig Tuple -> Avro Record
        DataType.TUPLE:
            if (LOG.isTraceEnabled())
                LOG.trace("    Writing tuple.");
            try {
                PersistentBase persistentRecord = (PersistentBase) ClassLoadingUtils.loadClass(avroSchema.getFullName()).newInstance();
                ResourceFieldSchema[] tupleFieldSchemas = pigField.getSchema().getFields();
                for (int i = 0; i < tupleFieldSchemas.length; i++) {
                    persistentRecord.put(tupleFieldSchemas[i].getName(), this.writeField(avroSchema.getField(tupleFieldSchemas[i].getName()).schema(), tupleFieldSchemas[i], ((Tuple) pigData).get(i)));
                }
                return persistentRecord;
            } catch (InstantiationException e) {
                throw new IOException(e);
            } catch (IllegalAccessException e) {
                throw new IOException(e);
            } catch (ClassNotFoundException e) {
                throw new IOException(e);
            }
        default:
            throw new IOException("Unexpected field " + pigField.getName() + " with Pig type " + DataType.genTypeToNameMap().get(pigField.getType()));
    }
}
Also used : PersistentBase(org.apache.gora.persistency.impl.PersistentBase) DataBag(org.apache.pig.data.DataBag) HashMap(java.util.HashMap) IOException(java.io.IOException) Array(org.apache.avro.generic.GenericData.Array) DataByteArray(org.apache.pig.data.DataByteArray) ResourceFieldSchema(org.apache.pig.ResourceSchema.ResourceFieldSchema) Map(java.util.Map) HashMap(java.util.HashMap) Tuple(org.apache.pig.data.Tuple)

Example 3 with Array

use of org.apache.avro.generic.GenericData.Array in project pinot by linkedin.

the class AvroQueryGenerator method generateSimpleAggregationOnSingleColumnFilters.

public void generateSimpleAggregationOnSingleColumnFilters() throws IOException {
    final Map<String, Map<Object, Integer>> cardinalityCountsMap = new HashMap<String, Map<Object, Integer>>();
    final Map<String, Map<Object, Map<String, Double>>> sumMap = new HashMap<String, Map<Object, Map<String, Double>>>();
    // here string key is columnName:columnValue:MetricName:GroupColumnName:groupKey:metricValue
    final Map<String, Map<Object, Double>> sumGroupBy = new HashMap<String, Map<Object, Double>>();
    aggregationQueries = new ArrayList<AvroQueryGenerator.TestSimpleAggreationQuery>();
    groupByQueries = new ArrayList<AvroQueryGenerator.TestGroupByAggreationQuery>();
    for (final Field f : schema.getFields()) {
        final String fieldName = f.name();
        if (dimensions.contains(fieldName) || metrics.contains(fieldName) || time.equals(fieldName)) {
            isSingleValueMap.put(fieldName, isSingleValueField(f));
            dataTypeMap.put(fieldName, getColumnType(f));
            if (!metrics.contains(fieldName)) {
                cardinalityCountsMap.put(fieldName, new HashMap<Object, Integer>());
            }
        }
    }
    for (final String column : cardinalityCountsMap.keySet()) {
        sumMap.put(column, new HashMap<Object, Map<String, Double>>());
    }
    while (dataStream.hasNext()) {
        final GenericRecord record = dataStream.next();
        for (final String column : cardinalityCountsMap.keySet()) {
            Object value = record.get(column);
            if (value == null) {
                switch(schema.getField(column).schema().getType()) {
                    case INT:
                        value = 0;
                        break;
                    case FLOAT:
                        value = 0F;
                        break;
                    case LONG:
                        value = 0L;
                        break;
                    case DOUBLE:
                        value = 0D;
                        break;
                    case STRING:
                    case BOOLEAN:
                        value = "null";
                        break;
                }
            }
            if (value instanceof Utf8) {
                value = ((Utf8) value).toString();
            }
            if (value instanceof Array) {
                continue;
            }
            for (final String metricName : metrics) {
                final String groupbyKeyBase = column + ":" + record.get(column) + ":" + metricName;
                int dimCounter = 1;
                for (final String dim : cardinalityCountsMap.keySet()) {
                    if (!dim.equals(column)) {
                        dimCounter++;
                        final String groupbyKey = groupbyKeyBase + ":" + dim;
                        if (sumGroupBy.containsKey(groupbyKey)) {
                            if (sumGroupBy.get(groupbyKey).containsKey(record.get(dim))) {
                                sumGroupBy.get(groupbyKey).put(record.get(dim), getAppropriateNumberType(metricName, record.get(metricName), sumGroupBy.get(groupbyKey).get(record.get(dim))));
                            } else {
                                sumGroupBy.get(groupbyKey).put(record.get(dim), Double.parseDouble(record.get(metricName).toString()));
                            }
                        } else {
                            sumGroupBy.put(groupbyKey, new HashMap<Object, Double>());
                            sumGroupBy.get(groupbyKey).put(record.get(dim), Double.parseDouble(record.get(metricName).toString()));
                        }
                    }
                    if (dimCounter == 4) {
                        break;
                    }
                }
            }
            if (cardinalityCountsMap.get(column).containsKey(value)) {
                cardinalityCountsMap.get(column).put(value, cardinalityCountsMap.get(column).get(value) + 1);
            } else {
                cardinalityCountsMap.get(column).put(value, 1);
            }
            if (!sumMap.get(column).containsKey(value)) {
                sumMap.get(column).put(value, new HashMap<String, Double>());
            }
            for (final String metric : metrics) {
                if (!sumMap.get(column).get(value).containsKey(metric)) {
                    sumMap.get(column).get(value).put(metric, getAppropriateNumberType(metric, record.get(metric), 0D));
                } else {
                    sumMap.get(column).get(value).put(metric, getAppropriateNumberType(metric, record.get(metric), sumMap.get(column).get(value).get(metric)));
                }
            }
        // here string key is columnName:columnValue:MetricName:GroupColumnName:groupKey:metricValue
        }
    }
    dataStream.close();
    if (!isRealtimeSegment) {
        for (final String column : cardinalityCountsMap.keySet()) {
            for (final Object entry : cardinalityCountsMap.get(column).keySet()) {
                final StringBuilder bld = new StringBuilder();
                bld.append("select count(*) from ");
                bld.append(resourceName);
                bld.append(" where ");
                bld.append(column);
                bld.append("=");
                bld.append("'");
                bld.append(entry);
                bld.append("'");
                bld.append(" ");
                bld.append("limit 0");
                String queryString = bld.toString();
                if (!queryString.contains("null")) {
                    aggregationQueries.add(new TestSimpleAggreationQuery(queryString, new Double(cardinalityCountsMap.get(column).get(entry))));
                }
            }
        }
    }
    for (final String column : sumMap.keySet()) {
        for (final Object value : sumMap.get(column).keySet()) {
            for (final String metric : sumMap.get(column).get(value).keySet()) {
                final StringBuilder bld = new StringBuilder();
                bld.append("select sum('" + metric + "') from ");
                bld.append(resourceName);
                bld.append(" where ");
                bld.append(column);
                bld.append("=");
                bld.append("'");
                bld.append(value);
                bld.append("'");
                bld.append(" ");
                bld.append("limit 0");
                String queryString = bld.toString();
                if (!queryString.contains("null")) {
                    aggregationQueries.add(new TestSimpleAggreationQuery(bld.toString(), sumMap.get(column).get(value).get(metric)));
                }
            }
        }
    }
    for (final String groupKey : sumGroupBy.keySet()) {
        final String columnName = groupKey.split(":")[0];
        final String columnValue = groupKey.split(":")[1];
        final String metricColumn = groupKey.split(":")[2];
        final String groupByColumnName = groupKey.split(":")[3];
        final StringBuilder bld = new StringBuilder();
        bld.append("select sum('" + metricColumn + "') from ");
        bld.append(resourceName);
        bld.append(" where ");
        bld.append(columnName);
        bld.append("=");
        bld.append("'");
        bld.append(columnValue);
        bld.append("'");
        bld.append(" ");
        bld.append(" group by ");
        bld.append(groupByColumnName);
        bld.append(" top 10 ");
        bld.append("limit 0");
        String queryString = bld.toString();
        if (!queryString.contains("null")) {
            groupByQueries.add(new TestGroupByAggreationQuery(bld.toString(), sumGroupBy.get(groupKey)));
        }
    }
}
Also used : HashMap(java.util.HashMap) Array(org.apache.avro.generic.GenericData.Array) Field(org.apache.avro.Schema.Field) Utf8(org.apache.avro.util.Utf8) GenericRecord(org.apache.avro.generic.GenericRecord) HashMap(java.util.HashMap) Map(java.util.Map)

Example 4 with Array

use of org.apache.avro.generic.GenericData.Array in project pinot by linkedin.

the class AvroRecordReader method getGenericRow.

private GenericRow getGenericRow(GenericRecord rawRecord, GenericRow row) {
    for (final Field field : _dataStream.getSchema().getFields()) {
        FieldSpec spec = _schemaExtractor.getSchema().getFieldSpecFor(field.name());
        if (spec == null) {
            continue;
        }
        Object value = rawRecord.get(field.name());
        if (value == null) {
            incrementNullCountFor(field.name());
            if (spec.isSingleValueField()) {
                value = spec.getDefaultNullValue();
            } else {
                value = transformAvroArrayToObjectArray((Array) value, spec);
            }
        } else {
            if (value instanceof Utf8) {
                value = ((Utf8) value).toString();
            }
            if (value instanceof Array) {
                value = transformAvroArrayToObjectArray((Array) value, spec);
            }
        }
        row.putField(field.name(), value);
    }
    return row;
}
Also used : Array(org.apache.avro.generic.GenericData.Array) Field(org.apache.avro.Schema.Field) Utf8(org.apache.avro.util.Utf8) FieldSpec(com.linkedin.pinot.common.data.FieldSpec)

Example 5 with Array

use of org.apache.avro.generic.GenericData.Array in project gora by apache.

the class CassandraStore method addOrUpdateField.

/**
   * Add a field to Cassandra according to its type.
   * @param key     the key of the row where the field should be added
   * @param field   the Avro field representing a datum
   * @param schema  the schema belonging to the particular Avro field
   * @param value   the field value
   */
@SuppressWarnings({ "unchecked", "rawtypes" })
private void addOrUpdateField(K key, Field field, Schema schema, Object value) {
    Type type = schema.getType();
    // checking if the value to be updated is used for saving union schema
    if (!field.name().contains(CassandraStore.UNION_COL_SUFIX)) {
        switch(type) {
            case STRING:
            case BOOLEAN:
            case INT:
            case LONG:
            case BYTES:
            case FLOAT:
            case DOUBLE:
            case FIXED:
                this.cassandraClient.addColumn(key, field.name(), value);
                break;
            case RECORD:
                if (value != null) {
                    if (value instanceof PersistentBase) {
                        PersistentBase persistentBase = (PersistentBase) value;
                        try {
                            byte[] byteValue = AvroSerializerUtil.serializer(persistentBase, schema);
                            this.cassandraClient.addColumn(key, field.name(), byteValue);
                        } catch (IOException e) {
                            LOG.warn(field.name() + " named record could not be serialized.");
                        }
                    } else {
                        LOG.warn("Record with value: " + value.toString() + " not supported for field: " + field.name());
                    }
                } else {
                    LOG.warn("Setting content of: " + field.name() + " to null.");
                    String familyName = this.cassandraClient.getCassandraMapping().getFamily(field.name());
                    this.cassandraClient.deleteColumn(key, familyName, this.cassandraClient.toByteBuffer(field.name()));
                }
                break;
            case MAP:
                if (value != null) {
                    if (value instanceof Map<?, ?>) {
                        Map<CharSequence, Object> map = (Map<CharSequence, Object>) value;
                        Schema valueSchema = schema.getValueType();
                        Type valueType = valueSchema.getType();
                        if (Type.UNION.equals(valueType)) {
                            Map<CharSequence, Object> valueMap = new HashMap<>();
                            for (CharSequence mapKey : map.keySet()) {
                                Object mapValue = map.get(mapKey);
                                int valueUnionIndex = getUnionSchema(mapValue, valueSchema);
                                valueMap.put((mapKey + UNION_COL_SUFIX), valueUnionIndex);
                                valueMap.put(mapKey, mapValue);
                            }
                            map = valueMap;
                        }
                        String familyName = this.cassandraClient.getCassandraMapping().getFamily(field.name());
                        // If map is not super column. We using Avro serializer. 
                        if (!this.cassandraClient.isSuper(familyName)) {
                            try {
                                byte[] byteValue = AvroSerializerUtil.serializer(map, schema);
                                this.cassandraClient.addColumn(key, field.name(), byteValue);
                            } catch (IOException e) {
                                LOG.warn(field.name() + " named map could not be serialized.");
                            }
                        } else {
                            this.cassandraClient.addStatefulHashMap(key, field.name(), map);
                        }
                    } else {
                        LOG.warn("Map with value: " + value.toString() + " not supported for field: " + field.name());
                    }
                } else {
                    // delete map
                    LOG.warn("Setting content of: " + field.name() + " to null.");
                    this.cassandraClient.deleteStatefulHashMap(key, field.name());
                }
                break;
            case ARRAY:
                if (value != null) {
                    if (value instanceof DirtyListWrapper<?>) {
                        DirtyListWrapper fieldValue = (DirtyListWrapper<?>) value;
                        GenericArray valueArray = new Array(fieldValue.size(), schema);
                        for (int i = 0; i < fieldValue.size(); i++) {
                            valueArray.add(i, fieldValue.get(i));
                        }
                        this.cassandraClient.addGenericArray(key, field.name(), (GenericArray<?>) valueArray);
                    } else {
                        LOG.warn("Array with value: " + value.toString() + " not supported for field: " + field.name());
                    }
                } else {
                    LOG.warn("Setting content of: " + field.name() + " to null.");
                    this.cassandraClient.deleteGenericArray(key, field.name());
                }
                break;
            case UNION:
                // adding union schema index
                String columnName = field.name() + UNION_COL_SUFIX;
                String familyName = this.cassandraClient.getCassandraMapping().getFamily(field.name());
                if (value != null) {
                    int schemaPos = getUnionSchema(value, schema);
                    LOG.debug("Union with value: " + value.toString() + " at index: " + schemaPos + " supported for field: " + field.name());
                    this.cassandraClient.getCassandraMapping().addColumn(familyName, columnName, columnName);
                    if (this.cassandraClient.isSuper(familyName)) {
                        this.cassandraClient.addSubColumn(key, columnName, columnName, schemaPos);
                    } else {
                        this.cassandraClient.addColumn(key, columnName, schemaPos);
                    }
                    //this.cassandraClient.getCassandraMapping().addColumn(familyName, columnName, columnName);
                    // adding union value
                    Schema unionSchema = schema.getTypes().get(schemaPos);
                    addOrUpdateField(key, field, unionSchema, value);
                //this.cassandraClient.addColumn(key, field.name(), value);
                } else {
                    LOG.warn("Setting content of: " + field.name() + " to null.");
                    if (this.cassandraClient.isSuper(familyName)) {
                        this.cassandraClient.deleteSubColumn(key, field.name());
                    } else {
                        this.cassandraClient.deleteColumn(key, familyName, this.cassandraClient.toByteBuffer(field.name()));
                    }
                }
                break;
            default:
                LOG.warn("Type: " + type.name() + " not considered for field: " + field.name() + ". Please report this to dev@gora.apache.org");
        }
    }
}
Also used : PersistentBase(org.apache.gora.persistency.impl.PersistentBase) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Schema(org.apache.avro.Schema) IOException(java.io.IOException) GenericArray(org.apache.avro.generic.GenericArray) Array(org.apache.avro.generic.GenericData.Array) Type(org.apache.avro.Schema.Type) DirtyListWrapper(org.apache.gora.persistency.impl.DirtyListWrapper) GenericArray(org.apache.avro.generic.GenericArray) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap)

Aggregations

Array (org.apache.avro.generic.GenericData.Array)7 HashMap (java.util.HashMap)5 Map (java.util.Map)5 Utf8 (org.apache.avro.util.Utf8)4 Field (org.apache.avro.Schema.Field)3 GenericRecord (org.apache.avro.generic.GenericRecord)3 FieldSpec (com.linkedin.pinot.common.data.FieldSpec)2 IOException (java.io.IOException)2 ByteBuffer (java.nio.ByteBuffer)2 Schema (org.apache.avro.Schema)2 PersistentBase (org.apache.gora.persistency.impl.PersistentBase)2 MapRecord (org.apache.nifi.serialization.record.MapRecord)2 Record (org.apache.nifi.serialization.record.Record)2 Time (java.sql.Time)1 Timestamp (java.sql.Timestamp)1 LinkedHashMap (java.util.LinkedHashMap)1 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)1 Conversions (org.apache.avro.Conversions)1 LogicalType (org.apache.avro.LogicalType)1 Type (org.apache.avro.Schema.Type)1