use of org.apache.avro.generic.GenericData.Array in project pinot by linkedin.
the class AvroRecordToPinotRowGenerator method transform.
public GenericRow transform(GenericData.Record record, org.apache.avro.Schema schema, GenericRow destination) {
for (String column : indexingSchema.getColumnNames()) {
Object entry = record.get(column);
FieldSpec fieldSpec = indexingSchema.getFieldSpecFor(column);
if (entry != null) {
if (entry instanceof Array) {
entry = AvroRecordReader.transformAvroArrayToObjectArray((Array) entry, fieldSpec);
if (fieldSpec.getDataType() == DataType.STRING || fieldSpec.getDataType() == DataType.STRING_ARRAY) {
for (int i = 0; i < ((Object[]) entry).length; ++i) {
if (((Object[]) entry)[i] != null) {
((Object[]) entry)[i] = ((Object[]) entry)[i].toString();
}
}
}
} else {
if (entry instanceof Utf8) {
entry = ((Utf8) entry).toString();
}
if (fieldSpec.getDataType() == DataType.STRING) {
entry = entry.toString();
}
}
} else {
// entry was null.
if (fieldSpec.isSingleValueField()) {
entry = AvroRecordReader.getDefaultNullValue(fieldSpec);
} else {
// A multi-value field, and null. Any of the instanceof checks above will not match, so we need to repeat some
// of the logic above here.
entry = AvroRecordReader.transformAvroArrayToObjectArray((Array) entry, fieldSpec);
if (fieldSpec.getDataType() == DataType.STRING || fieldSpec.getDataType() == DataType.STRING_ARRAY) {
for (int i = 0; i < ((Object[]) entry).length; ++i) {
if (((Object[]) entry)[i] != null) {
((Object[]) entry)[i] = ((Object[]) entry)[i].toString();
}
}
}
}
}
destination.putField(column, entry);
}
return destination;
}
use of org.apache.avro.generic.GenericData.Array in project gora by apache.
the class GoraStorage method writeField.
/**
* Converts one pig field data to PersistentBase Data.
*
* @param avroSchema PersistentBase schema used to create new nested records
* @param pigField Pig schema of the field being converted
* @param pigData Pig data relative to the schema
* @return PersistentBase data
* @throws IOException
*/
private Object writeField(Schema avroSchema, ResourceFieldSchema pigField, Object pigData) throws IOException {
// If data is null, return null (check if avro schema is right)
if (pigData == null) {
if (avroSchema.getType() != Type.UNION && avroSchema.getType() != Type.NULL) {
throw new IOException("Tuple field " + pigField.getName() + " is null, but Avro Schema is not union nor null");
} else {
return null;
}
}
// ONLY SUPPORT 2 ELEMENTS UNION!
if (avroSchema.getType() == Type.UNION) {
if (avroSchema.getTypes().get(0).getType() == Schema.Type.NULL) {
avroSchema = avroSchema.getTypes().get(1);
} else {
avroSchema = avroSchema.getTypes().get(0);
}
}
switch(pigField.getType()) {
case DataType.DOUBLE:
case DataType.FLOAT:
case DataType.LONG:
case DataType.BOOLEAN:
case DataType.NULL:
if (LOG.isTraceEnabled())
LOG.trace(" Writing double, float, long, boolean or null.");
return (Object) pigData;
case DataType.CHARARRAY:
if (LOG.isTraceEnabled())
LOG.trace(" Writing chararray.");
return pigData.toString();
case DataType.INTEGER:
if (LOG.isTraceEnabled())
LOG.trace(" Writing integer/enum.");
if (avroSchema.getType() == Type.ENUM) {
return AvroUtils.getEnumValue(avroSchema, ((Number) pigData).intValue());
} else {
return ((Number) pigData).intValue();
}
case DataType.BYTEARRAY:
if (LOG.isTraceEnabled())
LOG.trace(" Writing bytearray.");
return ByteBuffer.wrap(((DataByteArray) pigData).get());
case // Pig Map -> Avro Map
DataType.MAP:
if (LOG.isTraceEnabled())
LOG.trace(" Writing map.");
@SuppressWarnings("unchecked") Map<String, Object> pigMap = (Map<String, Object>) pigData;
Map<String, Object> goraMap = new HashMap<String, Object>(pigMap.size());
if (pigField.getSchema() == null) {
throw new IOException("The map being written does not have schema.");
}
for (Entry<String, Object> pigEntry : pigMap.entrySet()) {
goraMap.put(pigEntry.getKey(), this.writeField(avroSchema.getValueType(), pigField.getSchema().getFields()[0], pigEntry.getValue()));
}
return goraMap;
case // Pig Bag -> Avro Array
DataType.BAG:
if (LOG.isTraceEnabled())
LOG.trace(" Writing bag.");
Array<Object> persistentArray = new Array<Object>((int) ((DataBag) pigData).size(), avroSchema);
for (Object pigArrayElement : (DataBag) pigData) {
if (avroSchema.getElementType().getType() == Type.RECORD) {
// If element type is record, the mapping Persistent->PigType deletes one nested tuple:
// We want the map as: map((a1,a2,a3), (b1,b2,b3),...) instead of map(((a1,a2,a3)), ((b1,b2,b3)), ...)
persistentArray.add(this.writeField(avroSchema.getElementType(), pigField.getSchema().getFields()[0], pigArrayElement));
} else {
// Every bag has a tuple as element type. Since this is not a record, that "tuple" container must be ignored
persistentArray.add(this.writeField(avroSchema.getElementType(), pigField.getSchema().getFields()[0].getSchema().getFields()[0], ((Tuple) pigArrayElement).get(0)));
}
}
return persistentArray;
case // Pig Tuple -> Avro Record
DataType.TUPLE:
if (LOG.isTraceEnabled())
LOG.trace(" Writing tuple.");
try {
PersistentBase persistentRecord = (PersistentBase) ClassLoadingUtils.loadClass(avroSchema.getFullName()).newInstance();
ResourceFieldSchema[] tupleFieldSchemas = pigField.getSchema().getFields();
for (int i = 0; i < tupleFieldSchemas.length; i++) {
persistentRecord.put(tupleFieldSchemas[i].getName(), this.writeField(avroSchema.getField(tupleFieldSchemas[i].getName()).schema(), tupleFieldSchemas[i], ((Tuple) pigData).get(i)));
}
return persistentRecord;
} catch (InstantiationException e) {
throw new IOException(e);
} catch (IllegalAccessException e) {
throw new IOException(e);
} catch (ClassNotFoundException e) {
throw new IOException(e);
}
default:
throw new IOException("Unexpected field " + pigField.getName() + " with Pig type " + DataType.genTypeToNameMap().get(pigField.getType()));
}
}
use of org.apache.avro.generic.GenericData.Array in project pinot by linkedin.
the class AvroQueryGenerator method generateSimpleAggregationOnSingleColumnFilters.
public void generateSimpleAggregationOnSingleColumnFilters() throws IOException {
final Map<String, Map<Object, Integer>> cardinalityCountsMap = new HashMap<String, Map<Object, Integer>>();
final Map<String, Map<Object, Map<String, Double>>> sumMap = new HashMap<String, Map<Object, Map<String, Double>>>();
// here string key is columnName:columnValue:MetricName:GroupColumnName:groupKey:metricValue
final Map<String, Map<Object, Double>> sumGroupBy = new HashMap<String, Map<Object, Double>>();
aggregationQueries = new ArrayList<AvroQueryGenerator.TestSimpleAggreationQuery>();
groupByQueries = new ArrayList<AvroQueryGenerator.TestGroupByAggreationQuery>();
for (final Field f : schema.getFields()) {
final String fieldName = f.name();
if (dimensions.contains(fieldName) || metrics.contains(fieldName) || time.equals(fieldName)) {
isSingleValueMap.put(fieldName, isSingleValueField(f));
dataTypeMap.put(fieldName, getColumnType(f));
if (!metrics.contains(fieldName)) {
cardinalityCountsMap.put(fieldName, new HashMap<Object, Integer>());
}
}
}
for (final String column : cardinalityCountsMap.keySet()) {
sumMap.put(column, new HashMap<Object, Map<String, Double>>());
}
while (dataStream.hasNext()) {
final GenericRecord record = dataStream.next();
for (final String column : cardinalityCountsMap.keySet()) {
Object value = record.get(column);
if (value == null) {
switch(schema.getField(column).schema().getType()) {
case INT:
value = 0;
break;
case FLOAT:
value = 0F;
break;
case LONG:
value = 0L;
break;
case DOUBLE:
value = 0D;
break;
case STRING:
case BOOLEAN:
value = "null";
break;
}
}
if (value instanceof Utf8) {
value = ((Utf8) value).toString();
}
if (value instanceof Array) {
continue;
}
for (final String metricName : metrics) {
final String groupbyKeyBase = column + ":" + record.get(column) + ":" + metricName;
int dimCounter = 1;
for (final String dim : cardinalityCountsMap.keySet()) {
if (!dim.equals(column)) {
dimCounter++;
final String groupbyKey = groupbyKeyBase + ":" + dim;
if (sumGroupBy.containsKey(groupbyKey)) {
if (sumGroupBy.get(groupbyKey).containsKey(record.get(dim))) {
sumGroupBy.get(groupbyKey).put(record.get(dim), getAppropriateNumberType(metricName, record.get(metricName), sumGroupBy.get(groupbyKey).get(record.get(dim))));
} else {
sumGroupBy.get(groupbyKey).put(record.get(dim), Double.parseDouble(record.get(metricName).toString()));
}
} else {
sumGroupBy.put(groupbyKey, new HashMap<Object, Double>());
sumGroupBy.get(groupbyKey).put(record.get(dim), Double.parseDouble(record.get(metricName).toString()));
}
}
if (dimCounter == 4) {
break;
}
}
}
if (cardinalityCountsMap.get(column).containsKey(value)) {
cardinalityCountsMap.get(column).put(value, cardinalityCountsMap.get(column).get(value) + 1);
} else {
cardinalityCountsMap.get(column).put(value, 1);
}
if (!sumMap.get(column).containsKey(value)) {
sumMap.get(column).put(value, new HashMap<String, Double>());
}
for (final String metric : metrics) {
if (!sumMap.get(column).get(value).containsKey(metric)) {
sumMap.get(column).get(value).put(metric, getAppropriateNumberType(metric, record.get(metric), 0D));
} else {
sumMap.get(column).get(value).put(metric, getAppropriateNumberType(metric, record.get(metric), sumMap.get(column).get(value).get(metric)));
}
}
// here string key is columnName:columnValue:MetricName:GroupColumnName:groupKey:metricValue
}
}
dataStream.close();
if (!isRealtimeSegment) {
for (final String column : cardinalityCountsMap.keySet()) {
for (final Object entry : cardinalityCountsMap.get(column).keySet()) {
final StringBuilder bld = new StringBuilder();
bld.append("select count(*) from ");
bld.append(resourceName);
bld.append(" where ");
bld.append(column);
bld.append("=");
bld.append("'");
bld.append(entry);
bld.append("'");
bld.append(" ");
bld.append("limit 0");
String queryString = bld.toString();
if (!queryString.contains("null")) {
aggregationQueries.add(new TestSimpleAggreationQuery(queryString, new Double(cardinalityCountsMap.get(column).get(entry))));
}
}
}
}
for (final String column : sumMap.keySet()) {
for (final Object value : sumMap.get(column).keySet()) {
for (final String metric : sumMap.get(column).get(value).keySet()) {
final StringBuilder bld = new StringBuilder();
bld.append("select sum('" + metric + "') from ");
bld.append(resourceName);
bld.append(" where ");
bld.append(column);
bld.append("=");
bld.append("'");
bld.append(value);
bld.append("'");
bld.append(" ");
bld.append("limit 0");
String queryString = bld.toString();
if (!queryString.contains("null")) {
aggregationQueries.add(new TestSimpleAggreationQuery(bld.toString(), sumMap.get(column).get(value).get(metric)));
}
}
}
}
for (final String groupKey : sumGroupBy.keySet()) {
final String columnName = groupKey.split(":")[0];
final String columnValue = groupKey.split(":")[1];
final String metricColumn = groupKey.split(":")[2];
final String groupByColumnName = groupKey.split(":")[3];
final StringBuilder bld = new StringBuilder();
bld.append("select sum('" + metricColumn + "') from ");
bld.append(resourceName);
bld.append(" where ");
bld.append(columnName);
bld.append("=");
bld.append("'");
bld.append(columnValue);
bld.append("'");
bld.append(" ");
bld.append(" group by ");
bld.append(groupByColumnName);
bld.append(" top 10 ");
bld.append("limit 0");
String queryString = bld.toString();
if (!queryString.contains("null")) {
groupByQueries.add(new TestGroupByAggreationQuery(bld.toString(), sumGroupBy.get(groupKey)));
}
}
}
use of org.apache.avro.generic.GenericData.Array in project pinot by linkedin.
the class AvroRecordReader method getGenericRow.
private GenericRow getGenericRow(GenericRecord rawRecord, GenericRow row) {
for (final Field field : _dataStream.getSchema().getFields()) {
FieldSpec spec = _schemaExtractor.getSchema().getFieldSpecFor(field.name());
if (spec == null) {
continue;
}
Object value = rawRecord.get(field.name());
if (value == null) {
incrementNullCountFor(field.name());
if (spec.isSingleValueField()) {
value = spec.getDefaultNullValue();
} else {
value = transformAvroArrayToObjectArray((Array) value, spec);
}
} else {
if (value instanceof Utf8) {
value = ((Utf8) value).toString();
}
if (value instanceof Array) {
value = transformAvroArrayToObjectArray((Array) value, spec);
}
}
row.putField(field.name(), value);
}
return row;
}
use of org.apache.avro.generic.GenericData.Array in project gora by apache.
the class CassandraStore method addOrUpdateField.
/**
* Add a field to Cassandra according to its type.
* @param key the key of the row where the field should be added
* @param field the Avro field representing a datum
* @param schema the schema belonging to the particular Avro field
* @param value the field value
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
private void addOrUpdateField(K key, Field field, Schema schema, Object value) {
Type type = schema.getType();
// checking if the value to be updated is used for saving union schema
if (!field.name().contains(CassandraStore.UNION_COL_SUFIX)) {
switch(type) {
case STRING:
case BOOLEAN:
case INT:
case LONG:
case BYTES:
case FLOAT:
case DOUBLE:
case FIXED:
this.cassandraClient.addColumn(key, field.name(), value);
break;
case RECORD:
if (value != null) {
if (value instanceof PersistentBase) {
PersistentBase persistentBase = (PersistentBase) value;
try {
byte[] byteValue = AvroSerializerUtil.serializer(persistentBase, schema);
this.cassandraClient.addColumn(key, field.name(), byteValue);
} catch (IOException e) {
LOG.warn(field.name() + " named record could not be serialized.");
}
} else {
LOG.warn("Record with value: " + value.toString() + " not supported for field: " + field.name());
}
} else {
LOG.warn("Setting content of: " + field.name() + " to null.");
String familyName = this.cassandraClient.getCassandraMapping().getFamily(field.name());
this.cassandraClient.deleteColumn(key, familyName, this.cassandraClient.toByteBuffer(field.name()));
}
break;
case MAP:
if (value != null) {
if (value instanceof Map<?, ?>) {
Map<CharSequence, Object> map = (Map<CharSequence, Object>) value;
Schema valueSchema = schema.getValueType();
Type valueType = valueSchema.getType();
if (Type.UNION.equals(valueType)) {
Map<CharSequence, Object> valueMap = new HashMap<>();
for (CharSequence mapKey : map.keySet()) {
Object mapValue = map.get(mapKey);
int valueUnionIndex = getUnionSchema(mapValue, valueSchema);
valueMap.put((mapKey + UNION_COL_SUFIX), valueUnionIndex);
valueMap.put(mapKey, mapValue);
}
map = valueMap;
}
String familyName = this.cassandraClient.getCassandraMapping().getFamily(field.name());
// If map is not super column. We using Avro serializer.
if (!this.cassandraClient.isSuper(familyName)) {
try {
byte[] byteValue = AvroSerializerUtil.serializer(map, schema);
this.cassandraClient.addColumn(key, field.name(), byteValue);
} catch (IOException e) {
LOG.warn(field.name() + " named map could not be serialized.");
}
} else {
this.cassandraClient.addStatefulHashMap(key, field.name(), map);
}
} else {
LOG.warn("Map with value: " + value.toString() + " not supported for field: " + field.name());
}
} else {
// delete map
LOG.warn("Setting content of: " + field.name() + " to null.");
this.cassandraClient.deleteStatefulHashMap(key, field.name());
}
break;
case ARRAY:
if (value != null) {
if (value instanceof DirtyListWrapper<?>) {
DirtyListWrapper fieldValue = (DirtyListWrapper<?>) value;
GenericArray valueArray = new Array(fieldValue.size(), schema);
for (int i = 0; i < fieldValue.size(); i++) {
valueArray.add(i, fieldValue.get(i));
}
this.cassandraClient.addGenericArray(key, field.name(), (GenericArray<?>) valueArray);
} else {
LOG.warn("Array with value: " + value.toString() + " not supported for field: " + field.name());
}
} else {
LOG.warn("Setting content of: " + field.name() + " to null.");
this.cassandraClient.deleteGenericArray(key, field.name());
}
break;
case UNION:
// adding union schema index
String columnName = field.name() + UNION_COL_SUFIX;
String familyName = this.cassandraClient.getCassandraMapping().getFamily(field.name());
if (value != null) {
int schemaPos = getUnionSchema(value, schema);
LOG.debug("Union with value: " + value.toString() + " at index: " + schemaPos + " supported for field: " + field.name());
this.cassandraClient.getCassandraMapping().addColumn(familyName, columnName, columnName);
if (this.cassandraClient.isSuper(familyName)) {
this.cassandraClient.addSubColumn(key, columnName, columnName, schemaPos);
} else {
this.cassandraClient.addColumn(key, columnName, schemaPos);
}
//this.cassandraClient.getCassandraMapping().addColumn(familyName, columnName, columnName);
// adding union value
Schema unionSchema = schema.getTypes().get(schemaPos);
addOrUpdateField(key, field, unionSchema, value);
//this.cassandraClient.addColumn(key, field.name(), value);
} else {
LOG.warn("Setting content of: " + field.name() + " to null.");
if (this.cassandraClient.isSuper(familyName)) {
this.cassandraClient.deleteSubColumn(key, field.name());
} else {
this.cassandraClient.deleteColumn(key, familyName, this.cassandraClient.toByteBuffer(field.name()));
}
}
break;
default:
LOG.warn("Type: " + type.name() + " not considered for field: " + field.name() + ". Please report this to dev@gora.apache.org");
}
}
}
Aggregations