Search in sources :

Example 1 with VisibleForTesting

use of org.apache.arrow.util.VisibleForTesting in project aws-athena-query-federation by awslabs.

the class ElasticsearchSchemaUtils method mappingsEqual.

/**
 * Checks that two Schema objects are equal using the following criteria:
 * 1) The Schemas must have the same number of fields.
 * 2) The corresponding fields in the two Schema objects must also be the same irrespective of ordering within
 *    the Schema object using the following criteria:
 *    a) The fields' names must match.
 *    b) The fields' Arrow types must match.
 *    c) The fields' children lists (used for complex fields, e.g. LIST and STRUCT) must match irrespective of
 *       field ordering within the lists.
 *    d) The fields' metadata maps must match. Currently that's only applicable for scaled_float data types that
 *       use the field's metadata map to store the scaling factor associated with the data type.
 * @param mapping1 is a mapping to be compared.
 * @param mapping2 is a mapping to be compared.
 * @return true if the lists are equal, false otherwise.
 */
@VisibleForTesting
protected static final boolean mappingsEqual(Schema mapping1, Schema mapping2) {
    logger.info("mappingsEqual - Enter - Mapping1: {}, Mapping2: {}", mapping1, mapping2);
    // Schemas must have the same number of elements.
    if (mapping1.getFields().size() != mapping2.getFields().size()) {
        logger.warn("Mappings are different sizes - Mapping1: {}, Mapping2: {}", mapping1.getFields().size(), mapping2.getFields().size());
        return false;
    }
    // Mappings must have the same fields (irrespective of internal ordering).
    for (Field field1 : mapping1.getFields()) {
        Field field2 = mapping2.findField(field1.getName());
        // Corresponding fields must have the same Arrow types or the Schemas are deemed not equal.
        if (field2 == null || field1.getType() != field2.getType()) {
            logger.warn("Fields' types do not match - Field1: {}, Field2: {}", field1.getType(), field2 == null ? "null" : field2.getType());
            return false;
        }
        logger.info("Field1 Name: {}, Field1 Type: {}, Field1 Metadata: {}", field1.getName(), field1.getType(), field1.getMetadata());
        logger.info("Field2 Name: {}, Field2 Type: {}, Field2 Metadata: {}", field2.getName(), field2.getType(), field2.getMetadata());
        // The corresponding fields' children and metadata maps must also match or the Schemas are deemed not equal.
        if (!childrenEqual(field1.getChildren(), field2.getChildren()) || !field1.getMetadata().equals(field2.getMetadata())) {
            return false;
        }
    }
    return true;
}
Also used : Field(org.apache.arrow.vector.types.pojo.Field) VisibleForTesting(org.apache.arrow.util.VisibleForTesting)

Example 2 with VisibleForTesting

use of org.apache.arrow.util.VisibleForTesting in project aws-athena-query-federation by awslabs.

the class BlockUtils method writeStruct.

/**
 * Used to write a Struct value.
 *
 * @param allocator The BlockAllocator which can be used to generate Apache Arrow Buffers for types
 * which require conversion to an Arrow Buffer before they can be written using the FieldWriter.
 * @param writer The FieldWriter for the Struct field we'd like to write into.
 * @param field The Schema details of the Struct Field we are writing into.
 * @param pos The position (row) in the Apache Arrow batch we are writing to.
 * @param value The value we'd like to write as a struct.
 * @param resolver The field resolver that can be used to extract individual Struct fields from the value.
 */
@VisibleForTesting
protected static void writeStruct(BufferAllocator allocator, StructWriter writer, Field field, int pos, Object value, FieldResolver resolver) {
    // We expect null writes to have been handled earlier so this is a no-op.
    if (value == null) {
        return;
    }
    // Indicate the beginning of the struct value, this is how Apache Arrow handles the variable length of Struct types.
    writer.start();
    for (Field nextChild : field.getChildren()) {
        // For each child field that comprises the struct, attempt to extract and write the corresponding value
        // using the FieldResolver.
        Object childValue = resolver.getFieldValue(nextChild, value);
        switch(Types.getMinorTypeForArrowType(nextChild.getType())) {
            case LIST:
                writeList(allocator, (FieldWriter) writer.list(nextChild.getName()), nextChild, pos, ((List) childValue), resolver);
                break;
            case STRUCT:
                writeStruct(allocator, writer.struct(nextChild.getName()), nextChild, pos, childValue, resolver);
                break;
            default:
                writeStructValue(writer, nextChild, allocator, childValue);
                break;
        }
    }
    writer.end();
}
Also used : Field(org.apache.arrow.vector.types.pojo.Field) List(java.util.List) VisibleForTesting(org.apache.arrow.util.VisibleForTesting)

Example 3 with VisibleForTesting

use of org.apache.arrow.util.VisibleForTesting in project aws-athena-query-federation by awslabs.

the class BlockUtils method writeMap.

/**
 * Used to write a Map value.
 *
 * @param allocator The BlockAllocator which can be used to generate Apache Arrow Buffers for types
 * which require conversion to an Arrow Buffer before they can be written using the FieldWriter.
 * @param writer The FieldWriter for the Map field we'd like to write into.
 * @param field The Schema details of the Map Field we are writing into.
 * @param pos The position (row) in the Apache Arrow batch we are writing to.
 * @param value The value we'd like to write as a Map.
 * @param resolver The field resolver that can be used to extract individual Struct Map from the value.
 */
@VisibleForTesting
protected static void writeMap(BufferAllocator allocator, UnionMapWriter writer, Field field, int pos, Object value, FieldResolver resolver) {
    // We expect null writes to have been handled earlier so this is a no-op.
    if (value == null) {
        return;
    }
    // Indicate the beginning of the Map value, this is how Apache Arrow handles the variable length of map types.
    writer.startMap();
    List<Field> children = field.getChildren();
    Field keyValueStructField;
    if (children.size() != 1) {
        throw new IllegalStateException("Invalid Arrow Map schema: " + field);
    } else {
        keyValueStructField = children.get(0);
        if (!ENTRIES.equals(keyValueStructField.getName()) || !(keyValueStructField.getType() instanceof ArrowType.Struct)) {
            throw new IllegalStateException("Invalid Arrow Map schema: " + field);
        }
    }
    List<Field> keyValueChildren = keyValueStructField.getChildren();
    Field keyField;
    Field valueField;
    if (keyValueChildren.size() != 2) {
        throw new IllegalStateException("Invalid Arrow Map schema: " + field);
    } else {
        keyField = keyValueChildren.get(0);
        valueField = keyValueChildren.get(1);
        if (!KEY.equals(keyField.getName()) || !VALUE.equals(valueField.getName())) {
            throw new IllegalStateException("Invalid Arrow Map schema: " + field);
        }
    }
    for (Field nextChild : keyValueChildren) {
        // For each child field that comprises the Map, attempt to extract and write the corresponding value
        // using the FieldResolver.
        Object childValue = resolver.getFieldValue(nextChild, value);
        switch(Types.getMinorTypeForArrowType(nextChild.getType())) {
            case LIST:
                writeList(allocator, (FieldWriter) writer.list(nextChild.getName()), nextChild, pos, ((List) childValue), resolver);
                break;
            case STRUCT:
                writeStruct(allocator, writer.struct(nextChild.getName()), nextChild, pos, childValue, resolver);
                break;
            default:
                writeMapValue(writer, nextChild, allocator, childValue);
                break;
        }
    }
    writer.endEntry();
}
Also used : Field(org.apache.arrow.vector.types.pojo.Field) ArrowType(org.apache.arrow.vector.types.pojo.ArrowType) List(java.util.List) VisibleForTesting(org.apache.arrow.util.VisibleForTesting)

Example 4 with VisibleForTesting

use of org.apache.arrow.util.VisibleForTesting in project aws-athena-query-federation by awslabs.

the class BlockUtils method writeList.

/**
 * Used to write a List value.
 *
 * @param allocator The BlockAllocator which can be used to generate Apache Arrow Buffers for types
 * which require conversion to an Arrow Buffer before they can be written using the FieldWriter.
 * @param writer The FieldWriter for the List field we'd like to write into.
 * @param field The Schema details of the List Field we are writing into.
 * @param pos The position (row) in the Apache Arrow batch we are writing to.
 * @param value An iterator to the collection of values we want to write into the row.
 * @param resolver The field resolver that can be used to extract individual values from the value iterator.
 */
@VisibleForTesting
protected static void writeList(BufferAllocator allocator, FieldWriter writer, Field field, int pos, Iterable value, FieldResolver resolver) {
    if (value == null) {
        return;
    }
    // Apache Arrow List types have a single 'special' child field which gives us the concrete type of the values
    // stored in the list.
    Field child = null;
    if (field.getChildren() != null && !field.getChildren().isEmpty()) {
        child = field.getChildren().get(0);
    }
    // Mark the beginning of the list, this is essentially how Apache Arrow handles the variable length nature
    // of lists.
    writer.startList();
    Iterator itr = value.iterator();
    while (itr.hasNext()) {
        // For each item in the iterator, attempt to write it to the list.
        Object val = itr.next();
        if (val != null) {
            switch(Types.getMinorTypeForArrowType(child.getType())) {
                case LIST:
                    try {
                        writeList(allocator, (FieldWriter) writer.list(), child, pos, ((List) val), resolver);
                    } catch (Exception ex) {
                        throw ex;
                    }
                    break;
                case STRUCT:
                    writeStruct(allocator, writer.struct(), child, pos, val, resolver);
                    break;
                default:
                    writeListValue(writer, child.getType(), allocator, val);
                    break;
            }
        }
    }
    writer.endList();
}
Also used : Field(org.apache.arrow.vector.types.pojo.Field) Iterator(java.util.Iterator) List(java.util.List) VisibleForTesting(org.apache.arrow.util.VisibleForTesting)

Example 5 with VisibleForTesting

use of org.apache.arrow.util.VisibleForTesting in project aws-athena-query-federation by awslabs.

the class BlockUtils method writeStructValue.

/**
 * Used to write a value into a specific child field within a Struct. Multiple calls to this method per-cell are
 * expected in order to write to all N fields of a Struct.
 *
 * @param writer The FieldWriter (already positioned at the row and list entry number) that we want to write into.
 * @param field The child field we are attempting to write into.
 * @param allocator The BlockAllocator that can be used for allocating Arrow Buffers for fields which require conversion
 * to Arrow Buff before being written.
 * @param value The value to write.
 * @note This method and its List complement violate the DRY mantra because ListWriter and StructWriter don't share
 * a meaningful ancestor despite having identical methods. This requires us to either further wrap and abstract the writer
 * or duplicate come code. In a future release we hope to have contributed a better option to Apache Arrow which allows
 * us to simplify this method.
 */
@VisibleForTesting
protected static void writeStructValue(StructWriter writer, Field field, BufferAllocator allocator, Object value) {
    if (value == null) {
        return;
    }
    ArrowType type = field.getType();
    try {
        switch(Types.getMinorTypeForArrowType(type)) {
            case TIMESTAMPMILLITZ:
                long dateTimeWithZone;
                if (value instanceof ZonedDateTime) {
                    dateTimeWithZone = DateTimeFormatterUtil.packDateTimeWithZone((ZonedDateTime) value);
                } else if (value instanceof LocalDateTime) {
                    dateTimeWithZone = DateTimeFormatterUtil.packDateTimeWithZone(((LocalDateTime) value).atZone(UTC_ZONE_ID).toInstant().toEpochMilli(), UTC_ZONE_ID.getId());
                } else if (value instanceof Date) {
                    long ldtInLong = Instant.ofEpochMilli(((Date) value).getTime()).atZone(UTC_ZONE_ID).toInstant().toEpochMilli();
                    dateTimeWithZone = DateTimeFormatterUtil.packDateTimeWithZone(ldtInLong, UTC_ZONE_ID.getId());
                } else {
                    dateTimeWithZone = (long) value;
                }
                writer.timeStampMilliTZ(field.getName()).writeTimeStampMilliTZ(dateTimeWithZone);
            case DATEMILLI:
                if (value instanceof Date) {
                    writer.dateMilli(field.getName()).writeDateMilli(((Date) value).getTime());
                } else {
                    writer.dateMilli(field.getName()).writeDateMilli((long) value);
                }
                break;
            case DATEDAY:
                if (value instanceof Date) {
                    org.joda.time.Days days = org.joda.time.Days.daysBetween(EPOCH, new org.joda.time.DateTime(((Date) value).getTime()));
                    writer.dateDay(field.getName()).writeDateDay(days.getDays());
                } else if (value instanceof LocalDate) {
                    int days = (int) ((LocalDate) value).toEpochDay();
                    writer.dateDay(field.getName()).writeDateDay(days);
                } else if (value instanceof Long) {
                    writer.dateDay(field.getName()).writeDateDay(((Long) value).intValue());
                } else {
                    writer.dateDay(field.getName()).writeDateDay((int) value);
                }
                break;
            case FLOAT8:
                writer.float8(field.getName()).writeFloat8((double) value);
                break;
            case FLOAT4:
                writer.float4(field.getName()).writeFloat4((float) value);
                break;
            case INT:
                if (value != null && value instanceof Long) {
                    // This may seem odd at first but many frameworks (like Presto) use long as the preferred
                    // native java type for representing integers. We do this to keep type conversions simple.
                    writer.integer(field.getName()).writeInt(((Long) value).intValue());
                } else {
                    writer.integer(field.getName()).writeInt((int) value);
                }
                break;
            case TINYINT:
                writer.tinyInt(field.getName()).writeTinyInt((byte) value);
                break;
            case SMALLINT:
                writer.smallInt(field.getName()).writeSmallInt((short) value);
                break;
            case UINT1:
                writer.uInt1(field.getName()).writeUInt1((byte) value);
                break;
            case UINT2:
                writer.uInt2(field.getName()).writeUInt2((char) value);
                break;
            case UINT4:
                writer.uInt4(field.getName()).writeUInt4((int) value);
                break;
            case UINT8:
                writer.uInt8(field.getName()).writeUInt8((long) value);
                break;
            case BIGINT:
                writer.bigInt(field.getName()).writeBigInt((long) value);
                break;
            case VARBINARY:
                if (value instanceof ArrowBuf) {
                    ArrowBuf buf = (ArrowBuf) value;
                    writer.varBinary(field.getName()).writeVarBinary(0, (int) (buf.capacity()), buf);
                } else if (value instanceof byte[]) {
                    byte[] bytes = (byte[]) value;
                    try (ArrowBuf buf = allocator.buffer(bytes.length)) {
                        buf.writeBytes(bytes);
                        writer.varBinary(field.getName()).writeVarBinary(0, (int) (buf.readableBytes()), buf);
                    }
                }
                break;
            case DECIMAL:
                int scale = ((ArrowType.Decimal) type).getScale();
                int precision = ((ArrowType.Decimal) type).getPrecision();
                if (value instanceof Double) {
                    BigDecimal bdVal = new BigDecimal((double) value);
                    bdVal = bdVal.setScale(scale, RoundingMode.HALF_UP);
                    writer.decimal(field.getName(), scale, precision).writeDecimal(bdVal);
                } else {
                    BigDecimal scaledValue = ((BigDecimal) value).setScale(scale, RoundingMode.HALF_UP);
                    writer.decimal(field.getName(), scale, precision).writeDecimal(scaledValue);
                }
                break;
            case VARCHAR:
                if (value instanceof String) {
                    byte[] bytes = ((String) value).getBytes(Charsets.UTF_8);
                    try (ArrowBuf buf = allocator.buffer(bytes.length)) {
                        buf.writeBytes(bytes);
                        writer.varChar(field.getName()).writeVarChar(0, (int) (buf.readableBytes()), buf);
                    }
                } else if (value instanceof ArrowBuf) {
                    ArrowBuf buf = (ArrowBuf) value;
                    writer.varChar(field.getName()).writeVarChar(0, (int) (buf.readableBytes()), buf);
                } else if (value instanceof byte[]) {
                    byte[] bytes = (byte[]) value;
                    try (ArrowBuf buf = allocator.buffer(bytes.length)) {
                        buf.writeBytes(bytes);
                        writer.varChar(field.getName()).writeVarChar(0, (int) (buf.readableBytes()), buf);
                    }
                }
                break;
            case BIT:
                if (value instanceof Integer && (int) value > 0) {
                    writer.bit(field.getName()).writeBit(1);
                } else if (value instanceof Boolean && (boolean) value) {
                    writer.bit(field.getName()).writeBit(1);
                } else {
                    writer.bit(field.getName()).writeBit(0);
                }
                break;
            default:
                throw new IllegalArgumentException("Unknown type " + type);
        }
    } catch (RuntimeException ex) {
        throw new RuntimeException("Unable to write value for field " + field.getName() + " using value " + value, ex);
    }
}
Also used : LocalDateTime(java.time.LocalDateTime) ArrowBuf(org.apache.arrow.memory.ArrowBuf) ArrowType(org.apache.arrow.vector.types.pojo.ArrowType) LocalDate(java.time.LocalDate) Date(java.util.Date) LocalDate(java.time.LocalDate) BigDecimal(java.math.BigDecimal) BigDecimal(java.math.BigDecimal) ZonedDateTime(java.time.ZonedDateTime) VisibleForTesting(org.apache.arrow.util.VisibleForTesting)

Aggregations

VisibleForTesting (org.apache.arrow.util.VisibleForTesting)9 Field (org.apache.arrow.vector.types.pojo.Field)4 List (java.util.List)3 ArrowBuf (org.apache.arrow.memory.ArrowBuf)3 ArrowType (org.apache.arrow.vector.types.pojo.ArrowType)3 BigDecimal (java.math.BigDecimal)2 LocalDate (java.time.LocalDate)2 LocalDateTime (java.time.LocalDateTime)2 ZonedDateTime (java.time.ZonedDateTime)2 Date (java.util.Date)2 Iterator (java.util.Iterator)1 Cipher (javax.crypto.Cipher)1 SecretKeySpec (javax.crypto.spec.SecretKeySpec)1 ArrowRecordBatch (org.apache.arrow.vector.ipc.message.ArrowRecordBatch)1