Search in sources :

Example 1 with DataType

use of org.apache.spark.sql.types.DataType in project Gaffer by gchq.

the class SchemaToStructTypeConverter method buildSchema.

private void buildSchema() {
    LOGGER.info("Building Spark SQL schema for groups {}", StringUtils.join(groups, ','));
    for (final String group : groups) {
        final SchemaElementDefinition elementDefn = schema.getElement(group);
        final List<StructField> structFieldList = new ArrayList<>();
        if (elementDefn instanceof SchemaEntityDefinition) {
            entityOrEdgeByGroup.put(group, EntityOrEdge.ENTITY);
            final SchemaEntityDefinition entityDefinition = (SchemaEntityDefinition) elementDefn;
            final String vertexClass = schema.getType(entityDefinition.getVertex()).getClassString();
            final DataType vertexType = getType(vertexClass);
            if (vertexType == null) {
                throw new RuntimeException("Vertex must be a recognised type: found " + vertexClass);
            }
            LOGGER.info("Group {} is an entity group - {} is of type {}", group, VERTEX_COL_NAME, vertexType);
            structFieldList.add(new StructField(VERTEX_COL_NAME, vertexType, true, Metadata.empty()));
        } else {
            entityOrEdgeByGroup.put(group, EntityOrEdge.EDGE);
            final SchemaEdgeDefinition edgeDefinition = (SchemaEdgeDefinition) elementDefn;
            final String srcClass = schema.getType(edgeDefinition.getSource()).getClassString();
            final String dstClass = schema.getType(edgeDefinition.getDestination()).getClassString();
            final DataType srcType = getType(srcClass);
            final DataType dstType = getType(dstClass);
            if (srcType == null || dstType == null) {
                throw new RuntimeException("Both source and destination must be recognised types: source was " + srcClass + " destination was " + dstClass);
            }
            LOGGER.info("Group {} is an edge group - {} is of type {}, {} is of type {}", group, SRC_COL_NAME, srcType, DST_COL_NAME, dstType);
            structFieldList.add(new StructField(SRC_COL_NAME, srcType, true, Metadata.empty()));
            structFieldList.add(new StructField(DST_COL_NAME, dstType, true, Metadata.empty()));
        }
        final Set<String> properties = elementDefn.getProperties();
        for (final String property : properties) {
            // Check if property is of a known type that can be handled by default
            final String propertyClass = elementDefn.getPropertyClass(property).getCanonicalName();
            DataType propertyType = getType(propertyClass);
            if (propertyType != null) {
                propertyNeedsConversion.put(property, needsConversion(propertyClass));
                structFieldList.add(new StructField(property, propertyType, true, Metadata.empty()));
                LOGGER.info("Property {} is of type {}", property, propertyType);
            } else {
                // Check if any of the provided converters can handle it
                if (converters != null) {
                    for (final Converter converter : converters) {
                        if (converter.canHandle(elementDefn.getPropertyClass(property))) {
                            propertyNeedsConversion.put(property, true);
                            propertyType = converter.convertedType();
                            converterByProperty.put(property, converter);
                            structFieldList.add(new StructField(property, propertyType, true, Metadata.empty()));
                            LOGGER.info("Property {} of type {} will be converted by {} to {}", property, propertyClass, converter.getClass().getName(), propertyType);
                            break;
                        }
                    }
                    if (propertyType == null) {
                        LOGGER.warn("Ignoring property {} as it is not a recognised type and none of the provided " + "converters can handle it", property);
                    }
                }
            }
        }
        structTypeByGroup.put(group, new StructType(structFieldList.toArray(new StructField[structFieldList.size()])));
    }
    // Create reverse map of field name to StructField
    final Map<String, Set<StructField>> fieldToStructs = new HashMap<>();
    for (final String group : groups) {
        final StructType groupSchema = structTypeByGroup.get(group);
        for (final String field : groupSchema.fieldNames()) {
            if (fieldToStructs.get(field) == null) {
                fieldToStructs.put(field, new HashSet<StructField>());
            }
            fieldToStructs.get(field).add(groupSchema.apply(field));
        }
    }
    // Check consistency, i.e. if the same field appears in multiple groups then the types are consistent
    for (final Entry<String, Set<StructField>> entry : fieldToStructs.entrySet()) {
        final Set<StructField> schemas = entry.getValue();
        if (schemas.size() > 1) {
            throw new IllegalArgumentException("Inconsistent fields: the field " + entry.getKey() + " has more than one definition: " + StringUtils.join(schemas, ','));
        }
    }
    // Merge schemas for groups together - fields should appear in the order the groups were provided
    final LinkedHashSet<StructField> fields = new LinkedHashSet<>();
    fields.add(new StructField(GROUP, DataTypes.StringType, false, Metadata.empty()));
    usedProperties.add(GROUP);
    for (final String group : groups) {
        final StructType groupSchema = structTypeByGroup.get(group);
        for (final String field : groupSchema.fieldNames()) {
            final StructField struct = groupSchema.apply(field);
            // Add struct to fields unless it has already been added
            if (!fields.contains(struct)) {
                fields.add(struct);
                usedProperties.add(field);
            }
        }
    }
    structType = new StructType(fields.toArray(new StructField[fields.size()]));
    LOGGER.info("Schema is {}", structType);
    LOGGER.debug("properties -> conversion: {}", StringUtils.join(propertyNeedsConversion.entrySet(), ','));
}
Also used : LinkedHashSet(java.util.LinkedHashSet) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) Set(java.util.Set) StructType(org.apache.spark.sql.types.StructType) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) SchemaEntityDefinition(uk.gov.gchq.gaffer.store.schema.SchemaEntityDefinition) StructField(org.apache.spark.sql.types.StructField) DataType(org.apache.spark.sql.types.DataType) SchemaEdgeDefinition(uk.gov.gchq.gaffer.store.schema.SchemaEdgeDefinition) Converter(uk.gov.gchq.gaffer.spark.operation.dataframe.converter.property.Converter) HyperLogLogPlusConverter(uk.gov.gchq.gaffer.spark.operation.dataframe.converter.property.impl.HyperLogLogPlusConverter) FreqMapConverter(uk.gov.gchq.gaffer.spark.operation.dataframe.converter.property.impl.FreqMapConverter) UnionConverter(uk.gov.gchq.gaffer.spark.operation.dataframe.converter.property.impl.datasketches.theta.UnionConverter) SchemaElementDefinition(uk.gov.gchq.gaffer.store.schema.SchemaElementDefinition)

Example 2 with DataType

use of org.apache.spark.sql.types.DataType in project carbondata by apache.

the class SafeVariableLengthDimensionDataChunkStore method fillRow.

@Override
public void fillRow(int rowId, CarbonColumnVector vector, int vectorRow) {
    // if column was explicitly sorted we need to get the rowid based inverted index reverse
    if (isExplictSorted) {
        rowId = invertedIndexReverse[rowId];
    }
    // now to get the row from memory block we need to do following thing
    // 1. first get the current offset
    // 2. if it's not a last row- get the next row offset
    // Subtract the current row offset + 2 bytes(to skip the data length) with next row offset
    // else subtract the current row offset with complete data
    // length get the offset of set of data
    int currentDataOffset = dataOffsets[rowId];
    short length = 0;
    // calculating the length of data
    if (rowId < numberOfRows - 1) {
        length = (short) (dataOffsets[rowId + 1] - (currentDataOffset + CarbonCommonConstants.SHORT_SIZE_IN_BYTE));
    } else {
        // for last record
        length = (short) (this.data.length - currentDataOffset);
    }
    if (ByteUtil.UnsafeComparer.INSTANCE.equals(CarbonCommonConstants.MEMBER_DEFAULT_VAL_ARRAY, 0, CarbonCommonConstants.MEMBER_DEFAULT_VAL_ARRAY.length, data, currentDataOffset, length)) {
        vector.putNull(vectorRow);
    } else {
        DataType dt = vector.getType();
        if (dt instanceof StringType) {
            vector.putBytes(vectorRow, currentDataOffset, length, data);
        } else if (dt instanceof BooleanType) {
            vector.putBoolean(vectorRow, ByteUtil.toBoolean(data[currentDataOffset]));
        } else if (dt instanceof ShortType) {
            vector.putShort(vectorRow, ByteUtil.toShort(data, currentDataOffset, length));
        } else if (dt instanceof IntegerType) {
            vector.putInt(vectorRow, ByteUtil.toInt(data, currentDataOffset, length));
        } else if (dt instanceof FloatType) {
            vector.putFloat(vectorRow, ByteUtil.toFloat(data, currentDataOffset));
        } else if (dt instanceof DoubleType) {
            vector.putDouble(vectorRow, ByteUtil.toDouble(data, currentDataOffset));
        } else if (dt instanceof LongType) {
            vector.putLong(vectorRow, ByteUtil.toLong(data, currentDataOffset, length));
        } else if (dt instanceof DecimalType) {
            vector.putDecimal(vectorRow, Decimal.apply(ByteUtil.toBigDecimal(data, currentDataOffset, length)), DecimalType.MAX_PRECISION());
        }
    }
}
Also used : IntegerType(org.apache.spark.sql.types.IntegerType) LongType(org.apache.spark.sql.types.LongType) StringType(org.apache.spark.sql.types.StringType) DoubleType(org.apache.spark.sql.types.DoubleType) ShortType(org.apache.spark.sql.types.ShortType) BooleanType(org.apache.spark.sql.types.BooleanType) DataType(org.apache.spark.sql.types.DataType) DecimalType(org.apache.spark.sql.types.DecimalType) FloatType(org.apache.spark.sql.types.FloatType)

Example 3 with DataType

use of org.apache.spark.sql.types.DataType in project incubator-systemml by apache.

the class DataFrameVectorFrameConversionTest method createDataFrame.

@SuppressWarnings("resource")
private Dataset<Row> createDataFrame(SparkSession sparkSession, MatrixBlock mb, boolean containsID, ValueType[] schema) throws DMLRuntimeException {
    //create in-memory list of rows
    List<Row> list = new ArrayList<Row>();
    int off = (containsID ? 1 : 0);
    int clen = mb.getNumColumns() + off - colsVector + 1;
    for (int i = 0; i < mb.getNumRows(); i++) {
        Object[] row = new Object[clen];
        if (containsID)
            row[0] = (double) i + 1;
        for (int j = 0, j2 = 0; j < mb.getNumColumns(); j++, j2++) {
            if (schema[j2] != ValueType.OBJECT) {
                row[j2 + off] = UtilFunctions.doubleToObject(schema[j2], mb.quickGetValue(i, j));
            } else {
                double[] tmp = DataConverter.convertToDoubleVector(mb.sliceOperations(i, i, j, j + colsVector - 1, new MatrixBlock()));
                row[j2 + off] = new DenseVector(tmp);
                j += colsVector - 1;
            }
        }
        list.add(RowFactory.create(row));
    }
    //create data frame schema
    List<StructField> fields = new ArrayList<StructField>();
    if (containsID)
        fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
    for (int j = 0; j < schema.length; j++) {
        DataType dt = null;
        switch(schema[j]) {
            case STRING:
                dt = DataTypes.StringType;
                break;
            case DOUBLE:
                dt = DataTypes.DoubleType;
                break;
            case INT:
                dt = DataTypes.LongType;
                break;
            case OBJECT:
                dt = new VectorUDT();
                break;
            default:
                throw new RuntimeException("Unsupported value type.");
        }
        fields.add(DataTypes.createStructField("C" + (j + 1), dt, true));
    }
    StructType dfSchema = DataTypes.createStructType(fields);
    //create rdd and data frame
    JavaSparkContext sc = new JavaSparkContext(sparkSession.sparkContext());
    JavaRDD<Row> rowRDD = sc.parallelize(list);
    return sparkSession.createDataFrame(rowRDD, dfSchema);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) VectorUDT(org.apache.spark.ml.linalg.VectorUDT) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) DataType(org.apache.spark.sql.types.DataType) Row(org.apache.spark.sql.Row) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) DenseVector(org.apache.spark.ml.linalg.DenseVector)

Example 4 with DataType

use of org.apache.spark.sql.types.DataType in project incubator-systemml by apache.

the class DataFrameVectorScriptTest method createDataFrame.

@SuppressWarnings("resource")
private Dataset<Row> createDataFrame(SparkSession sparkSession, MatrixBlock mb, boolean containsID, ValueType[] schema) throws DMLRuntimeException {
    //create in-memory list of rows
    List<Row> list = new ArrayList<Row>();
    int off = (containsID ? 1 : 0);
    int clen = mb.getNumColumns() + off - colsVector + 1;
    for (int i = 0; i < mb.getNumRows(); i++) {
        Object[] row = new Object[clen];
        if (containsID)
            row[0] = (double) i + 1;
        for (int j = 0, j2 = 0; j < mb.getNumColumns(); j++, j2++) {
            if (schema[j2] != ValueType.OBJECT) {
                row[j2 + off] = UtilFunctions.doubleToObject(schema[j2], mb.quickGetValue(i, j));
            } else {
                double[] tmp = DataConverter.convertToDoubleVector(mb.sliceOperations(i, i, j, j + colsVector - 1, new MatrixBlock()));
                row[j2 + off] = new DenseVector(tmp);
                j += colsVector - 1;
            }
        }
        list.add(RowFactory.create(row));
    }
    //create data frame schema
    List<StructField> fields = new ArrayList<StructField>();
    if (containsID)
        fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
    for (int j = 0; j < schema.length; j++) {
        DataType dt = null;
        switch(schema[j]) {
            case STRING:
                dt = DataTypes.StringType;
                break;
            case DOUBLE:
                dt = DataTypes.DoubleType;
                break;
            case INT:
                dt = DataTypes.LongType;
                break;
            case OBJECT:
                dt = new VectorUDT();
                break;
            default:
                throw new RuntimeException("Unsupported value type.");
        }
        fields.add(DataTypes.createStructField("C" + (j + 1), dt, true));
    }
    StructType dfSchema = DataTypes.createStructType(fields);
    //create rdd and data frame
    JavaSparkContext sc = new JavaSparkContext(sparkSession.sparkContext());
    JavaRDD<Row> rowRDD = sc.parallelize(list);
    return sparkSession.createDataFrame(rowRDD, dfSchema);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) VectorUDT(org.apache.spark.ml.linalg.VectorUDT) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) DataType(org.apache.spark.sql.types.DataType) Row(org.apache.spark.sql.Row) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) DenseVector(org.apache.spark.ml.linalg.DenseVector)

Example 5 with DataType

use of org.apache.spark.sql.types.DataType in project incubator-systemml by apache.

the class FrameRDDConverterUtils method convertFrameSchemaToDFSchema.

/**
	 * This function will convert Frame schema into DataFrame schema 
	 * 
	 * @param fschema frame schema
	 * @param containsID true if contains ID column
	 * @return Spark StructType of StructFields representing schema
	 */
public static StructType convertFrameSchemaToDFSchema(ValueType[] fschema, boolean containsID) {
    // generate the schema based on the string of schema
    List<StructField> fields = new ArrayList<StructField>();
    // add id column type
    if (containsID)
        fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
    // add remaining types
    int col = 1;
    for (ValueType schema : fschema) {
        DataType dt = null;
        switch(schema) {
            case STRING:
                dt = DataTypes.StringType;
                break;
            case DOUBLE:
                dt = DataTypes.DoubleType;
                break;
            case INT:
                dt = DataTypes.LongType;
                break;
            case BOOLEAN:
                dt = DataTypes.BooleanType;
                break;
            default:
                dt = DataTypes.StringType;
                LOG.warn("Using default type String for " + schema.toString());
        }
        fields.add(DataTypes.createStructField("C" + col++, dt, true));
    }
    return DataTypes.createStructType(fields);
}
Also used : StructField(org.apache.spark.sql.types.StructField) ValueType(org.apache.sysml.parser.Expression.ValueType) ArrayList(java.util.ArrayList) DataType(org.apache.spark.sql.types.DataType)

Aggregations

DataType (org.apache.spark.sql.types.DataType)6 StructField (org.apache.spark.sql.types.StructField)5 ArrayList (java.util.ArrayList)4 StructType (org.apache.spark.sql.types.StructType)4 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)2 DenseVector (org.apache.spark.ml.linalg.DenseVector)2 VectorUDT (org.apache.spark.ml.linalg.VectorUDT)2 Row (org.apache.spark.sql.Row)2 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)2 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)2 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 LinkedHashSet (java.util.LinkedHashSet)1 Set (java.util.Set)1 BooleanType (org.apache.spark.sql.types.BooleanType)1 DecimalType (org.apache.spark.sql.types.DecimalType)1 DoubleType (org.apache.spark.sql.types.DoubleType)1 FloatType (org.apache.spark.sql.types.FloatType)1 IntegerType (org.apache.spark.sql.types.IntegerType)1 LongType (org.apache.spark.sql.types.LongType)1