use of org.apache.spark.sql.types.DataType in project Gaffer by gchq.
the class SchemaToStructTypeConverter method buildSchema.
private void buildSchema() {
LOGGER.info("Building Spark SQL schema for groups {}", StringUtils.join(groups, ','));
for (final String group : groups) {
final SchemaElementDefinition elementDefn = schema.getElement(group);
final List<StructField> structFieldList = new ArrayList<>();
if (elementDefn instanceof SchemaEntityDefinition) {
entityOrEdgeByGroup.put(group, EntityOrEdge.ENTITY);
final SchemaEntityDefinition entityDefinition = (SchemaEntityDefinition) elementDefn;
final String vertexClass = schema.getType(entityDefinition.getVertex()).getClassString();
final DataType vertexType = getType(vertexClass);
if (vertexType == null) {
throw new RuntimeException("Vertex must be a recognised type: found " + vertexClass);
}
LOGGER.info("Group {} is an entity group - {} is of type {}", group, VERTEX_COL_NAME, vertexType);
structFieldList.add(new StructField(VERTEX_COL_NAME, vertexType, true, Metadata.empty()));
} else {
entityOrEdgeByGroup.put(group, EntityOrEdge.EDGE);
final SchemaEdgeDefinition edgeDefinition = (SchemaEdgeDefinition) elementDefn;
final String srcClass = schema.getType(edgeDefinition.getSource()).getClassString();
final String dstClass = schema.getType(edgeDefinition.getDestination()).getClassString();
final DataType srcType = getType(srcClass);
final DataType dstType = getType(dstClass);
if (srcType == null || dstType == null) {
throw new RuntimeException("Both source and destination must be recognised types: source was " + srcClass + " destination was " + dstClass);
}
LOGGER.info("Group {} is an edge group - {} is of type {}, {} is of type {}", group, SRC_COL_NAME, srcType, DST_COL_NAME, dstType);
structFieldList.add(new StructField(SRC_COL_NAME, srcType, true, Metadata.empty()));
structFieldList.add(new StructField(DST_COL_NAME, dstType, true, Metadata.empty()));
}
final Set<String> properties = elementDefn.getProperties();
for (final String property : properties) {
// Check if property is of a known type that can be handled by default
final String propertyClass = elementDefn.getPropertyClass(property).getCanonicalName();
DataType propertyType = getType(propertyClass);
if (propertyType != null) {
propertyNeedsConversion.put(property, needsConversion(propertyClass));
structFieldList.add(new StructField(property, propertyType, true, Metadata.empty()));
LOGGER.info("Property {} is of type {}", property, propertyType);
} else {
// Check if any of the provided converters can handle it
if (converters != null) {
for (final Converter converter : converters) {
if (converter.canHandle(elementDefn.getPropertyClass(property))) {
propertyNeedsConversion.put(property, true);
propertyType = converter.convertedType();
converterByProperty.put(property, converter);
structFieldList.add(new StructField(property, propertyType, true, Metadata.empty()));
LOGGER.info("Property {} of type {} will be converted by {} to {}", property, propertyClass, converter.getClass().getName(), propertyType);
break;
}
}
if (propertyType == null) {
LOGGER.warn("Ignoring property {} as it is not a recognised type and none of the provided " + "converters can handle it", property);
}
}
}
}
structTypeByGroup.put(group, new StructType(structFieldList.toArray(new StructField[structFieldList.size()])));
}
// Create reverse map of field name to StructField
final Map<String, Set<StructField>> fieldToStructs = new HashMap<>();
for (final String group : groups) {
final StructType groupSchema = structTypeByGroup.get(group);
for (final String field : groupSchema.fieldNames()) {
if (fieldToStructs.get(field) == null) {
fieldToStructs.put(field, new HashSet<StructField>());
}
fieldToStructs.get(field).add(groupSchema.apply(field));
}
}
// Check consistency, i.e. if the same field appears in multiple groups then the types are consistent
for (final Entry<String, Set<StructField>> entry : fieldToStructs.entrySet()) {
final Set<StructField> schemas = entry.getValue();
if (schemas.size() > 1) {
throw new IllegalArgumentException("Inconsistent fields: the field " + entry.getKey() + " has more than one definition: " + StringUtils.join(schemas, ','));
}
}
// Merge schemas for groups together - fields should appear in the order the groups were provided
final LinkedHashSet<StructField> fields = new LinkedHashSet<>();
fields.add(new StructField(GROUP, DataTypes.StringType, false, Metadata.empty()));
usedProperties.add(GROUP);
for (final String group : groups) {
final StructType groupSchema = structTypeByGroup.get(group);
for (final String field : groupSchema.fieldNames()) {
final StructField struct = groupSchema.apply(field);
// Add struct to fields unless it has already been added
if (!fields.contains(struct)) {
fields.add(struct);
usedProperties.add(field);
}
}
}
structType = new StructType(fields.toArray(new StructField[fields.size()]));
LOGGER.info("Schema is {}", structType);
LOGGER.debug("properties -> conversion: {}", StringUtils.join(propertyNeedsConversion.entrySet(), ','));
}
use of org.apache.spark.sql.types.DataType in project carbondata by apache.
the class SafeVariableLengthDimensionDataChunkStore method fillRow.
@Override
public void fillRow(int rowId, CarbonColumnVector vector, int vectorRow) {
// if column was explicitly sorted we need to get the rowid based inverted index reverse
if (isExplictSorted) {
rowId = invertedIndexReverse[rowId];
}
// now to get the row from memory block we need to do following thing
// 1. first get the current offset
// 2. if it's not a last row- get the next row offset
// Subtract the current row offset + 2 bytes(to skip the data length) with next row offset
// else subtract the current row offset with complete data
// length get the offset of set of data
int currentDataOffset = dataOffsets[rowId];
short length = 0;
// calculating the length of data
if (rowId < numberOfRows - 1) {
length = (short) (dataOffsets[rowId + 1] - (currentDataOffset + CarbonCommonConstants.SHORT_SIZE_IN_BYTE));
} else {
// for last record
length = (short) (this.data.length - currentDataOffset);
}
if (ByteUtil.UnsafeComparer.INSTANCE.equals(CarbonCommonConstants.MEMBER_DEFAULT_VAL_ARRAY, 0, CarbonCommonConstants.MEMBER_DEFAULT_VAL_ARRAY.length, data, currentDataOffset, length)) {
vector.putNull(vectorRow);
} else {
DataType dt = vector.getType();
if (dt instanceof StringType) {
vector.putBytes(vectorRow, currentDataOffset, length, data);
} else if (dt instanceof BooleanType) {
vector.putBoolean(vectorRow, ByteUtil.toBoolean(data[currentDataOffset]));
} else if (dt instanceof ShortType) {
vector.putShort(vectorRow, ByteUtil.toShort(data, currentDataOffset, length));
} else if (dt instanceof IntegerType) {
vector.putInt(vectorRow, ByteUtil.toInt(data, currentDataOffset, length));
} else if (dt instanceof FloatType) {
vector.putFloat(vectorRow, ByteUtil.toFloat(data, currentDataOffset));
} else if (dt instanceof DoubleType) {
vector.putDouble(vectorRow, ByteUtil.toDouble(data, currentDataOffset));
} else if (dt instanceof LongType) {
vector.putLong(vectorRow, ByteUtil.toLong(data, currentDataOffset, length));
} else if (dt instanceof DecimalType) {
vector.putDecimal(vectorRow, Decimal.apply(ByteUtil.toBigDecimal(data, currentDataOffset, length)), DecimalType.MAX_PRECISION());
}
}
}
use of org.apache.spark.sql.types.DataType in project incubator-systemml by apache.
the class DataFrameVectorFrameConversionTest method createDataFrame.
@SuppressWarnings("resource")
private Dataset<Row> createDataFrame(SparkSession sparkSession, MatrixBlock mb, boolean containsID, ValueType[] schema) throws DMLRuntimeException {
//create in-memory list of rows
List<Row> list = new ArrayList<Row>();
int off = (containsID ? 1 : 0);
int clen = mb.getNumColumns() + off - colsVector + 1;
for (int i = 0; i < mb.getNumRows(); i++) {
Object[] row = new Object[clen];
if (containsID)
row[0] = (double) i + 1;
for (int j = 0, j2 = 0; j < mb.getNumColumns(); j++, j2++) {
if (schema[j2] != ValueType.OBJECT) {
row[j2 + off] = UtilFunctions.doubleToObject(schema[j2], mb.quickGetValue(i, j));
} else {
double[] tmp = DataConverter.convertToDoubleVector(mb.sliceOperations(i, i, j, j + colsVector - 1, new MatrixBlock()));
row[j2 + off] = new DenseVector(tmp);
j += colsVector - 1;
}
}
list.add(RowFactory.create(row));
}
//create data frame schema
List<StructField> fields = new ArrayList<StructField>();
if (containsID)
fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
for (int j = 0; j < schema.length; j++) {
DataType dt = null;
switch(schema[j]) {
case STRING:
dt = DataTypes.StringType;
break;
case DOUBLE:
dt = DataTypes.DoubleType;
break;
case INT:
dt = DataTypes.LongType;
break;
case OBJECT:
dt = new VectorUDT();
break;
default:
throw new RuntimeException("Unsupported value type.");
}
fields.add(DataTypes.createStructField("C" + (j + 1), dt, true));
}
StructType dfSchema = DataTypes.createStructType(fields);
//create rdd and data frame
JavaSparkContext sc = new JavaSparkContext(sparkSession.sparkContext());
JavaRDD<Row> rowRDD = sc.parallelize(list);
return sparkSession.createDataFrame(rowRDD, dfSchema);
}
use of org.apache.spark.sql.types.DataType in project incubator-systemml by apache.
the class DataFrameVectorScriptTest method createDataFrame.
@SuppressWarnings("resource")
private Dataset<Row> createDataFrame(SparkSession sparkSession, MatrixBlock mb, boolean containsID, ValueType[] schema) throws DMLRuntimeException {
//create in-memory list of rows
List<Row> list = new ArrayList<Row>();
int off = (containsID ? 1 : 0);
int clen = mb.getNumColumns() + off - colsVector + 1;
for (int i = 0; i < mb.getNumRows(); i++) {
Object[] row = new Object[clen];
if (containsID)
row[0] = (double) i + 1;
for (int j = 0, j2 = 0; j < mb.getNumColumns(); j++, j2++) {
if (schema[j2] != ValueType.OBJECT) {
row[j2 + off] = UtilFunctions.doubleToObject(schema[j2], mb.quickGetValue(i, j));
} else {
double[] tmp = DataConverter.convertToDoubleVector(mb.sliceOperations(i, i, j, j + colsVector - 1, new MatrixBlock()));
row[j2 + off] = new DenseVector(tmp);
j += colsVector - 1;
}
}
list.add(RowFactory.create(row));
}
//create data frame schema
List<StructField> fields = new ArrayList<StructField>();
if (containsID)
fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
for (int j = 0; j < schema.length; j++) {
DataType dt = null;
switch(schema[j]) {
case STRING:
dt = DataTypes.StringType;
break;
case DOUBLE:
dt = DataTypes.DoubleType;
break;
case INT:
dt = DataTypes.LongType;
break;
case OBJECT:
dt = new VectorUDT();
break;
default:
throw new RuntimeException("Unsupported value type.");
}
fields.add(DataTypes.createStructField("C" + (j + 1), dt, true));
}
StructType dfSchema = DataTypes.createStructType(fields);
//create rdd and data frame
JavaSparkContext sc = new JavaSparkContext(sparkSession.sparkContext());
JavaRDD<Row> rowRDD = sc.parallelize(list);
return sparkSession.createDataFrame(rowRDD, dfSchema);
}
use of org.apache.spark.sql.types.DataType in project incubator-systemml by apache.
the class FrameRDDConverterUtils method convertFrameSchemaToDFSchema.
/**
* This function will convert Frame schema into DataFrame schema
*
* @param fschema frame schema
* @param containsID true if contains ID column
* @return Spark StructType of StructFields representing schema
*/
public static StructType convertFrameSchemaToDFSchema(ValueType[] fschema, boolean containsID) {
// generate the schema based on the string of schema
List<StructField> fields = new ArrayList<StructField>();
// add id column type
if (containsID)
fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
// add remaining types
int col = 1;
for (ValueType schema : fschema) {
DataType dt = null;
switch(schema) {
case STRING:
dt = DataTypes.StringType;
break;
case DOUBLE:
dt = DataTypes.DoubleType;
break;
case INT:
dt = DataTypes.LongType;
break;
case BOOLEAN:
dt = DataTypes.BooleanType;
break;
default:
dt = DataTypes.StringType;
LOG.warn("Using default type String for " + schema.toString());
}
fields.add(DataTypes.createStructField("C" + col++, dt, true));
}
return DataTypes.createStructType(fields);
}
Aggregations