use of org.apache.spark.sql.types.IntegerType$ in project iceberg by apache.
the class Spark3Util method findWidth.
@SuppressWarnings("unchecked")
private static int findWidth(Transform transform) {
for (Expression expr : transform.arguments()) {
if (expr instanceof Literal) {
if (((Literal) expr).dataType() instanceof IntegerType) {
Literal<Integer> lit = (Literal<Integer>) expr;
Preconditions.checkArgument(lit.value() > 0, "Unsupported width for transform: %s", transform.describe());
return lit.value();
} else if (((Literal) expr).dataType() instanceof LongType) {
Literal<Long> lit = (Literal<Long>) expr;
Preconditions.checkArgument(lit.value() > 0 && lit.value() < Integer.MAX_VALUE, "Unsupported width for transform: %s", transform.describe());
if (lit.value() > Integer.MAX_VALUE) {
throw new IllegalArgumentException();
}
return lit.value().intValue();
}
}
}
throw new IllegalArgumentException("Cannot find width for transform: " + transform.describe());
}
use of org.apache.spark.sql.types.IntegerType$ in project OpenLineage by OpenLineage.
the class SparkReadWriteIntegTest method testWithLogicalRdd.
@Test
public void testWithLogicalRdd(@TempDir Path tmpDir, SparkSession spark) throws InterruptedException, TimeoutException {
StructType schema = new StructType(new StructField[] { new StructField("anInt", IntegerType$.MODULE$, false, new Metadata(new HashMap<>())), new StructField("aString", StringType$.MODULE$, false, new Metadata(new HashMap<>())) });
String csvPath = tmpDir.toAbsolutePath() + "/csv_data";
String csvUri = "file://" + csvPath;
spark.createDataFrame(Arrays.asList(new GenericRow(new Object[] { 1, "seven" }), new GenericRow(new Object[] { 6, "one" }), new GenericRow(new Object[] { 72, "fourteen" }), new GenericRow(new Object[] { 99, "sixteen" })), schema).write().csv(csvUri);
StaticExecutionContextFactory.waitForExecutionEnd();
// reset to start counting now
reset(SparkAgentTestExtension.OPEN_LINEAGE_SPARK_CONTEXT);
when(SparkAgentTestExtension.OPEN_LINEAGE_SPARK_CONTEXT.getJobNamespace()).thenReturn("theNamespace");
when(SparkAgentTestExtension.OPEN_LINEAGE_SPARK_CONTEXT.getParentJobName()).thenReturn("theParentJob");
when(SparkAgentTestExtension.OPEN_LINEAGE_SPARK_CONTEXT.getParentRunId()).thenReturn(Optional.of(UUID.randomUUID()));
JobConf conf = new JobConf();
FileInputFormat.addInputPath(conf, new org.apache.hadoop.fs.Path(csvUri));
JavaRDD<Tuple2<LongWritable, Text>> csvRdd = spark.sparkContext().hadoopRDD(conf, TextInputFormat.class, LongWritable.class, Text.class, 1).toJavaRDD();
JavaRDD<Row> splitDf = csvRdd.map(t -> new String(t._2.getBytes()).split(",")).map(arr -> new GenericRow(new Object[] { Integer.parseInt(arr[0]), arr[1] }));
Dataset<Row> df = spark.createDataFrame(splitDf, schema);
String outputPath = tmpDir.toAbsolutePath() + "/output_data";
String jsonPath = "file://" + outputPath;
df.write().json(jsonPath);
// wait for event processing to complete
StaticExecutionContextFactory.waitForExecutionEnd();
ArgumentCaptor<OpenLineage.RunEvent> lineageEvent = ArgumentCaptor.forClass(OpenLineage.RunEvent.class);
Mockito.verify(SparkAgentTestExtension.OPEN_LINEAGE_SPARK_CONTEXT, times(4)).emit(lineageEvent.capture());
OpenLineage.RunEvent completeEvent = lineageEvent.getAllValues().get(2);
assertThat(completeEvent).hasFieldOrPropertyWithValue("eventType", RunEvent.EventType.COMPLETE);
assertThat(completeEvent.getInputs()).singleElement().hasFieldOrPropertyWithValue("name", csvPath).hasFieldOrPropertyWithValue("namespace", "file");
assertThat(completeEvent.getOutputs()).singleElement().hasFieldOrPropertyWithValue("name", outputPath).hasFieldOrPropertyWithValue("namespace", "file");
}
use of org.apache.spark.sql.types.IntegerType$ in project OpenLineage by OpenLineage.
the class LogicalRDDVisitorTest method testApply.
@Test
public void testApply(@TempDir Path tmpDir) {
SparkSession session = SparkSession.builder().master("local").getOrCreate();
LogicalRDDVisitor visitor = new LogicalRDDVisitor(SparkAgentTestExtension.newContext(session), DatasetFactory.output(new OpenLineage(OpenLineageClient.OPEN_LINEAGE_CLIENT_URI)));
StructType schema = new StructType(new StructField[] { new StructField("anInt", IntegerType$.MODULE$, false, new Metadata(new HashMap<>())), new StructField("aString", StringType$.MODULE$, false, new Metadata(new HashMap<>())) });
jobConf = new JobConf();
FileInputFormat.addInputPath(jobConf, new org.apache.hadoop.fs.Path("file://" + tmpDir));
RDD<InternalRow> hadoopRdd = new HadoopRDD<>(session.sparkContext(), jobConf, TextInputFormat.class, LongWritable.class, Text.class, 1).toJavaRDD().map(t -> (InternalRow) new GenericInternalRow(new Object[] { t._2.toString() })).rdd();
LogicalRDD logicalRDD = new LogicalRDD(ScalaConversionUtils.fromSeq(schema.toAttributes()).stream().map(AttributeReference::toAttribute).collect(ScalaConversionUtils.toSeq()), hadoopRdd, SinglePartition$.MODULE$, Seq$.MODULE$.<SortOrder>empty(), false, session);
assertThat(visitor.isDefinedAt(logicalRDD)).isTrue();
List<OpenLineage.Dataset> datasets = visitor.apply(logicalRDD);
assertThat(datasets).singleElement().hasFieldOrPropertyWithValue("name", tmpDir.toString()).hasFieldOrPropertyWithValue("namespace", "file");
}
use of org.apache.spark.sql.types.IntegerType$ in project spark-bigquery-connector by GoogleCloudDataproc.
the class AvroSchemaConverter method createConverterFor.
static Converter createConverterFor(DataType sparkType, Schema avroType) {
if (sparkType instanceof NullType && avroType.getType() == Schema.Type.NULL) {
return (getter, ordinal) -> null;
}
if (sparkType instanceof BooleanType && avroType.getType() == Schema.Type.BOOLEAN) {
return (getter, ordinal) -> getter.getBoolean(ordinal);
}
if (sparkType instanceof ByteType && avroType.getType() == Schema.Type.LONG) {
return (getter, ordinal) -> Long.valueOf(getter.getByte(ordinal));
}
if (sparkType instanceof ShortType && avroType.getType() == Schema.Type.LONG) {
return (getter, ordinal) -> Long.valueOf(getter.getShort(ordinal));
}
if (sparkType instanceof IntegerType && avroType.getType() == Schema.Type.LONG) {
return (getter, ordinal) -> Long.valueOf(getter.getInt(ordinal));
}
if (sparkType instanceof LongType && avroType.getType() == Schema.Type.LONG) {
return (getter, ordinal) -> getter.getLong(ordinal);
}
if (sparkType instanceof FloatType && avroType.getType() == Schema.Type.DOUBLE) {
return (getter, ordinal) -> Double.valueOf(getter.getFloat(ordinal));
}
if (sparkType instanceof DoubleType && avroType.getType() == Schema.Type.DOUBLE) {
return (getter, ordinal) -> getter.getDouble(ordinal);
}
if (sparkType instanceof DecimalType && avroType.getType() == Schema.Type.BYTES) {
DecimalType decimalType = (DecimalType) sparkType;
return (getter, ordinal) -> {
Decimal decimal = getter.getDecimal(ordinal, decimalType.precision(), decimalType.scale());
return DECIMAL_CONVERSIONS.toBytes(decimal.toJavaBigDecimal(), avroType, LogicalTypes.decimal(decimalType.precision(), decimalType.scale()));
};
}
if (sparkType instanceof StringType && avroType.getType() == Schema.Type.STRING) {
return (getter, ordinal) -> new Utf8(getter.getUTF8String(ordinal).getBytes());
}
if (sparkType instanceof BinaryType && avroType.getType() == Schema.Type.FIXED) {
int size = avroType.getFixedSize();
return (getter, ordinal) -> {
byte[] data = getter.getBinary(ordinal);
if (data.length != size) {
throw new IllegalArgumentException(String.format("Cannot write %s bytes of binary data into FIXED Type with size of %s bytes", data.length, size));
}
return new GenericData.Fixed(avroType, data);
};
}
if (sparkType instanceof BinaryType && avroType.getType() == Schema.Type.BYTES) {
return (getter, ordinal) -> ByteBuffer.wrap(getter.getBinary(ordinal));
}
if (sparkType instanceof DateType && avroType.getType() == Schema.Type.INT) {
return (getter, ordinal) -> getter.getInt(ordinal);
}
if (sparkType instanceof TimestampType && avroType.getType() == Schema.Type.LONG) {
return (getter, ordinal) -> getter.getLong(ordinal);
}
if (sparkType instanceof ArrayType && avroType.getType() == Schema.Type.ARRAY) {
DataType et = ((ArrayType) sparkType).elementType();
boolean containsNull = ((ArrayType) sparkType).containsNull();
Converter elementConverter = createConverterFor(et, resolveNullableType(avroType.getElementType(), containsNull));
return (getter, ordinal) -> {
ArrayData arrayData = getter.getArray(ordinal);
int len = arrayData.numElements();
Object[] result = new Object[len];
for (int i = 0; i < len; i++) {
if (containsNull && arrayData.isNullAt(i)) {
result[i] = null;
} else {
result[i] = elementConverter.convert(arrayData, i);
}
}
// `ArrayList` backed by the specified array without data copying.
return java.util.Arrays.asList(result);
};
}
if (sparkType instanceof StructType && avroType.getType() == Schema.Type.RECORD) {
StructType sparkStruct = (StructType) sparkType;
StructConverter structConverter = new StructConverter(sparkStruct, avroType);
int numFields = sparkStruct.length();
return (getter, ordinal) -> structConverter.convert(getter.getStruct(ordinal, numFields));
}
if (sparkType instanceof UserDefinedType) {
UserDefinedType userDefinedType = (UserDefinedType) sparkType;
return createConverterFor(userDefinedType.sqlType(), avroType);
}
throw new IllegalArgumentException(String.format("Cannot convert Catalyst type %s to Avro type %s", sparkType, avroType));
}
use of org.apache.spark.sql.types.IntegerType$ in project spark-bigquery-connector by GoogleCloudDataproc.
the class AvroSchemaConverter method sparkTypeToRawAvroType.
static Schema sparkTypeToRawAvroType(DataType dataType, String recordName, SchemaBuilder.TypeBuilder<Schema> builder) {
if (dataType instanceof BinaryType) {
return builder.bytesType();
}
if (dataType instanceof ByteType || dataType instanceof ShortType || dataType instanceof IntegerType || dataType instanceof LongType) {
return builder.longType();
}
if (dataType instanceof BooleanType) {
return builder.booleanType();
}
if (dataType instanceof FloatType || dataType instanceof DoubleType) {
return builder.doubleType();
}
if (dataType instanceof DecimalType) {
DecimalType decimalType = (DecimalType) dataType;
if (decimalType.precision() <= SchemaConverters.BQ_NUMERIC_PRECISION && decimalType.scale() <= SchemaConverters.BQ_NUMERIC_SCALE) {
return LogicalTypes.decimal(decimalType.precision(), decimalType.scale()).addToSchema(builder.bytesType());
} else {
throw new IllegalArgumentException("Decimal type is too wide to fit in BigQuery Numeric format");
}
}
if (dataType instanceof StringType) {
return builder.stringType();
}
if (dataType instanceof TimestampType) {
// team adds microsecond support to their backend
return LogicalTypes.timestampMicros().addToSchema(builder.longType());
}
if (dataType instanceof DateType) {
return LogicalTypes.date().addToSchema(builder.intType());
}
if (dataType instanceof ArrayType) {
return builder.array().items(sparkTypeToRawAvroType(((ArrayType) dataType).elementType(), ((ArrayType) dataType).containsNull(), recordName));
}
if (dataType instanceof StructType) {
SchemaBuilder.FieldAssembler<Schema> fieldsAssembler = builder.record(recordName).fields();
for (StructField field : ((StructType) dataType).fields()) {
Schema avroType = sparkTypeToRawAvroType(field.dataType(), field.nullable(), field.name());
fieldsAssembler.name(field.name()).type(avroType).noDefault();
}
return fieldsAssembler.endRecord();
}
if (dataType instanceof UserDefinedType) {
DataType userDefinedType = ((UserDefinedType) dataType).sqlType();
return sparkTypeToRawAvroType(userDefinedType, recordName, builder);
}
if (dataType instanceof MapType) {
throw new IllegalArgumentException(SchemaConverters.MAPTYPE_ERROR_MESSAGE);
} else {
throw new IllegalArgumentException("Data type not supported: " + dataType.simpleString());
}
}
Aggregations