use of org.apache.spark.sql.types.DataTypes.StringType in project hudi by apache.
the class TestChainedTransformer method testChainedTransformation.
@Test
public void testChainedTransformation() {
StructType schema = DataTypes.createStructType(new StructField[] { createStructField("foo", StringType, false) });
Row r1 = RowFactory.create("100");
Row r2 = RowFactory.create("200");
Dataset<Row> original = spark().sqlContext().createDataFrame(Arrays.asList(r1, r2), schema);
Transformer t1 = (jsc, sparkSession, dataset, properties) -> dataset.withColumnRenamed("foo", "bar");
Transformer t2 = (jsc, sparkSession, dataset, properties) -> dataset.withColumn("bar", dataset.col("bar").cast(IntegerType));
ChainedTransformer transformer = new ChainedTransformer(Arrays.asList(t1, t2));
Dataset<Row> transformed = transformer.apply(jsc(), spark(), original, null);
assertEquals(2, transformed.count());
assertArrayEquals(new String[] { "bar" }, transformed.columns());
List<Row> rows = transformed.collectAsList();
assertEquals(100, rows.get(0).getInt(0));
assertEquals(200, rows.get(1).getInt(0));
}
use of org.apache.spark.sql.types.DataTypes.StringType in project gor-spark by gorpipe.
the class GorBatchTable method inferSchema.
void inferSchema() {
schema = Encoders.STRING().schema();
SparkSessionFactory sessionFactory = new SparkSessionFactory(null, projectRoot, cacheDir, configFile, aliasFile, securityContext, null);
GorSparkSession gorPipeSession = (GorSparkSession) sessionFactory.create();
if (path != null) {
String endingLowercase = path.substring(path.lastIndexOf(".")).toLowerCase();
boolean isGorz = endingLowercase.equals(".gorz");
try {
InputStream is;
if (hadoopInfer) {
var ri = fs.listFiles(ppath, true);
while (ri.hasNext()) {
var lfs = ri.next();
if (!lfs.isDirectory() && lfs.getPath().getName().toLowerCase().endsWith(endingLowercase)) {
ppath = lfs.getPath();
break;
}
}
is = fs.open(ppath);
} else {
if (gorPipeSession.getProjectContext().getFileReader().isDirectory(path)) {
var ppath = Paths.get(path);
if (!ppath.isAbsolute()) {
var root = Paths.get(projectRoot);
ppath = root.resolve(ppath);
}
var ogorz = Files.walk(ppath).filter(p -> !Files.isDirectory(p)).filter(p -> p.toString().toLowerCase().endsWith(".gorz")).findFirst();
is = ogorz.isPresent() ? gorPipeSession.getProjectContext().getFileReader().getInputStream(ogorz.get().toString()) : InputStream.nullInputStream();
} else {
is = gorPipeSession.getProjectContext().getFileReader().getInputStream(path);
}
}
schema = SparkRowUtilities.inferSchema(is, path, false, isGorz);
} catch (IOException | DataFormatException e) {
throw new RuntimeException("Unable to infer schema from " + ppath, e);
}
} else if (commands != null) {
GorDataType gdt = SparkRowUtilities.gorCmdSchema(commands, gorPipeSession);
String[] headerArray = gdt.header;
DataType[] dataTypes = new DataType[headerArray.length];
int start = 0;
for (int i = start; i < dataTypes.length; i++) {
dataTypes[i] = gdt.dataTypeMap.getOrDefault(i, StringType);
}
Stream<StructField> fieldStream = IntStream.range(0, headerArray.length).mapToObj(i -> new StructField(headerArray[i], dataTypes[i], true, Metadata.empty()));
StructField[] fields = (tag ? Stream.concat(fieldStream, Stream.of(new StructField("Tag", StringType, true, Metadata.empty()))) : fieldStream).toArray(StructField[]::new);
schema = new StructType(fields);
}
}
Aggregations