Search in sources :

Example 1 with StringType

use of org.apache.spark.sql.types.DataTypes.StringType in project hudi by apache.

the class TestChainedTransformer method testChainedTransformation.

@Test
public void testChainedTransformation() {
    StructType schema = DataTypes.createStructType(new StructField[] { createStructField("foo", StringType, false) });
    Row r1 = RowFactory.create("100");
    Row r2 = RowFactory.create("200");
    Dataset<Row> original = spark().sqlContext().createDataFrame(Arrays.asList(r1, r2), schema);
    Transformer t1 = (jsc, sparkSession, dataset, properties) -> dataset.withColumnRenamed("foo", "bar");
    Transformer t2 = (jsc, sparkSession, dataset, properties) -> dataset.withColumn("bar", dataset.col("bar").cast(IntegerType));
    ChainedTransformer transformer = new ChainedTransformer(Arrays.asList(t1, t2));
    Dataset<Row> transformed = transformer.apply(jsc(), spark(), original, null);
    assertEquals(2, transformed.count());
    assertArrayEquals(new String[] { "bar" }, transformed.columns());
    List<Row> rows = transformed.collectAsList();
    assertEquals(100, rows.get(0).getInt(0));
    assertEquals(200, rows.get(1).getInt(0));
}
Also used : DataTypes(org.apache.spark.sql.types.DataTypes) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) Arrays(java.util.Arrays) Dataset(org.apache.spark.sql.Dataset) RowFactory(org.apache.spark.sql.RowFactory) StringType(org.apache.spark.sql.types.DataTypes.StringType) Row(org.apache.spark.sql.Row) Test(org.junit.jupiter.api.Test) Assertions.assertArrayEquals(org.junit.jupiter.api.Assertions.assertArrayEquals) List(java.util.List) SparkClientFunctionalTestHarness(org.apache.hudi.testutils.SparkClientFunctionalTestHarness) IntegerType(org.apache.spark.sql.types.DataTypes.IntegerType) ChainedTransformer(org.apache.hudi.utilities.transform.ChainedTransformer) Transformer(org.apache.hudi.utilities.transform.Transformer) Tag(org.junit.jupiter.api.Tag) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) DataTypes.createStructField(org.apache.spark.sql.types.DataTypes.createStructField) ChainedTransformer(org.apache.hudi.utilities.transform.ChainedTransformer) Transformer(org.apache.hudi.utilities.transform.Transformer) StructType(org.apache.spark.sql.types.StructType) ChainedTransformer(org.apache.hudi.utilities.transform.ChainedTransformer) Row(org.apache.spark.sql.Row) Test(org.junit.jupiter.api.Test)

Example 2 with StringType

use of org.apache.spark.sql.types.DataTypes.StringType in project gor-spark by gorpipe.

the class GorBatchTable method inferSchema.

void inferSchema() {
    schema = Encoders.STRING().schema();
    SparkSessionFactory sessionFactory = new SparkSessionFactory(null, projectRoot, cacheDir, configFile, aliasFile, securityContext, null);
    GorSparkSession gorPipeSession = (GorSparkSession) sessionFactory.create();
    if (path != null) {
        String endingLowercase = path.substring(path.lastIndexOf(".")).toLowerCase();
        boolean isGorz = endingLowercase.equals(".gorz");
        try {
            InputStream is;
            if (hadoopInfer) {
                var ri = fs.listFiles(ppath, true);
                while (ri.hasNext()) {
                    var lfs = ri.next();
                    if (!lfs.isDirectory() && lfs.getPath().getName().toLowerCase().endsWith(endingLowercase)) {
                        ppath = lfs.getPath();
                        break;
                    }
                }
                is = fs.open(ppath);
            } else {
                if (gorPipeSession.getProjectContext().getFileReader().isDirectory(path)) {
                    var ppath = Paths.get(path);
                    if (!ppath.isAbsolute()) {
                        var root = Paths.get(projectRoot);
                        ppath = root.resolve(ppath);
                    }
                    var ogorz = Files.walk(ppath).filter(p -> !Files.isDirectory(p)).filter(p -> p.toString().toLowerCase().endsWith(".gorz")).findFirst();
                    is = ogorz.isPresent() ? gorPipeSession.getProjectContext().getFileReader().getInputStream(ogorz.get().toString()) : InputStream.nullInputStream();
                } else {
                    is = gorPipeSession.getProjectContext().getFileReader().getInputStream(path);
                }
            }
            schema = SparkRowUtilities.inferSchema(is, path, false, isGorz);
        } catch (IOException | DataFormatException e) {
            throw new RuntimeException("Unable to infer schema from " + ppath, e);
        }
    } else if (commands != null) {
        GorDataType gdt = SparkRowUtilities.gorCmdSchema(commands, gorPipeSession);
        String[] headerArray = gdt.header;
        DataType[] dataTypes = new DataType[headerArray.length];
        int start = 0;
        for (int i = start; i < dataTypes.length; i++) {
            dataTypes[i] = gdt.dataTypeMap.getOrDefault(i, StringType);
        }
        Stream<StructField> fieldStream = IntStream.range(0, headerArray.length).mapToObj(i -> new StructField(headerArray[i], dataTypes[i], true, Metadata.empty()));
        StructField[] fields = (tag ? Stream.concat(fieldStream, Stream.of(new StructField("Tag", StringType, true, Metadata.empty()))) : fieldStream).toArray(StructField[]::new);
        schema = new StructType(fields);
    }
}
Also used : DataType(org.apache.spark.sql.types.DataType) IntStream(java.util.stream.IntStream) java.util(java.util) GorSparkSession(org.gorpipe.spark.GorSparkSession) TableCapability(org.apache.spark.sql.connector.catalog.TableCapability) WriteBuilder(org.apache.spark.sql.connector.write.WriteBuilder) ReferenceBuildDefaults(org.gorpipe.gor.reference.ReferenceBuildDefaults) SparkSessionFactory(org.gorpipe.spark.SparkSessionFactory) SupportsPushDownFilters(org.apache.spark.sql.connector.read.SupportsPushDownFilters) GorDataType(gorsat.process.GorDataType) ScriptEngineFactory(gorsat.Script.ScriptEngineFactory) SupportsRead(org.apache.spark.sql.connector.catalog.SupportsRead) Configuration(org.apache.hadoop.conf.Configuration) DataFormatException(java.util.zip.DataFormatException) GeneralSparkQueryHandler(org.gorpipe.spark.GeneralSparkQueryHandler) InputPartition(org.apache.spark.sql.connector.read.InputPartition) Metadata(org.apache.spark.sql.types.Metadata) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) LogicalWriteInfo(org.apache.spark.sql.connector.write.LogicalWriteInfo) org.apache.spark.sql.sources(org.apache.spark.sql.sources) Files(java.nio.file.Files) StringType(org.apache.spark.sql.types.DataTypes.StringType) SupportsWrite(org.apache.spark.sql.connector.catalog.SupportsWrite) IOException(java.io.IOException) SparkRowUtilities(gorsat.process.SparkRowUtilities) Collectors(java.util.stream.Collectors) Encoders(org.apache.spark.sql.Encoders) Stream(java.util.stream.Stream) Paths(java.nio.file.Paths) Expressions(org.apache.spark.sql.connector.expressions.Expressions) ScanBuilder(org.apache.spark.sql.connector.read.ScanBuilder) Transform(org.apache.spark.sql.connector.expressions.Transform) org.apache.hadoop.fs(org.apache.hadoop.fs) CaseInsensitiveStringMap(org.apache.spark.sql.util.CaseInsensitiveStringMap) Scan(org.apache.spark.sql.connector.read.Scan) ScriptExecutionEngine(gorsat.Script.ScriptExecutionEngine) Table(org.apache.spark.sql.connector.catalog.Table) InputStream(java.io.InputStream) StructType(org.apache.spark.sql.types.StructType) InputStream(java.io.InputStream) IOException(java.io.IOException) GorDataType(gorsat.process.GorDataType) DataFormatException(java.util.zip.DataFormatException) StructField(org.apache.spark.sql.types.StructField) IntStream(java.util.stream.IntStream) Stream(java.util.stream.Stream) InputStream(java.io.InputStream) GorSparkSession(org.gorpipe.spark.GorSparkSession) SparkSessionFactory(org.gorpipe.spark.SparkSessionFactory)

Aggregations

StringType (org.apache.spark.sql.types.DataTypes.StringType)2 StructField (org.apache.spark.sql.types.StructField)2 StructType (org.apache.spark.sql.types.StructType)2 ScriptEngineFactory (gorsat.Script.ScriptEngineFactory)1 ScriptExecutionEngine (gorsat.Script.ScriptExecutionEngine)1 GorDataType (gorsat.process.GorDataType)1 SparkRowUtilities (gorsat.process.SparkRowUtilities)1 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 Files (java.nio.file.Files)1 Paths (java.nio.file.Paths)1 java.util (java.util)1 Arrays (java.util.Arrays)1 List (java.util.List)1 Collectors (java.util.stream.Collectors)1 IntStream (java.util.stream.IntStream)1 Stream (java.util.stream.Stream)1 DataFormatException (java.util.zip.DataFormatException)1 Configuration (org.apache.hadoop.conf.Configuration)1 org.apache.hadoop.fs (org.apache.hadoop.fs)1