Search in sources :

Example 1 with MetadataBuilder

use of org.apache.spark.sql.types.MetadataBuilder in project net.jgp.labs.spark by jgperrin.

the class AddMetadataApp method start.

private void start() {
    SparkSession spark = SparkSession.builder().appName("Modifying metadata").master("local[*]").getOrCreate();
    String format = "csv";
    String filename = "data/books.csv";
    Dataset<Row> df = spark.read().format(format).option("inferSchema", true).option("header", true).load(filename);
    // Step 1 - Flat read out
    System.out.println("-------");
    System.out.println("Step #1 - Flat read out");
    System.out.println("-------");
    df.show();
    df.printSchema();
    System.out.println("Full read-out of metadata");
    for (StructField field : df.schema().fields()) {
        System.out.println(FieldUtils.explain(field));
    }
    // Step 2 - Add custom metadata
    System.out.println("-------");
    System.out.println("Step #2 - Add custom metadata");
    System.out.println("-------");
    // Adding x-source, x-format, x-order
    long i = 0;
    for (String colName : df.columns()) {
        Column col = col(colName);
        Metadata metadata = new MetadataBuilder().putString("x-source", filename).putString("x-format", format).putLong("x-order", i++).build();
        System.out.println("Metadata added to column: " + col);
        df = df.withColumn(colName, col, metadata);
    }
    df.printSchema();
    System.out.println("Full read-out of metadata");
    for (StructField field : df.schema().fields()) {
        System.out.println(FieldUtils.explain(field));
    }
    // Adding x-process-date
    for (String colName : df.columns()) {
        Column col = col(colName);
        Metadata metadata = new MetadataBuilder().withMetadata(ColumnUtils.getMetadata(df, colName)).putString("x-process-date", new Date().toString()).build();
        System.out.println("Metadata added to column: " + col);
        df = df.withColumn(colName, col, metadata);
    }
    df.printSchema();
    // Step #3 - Adding more metadata
    System.out.println("-------");
    System.out.println("Pass #3 - Adding more metadata");
    System.out.println("-------");
    // Adding x-user
    for (String colName : df.columns()) {
        df = DataframeUtils.addMetadata(df, colName, "x-user", "jgp");
    }
    System.out.println("Full read-out of metadata");
    for (StructField field : df.schema().fields()) {
        System.out.println(FieldUtils.explain(field));
    }
    df.printSchema();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) StructField(org.apache.spark.sql.types.StructField) MetadataBuilder(org.apache.spark.sql.types.MetadataBuilder) Column(org.apache.spark.sql.Column) Metadata(org.apache.spark.sql.types.Metadata) Row(org.apache.spark.sql.Row) Date(java.util.Date)

Example 2 with MetadataBuilder

use of org.apache.spark.sql.types.MetadataBuilder in project kylo by Teradata.

the class SqlTransformStage method extractSchema.

/**
 * Builds the Spark SQL schema from the specified result set.
 */
@Nonnull
private StructType extractSchema(@Nonnull final ResultSetMetaData rsmd, @Nonnull final TransformResult result) throws SQLException {
    final int columnCount = rsmd.getColumnCount();
    final List<QueryResultColumn> columns = new ArrayList<>(columnCount);
    final Map<String, Integer> displayNameMap = new HashMap<>();
    final StructField[] fields = new StructField[columnCount];
    for (int i = 0; i < columnCount; ++i) {
        final String columnLabel = rsmd.getColumnLabel(i + 1);
        final String columnName = rsmd.getColumnName(i + 1);
        final int columnType = rsmd.getColumnType(i + 1);
        final String columnTypeName = rsmd.getColumnTypeName(i + 1);
        final int precision = rsmd.getPrecision(i + 1);
        final int scale = rsmd.getScale(i + 1);
        final boolean isNullable = rsmd.isNullable(i + 1) != ResultSetMetaData.columnNoNulls;
        final boolean isSigned = rsmd.isSigned(i + 1);
        final DefaultQueryResultColumn column = new DefaultQueryResultColumn();
        column.setField(columnName);
        column.setHiveColumnLabel(columnLabel);
        final String displayName = StringUtils.contains(columnLabel, ".") ? StringUtils.substringAfterLast(columnLabel, ".") : columnLabel;
        Integer count = 0;
        if (displayNameMap.containsKey(displayName)) {
            count = displayNameMap.get(displayName);
            count++;
        }
        displayNameMap.put(displayName, count);
        column.setDisplayName(displayName + "" + (count > 0 ? count : ""));
        column.setTableName(StringUtils.substringAfterLast(columnName, "."));
        column.setDataType(ParserHelper.sqlTypeToHiveType(columnType));
        column.setNativeDataType(columnTypeName);
        if (scale != 0) {
            column.setPrecisionScale(precision + "," + scale);
        } else if (precision != 0) {
            column.setPrecisionScale(Integer.toString(precision));
        }
        columns.add(column);
        final MetadataBuilder metadata = new MetadataBuilder();
        final Option<DataType> oct = dialect.getCatalystType(columnType, columnTypeName, precision, metadata);
        DataType catalystType;
        if (oct.isDefined()) {
            catalystType = oct.get();
        } else {
            catalystType = getCatalystType(columnType, precision, scale, isSigned);
        }
        fields[i] = new StructField(columnLabel, catalystType, isNullable, metadata.build());
    }
    result.setColumns(columns);
    return new StructType(fields);
}
Also used : MetadataBuilder(org.apache.spark.sql.types.MetadataBuilder) StructType(org.apache.spark.sql.types.StructType) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) DefaultQueryResultColumn(com.thinkbiganalytics.discovery.model.DefaultQueryResultColumn) StructField(org.apache.spark.sql.types.StructField) DataType(org.apache.spark.sql.types.DataType) DefaultQueryResultColumn(com.thinkbiganalytics.discovery.model.DefaultQueryResultColumn) QueryResultColumn(com.thinkbiganalytics.discovery.schema.QueryResultColumn) Nonnull(javax.annotation.Nonnull)

Example 3 with MetadataBuilder

use of org.apache.spark.sql.types.MetadataBuilder in project net.jgp.labs.spark by jgperrin.

the class DataframeUtils method addMetadata.

public static Dataset<Row> addMetadata(Dataset<Row> df, String colName, String key, String value) {
    Metadata metadata = new MetadataBuilder().withMetadata(ColumnUtils.getMetadata(df, colName)).putString(key, value).build();
    Column col = col(colName);
    return df.withColumn(colName, col, metadata);
}
Also used : MetadataBuilder(org.apache.spark.sql.types.MetadataBuilder) Column(org.apache.spark.sql.Column) Metadata(org.apache.spark.sql.types.Metadata)

Aggregations

MetadataBuilder (org.apache.spark.sql.types.MetadataBuilder)3 Column (org.apache.spark.sql.Column)2 Metadata (org.apache.spark.sql.types.Metadata)2 StructField (org.apache.spark.sql.types.StructField)2 DefaultQueryResultColumn (com.thinkbiganalytics.discovery.model.DefaultQueryResultColumn)1 QueryResultColumn (com.thinkbiganalytics.discovery.schema.QueryResultColumn)1 ArrayList (java.util.ArrayList)1 Date (java.util.Date)1 HashMap (java.util.HashMap)1 Nonnull (javax.annotation.Nonnull)1 Row (org.apache.spark.sql.Row)1 SparkSession (org.apache.spark.sql.SparkSession)1 DataType (org.apache.spark.sql.types.DataType)1 StructType (org.apache.spark.sql.types.StructType)1