use of org.apache.spark.sql.types.MetadataBuilder in project net.jgp.labs.spark by jgperrin.
the class AddMetadataApp method start.
private void start() {
SparkSession spark = SparkSession.builder().appName("Modifying metadata").master("local[*]").getOrCreate();
String format = "csv";
String filename = "data/books.csv";
Dataset<Row> df = spark.read().format(format).option("inferSchema", true).option("header", true).load(filename);
// Step 1 - Flat read out
System.out.println("-------");
System.out.println("Step #1 - Flat read out");
System.out.println("-------");
df.show();
df.printSchema();
System.out.println("Full read-out of metadata");
for (StructField field : df.schema().fields()) {
System.out.println(FieldUtils.explain(field));
}
// Step 2 - Add custom metadata
System.out.println("-------");
System.out.println("Step #2 - Add custom metadata");
System.out.println("-------");
// Adding x-source, x-format, x-order
long i = 0;
for (String colName : df.columns()) {
Column col = col(colName);
Metadata metadata = new MetadataBuilder().putString("x-source", filename).putString("x-format", format).putLong("x-order", i++).build();
System.out.println("Metadata added to column: " + col);
df = df.withColumn(colName, col, metadata);
}
df.printSchema();
System.out.println("Full read-out of metadata");
for (StructField field : df.schema().fields()) {
System.out.println(FieldUtils.explain(field));
}
// Adding x-process-date
for (String colName : df.columns()) {
Column col = col(colName);
Metadata metadata = new MetadataBuilder().withMetadata(ColumnUtils.getMetadata(df, colName)).putString("x-process-date", new Date().toString()).build();
System.out.println("Metadata added to column: " + col);
df = df.withColumn(colName, col, metadata);
}
df.printSchema();
// Step #3 - Adding more metadata
System.out.println("-------");
System.out.println("Pass #3 - Adding more metadata");
System.out.println("-------");
// Adding x-user
for (String colName : df.columns()) {
df = DataframeUtils.addMetadata(df, colName, "x-user", "jgp");
}
System.out.println("Full read-out of metadata");
for (StructField field : df.schema().fields()) {
System.out.println(FieldUtils.explain(field));
}
df.printSchema();
}
use of org.apache.spark.sql.types.MetadataBuilder in project kylo by Teradata.
the class SqlTransformStage method extractSchema.
/**
* Builds the Spark SQL schema from the specified result set.
*/
@Nonnull
private StructType extractSchema(@Nonnull final ResultSetMetaData rsmd, @Nonnull final TransformResult result) throws SQLException {
final int columnCount = rsmd.getColumnCount();
final List<QueryResultColumn> columns = new ArrayList<>(columnCount);
final Map<String, Integer> displayNameMap = new HashMap<>();
final StructField[] fields = new StructField[columnCount];
for (int i = 0; i < columnCount; ++i) {
final String columnLabel = rsmd.getColumnLabel(i + 1);
final String columnName = rsmd.getColumnName(i + 1);
final int columnType = rsmd.getColumnType(i + 1);
final String columnTypeName = rsmd.getColumnTypeName(i + 1);
final int precision = rsmd.getPrecision(i + 1);
final int scale = rsmd.getScale(i + 1);
final boolean isNullable = rsmd.isNullable(i + 1) != ResultSetMetaData.columnNoNulls;
final boolean isSigned = rsmd.isSigned(i + 1);
final DefaultQueryResultColumn column = new DefaultQueryResultColumn();
column.setField(columnName);
column.setHiveColumnLabel(columnLabel);
final String displayName = StringUtils.contains(columnLabel, ".") ? StringUtils.substringAfterLast(columnLabel, ".") : columnLabel;
Integer count = 0;
if (displayNameMap.containsKey(displayName)) {
count = displayNameMap.get(displayName);
count++;
}
displayNameMap.put(displayName, count);
column.setDisplayName(displayName + "" + (count > 0 ? count : ""));
column.setTableName(StringUtils.substringAfterLast(columnName, "."));
column.setDataType(ParserHelper.sqlTypeToHiveType(columnType));
column.setNativeDataType(columnTypeName);
if (scale != 0) {
column.setPrecisionScale(precision + "," + scale);
} else if (precision != 0) {
column.setPrecisionScale(Integer.toString(precision));
}
columns.add(column);
final MetadataBuilder metadata = new MetadataBuilder();
final Option<DataType> oct = dialect.getCatalystType(columnType, columnTypeName, precision, metadata);
DataType catalystType;
if (oct.isDefined()) {
catalystType = oct.get();
} else {
catalystType = getCatalystType(columnType, precision, scale, isSigned);
}
fields[i] = new StructField(columnLabel, catalystType, isNullable, metadata.build());
}
result.setColumns(columns);
return new StructType(fields);
}
use of org.apache.spark.sql.types.MetadataBuilder in project net.jgp.labs.spark by jgperrin.
the class DataframeUtils method addMetadata.
public static Dataset<Row> addMetadata(Dataset<Row> df, String colName, String key, String value) {
Metadata metadata = new MetadataBuilder().withMetadata(ColumnUtils.getMetadata(df, colName)).putString(key, value).build();
Column col = col(colName);
return df.withColumn(colName, col, metadata);
}
Aggregations