Search in sources :

Example 11 with Column

use of org.apache.spark.sql.Column in project kylo by Teradata.

the class DataSetProviderUtilV1 method map.

/**
 * Applies the specified function to the specified field of the data set.
 */
@Nonnull
@SuppressWarnings("unchecked")
static DataFrame map(@Nonnull final DataFrame dataSet, @Nonnull final String fieldName, @Nonnull final Function1 function, @Nonnull final DataType returnType) {
    final Seq<Column> inputs = Seq$.MODULE$.<Column>newBuilder().$plus$eq(dataSet.col(fieldName)).result();
    final UserDefinedFunction udf = new UserDefinedFunction(function, returnType, (Seq<DataType>) Seq$.MODULE$.<DataType>empty());
    return dataSet.withColumn(fieldName, udf.apply(inputs));
}
Also used : UserDefinedFunction(org.apache.spark.sql.UserDefinedFunction) Column(org.apache.spark.sql.Column) DataType(org.apache.spark.sql.types.DataType) Nonnull(javax.annotation.Nonnull)

Example 12 with Column

use of org.apache.spark.sql.Column in project kylo by Teradata.

the class StandardDataValidator method toSelectColumns.

private Column[] toSelectColumns(FieldPolicy[] policies1) {
    List<Column> columns = new ArrayList<>();
    log.info("Building select statement for # of policies {}", policies1.length);
    for (int i = 0; i < policies1.length; i++) {
        if (policies1[i].getField() != null) {
            log.info("policy [{}] name {} feedName {}", i, policies1[i].getField(), policies1[i].getFeedField());
            String feedField = StringUtils.defaultIfEmpty(policies1[i].getFeedField(), policies1[i].getField());
            columns.add(new Column(feedField).as(policies1[i].getField()));
        }
    }
    columns.add(new Column("processing_dttm"));
    return columns.toArray(new Column[columns.size()]);
}
Also used : Column(org.apache.spark.sql.Column) ArrayList(java.util.ArrayList)

Example 13 with Column

use of org.apache.spark.sql.Column in project kylo by Teradata.

the class AbstractJdbcDataSetProviderTest method filterByDateTimeWithOverlap.

/**
 * Verify filtering for new rows.
 */
@Test
public void filterByDateTimeWithOverlap() {
    DateTimeUtils.setCurrentMillisFixed(1524960000000L);
    // Mock data set
    final DataFrame dataSet = Mockito.mock(DataFrame.class);
    final DataFrame filterDataSet = Mockito.mock(DataFrame.class);
    Mockito.when(dataSet.filter(Mockito.any(Column.class))).thenReturn(filterDataSet);
    // Test filtering by date time
    final MockJdbcDataSetProvider provider = new MockJdbcDataSetProvider();
    final DataFrame newDataSet = provider.filterByDateTime(dataSet, "mockField", 1524873600000L, 60000L);
    Assert.assertEquals(filterDataSet, newDataSet);
    // Test condition
    final ArgumentCaptor<Column> conditionCaptor = ArgumentCaptor.forClass(Column.class);
    Mockito.verify(dataSet).filter(conditionCaptor.capture());
    Assert.assertEquals("((mockField > 1524873540000000) && (mockField < 1524959940000000))", conditionCaptor.getValue().toString());
}
Also used : Column(org.apache.spark.sql.Column) DataFrame(org.apache.spark.sql.DataFrame) Test(org.junit.Test)

Example 14 with Column

use of org.apache.spark.sql.Column in project kylo by Teradata.

the class AbstractJdbcDataSetProviderTest method updateHighWaterMarkWithTimestamp.

/**
 * Verify updating a high water mark for a timestamp column.
 */
@Test
public void updateHighWaterMarkWithTimestamp() {
    // Mock data set
    final DataFrame dataSet = Mockito.mock(DataFrame.class);
    Mockito.when(dataSet.col("mockField")).thenReturn(new Column("mockField"));
    final StructField field = DataTypes.createStructField("mockField", DataTypes.TimestampType, true);
    Mockito.when(dataSet.schema()).thenReturn(DataTypes.createStructType(Collections.singletonList(field)));
    final DataFrame mapDataSet = Mockito.mock(DataFrame.class);
    Mockito.when(dataSet.withColumn(Mockito.eq("mockField"), Mockito.any(Column.class))).thenReturn(mapDataSet);
    // Test updating high water mark
    final KyloCatalogClient client = Mockito.mock(KyloCatalogClient.class);
    final JdbcHighWaterMark highWaterMark = new JdbcHighWaterMark("mockWaterMark", client);
    final MockJdbcDataSetProvider provider = new MockJdbcDataSetProvider();
    final DataFrame newDataSet = provider.updateHighWaterMark(dataSet, "mockField", highWaterMark, client);
    Assert.assertEquals(mapDataSet, newDataSet);
    // Test replaced column
    final ArgumentCaptor<Column> newColumn = ArgumentCaptor.forClass(Column.class);
    Mockito.verify(dataSet).withColumn(Mockito.eq("mockField"), newColumn.capture());
    Assert.assertTrue("Expected new column to be a UDF", newColumn.getValue().expr() instanceof ScalaUDF);
}
Also used : StructField(org.apache.spark.sql.types.StructField) JdbcHighWaterMark(com.thinkbiganalytics.kylo.catalog.spark.sources.jdbc.JdbcHighWaterMark) Column(org.apache.spark.sql.Column) KyloCatalogClient(com.thinkbiganalytics.kylo.catalog.api.KyloCatalogClient) DataFrame(org.apache.spark.sql.DataFrame) ScalaUDF(org.apache.spark.sql.catalyst.expressions.ScalaUDF) Test(org.junit.Test)

Example 15 with Column

use of org.apache.spark.sql.Column in project kylo by Teradata.

the class AbstractJdbcDataSetProvider method filterByDateTime.

/**
 * Filters the specified data set using the specified date field.
 */
@Nonnull
@VisibleForTesting
T filterByDateTime(@Nonnull final T dataSet, @Nonnull final String fieldName, @Nullable final Long value, @Nullable final Long overlap) {
    long startTime = 0;
    long endTime = DateTimeUtils.currentTimeMillis();
    // Parse high water mark
    if (value != null) {
        if (value < endTime) {
            startTime = value;
        } else {
            log.warn("Value for high water mark is the future: {}", value);
            startTime = endTime;
        }
    }
    // Parse overlap
    if (overlap != null) {
        startTime = Math.max(startTime - overlap, 0);
        endTime -= overlap;
    }
    // Return filter
    final Column dateColumn = new Column(fieldName);
    final Column startFilter = (startTime > 0) ? dateColumn.gt(functions.lit(new Timestamp(startTime))) : null;
    final Column endFilter = dateColumn.lt(functions.lit(new Timestamp(endTime)));
    return filter(dataSet, (startFilter != null) ? startFilter.and(endFilter) : endFilter);
}
Also used : Column(org.apache.spark.sql.Column) Timestamp(java.sql.Timestamp) VisibleForTesting(com.google.common.annotations.VisibleForTesting) Nonnull(javax.annotation.Nonnull)

Aggregations

Column (org.apache.spark.sql.Column)18 Test (org.junit.Test)8 DataFrame (org.apache.spark.sql.DataFrame)6 Nonnull (javax.annotation.Nonnull)5 StructField (org.apache.spark.sql.types.StructField)4 KyloCatalogClient (com.thinkbiganalytics.kylo.catalog.api.KyloCatalogClient)3 Row (org.apache.spark.sql.Row)3 ScalaUDF (org.apache.spark.sql.catalyst.expressions.ScalaUDF)3 DataType (org.apache.spark.sql.types.DataType)3 JdbcHighWaterMark (com.thinkbiganalytics.kylo.catalog.spark.sources.jdbc.JdbcHighWaterMark)2 DataSet (com.thinkbiganalytics.spark.DataSet)2 GridCommonAbstractTest (org.apache.ignite.testframework.junits.common.GridCommonAbstractTest)2 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)2 UserDefinedFunction (org.apache.spark.sql.UserDefinedFunction)2 Metadata (org.apache.spark.sql.types.Metadata)2 MetadataBuilder (org.apache.spark.sql.types.MetadataBuilder)2 StructType (org.apache.spark.sql.types.StructType)2 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 DefaultQueryResultColumn (com.thinkbiganalytics.discovery.model.DefaultQueryResultColumn)1 DataSetOptions (com.thinkbiganalytics.kylo.catalog.spi.DataSetOptions)1