Search in sources :

Example 1 with DataSet

use of com.thinkbiganalytics.spark.DataSet in project kylo by Teradata.

the class ProfilerTest method setUp.

@Before
@SuppressWarnings("unchecked")
public void setUp() {
    if (columnStatsMap == null) {
        StructField[] schemaFields = new StructField[15];
        schemaFields[0] = DataTypes.createStructField("id", DataTypes.IntegerType, true);
        schemaFields[1] = DataTypes.createStructField("firstname", DataTypes.StringType, true);
        schemaFields[2] = DataTypes.createStructField("lastname", DataTypes.StringType, true);
        schemaFields[3] = DataTypes.createStructField("age", DataTypes.IntegerType, true);
        schemaFields[4] = DataTypes.createStructField("description", DataTypes.StringType, true);
        schemaFields[5] = DataTypes.createStructField("height", DataTypes.DoubleType, true);
        schemaFields[6] = DataTypes.createStructField("joindate", DataTypes.DateType, true);
        schemaFields[7] = DataTypes.createStructField("lifemember", DataTypes.BooleanType, true);
        schemaFields[8] = DataTypes.createStructField("lastlogin", DataTypes.TimestampType, true);
        schemaFields[9] = DataTypes.createStructField("phash", DataTypes.LongType, true);
        schemaFields[10] = DataTypes.createStructField("weight", DataTypes.FloatType, true);
        schemaFields[11] = DataTypes.createStructField("credits", DataTypes.ShortType, true);
        schemaFields[12] = DataTypes.createStructField("ccode", DataTypes.ByteType, true);
        schemaFields[13] = DataTypes.createStructField("score", DataTypes.createDecimalType(7, 5), true);
        schemaFields[14] = DataTypes.createStructField("favoritepet", DataTypes.StringType, true);
        StructType schema = DataTypes.createStructType(schemaFields);
        List<Row> rows = new ArrayList<>();
        rows.add(RowFactory.create(1, "Jon", "Wright", 14, "Jon::Wright", 5.85d, Date.valueOf("2010-05-04"), Boolean.TRUE, Timestamp.valueOf("2008-05-06 23:10:10"), 1456890911L, 40.2f, (short) 100, (byte) 99, new BigDecimal(String.valueOf(1.567)), "Cat"));
        rows.add(RowFactory.create(2, "Jon", "Hudson", null, "Jon::Hudson", 5.85d, Date.valueOf("1990-10-25"), null, Timestamp.valueOf("2011-01-08 11:25:45"), 7638962135L, 110.5f, (short) 100, (byte) 99, new BigDecimal(String.valueOf(8.223)), "alligator"));
        rows.add(RowFactory.create(3, "Rachael", "Hu", 40, "Rachael::Hu", 6.22d, Date.valueOf("1990-10-25"), Boolean.TRUE, Timestamp.valueOf("2011-01-08 11:25:45"), 2988626110L, 160.7f, (short) 1400, (byte) 99, new BigDecimal(String.valueOf(1.567)), "Alpaca"));
        rows.add(RowFactory.create(4, EMPTY_STRING, EMPTY_STRING, 40, null, null, Date.valueOf("1956-11-12"), Boolean.TRUE, Timestamp.valueOf("2008-05-06 23:10:10"), 2988626110L, null, null, (byte) 99, null, "Cat"));
        rows.add(RowFactory.create(5, "Rachael", EMPTY_STRING, 22, "Rachael::", 5.85d, Date.valueOf("2005-12-24"), Boolean.FALSE, Timestamp.valueOf("2008-05-06 23:10:10"), 8260467621L, 160.7f, (short) 100, null, new BigDecimal(String.valueOf(4.343)), "Zebra"));
        rows.add(RowFactory.create(6, "Elizabeth", "Taylor", 40, "Elizabeth::Taylor", 5.85d, Date.valueOf("2011-08-08"), null, Timestamp.valueOf("2016-01-14 14:20:20"), 8732866249L, null, (short) 1400, null, new BigDecimal(String.valueOf(4.343)), "ZEBRA"));
        rows.add(RowFactory.create(7, "Jon", "Taylor", 18, "Jon::Taylor", null, Date.valueOf("2011-08-08"), Boolean.TRUE, Timestamp.valueOf("2011-01-08 11:25:45"), 2988626110L, 110.5f, (short) 500, (byte) 40, new BigDecimal(String.valueOf(4.343)), null));
        rows.add(RowFactory.create(8, "Rachael", EMPTY_STRING, 22, "Rachael::", 4.37d, Date.valueOf("2011-08-08"), Boolean.FALSE, Timestamp.valueOf("2008-05-06 23:10:10"), 8782348100L, null, null, null, null, "albatross"));
        rows.add(RowFactory.create(9, EMPTY_STRING, "Edmundson Jr", 11, "::Edmundson Jr", 4.88d, Date.valueOf("2007-06-07"), Boolean.FALSE, Timestamp.valueOf("2007-03-16 08:24:37"), null, 155.3f, (short) 0, (byte) 99, new BigDecimal(String.valueOf(1.567)), EMPTY_STRING));
        rows.add(RowFactory.create(10, "Jon", EMPTY_STRING, 65, "Jon::", null, Date.valueOf("1975-04-04"), Boolean.TRUE, Timestamp.valueOf("2007-03-16 08:24:31"), null, 180.6f, (short) 5000, (byte) 2, new BigDecimal(String.valueOf(4.343)), "Cat"));
        final JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(sqlContext.sparkContext());
        JavaRDD<Row> dataRDD = javaSparkContext.parallelize(rows);
        DataSet dataDF = scs.toDataSet(sqlContext.createDataFrame(dataRDD, schema));
        /* Enable to debug contents of test data */
        /*
            for (Row r: dataRDD.collect()) {
                System.out.println(r.toString());
            }
            */
        StatisticsModel statsModel = profiler.profile(dataDF, new ProfilerConfiguration());
        columnStatsMap = (statsModel != null) ? (Map) statsModel.getColumnStatisticsMap() : (Map<Integer, StandardColumnStatistics>) Collections.EMPTY_MAP;
    }
}
Also used : StatisticsModel(com.thinkbiganalytics.spark.dataprofiler.StatisticsModel) StructType(org.apache.spark.sql.types.StructType) DataSet(com.thinkbiganalytics.spark.DataSet) ArrayList(java.util.ArrayList) BigDecimal(java.math.BigDecimal) StructField(org.apache.spark.sql.types.StructField) ProfilerConfiguration(com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration) Row(org.apache.spark.sql.Row) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Map(java.util.Map) Before(org.junit.Before)

Example 2 with DataSet

use of com.thinkbiganalytics.spark.DataSet in project kylo by Teradata.

the class SaveDataSetStage method getDataSet.

/**
 * Gets the data set for the specified transformation result.
 */
private DataSet getDataSet(@Nonnull final TransformResult transform) {
    DataSet dataset = transform.getDataSet();
    if (request.getFormat() != null && request.getFormat().equals("orc")) {
        // Ensure that column names comply with ORC standards
        final StructType schema = dataset.schema();
        final Column[] columns = new Column[schema.size()];
        final DefaultQueryResultColumn[] queryColumns = new QueryResultRowTransform(schema, "orc").columns();
        for (int i = 0; i < schema.size(); ++i) {
            if (!queryColumns[i].getField().equals(schema.apply(i).name())) {
                columns[i] = new Column(schema.apply(i).name()).as(queryColumns[i].getField());
            } else {
                columns[i] = new Column(schema.apply(i).name());
            }
        }
        dataset = dataset.select(columns);
    }
    return dataset;
}
Also used : StructType(org.apache.spark.sql.types.StructType) DataSet(com.thinkbiganalytics.spark.DataSet) DefaultQueryResultColumn(com.thinkbiganalytics.discovery.model.DefaultQueryResultColumn) Column(org.apache.spark.sql.Column) DefaultQueryResultColumn(com.thinkbiganalytics.discovery.model.DefaultQueryResultColumn)

Example 3 with DataSet

use of com.thinkbiganalytics.spark.DataSet in project kylo by Teradata.

the class TransformServiceTest method execute.

/**
 * Verify executing a transformation request.
 */
@Test
@SuppressWarnings("unchecked")
public void execute() throws Exception {
    // Mock data set
    final DataSet dataSet = Mockito.mock(DataSet.class);
    Mockito.when(dataSet.persist(Mockito.any(StorageLevel.class))).thenReturn(dataSet);
    Mockito.when(dataSet.schema()).thenReturn(new StructType());
    // Mock Spark context service
    final SparkContextService sparkContextService = Mockito.mock(SparkContextService.class);
    // Mock Spark script engine
    final SparkScriptEngine engine = Mockito.mock(SparkScriptEngine.class);
    Mockito.when(engine.eval(Mockito.anyString(), Mockito.anyListOf(NamedParam.class))).thenReturn(dataSet);
    Mockito.when(engine.getSparkContext()).thenReturn(Mockito.mock(SparkContext.class));
    // Test executing a request
    final TransformRequest request = new TransformRequest();
    request.setScript("sqlContext.range(1,10)");
    final TransformService service = new TransformService(TransformScript.class, engine, sparkContextService, new MockJobTrackerService());
    final TransformResponse response = service.execute(request);
    Assert.assertEquals(TransformResponse.Status.PENDING, response.getStatus());
    // Test eval arguments
    final ArgumentCaptor<String> evalScript = ArgumentCaptor.forClass(String.class);
    final ArgumentCaptor<List> evalBindings = ArgumentCaptor.forClass(List.class);
    Mockito.verify(engine).eval(evalScript.capture(), evalBindings.capture());
    String expectedScript = null;
    try (InputStream stream = getClass().getResourceAsStream("transform-service-script1.scala")) {
        expectedScript = IOUtils.toString(stream, "UTF-8");
    }
    if (expectedScript == null) {
        throw new Exception("transform-service-script1.scala failed to load");
    }
    Assert.assertEquals(expectedScript, evalScript.getValue());
    final List<NamedParam> bindings = evalBindings.getValue();
    Assert.assertEquals(1, bindings.size());
    Assert.assertEquals("sparkContextService", bindings.get(0).name());
    Assert.assertEquals("com.thinkbiganalytics.spark.SparkContextService", bindings.get(0).tpe());
    Assert.assertEquals(sparkContextService, bindings.get(0).value());
}
Also used : StructType(org.apache.spark.sql.types.StructType) DataSet(com.thinkbiganalytics.spark.DataSet) InputStream(java.io.InputStream) NamedParam(scala.tools.nsc.interpreter.NamedParam) TransformRequest(com.thinkbiganalytics.spark.rest.model.TransformRequest) SparkContext(org.apache.spark.SparkContext) SparkContextService(com.thinkbiganalytics.spark.SparkContextService) TransformResponse(com.thinkbiganalytics.spark.rest.model.TransformResponse) List(java.util.List) SparkScriptEngine(com.thinkbiganalytics.spark.repl.SparkScriptEngine) StorageLevel(org.apache.spark.storage.StorageLevel) Test(org.junit.Test)

Example 4 with DataSet

use of com.thinkbiganalytics.spark.DataSet in project kylo by Teradata.

the class TransformServiceTest method executeWithDatasourceProviderFactory.

/**
 * Verify executing a transformation request with a data source provider factory.
 */
@Test
@SuppressWarnings("unchecked")
public void executeWithDatasourceProviderFactory() throws Exception {
    // Mock data set
    final DataSet dataSet = Mockito.mock(DataSet.class);
    Mockito.when(dataSet.persist(Mockito.any(StorageLevel.class))).thenReturn(dataSet);
    Mockito.when(dataSet.schema()).thenReturn(new StructType());
    // Mock Spark context service
    final SparkContextService sparkContextService = Mockito.mock(SparkContextService.class);
    // Mock Spark script engine
    final SparkScriptEngine engine = Mockito.mock(SparkScriptEngine.class);
    Mockito.when(engine.eval(Mockito.anyString(), Mockito.anyListOf(NamedParam.class))).thenReturn(dataSet);
    Mockito.when(engine.getSparkContext()).thenReturn(Mockito.mock(SparkContext.class));
    // Mock data source provider factory
    final DatasourceProvider datasourceProvider = Mockito.mock(DatasourceProvider.class);
    final DatasourceProviderFactory datasourceProviderFactory = Mockito.mock(DatasourceProviderFactory.class);
    Mockito.when(datasourceProviderFactory.getDatasourceProvider(Mockito.anyCollectionOf(Datasource.class))).thenReturn(datasourceProvider);
    // Mock profiler
    final Profiler profiler = Mockito.mock(Profiler.class);
    // Test executing a request
    final TransformRequest request = new TransformRequest();
    request.setDatasources(Collections.singletonList(Mockito.mock(Datasource.class)));
    request.setScript("sqlContext.range(1,10)");
    final TransformService service = new TransformService(TransformScript.class, engine, sparkContextService, new MockJobTrackerService());
    service.setDatasourceProviderFactory(datasourceProviderFactory);
    service.setProfiler(profiler);
    final TransformResponse response = service.execute(request);
    Assert.assertEquals(TransformResponse.Status.PENDING, response.getStatus());
    // Test eval arguments
    final ArgumentCaptor<String> evalScript = ArgumentCaptor.forClass(String.class);
    final ArgumentCaptor<List> evalBindings = ArgumentCaptor.forClass(List.class);
    Mockito.verify(engine).eval(evalScript.capture(), evalBindings.capture());
    InputStream inputStream = getClass().getResourceAsStream("transform-service-script1.scala");
    final String expectedScript = IOUtils.toString(inputStream, "UTF-8");
    inputStream.close();
    Assert.assertEquals(expectedScript, evalScript.getValue());
    final List<NamedParam> bindings = evalBindings.getValue();
    Assert.assertEquals(2, bindings.size());
    Assert.assertEquals("sparkContextService", bindings.get(0).name());
    Assert.assertEquals("com.thinkbiganalytics.spark.SparkContextService", bindings.get(0).tpe());
    Assert.assertEquals(sparkContextService, bindings.get(0).value());
    Assert.assertEquals("datasourceProvider", bindings.get(1).name());
    Assert.assertEquals("com.thinkbiganalytics.spark.shell.DatasourceProvider[org.apache.spark.sql.DataFrame]", bindings.get(1).tpe());
    Assert.assertEquals(datasourceProvider, bindings.get(1).value());
}
Also used : Datasource(com.thinkbiganalytics.spark.rest.model.Datasource) DatasourceProvider(com.thinkbiganalytics.spark.shell.DatasourceProvider) StructType(org.apache.spark.sql.types.StructType) DataSet(com.thinkbiganalytics.spark.DataSet) InputStream(java.io.InputStream) DatasourceProviderFactory(com.thinkbiganalytics.spark.shell.DatasourceProviderFactory) NamedParam(scala.tools.nsc.interpreter.NamedParam) TransformRequest(com.thinkbiganalytics.spark.rest.model.TransformRequest) SparkContext(org.apache.spark.SparkContext) Profiler(com.thinkbiganalytics.spark.dataprofiler.Profiler) SparkContextService(com.thinkbiganalytics.spark.SparkContextService) TransformResponse(com.thinkbiganalytics.spark.rest.model.TransformResponse) List(java.util.List) SparkScriptEngine(com.thinkbiganalytics.spark.repl.SparkScriptEngine) StorageLevel(org.apache.spark.storage.StorageLevel) Test(org.junit.Test)

Example 5 with DataSet

use of com.thinkbiganalytics.spark.DataSet in project kylo by Teradata.

the class StandardDataValidator method saveInvalidToTable.

@Override
public void saveInvalidToTable(@Nonnull final String databaseName, @Nonnull final String tableName, @Nonnull final DataValidatorResult result, @Nonnull final HiveContext hiveContext) {
    // Return a new rdd based for Invalid Results
    // noinspection serial
    JavaRDD<CleansedRowResult> invalidResultRDD = result.getCleansedRowResultRDD().filter(new Function<CleansedRowResult, Boolean>() {

        @Override
        public Boolean call(CleansedRowResult cleansedRowResult) throws Exception {
            return !cleansedRowResult.isRowValid();
        }
    });
    final StructType invalidSchema = new StructType(resolveSchema(databaseName, tableName, hiveContext));
    DataSet invalidDataFrame = getRows(invalidResultRDD, invalidSchema, hiveContext);
    writeToTargetTable(invalidDataFrame, databaseName, tableName, hiveContext);
    log.info("wrote values to the invalid Table  {}", tableName);
}
Also used : StructType(org.apache.spark.sql.types.StructType) DataSet(com.thinkbiganalytics.spark.DataSet)

Aggregations

DataSet (com.thinkbiganalytics.spark.DataSet)14 StructType (org.apache.spark.sql.types.StructType)7 Nonnull (javax.annotation.Nonnull)4 SparkContext (org.apache.spark.SparkContext)4 TransformResponse (com.thinkbiganalytics.spark.rest.model.TransformResponse)3 Row (org.apache.spark.sql.Row)3 NamedParam (scala.tools.nsc.interpreter.NamedParam)3 SparkContextService (com.thinkbiganalytics.spark.SparkContextService)2 StatisticsModel (com.thinkbiganalytics.spark.dataprofiler.StatisticsModel)2 ShellTransformStage (com.thinkbiganalytics.spark.metadata.ShellTransformStage)2 SparkScriptEngine (com.thinkbiganalytics.spark.repl.SparkScriptEngine)2 TransformRequest (com.thinkbiganalytics.spark.rest.model.TransformRequest)2 DatasourceProvider (com.thinkbiganalytics.spark.shell.DatasourceProvider)2 InputStream (java.io.InputStream)2 ArrayList (java.util.ArrayList)2 List (java.util.List)2 HiveContext (org.apache.spark.sql.hive.HiveContext)2 StructField (org.apache.spark.sql.types.StructField)2 StorageLevel (org.apache.spark.storage.StorageLevel)2 Test (org.junit.Test)2