Search in sources :

Example 1 with StructType$

use of org.apache.spark.sql.types.StructType$ in project kylo by Teradata.

the class ModifiedSchema method getValidTableSchema.

@Nonnull
public static StructType getValidTableSchema(@Nonnull final StructField[] feedFields, @Nonnull final StructField[] validFields, @Nonnull final FieldPolicy[] policies) {
    // Map of the lower feed valid name to the field type
    final Map<String, StructField> validFieldsMap = new HashMap<>();
    for (StructField validField : validFields) {
        String lowerFieldName = validField.name().toLowerCase();
        validFieldsMap.put(lowerFieldName, validField);
    }
    // List of all the feedFieldNames that are part of the policyMap
    final List<String> policyMapFeedFieldNames = new ArrayList<>();
    // A map of the feedFieldName to validFieldName
    final Map<String, String> validFieldToFeedFieldMap = new HashMap<>();
    // List of all those validFieldNames that have a standardizer on them
    final List<String> validFieldsWithStandardizers = new ArrayList<>();
    for (FieldPolicy policy : policies) {
        if (policy.getField() != null) {
            String feedFieldName = policy.getFeedField().toLowerCase();
            String fieldName = policy.getField().toLowerCase();
            policyMapFeedFieldNames.add(feedFieldName);
            validFieldToFeedFieldMap.put(fieldName, feedFieldName);
            if (policy.hasStandardizationPolicies()) {
                validFieldsWithStandardizers.add(fieldName);
            }
        }
    }
    List<StructField> fieldsList = new ArrayList<>(feedFields.length);
    for (StructField feedField : feedFields) {
        String lowerFeedFieldName = feedField.name().toLowerCase();
        if (policyMapFeedFieldNames.contains(lowerFeedFieldName)) {
            StructField field = feedField;
            // get the corresponding valid table field name
            String lowerFieldName = validFieldToFeedFieldMap.get(lowerFeedFieldName);
            // if we are standardizing then use the field type matching the _valid table
            if (validFieldsWithStandardizers.contains(lowerFieldName)) {
                // get the valid table
                field = validFieldsMap.get(lowerFieldName);
                HCatDataType dataType = HCatDataType.createFromDataType(field.name(), field.dataType().simpleString());
                if (dataType != null && dataType.isDateOrTimestamp()) {
                    field = new StructField(field.name(), DataTypes.StringType, field.nullable(), field.metadata());
                }
            }
            fieldsList.add(field);
        } else {
            log.warn("Valid table field {} is not present in policy map", lowerFeedFieldName);
        }
    }
    // Insert the two custom fields before the processing partition column
    fieldsList.add(new StructField(CleanseAndValidateRow.PROCESSING_DTTM_COL, DataTypes.StringType, true, Metadata.empty()));
    fieldsList.add(fieldsList.size() - 1, new StructField(CleanseAndValidateRow.REJECT_REASON_COL, DataTypes.StringType, true, Metadata.empty()));
    return new StructType(fieldsList.toArray(new StructField[0]));
}
Also used : StructField(org.apache.spark.sql.types.StructField) FieldPolicy(com.thinkbiganalytics.policy.FieldPolicy) StructType(org.apache.spark.sql.types.StructType) HashMap(java.util.HashMap) HCatDataType(com.thinkbiganalytics.spark.validation.HCatDataType) ArrayList(java.util.ArrayList) Nonnull(javax.annotation.Nonnull)

Example 2 with StructType$

use of org.apache.spark.sql.types.StructType$ in project kylo by Teradata.

the class TransformServiceTest method executeWithDatasourceProviderFactory.

/**
 * Verify executing a transformation request with a data source provider factory.
 */
@Test
@SuppressWarnings("unchecked")
public void executeWithDatasourceProviderFactory() throws Exception {
    // Mock data set
    final DataSet dataSet = Mockito.mock(DataSet.class);
    Mockito.when(dataSet.persist(Mockito.any(StorageLevel.class))).thenReturn(dataSet);
    Mockito.when(dataSet.schema()).thenReturn(new StructType());
    // Mock Spark context service
    final SparkContextService sparkContextService = Mockito.mock(SparkContextService.class);
    // Mock Spark script engine
    final SparkScriptEngine engine = Mockito.mock(SparkScriptEngine.class);
    Mockito.when(engine.eval(Mockito.anyString(), Mockito.anyListOf(NamedParam.class))).thenReturn(dataSet);
    Mockito.when(engine.getSparkContext()).thenReturn(Mockito.mock(SparkContext.class));
    // Mock data source provider factory
    final DatasourceProvider datasourceProvider = Mockito.mock(DatasourceProvider.class);
    final DatasourceProviderFactory datasourceProviderFactory = Mockito.mock(DatasourceProviderFactory.class);
    Mockito.when(datasourceProviderFactory.getDatasourceProvider(Mockito.anyCollectionOf(Datasource.class), Mockito.anyCollectionOf(DataSource.class))).thenReturn(datasourceProvider);
    // Mock profiler
    final Profiler profiler = Mockito.mock(Profiler.class);
    // Test executing a request
    final TransformRequest request = new TransformRequest();
    request.setDoProfile(true);
    request.setDatasources(Collections.singletonList(Mockito.mock(Datasource.class)));
    request.setScript("sqlContext.range(1,10)");
    final TransformService service = new TransformService(TransformScript.class, engine, sparkContextService, new MockJobTrackerService(), Mockito.mock(DataSetConverterService.class), Mockito.mock(KyloCatalogClientBuilder.class));
    service.setDatasourceProviderFactory(datasourceProviderFactory);
    service.setProfiler(profiler);
    final TransformResponse response = service.execute(request);
    Assert.assertEquals(TransformResponse.Status.PENDING, response.getStatus());
    // Test eval arguments
    final ArgumentCaptor<String> evalScript = ArgumentCaptor.forClass(String.class);
    final ArgumentCaptor<List> evalBindings = ArgumentCaptor.forClass(List.class);
    Mockito.verify(engine).eval(evalScript.capture(), evalBindings.capture());
    InputStream inputStream = getClass().getResourceAsStream("transform-service-script1.scala");
    final String expectedScript = IOUtils.toString(inputStream, "UTF-8");
    inputStream.close();
    Assert.assertEquals(expectedScript, evalScript.getValue());
    final List<NamedParam> bindings = evalBindings.getValue();
    Assert.assertEquals(2, bindings.size());
    Assert.assertEquals("sparkContextService", bindings.get(0).name());
    Assert.assertEquals("com.thinkbiganalytics.spark.SparkContextService", bindings.get(0).tpe());
    Assert.assertEquals(sparkContextService, bindings.get(0).value());
    Assert.assertEquals("datasourceProvider", bindings.get(1).name());
    Assert.assertEquals("com.thinkbiganalytics.spark.shell.DatasourceProvider[org.apache.spark.sql.DataFrame]", bindings.get(1).tpe());
    Assert.assertEquals(datasourceProvider, bindings.get(1).value());
}
Also used : Datasource(com.thinkbiganalytics.spark.rest.model.Datasource) DatasourceProvider(com.thinkbiganalytics.spark.shell.DatasourceProvider) StructType(org.apache.spark.sql.types.StructType) DataSet(com.thinkbiganalytics.spark.DataSet) InputStream(java.io.InputStream) DatasourceProviderFactory(com.thinkbiganalytics.spark.shell.DatasourceProviderFactory) NamedParam(scala.tools.nsc.interpreter.NamedParam) TransformRequest(com.thinkbiganalytics.spark.rest.model.TransformRequest) DataSource(com.thinkbiganalytics.kylo.catalog.rest.model.DataSource) SparkContext(org.apache.spark.SparkContext) Profiler(com.thinkbiganalytics.spark.dataprofiler.Profiler) KyloCatalogClientBuilder(com.thinkbiganalytics.kylo.catalog.api.KyloCatalogClientBuilder) SparkContextService(com.thinkbiganalytics.spark.SparkContextService) TransformResponse(com.thinkbiganalytics.spark.rest.model.TransformResponse) List(java.util.List) SparkScriptEngine(com.thinkbiganalytics.spark.repl.SparkScriptEngine) StorageLevel(org.apache.spark.storage.StorageLevel) Test(org.junit.Test)

Example 3 with StructType$

use of org.apache.spark.sql.types.StructType$ in project kylo by Teradata.

the class TransformServiceTest method execute.

/**
 * Verify executing a transformation request.
 */
@Test
@SuppressWarnings("unchecked")
public void execute() throws Exception {
    // Mock data set
    final DataSet dataSet = Mockito.mock(DataSet.class);
    Mockito.when(dataSet.persist(Mockito.any(StorageLevel.class))).thenReturn(dataSet);
    Mockito.when(dataSet.schema()).thenReturn(new StructType());
    // Mock Spark context service
    final SparkContextService sparkContextService = Mockito.mock(SparkContextService.class);
    // Mock Spark script engine
    final SparkScriptEngine engine = Mockito.mock(SparkScriptEngine.class);
    Mockito.when(engine.eval(Mockito.anyString(), Mockito.anyListOf(NamedParam.class))).thenReturn(dataSet);
    Mockito.when(engine.getSparkContext()).thenReturn(Mockito.mock(SparkContext.class));
    // Test executing a request
    final TransformRequest request = new TransformRequest();
    request.setDoProfile(true);
    request.setScript("sqlContext.range(1,10)");
    final TransformService service = new TransformService(TransformScript.class, engine, sparkContextService, new MockJobTrackerService(), Mockito.mock(DataSetConverterService.class), Mockito.mock(KyloCatalogClientBuilder.class));
    final TransformResponse response = service.execute(request);
    Assert.assertEquals(TransformResponse.Status.PENDING, response.getStatus());
    // Test eval arguments
    final ArgumentCaptor<String> evalScript = ArgumentCaptor.forClass(String.class);
    final ArgumentCaptor<List> evalBindings = ArgumentCaptor.forClass(List.class);
    Mockito.verify(engine).eval(evalScript.capture(), evalBindings.capture());
    String expectedScript = null;
    try (InputStream stream = getClass().getResourceAsStream("transform-service-script1.scala")) {
        expectedScript = IOUtils.toString(stream, "UTF-8");
    }
    if (expectedScript == null) {
        throw new Exception("transform-service-script1.scala failed to load");
    }
    Assert.assertEquals(expectedScript, evalScript.getValue());
    final List<NamedParam> bindings = evalBindings.getValue();
    Assert.assertEquals(1, bindings.size());
    Assert.assertEquals("sparkContextService", bindings.get(0).name());
    Assert.assertEquals("com.thinkbiganalytics.spark.SparkContextService", bindings.get(0).tpe());
    Assert.assertEquals(sparkContextService, bindings.get(0).value());
}
Also used : StructType(org.apache.spark.sql.types.StructType) DataSet(com.thinkbiganalytics.spark.DataSet) InputStream(java.io.InputStream) NamedParam(scala.tools.nsc.interpreter.NamedParam) TransformRequest(com.thinkbiganalytics.spark.rest.model.TransformRequest) SparkContext(org.apache.spark.SparkContext) KyloCatalogClientBuilder(com.thinkbiganalytics.kylo.catalog.api.KyloCatalogClientBuilder) SparkContextService(com.thinkbiganalytics.spark.SparkContextService) TransformResponse(com.thinkbiganalytics.spark.rest.model.TransformResponse) List(java.util.List) SparkScriptEngine(com.thinkbiganalytics.spark.repl.SparkScriptEngine) StorageLevel(org.apache.spark.storage.StorageLevel) Test(org.junit.Test)

Example 4 with StructType$

use of org.apache.spark.sql.types.StructType$ in project kylo by Teradata.

the class StandardDataValidator method saveInvalidToTable.

@Override
public void saveInvalidToTable(@Nonnull final String databaseName, @Nonnull final String feedTableName, @Nonnull final String invalidTableName, @Nonnull final DataValidatorResult result, @Nonnull final HiveContext hiveContext) {
    // Return a new rdd based for Invalid Results
    // noinspection serial
    JavaRDD<CleansedRowResult> invalidResultRDD = result.getCleansedRowResultRDD().filter(new Function<CleansedRowResult, Boolean>() {

        @Override
        public Boolean call(CleansedRowResult cleansedRowResult) throws Exception {
            return !cleansedRowResult.isRowValid();
        }
    });
    final StructType invalidSchema = new StructType(resolveSchema(databaseName, invalidTableName, hiveContext));
    final StructType feedSchema = new StructType(resolveSchema(databaseName, feedTableName, hiveContext));
    final StructType mergedSchema = cloneSchemaWithNewTypes(invalidSchema, feedSchema);
    DataSet invalidDataFrame = getRows(invalidResultRDD, mergedSchema, hiveContext);
    writeToTargetTable(invalidDataFrame, databaseName, invalidTableName, hiveContext);
    log.info("wrote values to the invalid Table  {}", invalidTableName);
}
Also used : StructType(org.apache.spark.sql.types.StructType) DataSet(com.thinkbiganalytics.spark.DataSet)

Example 5 with StructType$

use of org.apache.spark.sql.types.StructType$ in project kylo by Teradata.

the class StandardDataValidator method cloneSchemaWithNewTypes.

/**
 * Creates a new schema (StructType) by taking the fields (with their types) from the typed schema
 * and any remaining fields from the original schema.
 *
 * @param originalSchema  a schema with reference columns
 * @param typedSchema     a schema that will be used to define columns found in the originalSchema.
 *
 * @return a new schema
 */
private StructType cloneSchemaWithNewTypes(StructType originalSchema, StructType typedSchema) {
    Map<String, StructField> typesMap = new HashMap<>(typedSchema.fields().length);
    for (StructField sf : typedSchema.fields()) {
        typesMap.put(sf.name(), sf);
    }
    StructType st = new StructType();
    for (StructField sf : originalSchema.fields()) {
        if (typesMap.containsKey(sf.name())) {
            st = st.add(typesMap.get(sf.name()));
        } else {
            st = st.add(sf);
        }
    }
    return st;
}
Also used : StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) HashMap(java.util.HashMap)

Aggregations

StructType (org.apache.spark.sql.types.StructType)418 StructField (org.apache.spark.sql.types.StructField)228 Row (org.apache.spark.sql.Row)200 ArrayList (java.util.ArrayList)152 Test (org.junit.Test)131 Script (org.apache.sysml.api.mlcontext.Script)68 SparkSession (org.apache.spark.sql.SparkSession)61 List (java.util.List)41 DataType (org.apache.spark.sql.types.DataType)40 VectorUDT (org.apache.spark.ml.linalg.VectorUDT)36 MatrixMetadata (org.apache.sysml.api.mlcontext.MatrixMetadata)34 DenseVector (org.apache.spark.ml.linalg.DenseVector)33 Map (java.util.Map)31 ArrayType (org.apache.spark.sql.types.ArrayType)30 Dataset (org.apache.spark.sql.Dataset)28 Tuple2 (scala.Tuple2)28 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)27 Vector (org.apache.spark.ml.linalg.Vector)27 IOException (java.io.IOException)26 InternalRow (org.apache.spark.sql.catalyst.InternalRow)25