use of org.apache.spark.sql.types.StructType$ in project kylo by Teradata.
the class ModifiedSchema method getValidTableSchema.
@Nonnull
public static StructType getValidTableSchema(@Nonnull final StructField[] feedFields, @Nonnull final StructField[] validFields, @Nonnull final FieldPolicy[] policies) {
// Map of the lower feed valid name to the field type
final Map<String, StructField> validFieldsMap = new HashMap<>();
for (StructField validField : validFields) {
String lowerFieldName = validField.name().toLowerCase();
validFieldsMap.put(lowerFieldName, validField);
}
// List of all the feedFieldNames that are part of the policyMap
final List<String> policyMapFeedFieldNames = new ArrayList<>();
// A map of the feedFieldName to validFieldName
final Map<String, String> validFieldToFeedFieldMap = new HashMap<>();
// List of all those validFieldNames that have a standardizer on them
final List<String> validFieldsWithStandardizers = new ArrayList<>();
for (FieldPolicy policy : policies) {
if (policy.getField() != null) {
String feedFieldName = policy.getFeedField().toLowerCase();
String fieldName = policy.getField().toLowerCase();
policyMapFeedFieldNames.add(feedFieldName);
validFieldToFeedFieldMap.put(fieldName, feedFieldName);
if (policy.hasStandardizationPolicies()) {
validFieldsWithStandardizers.add(fieldName);
}
}
}
List<StructField> fieldsList = new ArrayList<>(feedFields.length);
for (StructField feedField : feedFields) {
String lowerFeedFieldName = feedField.name().toLowerCase();
if (policyMapFeedFieldNames.contains(lowerFeedFieldName)) {
StructField field = feedField;
// get the corresponding valid table field name
String lowerFieldName = validFieldToFeedFieldMap.get(lowerFeedFieldName);
// if we are standardizing then use the field type matching the _valid table
if (validFieldsWithStandardizers.contains(lowerFieldName)) {
// get the valid table
field = validFieldsMap.get(lowerFieldName);
HCatDataType dataType = HCatDataType.createFromDataType(field.name(), field.dataType().simpleString());
if (dataType != null && dataType.isDateOrTimestamp()) {
field = new StructField(field.name(), DataTypes.StringType, field.nullable(), field.metadata());
}
}
fieldsList.add(field);
} else {
log.warn("Valid table field {} is not present in policy map", lowerFeedFieldName);
}
}
// Insert the two custom fields before the processing partition column
fieldsList.add(new StructField(CleanseAndValidateRow.PROCESSING_DTTM_COL, DataTypes.StringType, true, Metadata.empty()));
fieldsList.add(fieldsList.size() - 1, new StructField(CleanseAndValidateRow.REJECT_REASON_COL, DataTypes.StringType, true, Metadata.empty()));
return new StructType(fieldsList.toArray(new StructField[0]));
}
use of org.apache.spark.sql.types.StructType$ in project kylo by Teradata.
the class TransformServiceTest method executeWithDatasourceProviderFactory.
/**
* Verify executing a transformation request with a data source provider factory.
*/
@Test
@SuppressWarnings("unchecked")
public void executeWithDatasourceProviderFactory() throws Exception {
// Mock data set
final DataSet dataSet = Mockito.mock(DataSet.class);
Mockito.when(dataSet.persist(Mockito.any(StorageLevel.class))).thenReturn(dataSet);
Mockito.when(dataSet.schema()).thenReturn(new StructType());
// Mock Spark context service
final SparkContextService sparkContextService = Mockito.mock(SparkContextService.class);
// Mock Spark script engine
final SparkScriptEngine engine = Mockito.mock(SparkScriptEngine.class);
Mockito.when(engine.eval(Mockito.anyString(), Mockito.anyListOf(NamedParam.class))).thenReturn(dataSet);
Mockito.when(engine.getSparkContext()).thenReturn(Mockito.mock(SparkContext.class));
// Mock data source provider factory
final DatasourceProvider datasourceProvider = Mockito.mock(DatasourceProvider.class);
final DatasourceProviderFactory datasourceProviderFactory = Mockito.mock(DatasourceProviderFactory.class);
Mockito.when(datasourceProviderFactory.getDatasourceProvider(Mockito.anyCollectionOf(Datasource.class), Mockito.anyCollectionOf(DataSource.class))).thenReturn(datasourceProvider);
// Mock profiler
final Profiler profiler = Mockito.mock(Profiler.class);
// Test executing a request
final TransformRequest request = new TransformRequest();
request.setDoProfile(true);
request.setDatasources(Collections.singletonList(Mockito.mock(Datasource.class)));
request.setScript("sqlContext.range(1,10)");
final TransformService service = new TransformService(TransformScript.class, engine, sparkContextService, new MockJobTrackerService(), Mockito.mock(DataSetConverterService.class), Mockito.mock(KyloCatalogClientBuilder.class));
service.setDatasourceProviderFactory(datasourceProviderFactory);
service.setProfiler(profiler);
final TransformResponse response = service.execute(request);
Assert.assertEquals(TransformResponse.Status.PENDING, response.getStatus());
// Test eval arguments
final ArgumentCaptor<String> evalScript = ArgumentCaptor.forClass(String.class);
final ArgumentCaptor<List> evalBindings = ArgumentCaptor.forClass(List.class);
Mockito.verify(engine).eval(evalScript.capture(), evalBindings.capture());
InputStream inputStream = getClass().getResourceAsStream("transform-service-script1.scala");
final String expectedScript = IOUtils.toString(inputStream, "UTF-8");
inputStream.close();
Assert.assertEquals(expectedScript, evalScript.getValue());
final List<NamedParam> bindings = evalBindings.getValue();
Assert.assertEquals(2, bindings.size());
Assert.assertEquals("sparkContextService", bindings.get(0).name());
Assert.assertEquals("com.thinkbiganalytics.spark.SparkContextService", bindings.get(0).tpe());
Assert.assertEquals(sparkContextService, bindings.get(0).value());
Assert.assertEquals("datasourceProvider", bindings.get(1).name());
Assert.assertEquals("com.thinkbiganalytics.spark.shell.DatasourceProvider[org.apache.spark.sql.DataFrame]", bindings.get(1).tpe());
Assert.assertEquals(datasourceProvider, bindings.get(1).value());
}
use of org.apache.spark.sql.types.StructType$ in project kylo by Teradata.
the class TransformServiceTest method execute.
/**
* Verify executing a transformation request.
*/
@Test
@SuppressWarnings("unchecked")
public void execute() throws Exception {
// Mock data set
final DataSet dataSet = Mockito.mock(DataSet.class);
Mockito.when(dataSet.persist(Mockito.any(StorageLevel.class))).thenReturn(dataSet);
Mockito.when(dataSet.schema()).thenReturn(new StructType());
// Mock Spark context service
final SparkContextService sparkContextService = Mockito.mock(SparkContextService.class);
// Mock Spark script engine
final SparkScriptEngine engine = Mockito.mock(SparkScriptEngine.class);
Mockito.when(engine.eval(Mockito.anyString(), Mockito.anyListOf(NamedParam.class))).thenReturn(dataSet);
Mockito.when(engine.getSparkContext()).thenReturn(Mockito.mock(SparkContext.class));
// Test executing a request
final TransformRequest request = new TransformRequest();
request.setDoProfile(true);
request.setScript("sqlContext.range(1,10)");
final TransformService service = new TransformService(TransformScript.class, engine, sparkContextService, new MockJobTrackerService(), Mockito.mock(DataSetConverterService.class), Mockito.mock(KyloCatalogClientBuilder.class));
final TransformResponse response = service.execute(request);
Assert.assertEquals(TransformResponse.Status.PENDING, response.getStatus());
// Test eval arguments
final ArgumentCaptor<String> evalScript = ArgumentCaptor.forClass(String.class);
final ArgumentCaptor<List> evalBindings = ArgumentCaptor.forClass(List.class);
Mockito.verify(engine).eval(evalScript.capture(), evalBindings.capture());
String expectedScript = null;
try (InputStream stream = getClass().getResourceAsStream("transform-service-script1.scala")) {
expectedScript = IOUtils.toString(stream, "UTF-8");
}
if (expectedScript == null) {
throw new Exception("transform-service-script1.scala failed to load");
}
Assert.assertEquals(expectedScript, evalScript.getValue());
final List<NamedParam> bindings = evalBindings.getValue();
Assert.assertEquals(1, bindings.size());
Assert.assertEquals("sparkContextService", bindings.get(0).name());
Assert.assertEquals("com.thinkbiganalytics.spark.SparkContextService", bindings.get(0).tpe());
Assert.assertEquals(sparkContextService, bindings.get(0).value());
}
use of org.apache.spark.sql.types.StructType$ in project kylo by Teradata.
the class StandardDataValidator method saveInvalidToTable.
@Override
public void saveInvalidToTable(@Nonnull final String databaseName, @Nonnull final String feedTableName, @Nonnull final String invalidTableName, @Nonnull final DataValidatorResult result, @Nonnull final HiveContext hiveContext) {
// Return a new rdd based for Invalid Results
// noinspection serial
JavaRDD<CleansedRowResult> invalidResultRDD = result.getCleansedRowResultRDD().filter(new Function<CleansedRowResult, Boolean>() {
@Override
public Boolean call(CleansedRowResult cleansedRowResult) throws Exception {
return !cleansedRowResult.isRowValid();
}
});
final StructType invalidSchema = new StructType(resolveSchema(databaseName, invalidTableName, hiveContext));
final StructType feedSchema = new StructType(resolveSchema(databaseName, feedTableName, hiveContext));
final StructType mergedSchema = cloneSchemaWithNewTypes(invalidSchema, feedSchema);
DataSet invalidDataFrame = getRows(invalidResultRDD, mergedSchema, hiveContext);
writeToTargetTable(invalidDataFrame, databaseName, invalidTableName, hiveContext);
log.info("wrote values to the invalid Table {}", invalidTableName);
}
use of org.apache.spark.sql.types.StructType$ in project kylo by Teradata.
the class StandardDataValidator method cloneSchemaWithNewTypes.
/**
* Creates a new schema (StructType) by taking the fields (with their types) from the typed schema
* and any remaining fields from the original schema.
*
* @param originalSchema a schema with reference columns
* @param typedSchema a schema that will be used to define columns found in the originalSchema.
*
* @return a new schema
*/
private StructType cloneSchemaWithNewTypes(StructType originalSchema, StructType typedSchema) {
Map<String, StructField> typesMap = new HashMap<>(typedSchema.fields().length);
for (StructField sf : typedSchema.fields()) {
typesMap.put(sf.name(), sf);
}
StructType st = new StructType();
for (StructField sf : originalSchema.fields()) {
if (typesMap.containsKey(sf.name())) {
st = st.add(typesMap.get(sf.name()));
} else {
st = st.add(sf);
}
}
return st;
}
Aggregations