use of com.thinkbiganalytics.spark.rest.controller.SparkShellScriptRunner in project kylo by Teradata.
the class FileMetadataCompletionTask method run.
@Override
public void run() {
List<String> exceptions = new ArrayList<>();
// aggregrate
if (this.tasks != null) {
exceptions.addAll(this.tasks.stream().filter(task -> task.hasExecutionException()).map(task -> task.getExecutionException()).collect(Collectors.toList()));
}
try {
for (Map.Entry<SparkShellScriptRunner, List<com.thinkbiganalytics.spark.rest.model.FileMetadataResponse.ParsedFileMetadata>> e : this.fileSchemaScriptRunnerMap.entrySet()) {
SparkShellScriptRunner runner = e.getKey();
TransformResponse response = runner.getFinalResponse();
if (response != null) {
TransformResponseUtil responseTransformer = new TransformResponseUtil(response);
Map<String, FileMetadataSchema> schemaMap = response.getResults().getRows().stream().map(row -> {
String mimeType = (String) responseTransformer.getRowValue("mimeType", row);
String filePath = (String) responseTransformer.getRowValue("resource", row);
Map<String, String> schema = responseTransformer.getRowValue("schema", row, Map.class);
return new FileMetadataSchema(mimeType, filePath, schema);
}).collect(Collectors.toMap(key -> key.getFilePath(), key -> key));
e.getValue().stream().forEach(originalData -> {
FileMetadataSchema updatedSchema = schemaMap.get(originalData.getFilePath());
if (updatedSchema != null) {
String headers = ObjectMapperSerializer.serialize(updatedSchema.getSchema());
originalData.getProperties().put("headers", headers);
originalData.getProperties().put("headerCount", updatedSchema.getSchema().size() + "");
}
});
}
}
} catch (Exception e) {
exceptions.add(e.getMessage());
}
if (!exceptions.isEmpty()) {
String message = exceptions.stream().collect(Collectors.joining("\n"));
this.modifiedTransformResponse.setMessage(message);
}
this.finalResult.complete(this.modifiedTransformResponse);
this.complete = true;
}
use of com.thinkbiganalytics.spark.rest.controller.SparkShellScriptRunner in project kylo by Teradata.
the class FileMetadataTaskService method findFileMetadataSchemas.
/**
* Group the files by their respective mime type
* For each mime type that spark can process create a task to determine the header information
*/
public void findFileMetadataSchemas(ModifiedTransformResponse<FileMetadataResponse> modifiedTransformResponse, FileMetadataTransformResponseModifier resultModifier) {
FileMetadataCompletionTask result = new FileMetadataCompletionTask(modifiedTransformResponse, resultModifier);
metadataResultCache.put(result.getTableId(), result);
Map<String, List<com.thinkbiganalytics.spark.rest.model.FileMetadataResponse.ParsedFileMetadata>> mimeTypeGroup = resultModifier.groupByMimeType();
List<String> mimeTypes = Lists.newArrayList(mimeTypeGroup.keySet());
mimeTypes.removeIf(type -> !PARSABLE_MIME_TYPES.contains(type));
List<SparkShellScriptRunner> tasks = new ArrayList<>();
for (String mimeType : mimeTypes) {
List<com.thinkbiganalytics.spark.rest.model.FileMetadataResponse.ParsedFileMetadata> data = mimeTypeGroup.get(mimeType);
if (mimeType == "application/xml") {
// need to group by rowtag
Map<String, List<com.thinkbiganalytics.spark.rest.model.FileMetadataResponse.ParsedFileMetadata>> rowTags = data.stream().collect(Collectors.groupingBy(row -> row.getRowTag()));
for (Map.Entry<String, List<com.thinkbiganalytics.spark.rest.model.FileMetadataResponse.ParsedFileMetadata>> rows : rowTags.entrySet()) {
List<String> files = rows.getValue().stream().map(r -> r.getFilePath()).collect(Collectors.toList());
SparkShellScriptRunner shellScriptRunner = new SparkShellScriptRunner(sparkShellUserProcessService, restClient, getUsername(), FileMetadataSchemaScriptBuilder.getSparkScript(mimeType, rows.getKey(), files), mimeType);
tasks.add(shellScriptRunner);
result.addTask(shellScriptRunner, rows.getValue());
}
} else {
List<String> files = data.stream().map(r -> r.getFilePath()).collect(Collectors.toList());
SparkShellScriptRunner shellScriptRunner = new SparkShellScriptRunner(sparkShellUserProcessService, restClient, getUsername(), FileMetadataSchemaScriptBuilder.getSparkScript(mimeType, null, files), mimeType);
tasks.add(shellScriptRunner);
result.addTask(shellScriptRunner, data);
}
}
submitTasks(result, tasks);
}
Aggregations