use of com.thinkbiganalytics.spark.rest.controller.SparkShellUserProcessService in project kylo by Teradata.
the class FileMetadataTaskService method findFileMetadataSchemas.
/**
* Group the files by their respective mime type
* For each mime type that spark can process create a task to determine the header information
*/
public void findFileMetadataSchemas(ModifiedTransformResponse<FileMetadataResponse> modifiedTransformResponse, FileMetadataTransformResponseModifier resultModifier) {
FileMetadataCompletionTask result = new FileMetadataCompletionTask(modifiedTransformResponse, resultModifier);
metadataResultCache.put(result.getTableId(), result);
Map<String, List<com.thinkbiganalytics.spark.rest.model.FileMetadataResponse.ParsedFileMetadata>> mimeTypeGroup = resultModifier.groupByMimeType();
List<String> mimeTypes = Lists.newArrayList(mimeTypeGroup.keySet());
mimeTypes.removeIf(type -> !PARSABLE_MIME_TYPES.contains(type));
List<SparkShellScriptRunner> tasks = new ArrayList<>();
for (String mimeType : mimeTypes) {
List<com.thinkbiganalytics.spark.rest.model.FileMetadataResponse.ParsedFileMetadata> data = mimeTypeGroup.get(mimeType);
if (mimeType == "application/xml") {
// need to group by rowtag
Map<String, List<com.thinkbiganalytics.spark.rest.model.FileMetadataResponse.ParsedFileMetadata>> rowTags = data.stream().collect(Collectors.groupingBy(row -> row.getRowTag()));
for (Map.Entry<String, List<com.thinkbiganalytics.spark.rest.model.FileMetadataResponse.ParsedFileMetadata>> rows : rowTags.entrySet()) {
List<String> files = rows.getValue().stream().map(r -> r.getFilePath()).collect(Collectors.toList());
SparkShellScriptRunner shellScriptRunner = new SparkShellScriptRunner(sparkShellUserProcessService, restClient, getUsername(), FileMetadataSchemaScriptBuilder.getSparkScript(mimeType, rows.getKey(), files), mimeType);
tasks.add(shellScriptRunner);
result.addTask(shellScriptRunner, rows.getValue());
}
} else {
List<String> files = data.stream().map(r -> r.getFilePath()).collect(Collectors.toList());
SparkShellScriptRunner shellScriptRunner = new SparkShellScriptRunner(sparkShellUserProcessService, restClient, getUsername(), FileMetadataSchemaScriptBuilder.getSparkScript(mimeType, null, files), mimeType);
tasks.add(shellScriptRunner);
result.addTask(shellScriptRunner, data);
}
}
submitTasks(result, tasks);
}
Aggregations