Search in sources :

Example 1 with SparkShellScriptRunner

use of com.thinkbiganalytics.spark.rest.controller.SparkShellScriptRunner in project kylo by Teradata.

the class FileMetadataCompletionTask method run.

@Override
public void run() {
    List<String> exceptions = new ArrayList<>();
    // aggregrate
    if (this.tasks != null) {
        exceptions.addAll(this.tasks.stream().filter(task -> task.hasExecutionException()).map(task -> task.getExecutionException()).collect(Collectors.toList()));
    }
    try {
        for (Map.Entry<SparkShellScriptRunner, List<com.thinkbiganalytics.spark.rest.model.FileMetadataResponse.ParsedFileMetadata>> e : this.fileSchemaScriptRunnerMap.entrySet()) {
            SparkShellScriptRunner runner = e.getKey();
            TransformResponse response = runner.getFinalResponse();
            if (response != null) {
                TransformResponseUtil responseTransformer = new TransformResponseUtil(response);
                Map<String, FileMetadataSchema> schemaMap = response.getResults().getRows().stream().map(row -> {
                    String mimeType = (String) responseTransformer.getRowValue("mimeType", row);
                    String filePath = (String) responseTransformer.getRowValue("resource", row);
                    Map<String, String> schema = responseTransformer.getRowValue("schema", row, Map.class);
                    return new FileMetadataSchema(mimeType, filePath, schema);
                }).collect(Collectors.toMap(key -> key.getFilePath(), key -> key));
                e.getValue().stream().forEach(originalData -> {
                    FileMetadataSchema updatedSchema = schemaMap.get(originalData.getFilePath());
                    if (updatedSchema != null) {
                        String headers = ObjectMapperSerializer.serialize(updatedSchema.getSchema());
                        originalData.getProperties().put("headers", headers);
                        originalData.getProperties().put("headerCount", updatedSchema.getSchema().size() + "");
                    }
                });
            }
        }
    } catch (Exception e) {
        exceptions.add(e.getMessage());
    }
    if (!exceptions.isEmpty()) {
        String message = exceptions.stream().collect(Collectors.joining("\n"));
        this.modifiedTransformResponse.setMessage(message);
    }
    this.finalResult.complete(this.modifiedTransformResponse);
    this.complete = true;
}
Also used : FileMetadataResponse(com.thinkbiganalytics.spark.rest.model.FileMetadataResponse) ObjectMapperSerializer(com.thinkbiganalytics.json.ObjectMapperSerializer) TransformResponse(com.thinkbiganalytics.spark.rest.model.TransformResponse) List(java.util.List) TransformResponseUtil(com.thinkbiganalytics.spark.rest.controller.TransformResponseUtil) FileMetadataTransformResponseModifier(com.thinkbiganalytics.spark.rest.filemetadata.FileMetadataTransformResponseModifier) Map(java.util.Map) SparkShellScriptRunner(com.thinkbiganalytics.spark.rest.controller.SparkShellScriptRunner) ModifiedTransformResponse(com.thinkbiganalytics.spark.rest.model.ModifiedTransformResponse) HashMap(java.util.HashMap) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) ArrayList(java.util.ArrayList) SparkShellScriptRunner(com.thinkbiganalytics.spark.rest.controller.SparkShellScriptRunner) FileMetadataResponse(com.thinkbiganalytics.spark.rest.model.FileMetadataResponse) List(java.util.List) ArrayList(java.util.ArrayList) TransformResponse(com.thinkbiganalytics.spark.rest.model.TransformResponse) ModifiedTransformResponse(com.thinkbiganalytics.spark.rest.model.ModifiedTransformResponse) Map(java.util.Map) HashMap(java.util.HashMap) TransformResponseUtil(com.thinkbiganalytics.spark.rest.controller.TransformResponseUtil)

Example 2 with SparkShellScriptRunner

use of com.thinkbiganalytics.spark.rest.controller.SparkShellScriptRunner in project kylo by Teradata.

the class FileMetadataTaskService method findFileMetadataSchemas.

/**
 * Group the files by their respective mime type
 * For each mime type that spark can process create a task to determine the header information
 */
public void findFileMetadataSchemas(ModifiedTransformResponse<FileMetadataResponse> modifiedTransformResponse, FileMetadataTransformResponseModifier resultModifier) {
    FileMetadataCompletionTask result = new FileMetadataCompletionTask(modifiedTransformResponse, resultModifier);
    metadataResultCache.put(result.getTableId(), result);
    Map<String, List<com.thinkbiganalytics.spark.rest.model.FileMetadataResponse.ParsedFileMetadata>> mimeTypeGroup = resultModifier.groupByMimeType();
    List<String> mimeTypes = Lists.newArrayList(mimeTypeGroup.keySet());
    mimeTypes.removeIf(type -> !PARSABLE_MIME_TYPES.contains(type));
    List<SparkShellScriptRunner> tasks = new ArrayList<>();
    for (String mimeType : mimeTypes) {
        List<com.thinkbiganalytics.spark.rest.model.FileMetadataResponse.ParsedFileMetadata> data = mimeTypeGroup.get(mimeType);
        if (mimeType == "application/xml") {
            // need to group by rowtag
            Map<String, List<com.thinkbiganalytics.spark.rest.model.FileMetadataResponse.ParsedFileMetadata>> rowTags = data.stream().collect(Collectors.groupingBy(row -> row.getRowTag()));
            for (Map.Entry<String, List<com.thinkbiganalytics.spark.rest.model.FileMetadataResponse.ParsedFileMetadata>> rows : rowTags.entrySet()) {
                List<String> files = rows.getValue().stream().map(r -> r.getFilePath()).collect(Collectors.toList());
                SparkShellScriptRunner shellScriptRunner = new SparkShellScriptRunner(sparkShellUserProcessService, restClient, getUsername(), FileMetadataSchemaScriptBuilder.getSparkScript(mimeType, rows.getKey(), files), mimeType);
                tasks.add(shellScriptRunner);
                result.addTask(shellScriptRunner, rows.getValue());
            }
        } else {
            List<String> files = data.stream().map(r -> r.getFilePath()).collect(Collectors.toList());
            SparkShellScriptRunner shellScriptRunner = new SparkShellScriptRunner(sparkShellUserProcessService, restClient, getUsername(), FileMetadataSchemaScriptBuilder.getSparkScript(mimeType, null, files), mimeType);
            tasks.add(shellScriptRunner);
            result.addTask(shellScriptRunner, data);
        }
    }
    submitTasks(result, tasks);
}
Also used : ThreadFactoryBuilder(com.google.common.util.concurrent.ThreadFactoryBuilder) SparkShellUserProcessService(com.thinkbiganalytics.spark.rest.controller.SparkShellUserProcessService) ThreadPoolExecutor(java.util.concurrent.ThreadPoolExecutor) LoggerFactory(org.slf4j.LoggerFactory) User(org.springframework.security.core.userdetails.User) ArrayList(java.util.ArrayList) Inject(javax.inject.Inject) Future(java.util.concurrent.Future) Lists(com.google.common.collect.Lists) FileMetadataSchemaScriptBuilder(com.thinkbiganalytics.spark.rest.filemetadata.FileMetadataSchemaScriptBuilder) Map(java.util.Map) SecurityContextHolder(org.springframework.security.core.context.SecurityContextHolder) Nonnull(javax.annotation.Nonnull) ExecutorService(java.util.concurrent.ExecutorService) SparkShellRestClient(com.thinkbiganalytics.spark.shell.SparkShellRestClient) CyclicBarrier(java.util.concurrent.CyclicBarrier) Logger(org.slf4j.Logger) FileMetadataTransformResponseModifier(com.thinkbiganalytics.spark.rest.filemetadata.FileMetadataTransformResponseModifier) ModifiedTransformResponse(com.thinkbiganalytics.spark.rest.model.ModifiedTransformResponse) BlockingQueue(java.util.concurrent.BlockingQueue) UUID(java.util.UUID) LinkedBlockingQueue(java.util.concurrent.LinkedBlockingQueue) Collectors(java.util.stream.Collectors) Executors(java.util.concurrent.Executors) FileMetadataResponse(com.thinkbiganalytics.spark.rest.model.FileMetadataResponse) TimeUnit(java.util.concurrent.TimeUnit) SparkShellProxyController(com.thinkbiganalytics.spark.rest.controller.SparkShellProxyController) Component(org.springframework.stereotype.Component) List(java.util.List) SparkShellScriptRunner(com.thinkbiganalytics.spark.rest.controller.SparkShellScriptRunner) CacheBuilder(com.google.common.cache.CacheBuilder) Cache(com.google.common.cache.Cache) Authentication(org.springframework.security.core.Authentication) ArrayList(java.util.ArrayList) SparkShellScriptRunner(com.thinkbiganalytics.spark.rest.controller.SparkShellScriptRunner) FileMetadataResponse(com.thinkbiganalytics.spark.rest.model.FileMetadataResponse) ArrayList(java.util.ArrayList) List(java.util.List) Map(java.util.Map)

Aggregations

SparkShellScriptRunner (com.thinkbiganalytics.spark.rest.controller.SparkShellScriptRunner)2 FileMetadataTransformResponseModifier (com.thinkbiganalytics.spark.rest.filemetadata.FileMetadataTransformResponseModifier)2 FileMetadataResponse (com.thinkbiganalytics.spark.rest.model.FileMetadataResponse)2 ModifiedTransformResponse (com.thinkbiganalytics.spark.rest.model.ModifiedTransformResponse)2 ArrayList (java.util.ArrayList)2 List (java.util.List)2 Map (java.util.Map)2 Collectors (java.util.stream.Collectors)2 Cache (com.google.common.cache.Cache)1 CacheBuilder (com.google.common.cache.CacheBuilder)1 Lists (com.google.common.collect.Lists)1 ThreadFactoryBuilder (com.google.common.util.concurrent.ThreadFactoryBuilder)1 ObjectMapperSerializer (com.thinkbiganalytics.json.ObjectMapperSerializer)1 SparkShellProxyController (com.thinkbiganalytics.spark.rest.controller.SparkShellProxyController)1 SparkShellUserProcessService (com.thinkbiganalytics.spark.rest.controller.SparkShellUserProcessService)1 TransformResponseUtil (com.thinkbiganalytics.spark.rest.controller.TransformResponseUtil)1 FileMetadataSchemaScriptBuilder (com.thinkbiganalytics.spark.rest.filemetadata.FileMetadataSchemaScriptBuilder)1 TransformResponse (com.thinkbiganalytics.spark.rest.model.TransformResponse)1 SparkShellRestClient (com.thinkbiganalytics.spark.shell.SparkShellRestClient)1 HashMap (java.util.HashMap)1