Search in sources :

Example 1 with FileMetadataResponse

use of com.thinkbiganalytics.spark.rest.model.FileMetadataResponse in project kylo by Teradata.

the class FileMetadataResultModifier method modifySuccessfulResults.

@Override
public void modifySuccessfulResults(ModifiedTransformResponse<FileMetadataResponse> modifiedTransformResponse) {
    FileMetadataResponse fileMetadataResponse = new FileMetadataResponse();
    Map<String, List<FileMetadataResponse.ParsedFileMetadata>> groupedMetadata = response.getResults().getRows().stream().map(objects -> {
        FileMetadata fileMetadata = new FileMetadata();
        String mimeType = (String) getRowValue("mimeType", objects);
        String delimiter = (String) getRowValue("delimiter", objects);
        Integer headerCount = getRowValue("headerCount", objects, Integer.class, 0);
        String filePath = (String) getRowValue("resource", objects);
        String encoding = (String) getRowValue("encoding", objects);
        FileMetadataResponse.ParsedFileMetadata metadata = new FileMetadataResponse.ParsedFileMetadata();
        metadata.setMimeType(mimeType);
        metadata.setDelimiter(delimiter);
        metadata.setEncoding(encoding);
        metadata.setFilePath(filePath);
        return metadata;
    }).collect(Collectors.groupingBy(FileMetadataResponse.ParsedFileMetadata::getKey));
    // create user friendly names from the groupings
    // the key here is going to be
    Map<String, FileMetadataResponse.FileDataSet> fileDatasets = groupedMetadata.keySet().stream().collect(Collectors.toMap(key -> groupedMetadata.get(key).get(0).getFilePath(), key -> {
        FileMetadataResponse.FileDataSet dataSet = new FileMetadataResponse.FileDataSet();
        List<FileMetadataResponse.ParsedFileMetadata> files = groupedMetadata.get(key);
        dataSet.setFiles(files);
        FileMetadataResponse.ParsedFileMetadata firstFile = files.get(0);
        dataSet.setMimeType(firstFile.getMimeType());
        findSchemaParserForMimeType(firstFile.getMimeType()).ifPresent(schemaParserDescriptor -> {
            dataSet.setSchemaParser(schemaParserDescriptor);
            // if the parser has a delimiter property set it
            schemaParserDescriptor.getProperties().stream().filter(property -> property.getObjectProperty().equals("separatorChar")).findFirst().ifPresent(property -> {
                property.setValue(firstFile.getDelimiter());
            });
            Optional<FileSchemaParser> fileSchemaParser = fileSchemaParser(schemaParserDescriptor);
            if (fileSchemaParser.isPresent() && fileSchemaParser.get() instanceof SparkFileSchemaParser) {
                List<String> paths = files.stream().map(parsedFileMetadata -> parsedFileMetadata.getFilePath()).collect(Collectors.toList());
                SampleFileSparkScript sparkScript = ((SparkFileSchemaParser) fileSchemaParser.get()).getSparkScript(paths);
                dataSet.setSparkScript(sparkScript);
            }
        });
        return dataSet;
    }));
    fileMetadataResponse.setDatasets(fileDatasets);
    modifiedTransformResponse.setResults(fileMetadataResponse);
}
Also used : SampleFileSparkScript(com.thinkbiganalytics.discovery.parser.SampleFileSparkScript) Arrays(java.util.Arrays) SchemaParserDescriptor(com.thinkbiganalytics.discovery.model.SchemaParserDescriptor) AbstractTransformResponseModifier(com.thinkbiganalytics.spark.rest.controller.AbstractTransformResponseModifier) ModifiedTransformResponse(com.thinkbiganalytics.spark.rest.model.ModifiedTransformResponse) FileParserFactory(com.thinkbiganalytics.discovery.FileParserFactory) FileMetadata(com.thinkbiganalytics.kylo.metadata.file.FileMetadata) StringUtils(org.apache.commons.lang3.StringUtils) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) SparkFileSchemaParser(com.thinkbiganalytics.discovery.parser.SparkFileSchemaParser) FileMetadataResponse(com.thinkbiganalytics.spark.rest.model.FileMetadataResponse) FileSchemaParser(com.thinkbiganalytics.discovery.parser.FileSchemaParser) List(java.util.List) Map(java.util.Map) Optional(java.util.Optional) SchemaParserAnnotationTransformer(com.thinkbiganalytics.discovery.rest.controller.SchemaParserAnnotationTransformer) SparkFileSchemaParser(com.thinkbiganalytics.discovery.parser.SparkFileSchemaParser) Optional(java.util.Optional) FileMetadata(com.thinkbiganalytics.kylo.metadata.file.FileMetadata) SampleFileSparkScript(com.thinkbiganalytics.discovery.parser.SampleFileSparkScript) FileMetadataResponse(com.thinkbiganalytics.spark.rest.model.FileMetadataResponse) ArrayList(java.util.ArrayList) List(java.util.List)

Example 2 with FileMetadataResponse

use of com.thinkbiganalytics.spark.rest.model.FileMetadataResponse in project kylo by Teradata.

the class FileMetadataTaskService method findFileMetadataSchemas.

/**
 * Group the files by their respective mime type
 * For each mime type that spark can process create a task to determine the header information
 */
public void findFileMetadataSchemas(ModifiedTransformResponse<FileMetadataResponse> modifiedTransformResponse, FileMetadataTransformResponseModifier resultModifier) {
    FileMetadataCompletionTask result = new FileMetadataCompletionTask(modifiedTransformResponse, resultModifier);
    metadataResultCache.put(result.getTableId(), result);
    Map<String, List<com.thinkbiganalytics.spark.rest.model.FileMetadataResponse.ParsedFileMetadata>> mimeTypeGroup = resultModifier.groupByMimeType();
    List<String> mimeTypes = Lists.newArrayList(mimeTypeGroup.keySet());
    mimeTypes.removeIf(type -> !PARSABLE_MIME_TYPES.contains(type));
    List<SparkShellScriptRunner> tasks = new ArrayList<>();
    for (String mimeType : mimeTypes) {
        List<com.thinkbiganalytics.spark.rest.model.FileMetadataResponse.ParsedFileMetadata> data = mimeTypeGroup.get(mimeType);
        if (mimeType == "application/xml") {
            // need to group by rowtag
            Map<String, List<com.thinkbiganalytics.spark.rest.model.FileMetadataResponse.ParsedFileMetadata>> rowTags = data.stream().collect(Collectors.groupingBy(row -> row.getRowTag()));
            for (Map.Entry<String, List<com.thinkbiganalytics.spark.rest.model.FileMetadataResponse.ParsedFileMetadata>> rows : rowTags.entrySet()) {
                List<String> files = rows.getValue().stream().map(r -> r.getFilePath()).collect(Collectors.toList());
                SparkShellScriptRunner shellScriptRunner = new SparkShellScriptRunner(sparkShellUserProcessService, restClient, getUsername(), FileMetadataSchemaScriptBuilder.getSparkScript(mimeType, rows.getKey(), files), mimeType);
                tasks.add(shellScriptRunner);
                result.addTask(shellScriptRunner, rows.getValue());
            }
        } else {
            List<String> files = data.stream().map(r -> r.getFilePath()).collect(Collectors.toList());
            SparkShellScriptRunner shellScriptRunner = new SparkShellScriptRunner(sparkShellUserProcessService, restClient, getUsername(), FileMetadataSchemaScriptBuilder.getSparkScript(mimeType, null, files), mimeType);
            tasks.add(shellScriptRunner);
            result.addTask(shellScriptRunner, data);
        }
    }
    submitTasks(result, tasks);
}
Also used : ThreadFactoryBuilder(com.google.common.util.concurrent.ThreadFactoryBuilder) SparkShellUserProcessService(com.thinkbiganalytics.spark.rest.controller.SparkShellUserProcessService) ThreadPoolExecutor(java.util.concurrent.ThreadPoolExecutor) LoggerFactory(org.slf4j.LoggerFactory) User(org.springframework.security.core.userdetails.User) ArrayList(java.util.ArrayList) Inject(javax.inject.Inject) Future(java.util.concurrent.Future) Lists(com.google.common.collect.Lists) FileMetadataSchemaScriptBuilder(com.thinkbiganalytics.spark.rest.filemetadata.FileMetadataSchemaScriptBuilder) Map(java.util.Map) SecurityContextHolder(org.springframework.security.core.context.SecurityContextHolder) Nonnull(javax.annotation.Nonnull) ExecutorService(java.util.concurrent.ExecutorService) SparkShellRestClient(com.thinkbiganalytics.spark.shell.SparkShellRestClient) CyclicBarrier(java.util.concurrent.CyclicBarrier) Logger(org.slf4j.Logger) FileMetadataTransformResponseModifier(com.thinkbiganalytics.spark.rest.filemetadata.FileMetadataTransformResponseModifier) ModifiedTransformResponse(com.thinkbiganalytics.spark.rest.model.ModifiedTransformResponse) BlockingQueue(java.util.concurrent.BlockingQueue) UUID(java.util.UUID) LinkedBlockingQueue(java.util.concurrent.LinkedBlockingQueue) Collectors(java.util.stream.Collectors) Executors(java.util.concurrent.Executors) FileMetadataResponse(com.thinkbiganalytics.spark.rest.model.FileMetadataResponse) TimeUnit(java.util.concurrent.TimeUnit) SparkShellProxyController(com.thinkbiganalytics.spark.rest.controller.SparkShellProxyController) Component(org.springframework.stereotype.Component) List(java.util.List) SparkShellScriptRunner(com.thinkbiganalytics.spark.rest.controller.SparkShellScriptRunner) CacheBuilder(com.google.common.cache.CacheBuilder) Cache(com.google.common.cache.Cache) Authentication(org.springframework.security.core.Authentication) ArrayList(java.util.ArrayList) SparkShellScriptRunner(com.thinkbiganalytics.spark.rest.controller.SparkShellScriptRunner) FileMetadataResponse(com.thinkbiganalytics.spark.rest.model.FileMetadataResponse) ArrayList(java.util.ArrayList) List(java.util.List) Map(java.util.Map)

Example 3 with FileMetadataResponse

use of com.thinkbiganalytics.spark.rest.model.FileMetadataResponse in project kylo by Teradata.

the class FileMetadataTest method testChainedResult.

@Test
public void testChainedResult() {
    setup();
    TransformResponse initialResponse = new TransformResponse();
    initialResponse.setStatus(TransformResponse.Status.SUCCESS);
    TransformQueryResult result = new TransformQueryResult();
    List<QueryResultColumn> columnList = new ArrayList<>();
    columnList.add(newColumn("mimeType"));
    columnList.add(newColumn("delimiter"));
    columnList.add(newColumn("headerCount"));
    columnList.add(newColumn("resource"));
    columnList.add(newColumn("encoding"));
    result.setColumns(columnList);
    List<List<Object>> rows = new ArrayList<>();
    rows.add(newtRow("application/parquet", "file://my/parquet001.parquet"));
    rows.add(newtRow("application/parquet", "file://my/parquet002.parquet"));
    rows.add(newtRow("application/parquet", "file://my/parquet003.parquet"));
    rows.add(newtRow("application/avro", "file://my/avro001.avro"));
    rows.add(newtRow("application/avro", "file://my/avro002.avro"));
    rows.add(newtRow("text/csv", "file://my/test001.csv"));
    rows.add(newtRow("text/csv", "file://my/test002.csv"));
    result.setRows(rows);
    initialResponse.setResults(result);
    initialResponse.setTable(UUID.randomUUID().toString());
    FileMetadataTransformResponseModifier fileMetadataResult = new FileMetadataTransformResponseModifier(trackerService);
    ModifiedTransformResponse<FileMetadataResponse> m = fileMetadataResult.modify(initialResponse);
    FileMetadataResponse response = m.getResults();
    int retryCount = 0;
    long start = System.currentTimeMillis();
    boolean process = response == null;
    while (process) {
        Uninterruptibles.sleepUninterruptibly(1000, TimeUnit.MILLISECONDS);
        response = m.getResults();
        if (response != null) {
            process = false;
        }
        retryCount += 1;
        if (retryCount > 40) {
            process = false;
        }
    }
    long stop = System.currentTimeMillis();
    log.info("Time to get chained response {} ms, Retry Attempts: {}", (stop - start), retryCount);
    Assert.assertEquals(3, response.getDatasets().size());
    Assert.assertEquals(2, response.getDatasets().get("file://my/test001.csv").getFiles().size());
    Assert.assertEquals(3, response.getDatasets().get("file://my/parquet001.parquet").getFiles().size());
    Assert.assertEquals(2, response.getDatasets().get("file://my/avro001.avro").getFiles().size());
}
Also used : ArrayList(java.util.ArrayList) FileMetadataTransformResponseModifier(com.thinkbiganalytics.spark.rest.filemetadata.FileMetadataTransformResponseModifier) TransformQueryResult(com.thinkbiganalytics.spark.rest.model.TransformQueryResult) FileMetadataResponse(com.thinkbiganalytics.spark.rest.model.FileMetadataResponse) TransformResponse(com.thinkbiganalytics.spark.rest.model.TransformResponse) ModifiedTransformResponse(com.thinkbiganalytics.spark.rest.model.ModifiedTransformResponse) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) List(java.util.List) DefaultQueryResultColumn(com.thinkbiganalytics.discovery.model.DefaultQueryResultColumn) QueryResultColumn(com.thinkbiganalytics.discovery.schema.QueryResultColumn) Test(org.junit.Test)

Example 4 with FileMetadataResponse

use of com.thinkbiganalytics.spark.rest.model.FileMetadataResponse in project kylo by Teradata.

the class FileMetadataTransformResponseModifier method complete.

public void complete(ModifiedTransformResponse<FileMetadataResponse> modifiedTransformResponse) {
    FileMetadataResponse fileMetadataResponse = new FileMetadataResponse();
    String tableId = modifiedTransformResponse.getTable();
    Map<String, List<FileMetadataResponse.ParsedFileMetadata>> groupedMimeTypeMetadata = response.getResults().getRows().stream().map(r -> parseRow(r)).collect(Collectors.groupingBy(FileMetadataResponse.ParsedFileMetadata::getMimeType));
    Map<String, List<FileMetadataResponse.ParsedFileMetadata>> groupedMetadata = response.getResults().getRows().stream().map(r -> parseRow(r)).collect(Collectors.groupingBy(FileMetadataResponse.ParsedFileMetadata::getKey));
    // create user friendly names from the groupings
    // the key in this new map will be the first file found
    Map<String, FileMetadataResponse.FileDataSet> fileDatasets = groupedMetadata.keySet().stream().collect(Collectors.toMap(key -> groupedMetadata.get(key).get(0).getFilePath(), key -> {
        FileMetadataResponse.FileDataSet dataSet = new FileMetadataResponse.FileDataSet();
        List<FileMetadataResponse.ParsedFileMetadata> files = groupedMetadata.get(key);
        dataSet.setFiles(files);
        FileMetadataResponse.ParsedFileMetadata firstFile = files.get(0);
        dataSet.setMimeType(firstFile.getMimeType());
        findSchemaParserForMimeType(firstFile.getMimeType()).ifPresent(schemaParserDescriptor -> {
            dataSet.setSchemaParser(schemaParserDescriptor);
            // if the parser has a delimiter property set it
            schemaParserDescriptor.getProperties().stream().filter(property -> property.getObjectProperty().equals("separatorChar")).findFirst().ifPresent(property -> {
                property.setValue(firstFile.getDelimiter());
            });
            schemaParserDescriptor.getProperties().stream().filter(property -> property.getObjectProperty().equals("rowTag")).findFirst().ifPresent(property -> {
                property.setValue(firstFile.getRowTag());
            });
            Optional<FileSchemaParser> fileSchemaParser = fileSchemaParser(schemaParserDescriptor);
            if (fileSchemaParser.isPresent() && fileSchemaParser.get() instanceof SparkFileSchemaParser) {
                List<String> paths = files.stream().map(parsedFileMetadata -> parsedFileMetadata.getFilePath()).collect(Collectors.toList());
                SampleFileSparkScript sparkScript = ((SparkFileSchemaParser) fileSchemaParser.get()).getSparkScript(paths);
                dataSet.setSparkScript(sparkScript);
            }
        });
        return dataSet;
    }));
    fileMetadataResponse.setDatasets(fileDatasets);
    modifiedTransformResponse.setResults(fileMetadataResponse);
    modifiedTransformResponse.setStatus(TransformResponse.Status.SUCCESS);
}
Also used : SampleFileSparkScript(com.thinkbiganalytics.discovery.parser.SampleFileSparkScript) Arrays(java.util.Arrays) FileMetadataTaskService(com.thinkbiganalytics.spark.rest.filemetadata.tasks.FileMetadataTaskService) SchemaParserDescriptor(com.thinkbiganalytics.discovery.model.SchemaParserDescriptor) AbstractTransformResponseModifier(com.thinkbiganalytics.spark.rest.controller.AbstractTransformResponseModifier) ModifiedTransformResponse(com.thinkbiganalytics.spark.rest.model.ModifiedTransformResponse) FileParserFactory(com.thinkbiganalytics.discovery.FileParserFactory) StringUtils(org.apache.commons.lang3.StringUtils) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) SparkFileSchemaParser(com.thinkbiganalytics.discovery.parser.SparkFileSchemaParser) FileMetadataResponse(com.thinkbiganalytics.spark.rest.model.FileMetadataResponse) FileSchemaParser(com.thinkbiganalytics.discovery.parser.FileSchemaParser) TransformResponse(com.thinkbiganalytics.spark.rest.model.TransformResponse) List(java.util.List) Map(java.util.Map) Optional(java.util.Optional) SchemaParserAnnotationTransformer(com.thinkbiganalytics.discovery.rest.controller.SchemaParserAnnotationTransformer) SampleFileSparkScript(com.thinkbiganalytics.discovery.parser.SampleFileSparkScript) SparkFileSchemaParser(com.thinkbiganalytics.discovery.parser.SparkFileSchemaParser) Optional(java.util.Optional) FileMetadataResponse(com.thinkbiganalytics.spark.rest.model.FileMetadataResponse) ArrayList(java.util.ArrayList) List(java.util.List)

Aggregations

FileMetadataResponse (com.thinkbiganalytics.spark.rest.model.FileMetadataResponse)4 ModifiedTransformResponse (com.thinkbiganalytics.spark.rest.model.ModifiedTransformResponse)4 ArrayList (java.util.ArrayList)4 List (java.util.List)4 Map (java.util.Map)3 Collectors (java.util.stream.Collectors)3 FileParserFactory (com.thinkbiganalytics.discovery.FileParserFactory)2 SchemaParserDescriptor (com.thinkbiganalytics.discovery.model.SchemaParserDescriptor)2 FileSchemaParser (com.thinkbiganalytics.discovery.parser.FileSchemaParser)2 SampleFileSparkScript (com.thinkbiganalytics.discovery.parser.SampleFileSparkScript)2 SparkFileSchemaParser (com.thinkbiganalytics.discovery.parser.SparkFileSchemaParser)2 SchemaParserAnnotationTransformer (com.thinkbiganalytics.discovery.rest.controller.SchemaParserAnnotationTransformer)2 AbstractTransformResponseModifier (com.thinkbiganalytics.spark.rest.controller.AbstractTransformResponseModifier)2 FileMetadataTransformResponseModifier (com.thinkbiganalytics.spark.rest.filemetadata.FileMetadataTransformResponseModifier)2 TransformResponse (com.thinkbiganalytics.spark.rest.model.TransformResponse)2 Arrays (java.util.Arrays)2 Optional (java.util.Optional)2 Cache (com.google.common.cache.Cache)1 CacheBuilder (com.google.common.cache.CacheBuilder)1 Lists (com.google.common.collect.Lists)1