use of com.thinkbiganalytics.spark.rest.model.FileMetadataResponse in project kylo by Teradata.
the class FileMetadataResultModifier method modifySuccessfulResults.
@Override
public void modifySuccessfulResults(ModifiedTransformResponse<FileMetadataResponse> modifiedTransformResponse) {
FileMetadataResponse fileMetadataResponse = new FileMetadataResponse();
Map<String, List<FileMetadataResponse.ParsedFileMetadata>> groupedMetadata = response.getResults().getRows().stream().map(objects -> {
FileMetadata fileMetadata = new FileMetadata();
String mimeType = (String) getRowValue("mimeType", objects);
String delimiter = (String) getRowValue("delimiter", objects);
Integer headerCount = getRowValue("headerCount", objects, Integer.class, 0);
String filePath = (String) getRowValue("resource", objects);
String encoding = (String) getRowValue("encoding", objects);
FileMetadataResponse.ParsedFileMetadata metadata = new FileMetadataResponse.ParsedFileMetadata();
metadata.setMimeType(mimeType);
metadata.setDelimiter(delimiter);
metadata.setEncoding(encoding);
metadata.setFilePath(filePath);
return metadata;
}).collect(Collectors.groupingBy(FileMetadataResponse.ParsedFileMetadata::getKey));
// create user friendly names from the groupings
// the key here is going to be
Map<String, FileMetadataResponse.FileDataSet> fileDatasets = groupedMetadata.keySet().stream().collect(Collectors.toMap(key -> groupedMetadata.get(key).get(0).getFilePath(), key -> {
FileMetadataResponse.FileDataSet dataSet = new FileMetadataResponse.FileDataSet();
List<FileMetadataResponse.ParsedFileMetadata> files = groupedMetadata.get(key);
dataSet.setFiles(files);
FileMetadataResponse.ParsedFileMetadata firstFile = files.get(0);
dataSet.setMimeType(firstFile.getMimeType());
findSchemaParserForMimeType(firstFile.getMimeType()).ifPresent(schemaParserDescriptor -> {
dataSet.setSchemaParser(schemaParserDescriptor);
// if the parser has a delimiter property set it
schemaParserDescriptor.getProperties().stream().filter(property -> property.getObjectProperty().equals("separatorChar")).findFirst().ifPresent(property -> {
property.setValue(firstFile.getDelimiter());
});
Optional<FileSchemaParser> fileSchemaParser = fileSchemaParser(schemaParserDescriptor);
if (fileSchemaParser.isPresent() && fileSchemaParser.get() instanceof SparkFileSchemaParser) {
List<String> paths = files.stream().map(parsedFileMetadata -> parsedFileMetadata.getFilePath()).collect(Collectors.toList());
SampleFileSparkScript sparkScript = ((SparkFileSchemaParser) fileSchemaParser.get()).getSparkScript(paths);
dataSet.setSparkScript(sparkScript);
}
});
return dataSet;
}));
fileMetadataResponse.setDatasets(fileDatasets);
modifiedTransformResponse.setResults(fileMetadataResponse);
}
use of com.thinkbiganalytics.spark.rest.model.FileMetadataResponse in project kylo by Teradata.
the class FileMetadataTaskService method findFileMetadataSchemas.
/**
* Group the files by their respective mime type
* For each mime type that spark can process create a task to determine the header information
*/
public void findFileMetadataSchemas(ModifiedTransformResponse<FileMetadataResponse> modifiedTransformResponse, FileMetadataTransformResponseModifier resultModifier) {
FileMetadataCompletionTask result = new FileMetadataCompletionTask(modifiedTransformResponse, resultModifier);
metadataResultCache.put(result.getTableId(), result);
Map<String, List<com.thinkbiganalytics.spark.rest.model.FileMetadataResponse.ParsedFileMetadata>> mimeTypeGroup = resultModifier.groupByMimeType();
List<String> mimeTypes = Lists.newArrayList(mimeTypeGroup.keySet());
mimeTypes.removeIf(type -> !PARSABLE_MIME_TYPES.contains(type));
List<SparkShellScriptRunner> tasks = new ArrayList<>();
for (String mimeType : mimeTypes) {
List<com.thinkbiganalytics.spark.rest.model.FileMetadataResponse.ParsedFileMetadata> data = mimeTypeGroup.get(mimeType);
if (mimeType == "application/xml") {
// need to group by rowtag
Map<String, List<com.thinkbiganalytics.spark.rest.model.FileMetadataResponse.ParsedFileMetadata>> rowTags = data.stream().collect(Collectors.groupingBy(row -> row.getRowTag()));
for (Map.Entry<String, List<com.thinkbiganalytics.spark.rest.model.FileMetadataResponse.ParsedFileMetadata>> rows : rowTags.entrySet()) {
List<String> files = rows.getValue().stream().map(r -> r.getFilePath()).collect(Collectors.toList());
SparkShellScriptRunner shellScriptRunner = new SparkShellScriptRunner(sparkShellUserProcessService, restClient, getUsername(), FileMetadataSchemaScriptBuilder.getSparkScript(mimeType, rows.getKey(), files), mimeType);
tasks.add(shellScriptRunner);
result.addTask(shellScriptRunner, rows.getValue());
}
} else {
List<String> files = data.stream().map(r -> r.getFilePath()).collect(Collectors.toList());
SparkShellScriptRunner shellScriptRunner = new SparkShellScriptRunner(sparkShellUserProcessService, restClient, getUsername(), FileMetadataSchemaScriptBuilder.getSparkScript(mimeType, null, files), mimeType);
tasks.add(shellScriptRunner);
result.addTask(shellScriptRunner, data);
}
}
submitTasks(result, tasks);
}
use of com.thinkbiganalytics.spark.rest.model.FileMetadataResponse in project kylo by Teradata.
the class FileMetadataTest method testChainedResult.
@Test
public void testChainedResult() {
setup();
TransformResponse initialResponse = new TransformResponse();
initialResponse.setStatus(TransformResponse.Status.SUCCESS);
TransformQueryResult result = new TransformQueryResult();
List<QueryResultColumn> columnList = new ArrayList<>();
columnList.add(newColumn("mimeType"));
columnList.add(newColumn("delimiter"));
columnList.add(newColumn("headerCount"));
columnList.add(newColumn("resource"));
columnList.add(newColumn("encoding"));
result.setColumns(columnList);
List<List<Object>> rows = new ArrayList<>();
rows.add(newtRow("application/parquet", "file://my/parquet001.parquet"));
rows.add(newtRow("application/parquet", "file://my/parquet002.parquet"));
rows.add(newtRow("application/parquet", "file://my/parquet003.parquet"));
rows.add(newtRow("application/avro", "file://my/avro001.avro"));
rows.add(newtRow("application/avro", "file://my/avro002.avro"));
rows.add(newtRow("text/csv", "file://my/test001.csv"));
rows.add(newtRow("text/csv", "file://my/test002.csv"));
result.setRows(rows);
initialResponse.setResults(result);
initialResponse.setTable(UUID.randomUUID().toString());
FileMetadataTransformResponseModifier fileMetadataResult = new FileMetadataTransformResponseModifier(trackerService);
ModifiedTransformResponse<FileMetadataResponse> m = fileMetadataResult.modify(initialResponse);
FileMetadataResponse response = m.getResults();
int retryCount = 0;
long start = System.currentTimeMillis();
boolean process = response == null;
while (process) {
Uninterruptibles.sleepUninterruptibly(1000, TimeUnit.MILLISECONDS);
response = m.getResults();
if (response != null) {
process = false;
}
retryCount += 1;
if (retryCount > 40) {
process = false;
}
}
long stop = System.currentTimeMillis();
log.info("Time to get chained response {} ms, Retry Attempts: {}", (stop - start), retryCount);
Assert.assertEquals(3, response.getDatasets().size());
Assert.assertEquals(2, response.getDatasets().get("file://my/test001.csv").getFiles().size());
Assert.assertEquals(3, response.getDatasets().get("file://my/parquet001.parquet").getFiles().size());
Assert.assertEquals(2, response.getDatasets().get("file://my/avro001.avro").getFiles().size());
}
use of com.thinkbiganalytics.spark.rest.model.FileMetadataResponse in project kylo by Teradata.
the class FileMetadataTransformResponseModifier method complete.
public void complete(ModifiedTransformResponse<FileMetadataResponse> modifiedTransformResponse) {
FileMetadataResponse fileMetadataResponse = new FileMetadataResponse();
String tableId = modifiedTransformResponse.getTable();
Map<String, List<FileMetadataResponse.ParsedFileMetadata>> groupedMimeTypeMetadata = response.getResults().getRows().stream().map(r -> parseRow(r)).collect(Collectors.groupingBy(FileMetadataResponse.ParsedFileMetadata::getMimeType));
Map<String, List<FileMetadataResponse.ParsedFileMetadata>> groupedMetadata = response.getResults().getRows().stream().map(r -> parseRow(r)).collect(Collectors.groupingBy(FileMetadataResponse.ParsedFileMetadata::getKey));
// create user friendly names from the groupings
// the key in this new map will be the first file found
Map<String, FileMetadataResponse.FileDataSet> fileDatasets = groupedMetadata.keySet().stream().collect(Collectors.toMap(key -> groupedMetadata.get(key).get(0).getFilePath(), key -> {
FileMetadataResponse.FileDataSet dataSet = new FileMetadataResponse.FileDataSet();
List<FileMetadataResponse.ParsedFileMetadata> files = groupedMetadata.get(key);
dataSet.setFiles(files);
FileMetadataResponse.ParsedFileMetadata firstFile = files.get(0);
dataSet.setMimeType(firstFile.getMimeType());
findSchemaParserForMimeType(firstFile.getMimeType()).ifPresent(schemaParserDescriptor -> {
dataSet.setSchemaParser(schemaParserDescriptor);
// if the parser has a delimiter property set it
schemaParserDescriptor.getProperties().stream().filter(property -> property.getObjectProperty().equals("separatorChar")).findFirst().ifPresent(property -> {
property.setValue(firstFile.getDelimiter());
});
schemaParserDescriptor.getProperties().stream().filter(property -> property.getObjectProperty().equals("rowTag")).findFirst().ifPresent(property -> {
property.setValue(firstFile.getRowTag());
});
Optional<FileSchemaParser> fileSchemaParser = fileSchemaParser(schemaParserDescriptor);
if (fileSchemaParser.isPresent() && fileSchemaParser.get() instanceof SparkFileSchemaParser) {
List<String> paths = files.stream().map(parsedFileMetadata -> parsedFileMetadata.getFilePath()).collect(Collectors.toList());
SampleFileSparkScript sparkScript = ((SparkFileSchemaParser) fileSchemaParser.get()).getSparkScript(paths);
dataSet.setSparkScript(sparkScript);
}
});
return dataSet;
}));
fileMetadataResponse.setDatasets(fileDatasets);
modifiedTransformResponse.setResults(fileMetadataResponse);
modifiedTransformResponse.setStatus(TransformResponse.Status.SUCCESS);
}
Aggregations