use of com.thinkbiganalytics.discovery.model.SchemaParserDescriptor in project kylo by Teradata.
the class FileMetadataTransformResponseModifier method complete.
public void complete(ModifiedTransformResponse<FileMetadataResponse> modifiedTransformResponse) {
FileMetadataResponse fileMetadataResponse = new FileMetadataResponse();
String tableId = modifiedTransformResponse.getTable();
Map<String, List<FileMetadataResponse.ParsedFileMetadata>> groupedMimeTypeMetadata = response.getResults().getRows().stream().map(r -> parseRow(r)).collect(Collectors.groupingBy(FileMetadataResponse.ParsedFileMetadata::getMimeType));
Map<String, List<FileMetadataResponse.ParsedFileMetadata>> groupedMetadata = response.getResults().getRows().stream().map(r -> parseRow(r)).collect(Collectors.groupingBy(FileMetadataResponse.ParsedFileMetadata::getKey));
// create user friendly names from the groupings
// the key in this new map will be the first file found
Map<String, FileMetadataResponse.FileDataSet> fileDatasets = groupedMetadata.keySet().stream().collect(Collectors.toMap(key -> groupedMetadata.get(key).get(0).getFilePath(), key -> {
FileMetadataResponse.FileDataSet dataSet = new FileMetadataResponse.FileDataSet();
List<FileMetadataResponse.ParsedFileMetadata> files = groupedMetadata.get(key);
dataSet.setFiles(files);
FileMetadataResponse.ParsedFileMetadata firstFile = files.get(0);
dataSet.setMimeType(firstFile.getMimeType());
findSchemaParserForMimeType(firstFile.getMimeType()).ifPresent(schemaParserDescriptor -> {
dataSet.setSchemaParser(schemaParserDescriptor);
// if the parser has a delimiter property set it
schemaParserDescriptor.getProperties().stream().filter(property -> property.getObjectProperty().equals("separatorChar")).findFirst().ifPresent(property -> {
property.setValue(firstFile.getDelimiter());
});
schemaParserDescriptor.getProperties().stream().filter(property -> property.getObjectProperty().equals("rowTag")).findFirst().ifPresent(property -> {
property.setValue(firstFile.getRowTag());
});
Optional<FileSchemaParser> fileSchemaParser = fileSchemaParser(schemaParserDescriptor);
if (fileSchemaParser.isPresent() && fileSchemaParser.get() instanceof SparkFileSchemaParser) {
List<String> paths = files.stream().map(parsedFileMetadata -> parsedFileMetadata.getFilePath()).collect(Collectors.toList());
SampleFileSparkScript sparkScript = ((SparkFileSchemaParser) fileSchemaParser.get()).getSparkScript(paths);
dataSet.setSparkScript(sparkScript);
}
});
return dataSet;
}));
fileMetadataResponse.setDatasets(fileDatasets);
modifiedTransformResponse.setResults(fileMetadataResponse);
modifiedTransformResponse.setStatus(TransformResponse.Status.SUCCESS);
}
use of com.thinkbiganalytics.discovery.model.SchemaParserDescriptor in project kylo by Teradata.
the class SparkShellProxyController method getTextSchemaParserDescriptor.
/**
* Get the text schema parser, first looking for the cached one if it exists
* @return
*/
private SchemaParserDescriptor getTextSchemaParserDescriptor() {
SchemaParserDescriptor textParser = new SchemaParserDescriptor();
textParser.setName("Text");
textParser.setDisplayName("Text");
textParser.setSparkFormat("text");
textParser.setUsesSpark(true);
textParser.setMimeTypes(new String[] { "text" });
textParser.setPrimary(true);
textParser.setAllowSkipHeader(true);
return textParser;
}
use of com.thinkbiganalytics.discovery.model.SchemaParserDescriptor in project kylo by Teradata.
the class KyloCatalogReaderUtil method toKyloCatalogRequest.
public static KyloCatalogReadRequest toKyloCatalogRequest(PreviewDataSetRequest previewRequest) {
DataSource dataSource = previewRequest.getDataSource();
Connector connector = dataSource.getConnector();
// merge template
DataSetTemplate dataSetTemplate = DataSourceUtil.mergeTemplates(dataSource);
// get data out of the dataset template
List<String> jars = dataSetTemplate.getJars();
List<String> paths = dataSetTemplate.getPaths();
List<String> files = dataSetTemplate.getFiles();
String format = dataSetTemplate.getFormat();
Map<String, String> options = dataSetTemplate.getOptions();
if (options == null) {
options = new HashMap<>();
}
// parse the SchemaParser if it exists and add options and update the format
if (previewRequest.getSchemaParser() != null) {
SchemaParserDescriptor schemaParser = previewRequest.getSchemaParser();
Map<String, String> sparkOptions = schemaParser.getProperties().stream().collect(Collectors.toMap(p -> p.getAdditionalProperties().stream().filter(labelValue -> "spark.option".equalsIgnoreCase(labelValue.getLabel())).map(labelValue -> labelValue.getValue()).findFirst().orElse(""), p -> p.getValue()));
// remove any options that produced an empty key
sparkOptions.remove("");
// supplied options by the schema parse take precedence over the template options
options.putAll(sparkOptions);
format = schemaParser.getSparkFormat();
}
// add in additional preview options
if (previewRequest.getProperties() != null && !previewRequest.getProperties().isEmpty()) {
options.putAll(previewRequest.getProperties());
}
KyloCatalogReadRequest request = new KyloCatalogReadRequest();
request.setFiles(files);
request.setJars(jars);
request.setFormat(format);
request.setOptions(options);
if (previewRequest.getPreviewItem() != null && previewRequest.isAddPreviewItemToPath()) {
request.addPath(previewRequest.getPreviewItem());
}
PageSpec pageSpec = previewRequest.getPageSpec();
if (pageSpec == null) {
pageSpec = new PageSpec();
}
request.setPageSpec(pageSpec);
return request;
}
Aggregations