use of com.thinkbiganalytics.discovery.parser.SampleFileSparkScript in project kylo by Teradata.
the class SchemaDiscoveryRestController method uploadFileSpark.
/**
* Generate the spark script that can parse the passed in file using the passed in "parserDescriptor"
*
* @param parserDescriptor metadata about how the file should be parsed
* @param dataFrameVariable the name of the dataframe variable in the generate spark code
* @param limit a number indicating how many rows the script should limit the output
* @param fileInputStream the file
* @param fileMetaData metadata about the file
* @return an object including the name of the file on disk and the generated spark script
*/
@POST
@Path("/spark/sample-file")
@Consumes(MediaType.MULTIPART_FORM_DATA)
@Produces(MediaType.APPLICATION_JSON)
@ApiOperation("Determines the schema of the provided file.")
@ApiResponses({ @ApiResponse(code = 200, message = "Returns the spark script that parses the sample file.", response = Schema.class), @ApiResponse(code = 500, message = "The schema could not be determined.", response = RestResponseStatus.class) })
public Response uploadFileSpark(@FormDataParam("parser") String parserDescriptor, @FormDataParam("dataFrameVariable") @DefaultValue("df") String dataFrameVariable, @FormDataParam("limit") @DefaultValue("-1") Integer limit, @FormDataParam("file") InputStream fileInputStream, @FormDataParam("file") FormDataContentDisposition fileMetaData) throws Exception {
SampleFileSparkScript sampleFileSparkScript = null;
SchemaParserAnnotationTransformer transformer = new SchemaParserAnnotationTransformer();
try {
SchemaParserDescriptor descriptor = ObjectMapperSerializer.deserialize(parserDescriptor, SchemaParserDescriptor.class);
FileSchemaParser p = transformer.fromUiModel(descriptor);
SparkFileSchemaParser sparkFileSchemaParser = (SparkFileSchemaParser) p;
sparkFileSchemaParser.setDataFrameVariable(dataFrameVariable);
sparkFileSchemaParser.setLimit(limit);
sampleFileSparkScript = sparkFileSchemaParser.getSparkScript(fileInputStream);
} catch (IOException e) {
throw new WebApplicationException(e.getMessage());
} catch (PolicyTransformException e) {
log.warn("Failed to convert parser", e);
throw new InternalServerErrorException(STRINGS.getString("discovery.transformError"), e);
}
if (sampleFileSparkScript == null) {
log.warn("Failed to convert parser");
throw new InternalServerErrorException(STRINGS.getString("discovery.transformError"));
}
return Response.ok(sampleFileSparkScript).build();
}
use of com.thinkbiganalytics.discovery.parser.SampleFileSparkScript in project kylo by Teradata.
the class FileMetadataResultModifier method modifySuccessfulResults.
@Override
public void modifySuccessfulResults(ModifiedTransformResponse<FileMetadataResponse> modifiedTransformResponse) {
FileMetadataResponse fileMetadataResponse = new FileMetadataResponse();
Map<String, List<FileMetadataResponse.ParsedFileMetadata>> groupedMetadata = response.getResults().getRows().stream().map(objects -> {
FileMetadata fileMetadata = new FileMetadata();
String mimeType = (String) getRowValue("mimeType", objects);
String delimiter = (String) getRowValue("delimiter", objects);
Integer headerCount = getRowValue("headerCount", objects, Integer.class, 0);
String filePath = (String) getRowValue("resource", objects);
String encoding = (String) getRowValue("encoding", objects);
FileMetadataResponse.ParsedFileMetadata metadata = new FileMetadataResponse.ParsedFileMetadata();
metadata.setMimeType(mimeType);
metadata.setDelimiter(delimiter);
metadata.setEncoding(encoding);
metadata.setFilePath(filePath);
return metadata;
}).collect(Collectors.groupingBy(FileMetadataResponse.ParsedFileMetadata::getKey));
// create user friendly names from the groupings
// the key here is going to be
Map<String, FileMetadataResponse.FileDataSet> fileDatasets = groupedMetadata.keySet().stream().collect(Collectors.toMap(key -> groupedMetadata.get(key).get(0).getFilePath(), key -> {
FileMetadataResponse.FileDataSet dataSet = new FileMetadataResponse.FileDataSet();
List<FileMetadataResponse.ParsedFileMetadata> files = groupedMetadata.get(key);
dataSet.setFiles(files);
FileMetadataResponse.ParsedFileMetadata firstFile = files.get(0);
dataSet.setMimeType(firstFile.getMimeType());
findSchemaParserForMimeType(firstFile.getMimeType()).ifPresent(schemaParserDescriptor -> {
dataSet.setSchemaParser(schemaParserDescriptor);
// if the parser has a delimiter property set it
schemaParserDescriptor.getProperties().stream().filter(property -> property.getObjectProperty().equals("separatorChar")).findFirst().ifPresent(property -> {
property.setValue(firstFile.getDelimiter());
});
Optional<FileSchemaParser> fileSchemaParser = fileSchemaParser(schemaParserDescriptor);
if (fileSchemaParser.isPresent() && fileSchemaParser.get() instanceof SparkFileSchemaParser) {
List<String> paths = files.stream().map(parsedFileMetadata -> parsedFileMetadata.getFilePath()).collect(Collectors.toList());
SampleFileSparkScript sparkScript = ((SparkFileSchemaParser) fileSchemaParser.get()).getSparkScript(paths);
dataSet.setSparkScript(sparkScript);
}
});
return dataSet;
}));
fileMetadataResponse.setDatasets(fileDatasets);
modifiedTransformResponse.setResults(fileMetadataResponse);
}
use of com.thinkbiganalytics.discovery.parser.SampleFileSparkScript in project kylo by Teradata.
the class SparkFileSchemaParserService method getSparkScript.
public SampleFileSparkScript getSparkScript(File tempFile, SparkFileType fileType, SparkCommandBuilder commandBuilder) throws IOException {
String script = toScript(tempFile, fileType, commandBuilder);
SampleFileSparkScript sparkScript = new SampleFileSparkScript(tempFile.getPath(), script);
return sparkScript;
}
use of com.thinkbiganalytics.discovery.parser.SampleFileSparkScript in project kylo by Teradata.
the class SchemaDiscoveryRestController method sparkScriptForFilesList.
/**
* Generate the spark script that can parse the passed in file using the passed in "parserDescriptor"
*
* @return an object including the name of the file on disk and the generated spark script
*/
@POST
@Path("/spark/files-list")
@Consumes(MediaType.APPLICATION_JSON)
@Produces(MediaType.APPLICATION_JSON)
@ApiOperation("Determines the schema of the provided file.")
@ApiResponses({ @ApiResponse(code = 200, message = "Returns the spark script that parses the sample file.", response = Schema.class), @ApiResponse(code = 500, message = "The schema could not be determined.", response = RestResponseStatus.class) })
public Response sparkScriptForFilesList(SparkFilesScript sparkFilesScript) throws Exception {
SampleFileSparkScript sampleFileSparkScript = null;
SchemaParserAnnotationTransformer transformer = new SchemaParserAnnotationTransformer();
try {
SchemaParserDescriptor descriptor = ObjectMapperSerializer.deserialize(sparkFilesScript.getParserDescriptor(), SchemaParserDescriptor.class);
FileSchemaParser p = transformer.fromUiModel(descriptor);
SparkFileSchemaParser sparkFileSchemaParser = (SparkFileSchemaParser) p;
sparkFileSchemaParser.setDataFrameVariable(sparkFilesScript.getDataFrameVariable());
sparkFileSchemaParser.setLimit(-1);
sampleFileSparkScript = sparkFileSchemaParser.getSparkScript(sparkFilesScript.getFiles());
} catch (PolicyTransformException e) {
log.warn("Failed to convert parser", e);
throw new InternalServerErrorException(STRINGS.getString("discovery.transformError"), e);
}
if (sampleFileSparkScript == null) {
log.warn("Failed to convert parser");
throw new InternalServerErrorException(STRINGS.getString("discovery.transformError"));
}
return Response.ok(sampleFileSparkScript).build();
}
use of com.thinkbiganalytics.discovery.parser.SampleFileSparkScript in project kylo by Teradata.
the class SparkFileSchemaParserServiceTest method testScript.
@org.junit.Test
public void testScript() {
// String file1 ="file:///opt/kylo/setup/data/sample-data/avro/userdata1.avro";
// String file2 = "file:///opt/kylo/setup/data/sample-data/avro/userdata1.avro";
String file1 = "file:///opt/kylo/setup/data/sample-data/csv/userdata1.csv";
String file2 = "file:///opt/kylo/setup/data/sample-data/csv/userdata2.csv";
String file3 = "file:///opt/kylo/setup/data/sample-data/csv/userdata3.csv";
List<String> files = new ArrayList<>();
files.add(file1);
files.add(file2);
files.add(file3);
AvroFileSchemaParser avroFileSchemaParser = Mockito.spy(AvroFileSchemaParser.class);
avroFileSchemaParser.setDataFrameVariable("df");
OrcFileSchemaParser orcFileSchemaParser = Mockito.spy(OrcFileSchemaParser.class);
orcFileSchemaParser.setLimit(-1);
XMLFileSchemaParser xmlFileSchemaParser = Mockito.spy(XMLFileSchemaParser.class);
xmlFileSchemaParser.setRowTag("row1");
SparkCSVFileSchemaParser csvFileSchemaParser = Mockito.spy(SparkCSVFileSchemaParser.class);
csvFileSchemaParser.setDataFrameVariable("df");
// orcFileSchemaParser.setDataFrameVariable("df");
try {
SampleFileSparkScript sparkScript = setup(csvFileSchemaParser).getSparkScript(files);
System.out.println(sparkScript.getScript());
} catch (Exception e) {
e.printStackTrace();
}
}
Aggregations