use of com.thinkbiganalytics.kylo.metadata.file.FileMetadata in project kylo by Teradata.
the class FileMetadataResultModifier method modifySuccessfulResults.
@Override
public void modifySuccessfulResults(ModifiedTransformResponse<FileMetadataResponse> modifiedTransformResponse) {
FileMetadataResponse fileMetadataResponse = new FileMetadataResponse();
Map<String, List<FileMetadataResponse.ParsedFileMetadata>> groupedMetadata = response.getResults().getRows().stream().map(objects -> {
FileMetadata fileMetadata = new FileMetadata();
String mimeType = (String) getRowValue("mimeType", objects);
String delimiter = (String) getRowValue("delimiter", objects);
Integer headerCount = getRowValue("headerCount", objects, Integer.class, 0);
String filePath = (String) getRowValue("resource", objects);
String encoding = (String) getRowValue("encoding", objects);
FileMetadataResponse.ParsedFileMetadata metadata = new FileMetadataResponse.ParsedFileMetadata();
metadata.setMimeType(mimeType);
metadata.setDelimiter(delimiter);
metadata.setEncoding(encoding);
metadata.setFilePath(filePath);
return metadata;
}).collect(Collectors.groupingBy(FileMetadataResponse.ParsedFileMetadata::getKey));
// create user friendly names from the groupings
// the key here is going to be
Map<String, FileMetadataResponse.FileDataSet> fileDatasets = groupedMetadata.keySet().stream().collect(Collectors.toMap(key -> groupedMetadata.get(key).get(0).getFilePath(), key -> {
FileMetadataResponse.FileDataSet dataSet = new FileMetadataResponse.FileDataSet();
List<FileMetadataResponse.ParsedFileMetadata> files = groupedMetadata.get(key);
dataSet.setFiles(files);
FileMetadataResponse.ParsedFileMetadata firstFile = files.get(0);
dataSet.setMimeType(firstFile.getMimeType());
findSchemaParserForMimeType(firstFile.getMimeType()).ifPresent(schemaParserDescriptor -> {
dataSet.setSchemaParser(schemaParserDescriptor);
// if the parser has a delimiter property set it
schemaParserDescriptor.getProperties().stream().filter(property -> property.getObjectProperty().equals("separatorChar")).findFirst().ifPresent(property -> {
property.setValue(firstFile.getDelimiter());
});
Optional<FileSchemaParser> fileSchemaParser = fileSchemaParser(schemaParserDescriptor);
if (fileSchemaParser.isPresent() && fileSchemaParser.get() instanceof SparkFileSchemaParser) {
List<String> paths = files.stream().map(parsedFileMetadata -> parsedFileMetadata.getFilePath()).collect(Collectors.toList());
SampleFileSparkScript sparkScript = ((SparkFileSchemaParser) fileSchemaParser.get()).getSparkScript(paths);
dataSet.setSparkScript(sparkScript);
}
});
return dataSet;
}));
fileMetadataResponse.setDatasets(fileDatasets);
modifiedTransformResponse.setResults(fileMetadataResponse);
}
use of com.thinkbiganalytics.kylo.metadata.file.FileMetadata in project kylo by Teradata.
the class TikaParserTest method testXml.
@Test
public void testXml() throws Exception {
String file = "test.xml";
FileMetadata type = FileMetadataService.detectFromStream(getFile(file).getInputStream(), file);
Assert.assertEquals("application/xml", type.getMimeType());
Assert.assertEquals("catalog", type.getProperties().get("rowTag"));
}
use of com.thinkbiganalytics.kylo.metadata.file.FileMetadata in project kylo by Teradata.
the class TikaParserTest method testCsv.
@Test
public void testCsv() throws Exception {
String file = "MOCK_DATA.commasep.txt";
FileMetadata type = FileMetadataService.detectFromStream(getFile(file).getInputStream(), file);
Assert.assertEquals("text/csv", type.getMimeType());
Assert.assertEquals(",", type.getProperties().get("delimiter"));
file = "MOCK_DATA.tab_unix.txt";
type = FileMetadataService.detectFromStream(getFile(file).getInputStream(), file);
Assert.assertEquals("text/csv", type.getMimeType());
Assert.assertEquals("\t", type.getProperties().get("delimiter"));
file = "MOCK_DATA.pipe.txt";
type = FileMetadataService.detectFromStream(getFile(file).getInputStream(), file);
Assert.assertEquals("text/csv", type.getMimeType());
Assert.assertEquals("|", type.getProperties().get("delimiter"));
file = "MOCK_DATA.plus_unix.txt";
type = FileMetadataService.detectFromStream(getFile(file).getInputStream(), file);
Assert.assertEquals("text/csv", type.getMimeType());
Assert.assertEquals("+", type.getProperties().get("delimiter"));
}
use of com.thinkbiganalytics.kylo.metadata.file.FileMetadata in project kylo by Teradata.
the class FileMetadataService method detectFromStream.
/**
* Detect file format and metadata/mimetype from the incoming input sream
*
* @param is the file input stream
* @param fileName the name of the file (optional)
* @return the metadata object
*/
public static FileMetadata detectFromStream(InputStream is, String fileName) throws Exception {
Tika tika = new Tika();
TikaConfig tikaConfig = new TikaConfig();
Metadata md = new Metadata();
md.set(Metadata.RESOURCE_NAME_KEY, fileName);
Charset charset = StandardCharsets.ISO_8859_1;
byte[] header = InputStreamUtil.readHeader(is, bytesToTest);
AutoDetectReader reader = new AutoDetectReader(InputStreamUtil.asStream(header));
charset = reader.getCharset();
MediaType mediaType = null;
if (fileName != null && fileName.endsWith(".csv")) {
mediaType = detectCsv(InputStreamUtil.asStream(header), md);
} else {
mediaType = tika.getDetector().detect(TikaInputStream.get(InputStreamUtil.asStream(header)), md);
// then if we didnt get a concrete type attempt to parse it via csv
if (mediaType.equals(MediaType.TEXT_PLAIN)) {
mediaType = detectCsv(InputStreamUtil.asStream(header), md);
} else if (mediaType.equals(MediaType.APPLICATION_XML)) {
XmlRootExtractor rowTagExtractor = new XmlRootExtractor();
QName root = rowTagExtractor.extractRootElement(InputStreamUtil.asStream(header));
if (root != null) {
String rowTag = root.getLocalPart();
md.set("rowTag", rowTag);
} else {
// unable to detect RowTag from XML!!
}
}
}
if (mediaType == null) {
mediaType = MediaType.OCTET_STREAM;
}
FileMetadata fileMetadata = new FileMetadata(mediaType.toString());
fileMetadata.addProperties(metadataToMap(md));
fileMetadata.setSubType(mediaType.getSubtype());
String encoding = charset.name();
fileMetadata.setEncoding(StringUtils.isBlank(encoding) ? StandardCharsets.UTF_8.name() : encoding);
return fileMetadata;
}
use of com.thinkbiganalytics.kylo.metadata.file.FileMetadata in project kylo by Teradata.
the class SparkMetadataExtractor method parse.
@Override
public List<FileMetadata> parse(String[] filePaths) {
List<DataFrame> dataFrameList = new ArrayList<>();
for (String path : filePaths) {
DataFrame df = sqlContext.read().format("com.thinkbiganalytics.spark.file.metadata").load(path);
dataFrameList.add(df);
}
DataFrame unionDf = SparkUtil.unionAll(dataFrameList);
Encoder<FileMetadata> encoder = Encoders.bean(FileMetadata.class);
Dataset dataset = unionDf.as(encoder);
return dataset.collectAsList();
}
Aggregations