use of io.jhdf.api.Dataset in project drill by apache.
the class HDF5BatchReader method projectMetadataRow.
/**
* Writes one row of HDF5 metadata.
* @param rowWriter The input rowWriter object
*/
private void projectMetadataRow(RowSetLoader rowWriter) {
HDF5DrillMetadata metadataRow = metadataIterator.next();
rowWriter.start();
pathWriter.setString(metadataRow.getPath());
dataTypeWriter.setString(metadataRow.getDataType());
fileNameWriter.setString(fileName);
linkWriter.setBoolean(metadataRow.isLink());
// Write attributes if present
if (metadataRow.getAttributes().size() > 0) {
writeAttributes(rowWriter, metadataRow);
}
if (metadataRow.getDataType().equalsIgnoreCase("DATASET")) {
Dataset dataset = hdfFile.getDatasetByPath(metadataRow.getPath());
// Project Dataset Metadata
dataSizeWriter.setLong(dataset.getSizeInBytes());
elementCountWriter.setLong(dataset.getSize());
datasetTypeWriter.setString(dataset.getJavaType().getName());
dimensionsWriter.setString(Arrays.toString(dataset.getDimensions()));
// Do not project links
if (!metadataRow.isLink() && showMetadataPreview) {
projectDataset(rowWriter, metadataRow.getPath());
}
}
rowWriter.save();
}
use of io.jhdf.api.Dataset in project drill by apache.
the class HDF5BatchReader method projectDataset.
/**
* Writes one row of data in a metadata query. The number of dimensions here
* is n+1. So if the actual dataset is a 1D column, it will be written as a list.
* This is function is only called in metadata queries as the schema is not
* known in advance. If the datasize is greater than 16MB, the function does
* not project the dataset
*
* @param rowWriter
* The rowWriter to which the data will be written
* @param datapath
* The datapath from which the data will be read
*/
private void projectDataset(RowSetLoader rowWriter, String datapath) {
String fieldName = HDF5Utils.getNameFromPath(datapath);
Dataset dataset = hdfFile.getDatasetByPath(datapath);
// If the dataset is larger than 16MB, do not project the dataset
if (dataset.getSizeInBytes() > MAX_DATASET_SIZE) {
logger.warn("Dataset {} is greater than 16MB. Data will be truncated in Metadata view.", datapath);
}
int[] dimensions = dataset.getDimensions();
// Case for single dimensional data
if (dimensions.length == 1) {
MinorType currentDataType = HDF5Utils.getDataType(dataset.getDataType());
Object data;
try {
data = dataset.getData();
} catch (Exception e) {
logger.debug("Error reading {}", datapath);
return;
}
assert currentDataType != null;
// Skip null datasets
if (data == null) {
return;
}
switch(currentDataType) {
case GENERIC_OBJECT:
logger.warn("Couldn't read {}", datapath);
break;
case VARCHAR:
String[] stringData = (String[]) data;
writeStringListColumn(rowWriter, fieldName, stringData);
break;
case TIMESTAMP:
long[] longList = (long[]) data;
writeTimestampListColumn(rowWriter, fieldName, longList);
break;
case INT:
int[] intList = (int[]) data;
writeIntListColumn(rowWriter, fieldName, intList);
break;
case SMALLINT:
short[] shortList = (short[]) data;
writeSmallIntColumn(rowWriter, fieldName, shortList);
break;
case TINYINT:
byte[] byteList = (byte[]) data;
writeByteListColumn(rowWriter, fieldName, byteList);
break;
case FLOAT4:
float[] tempFloatList = (float[]) data;
writeFloat4ListColumn(rowWriter, fieldName, tempFloatList);
break;
case FLOAT8:
double[] tempDoubleList = (double[]) data;
writeFloat8ListColumn(rowWriter, fieldName, tempDoubleList);
break;
case BIGINT:
long[] tempBigIntList = (long[]) data;
writeLongListColumn(rowWriter, fieldName, tempBigIntList);
break;
case MAP:
try {
getAndMapCompoundData(datapath, hdfFile, rowWriter);
} catch (Exception e) {
throw UserException.dataReadError().message("Error writing Compound Field: " + e.getMessage()).addContext(errorContext).build(logger);
}
break;
default:
// Case for data types that cannot be read
logger.warn("{} not implemented.", currentDataType.name());
}
} else if (dimensions.length == 2) {
// Case for 2D data sets. These are projected as lists of lists or maps of maps
int cols = dimensions[1];
int rows = dimensions[0];
// TODO Add Boolean, Small and TinyInt data types
switch(HDF5Utils.getDataType(dataset.getDataType())) {
case INT:
int[][] colData = (int[][]) dataset.getData();
mapIntMatrixField(colData, cols, rows, rowWriter);
break;
case FLOAT4:
float[][] floatData = (float[][]) dataset.getData();
mapFloatMatrixField(floatData, cols, rows, rowWriter);
break;
case FLOAT8:
double[][] doubleData = (double[][]) dataset.getData();
mapDoubleMatrixField(doubleData, cols, rows, rowWriter);
break;
case BIGINT:
long[][] longData = (long[][]) dataset.getData();
mapBigIntMatrixField(longData, cols, rows, rowWriter);
break;
default:
logger.warn("{} not implemented.", HDF5Utils.getDataType(dataset.getDataType()));
}
} else if (dimensions.length > 2) {
// Case for data sets with dimensions > 2
int cols = dimensions[1];
int rows = dimensions[0];
switch(HDF5Utils.getDataType(dataset.getDataType())) {
case INT:
int[][] intMatrix = HDF5Utils.toIntMatrix((Object[]) dataset.getData());
mapIntMatrixField(intMatrix, cols, rows, rowWriter);
break;
case FLOAT4:
float[][] floatData = HDF5Utils.toFloatMatrix((Object[]) dataset.getData());
mapFloatMatrixField(floatData, cols, rows, rowWriter);
break;
case FLOAT8:
double[][] doubleData = HDF5Utils.toDoubleMatrix((Object[]) dataset.getData());
mapDoubleMatrixField(doubleData, cols, rows, rowWriter);
break;
case BIGINT:
long[][] longData = HDF5Utils.toLongMatrix((Object[]) dataset.getData());
mapBigIntMatrixField(longData, cols, rows, rowWriter);
break;
default:
logger.warn("{} not implemented.", HDF5Utils.getDataType(dataset.getDataType()));
}
}
}
use of io.jhdf.api.Dataset in project drill by apache.
the class HDF5BatchReader method open.
@Override
public boolean open(FileSchemaNegotiator negotiator) {
split = negotiator.split();
errorContext = negotiator.parentErrorContext();
// Since the HDF file reader uses a stream to actually read the file, the file name from the
// module is incorrect.
fileName = split.getPath().getName();
try {
openFile(negotiator);
} catch (IOException e) {
throw UserException.dataReadError(e).addContext("Failed to close input file: %s", split.getPath()).addContext(errorContext).build(logger);
}
ResultSetLoader loader;
if (readerConfig.defaultPath == null) {
// Get file metadata
List<HDF5DrillMetadata> metadata = getFileMetadata(hdfFile, new ArrayList<>());
metadataIterator = metadata.iterator();
// Schema for Metadata query
SchemaBuilder builder = new SchemaBuilder().addNullable(PATH_COLUMN_NAME, MinorType.VARCHAR).addNullable(DATA_TYPE_COLUMN_NAME, MinorType.VARCHAR).addNullable(FILE_NAME_COLUMN_NAME, MinorType.VARCHAR).addNullable(DATA_SIZE_COLUMN_NAME, MinorType.BIGINT).addNullable(IS_LINK_COLUMN_NAME, MinorType.BIT).addNullable(ELEMENT_COUNT_NAME, MinorType.BIGINT).addNullable(DATASET_DATA_TYPE_NAME, MinorType.VARCHAR).addNullable(DIMENSIONS_FIELD_NAME, MinorType.VARCHAR);
negotiator.tableSchema(builder.buildSchema(), false);
loader = negotiator.build();
dimensions = new int[0];
rowWriter = loader.writer();
} else {
// This is the case when the default path is specified. Since the user is explicitly asking for a dataset
// Drill can obtain the schema by getting the datatypes below and ultimately mapping that schema to columns
Dataset dataSet = hdfFile.getDatasetByPath(readerConfig.defaultPath);
dimensions = dataSet.getDimensions();
loader = negotiator.build();
rowWriter = loader.writer();
writerSpec = new WriterSpec(rowWriter, negotiator.providedSchema(), negotiator.parentErrorContext());
if (dimensions.length <= 1) {
buildSchemaFor1DimensionalDataset(dataSet);
} else if (dimensions.length == 2) {
buildSchemaFor2DimensionalDataset(dataSet);
} else {
// Case for datasets of greater than 2D
// These are automatically flattened
buildSchemaFor2DimensionalDataset(dataSet);
}
}
if (readerConfig.defaultPath == null) {
pathWriter = rowWriter.scalar(PATH_COLUMN_NAME);
dataTypeWriter = rowWriter.scalar(DATA_TYPE_COLUMN_NAME);
fileNameWriter = rowWriter.scalar(FILE_NAME_COLUMN_NAME);
dataSizeWriter = rowWriter.scalar(DATA_SIZE_COLUMN_NAME);
linkWriter = rowWriter.scalar(IS_LINK_COLUMN_NAME);
elementCountWriter = rowWriter.scalar(ELEMENT_COUNT_NAME);
datasetTypeWriter = rowWriter.scalar(DATASET_DATA_TYPE_NAME);
dimensionsWriter = rowWriter.scalar(DIMENSIONS_FIELD_NAME);
}
return true;
}
Aggregations