Search in sources :

Example 1 with Dataset

use of io.jhdf.api.Dataset in project drill by apache.

the class HDF5BatchReader method projectMetadataRow.

/**
 * Writes one row of HDF5 metadata.
 * @param rowWriter The input rowWriter object
 */
private void projectMetadataRow(RowSetLoader rowWriter) {
    HDF5DrillMetadata metadataRow = metadataIterator.next();
    rowWriter.start();
    pathWriter.setString(metadataRow.getPath());
    dataTypeWriter.setString(metadataRow.getDataType());
    fileNameWriter.setString(fileName);
    linkWriter.setBoolean(metadataRow.isLink());
    // Write attributes if present
    if (metadataRow.getAttributes().size() > 0) {
        writeAttributes(rowWriter, metadataRow);
    }
    if (metadataRow.getDataType().equalsIgnoreCase("DATASET")) {
        Dataset dataset = hdfFile.getDatasetByPath(metadataRow.getPath());
        // Project Dataset Metadata
        dataSizeWriter.setLong(dataset.getSizeInBytes());
        elementCountWriter.setLong(dataset.getSize());
        datasetTypeWriter.setString(dataset.getJavaType().getName());
        dimensionsWriter.setString(Arrays.toString(dataset.getDimensions()));
        // Do not project links
        if (!metadataRow.isLink() && showMetadataPreview) {
            projectDataset(rowWriter, metadataRow.getPath());
        }
    }
    rowWriter.save();
}
Also used : Dataset(io.jhdf.api.Dataset)

Example 2 with Dataset

use of io.jhdf.api.Dataset in project drill by apache.

the class HDF5BatchReader method projectDataset.

/**
 * Writes one row of data in a metadata query. The number of dimensions here
 * is n+1. So if the actual dataset is a 1D column, it will be written as a list.
 * This is function is only called in metadata queries as the schema is not
 * known in advance. If the datasize is greater than 16MB, the function does
 * not project the dataset
 *
 * @param rowWriter
 *          The rowWriter to which the data will be written
 * @param datapath
 *          The datapath from which the data will be read
 */
private void projectDataset(RowSetLoader rowWriter, String datapath) {
    String fieldName = HDF5Utils.getNameFromPath(datapath);
    Dataset dataset = hdfFile.getDatasetByPath(datapath);
    // If the dataset is larger than 16MB, do not project the dataset
    if (dataset.getSizeInBytes() > MAX_DATASET_SIZE) {
        logger.warn("Dataset {} is greater than 16MB.  Data will be truncated in Metadata view.", datapath);
    }
    int[] dimensions = dataset.getDimensions();
    // Case for single dimensional data
    if (dimensions.length == 1) {
        MinorType currentDataType = HDF5Utils.getDataType(dataset.getDataType());
        Object data;
        try {
            data = dataset.getData();
        } catch (Exception e) {
            logger.debug("Error reading {}", datapath);
            return;
        }
        assert currentDataType != null;
        // Skip null datasets
        if (data == null) {
            return;
        }
        switch(currentDataType) {
            case GENERIC_OBJECT:
                logger.warn("Couldn't read {}", datapath);
                break;
            case VARCHAR:
                String[] stringData = (String[]) data;
                writeStringListColumn(rowWriter, fieldName, stringData);
                break;
            case TIMESTAMP:
                long[] longList = (long[]) data;
                writeTimestampListColumn(rowWriter, fieldName, longList);
                break;
            case INT:
                int[] intList = (int[]) data;
                writeIntListColumn(rowWriter, fieldName, intList);
                break;
            case SMALLINT:
                short[] shortList = (short[]) data;
                writeSmallIntColumn(rowWriter, fieldName, shortList);
                break;
            case TINYINT:
                byte[] byteList = (byte[]) data;
                writeByteListColumn(rowWriter, fieldName, byteList);
                break;
            case FLOAT4:
                float[] tempFloatList = (float[]) data;
                writeFloat4ListColumn(rowWriter, fieldName, tempFloatList);
                break;
            case FLOAT8:
                double[] tempDoubleList = (double[]) data;
                writeFloat8ListColumn(rowWriter, fieldName, tempDoubleList);
                break;
            case BIGINT:
                long[] tempBigIntList = (long[]) data;
                writeLongListColumn(rowWriter, fieldName, tempBigIntList);
                break;
            case MAP:
                try {
                    getAndMapCompoundData(datapath, hdfFile, rowWriter);
                } catch (Exception e) {
                    throw UserException.dataReadError().message("Error writing Compound Field: " + e.getMessage()).addContext(errorContext).build(logger);
                }
                break;
            default:
                // Case for data types that cannot be read
                logger.warn("{} not implemented.", currentDataType.name());
        }
    } else if (dimensions.length == 2) {
        // Case for 2D data sets.  These are projected as lists of lists or maps of maps
        int cols = dimensions[1];
        int rows = dimensions[0];
        // TODO Add Boolean, Small and TinyInt data types
        switch(HDF5Utils.getDataType(dataset.getDataType())) {
            case INT:
                int[][] colData = (int[][]) dataset.getData();
                mapIntMatrixField(colData, cols, rows, rowWriter);
                break;
            case FLOAT4:
                float[][] floatData = (float[][]) dataset.getData();
                mapFloatMatrixField(floatData, cols, rows, rowWriter);
                break;
            case FLOAT8:
                double[][] doubleData = (double[][]) dataset.getData();
                mapDoubleMatrixField(doubleData, cols, rows, rowWriter);
                break;
            case BIGINT:
                long[][] longData = (long[][]) dataset.getData();
                mapBigIntMatrixField(longData, cols, rows, rowWriter);
                break;
            default:
                logger.warn("{} not implemented.", HDF5Utils.getDataType(dataset.getDataType()));
        }
    } else if (dimensions.length > 2) {
        // Case for data sets with dimensions > 2
        int cols = dimensions[1];
        int rows = dimensions[0];
        switch(HDF5Utils.getDataType(dataset.getDataType())) {
            case INT:
                int[][] intMatrix = HDF5Utils.toIntMatrix((Object[]) dataset.getData());
                mapIntMatrixField(intMatrix, cols, rows, rowWriter);
                break;
            case FLOAT4:
                float[][] floatData = HDF5Utils.toFloatMatrix((Object[]) dataset.getData());
                mapFloatMatrixField(floatData, cols, rows, rowWriter);
                break;
            case FLOAT8:
                double[][] doubleData = HDF5Utils.toDoubleMatrix((Object[]) dataset.getData());
                mapDoubleMatrixField(doubleData, cols, rows, rowWriter);
                break;
            case BIGINT:
                long[][] longData = HDF5Utils.toLongMatrix((Object[]) dataset.getData());
                mapBigIntMatrixField(longData, cols, rows, rowWriter);
                break;
            default:
                logger.warn("{} not implemented.", HDF5Utils.getDataType(dataset.getDataType()));
        }
    }
}
Also used : Dataset(io.jhdf.api.Dataset) UserException(org.apache.drill.common.exceptions.UserException) HdfException(io.jhdf.exceptions.HdfException) IOException(java.io.IOException) MinorType(org.apache.drill.common.types.TypeProtos.MinorType)

Example 3 with Dataset

use of io.jhdf.api.Dataset in project drill by apache.

the class HDF5BatchReader method open.

@Override
public boolean open(FileSchemaNegotiator negotiator) {
    split = negotiator.split();
    errorContext = negotiator.parentErrorContext();
    // Since the HDF file reader uses a stream to actually read the file, the file name from the
    // module is incorrect.
    fileName = split.getPath().getName();
    try {
        openFile(negotiator);
    } catch (IOException e) {
        throw UserException.dataReadError(e).addContext("Failed to close input file: %s", split.getPath()).addContext(errorContext).build(logger);
    }
    ResultSetLoader loader;
    if (readerConfig.defaultPath == null) {
        // Get file metadata
        List<HDF5DrillMetadata> metadata = getFileMetadata(hdfFile, new ArrayList<>());
        metadataIterator = metadata.iterator();
        // Schema for Metadata query
        SchemaBuilder builder = new SchemaBuilder().addNullable(PATH_COLUMN_NAME, MinorType.VARCHAR).addNullable(DATA_TYPE_COLUMN_NAME, MinorType.VARCHAR).addNullable(FILE_NAME_COLUMN_NAME, MinorType.VARCHAR).addNullable(DATA_SIZE_COLUMN_NAME, MinorType.BIGINT).addNullable(IS_LINK_COLUMN_NAME, MinorType.BIT).addNullable(ELEMENT_COUNT_NAME, MinorType.BIGINT).addNullable(DATASET_DATA_TYPE_NAME, MinorType.VARCHAR).addNullable(DIMENSIONS_FIELD_NAME, MinorType.VARCHAR);
        negotiator.tableSchema(builder.buildSchema(), false);
        loader = negotiator.build();
        dimensions = new int[0];
        rowWriter = loader.writer();
    } else {
        // This is the case when the default path is specified. Since the user is explicitly asking for a dataset
        // Drill can obtain the schema by getting the datatypes below and ultimately mapping that schema to columns
        Dataset dataSet = hdfFile.getDatasetByPath(readerConfig.defaultPath);
        dimensions = dataSet.getDimensions();
        loader = negotiator.build();
        rowWriter = loader.writer();
        writerSpec = new WriterSpec(rowWriter, negotiator.providedSchema(), negotiator.parentErrorContext());
        if (dimensions.length <= 1) {
            buildSchemaFor1DimensionalDataset(dataSet);
        } else if (dimensions.length == 2) {
            buildSchemaFor2DimensionalDataset(dataSet);
        } else {
            // Case for datasets of greater than 2D
            // These are automatically flattened
            buildSchemaFor2DimensionalDataset(dataSet);
        }
    }
    if (readerConfig.defaultPath == null) {
        pathWriter = rowWriter.scalar(PATH_COLUMN_NAME);
        dataTypeWriter = rowWriter.scalar(DATA_TYPE_COLUMN_NAME);
        fileNameWriter = rowWriter.scalar(FILE_NAME_COLUMN_NAME);
        dataSizeWriter = rowWriter.scalar(DATA_SIZE_COLUMN_NAME);
        linkWriter = rowWriter.scalar(IS_LINK_COLUMN_NAME);
        elementCountWriter = rowWriter.scalar(ELEMENT_COUNT_NAME);
        datasetTypeWriter = rowWriter.scalar(DATASET_DATA_TYPE_NAME);
        dimensionsWriter = rowWriter.scalar(DIMENSIONS_FIELD_NAME);
    }
    return true;
}
Also used : ResultSetLoader(org.apache.drill.exec.physical.resultSet.ResultSetLoader) Dataset(io.jhdf.api.Dataset) SchemaBuilder(org.apache.drill.exec.record.metadata.SchemaBuilder) WriterSpec(org.apache.drill.exec.store.hdf5.writers.WriterSpec) IOException(java.io.IOException)

Aggregations

Dataset (io.jhdf.api.Dataset)3 IOException (java.io.IOException)2 HdfException (io.jhdf.exceptions.HdfException)1 UserException (org.apache.drill.common.exceptions.UserException)1 MinorType (org.apache.drill.common.types.TypeProtos.MinorType)1 ResultSetLoader (org.apache.drill.exec.physical.resultSet.ResultSetLoader)1 SchemaBuilder (org.apache.drill.exec.record.metadata.SchemaBuilder)1 WriterSpec (org.apache.drill.exec.store.hdf5.writers.WriterSpec)1