Search in sources :

Example 1 with WriterSpec

use of org.apache.drill.exec.store.hdf5.writers.WriterSpec in project drill by apache.

the class HDF5BatchReader method open.

@Override
public boolean open(FileSchemaNegotiator negotiator) {
    split = negotiator.split();
    errorContext = negotiator.parentErrorContext();
    // Since the HDF file reader uses a stream to actually read the file, the file name from the
    // module is incorrect.
    fileName = split.getPath().getName();
    try {
        openFile(negotiator);
    } catch (IOException e) {
        throw UserException.dataReadError(e).addContext("Failed to close input file: %s", split.getPath()).addContext(errorContext).build(logger);
    }
    ResultSetLoader loader;
    if (readerConfig.defaultPath == null) {
        // Get file metadata
        List<HDF5DrillMetadata> metadata = getFileMetadata(hdfFile, new ArrayList<>());
        metadataIterator = metadata.iterator();
        // Schema for Metadata query
        SchemaBuilder builder = new SchemaBuilder().addNullable(PATH_COLUMN_NAME, MinorType.VARCHAR).addNullable(DATA_TYPE_COLUMN_NAME, MinorType.VARCHAR).addNullable(FILE_NAME_COLUMN_NAME, MinorType.VARCHAR).addNullable(DATA_SIZE_COLUMN_NAME, MinorType.BIGINT).addNullable(IS_LINK_COLUMN_NAME, MinorType.BIT).addNullable(ELEMENT_COUNT_NAME, MinorType.BIGINT).addNullable(DATASET_DATA_TYPE_NAME, MinorType.VARCHAR).addNullable(DIMENSIONS_FIELD_NAME, MinorType.VARCHAR);
        negotiator.tableSchema(builder.buildSchema(), false);
        loader = negotiator.build();
        dimensions = new int[0];
        rowWriter = loader.writer();
    } else {
        // This is the case when the default path is specified. Since the user is explicitly asking for a dataset
        // Drill can obtain the schema by getting the datatypes below and ultimately mapping that schema to columns
        Dataset dataSet = hdfFile.getDatasetByPath(readerConfig.defaultPath);
        dimensions = dataSet.getDimensions();
        loader = negotiator.build();
        rowWriter = loader.writer();
        writerSpec = new WriterSpec(rowWriter, negotiator.providedSchema(), negotiator.parentErrorContext());
        if (dimensions.length <= 1) {
            buildSchemaFor1DimensionalDataset(dataSet);
        } else if (dimensions.length == 2) {
            buildSchemaFor2DimensionalDataset(dataSet);
        } else {
            // Case for datasets of greater than 2D
            // These are automatically flattened
            buildSchemaFor2DimensionalDataset(dataSet);
        }
    }
    if (readerConfig.defaultPath == null) {
        pathWriter = rowWriter.scalar(PATH_COLUMN_NAME);
        dataTypeWriter = rowWriter.scalar(DATA_TYPE_COLUMN_NAME);
        fileNameWriter = rowWriter.scalar(FILE_NAME_COLUMN_NAME);
        dataSizeWriter = rowWriter.scalar(DATA_SIZE_COLUMN_NAME);
        linkWriter = rowWriter.scalar(IS_LINK_COLUMN_NAME);
        elementCountWriter = rowWriter.scalar(ELEMENT_COUNT_NAME);
        datasetTypeWriter = rowWriter.scalar(DATASET_DATA_TYPE_NAME);
        dimensionsWriter = rowWriter.scalar(DIMENSIONS_FIELD_NAME);
    }
    return true;
}
Also used : ResultSetLoader(org.apache.drill.exec.physical.resultSet.ResultSetLoader) Dataset(io.jhdf.api.Dataset) SchemaBuilder(org.apache.drill.exec.record.metadata.SchemaBuilder) WriterSpec(org.apache.drill.exec.store.hdf5.writers.WriterSpec) IOException(java.io.IOException)

Example 2 with WriterSpec

use of org.apache.drill.exec.store.hdf5.writers.WriterSpec in project drill by apache.

the class HDF5BatchReader method buildSchemaFor2DimensionalDataset.

/**
 * Builds a Drill schema from a dataset with 2 or more dimensions. HDF5 only
 * supports INT, LONG, DOUBLE and FLOAT for >2 data types so this function is
 * not as inclusive as the 1D function. This function will build the schema
 * by adding DataWriters to the dataWriters array.
 *
 * @param dataset
 *          The dataset which Drill will use to build a schema
 */
private void buildSchemaFor2DimensionalDataset(Dataset dataset) {
    MinorType currentDataType = HDF5Utils.getDataType(dataset.getDataType());
    // Case for null or unknown data types:
    if (currentDataType == null) {
        logger.warn("Couldn't add {}", dataset.getJavaType().getName());
        return;
    }
    long cols = dimensions[1];
    String tempFieldName;
    for (int i = 0; i < cols; i++) {
        switch(currentDataType) {
            case INT:
                tempFieldName = INT_COLUMN_PREFIX + i;
                dataWriters.add(new HDF5IntDataWriter(hdfFile, writerSpec, readerConfig.defaultPath, tempFieldName, i));
                break;
            case BIGINT:
                tempFieldName = LONG_COLUMN_PREFIX + i;
                dataWriters.add(new HDF5LongDataWriter(hdfFile, writerSpec, readerConfig.defaultPath, tempFieldName, i));
                break;
            case FLOAT8:
                tempFieldName = DOUBLE_COLUMN_PREFIX + i;
                dataWriters.add(new HDF5DoubleDataWriter(hdfFile, writerSpec, readerConfig.defaultPath, tempFieldName, i));
                break;
            case FLOAT4:
                tempFieldName = FLOAT_COLUMN_PREFIX + i;
                dataWriters.add(new HDF5FloatDataWriter(hdfFile, writerSpec, readerConfig.defaultPath, tempFieldName, i));
                break;
            default:
                throw new UnsupportedOperationException(currentDataType.name());
        }
    }
}
Also used : HDF5DoubleDataWriter(org.apache.drill.exec.store.hdf5.writers.HDF5DoubleDataWriter) HDF5IntDataWriter(org.apache.drill.exec.store.hdf5.writers.HDF5IntDataWriter) MinorType(org.apache.drill.common.types.TypeProtos.MinorType) HDF5FloatDataWriter(org.apache.drill.exec.store.hdf5.writers.HDF5FloatDataWriter) HDF5LongDataWriter(org.apache.drill.exec.store.hdf5.writers.HDF5LongDataWriter)

Aggregations

Dataset (io.jhdf.api.Dataset)1 IOException (java.io.IOException)1 MinorType (org.apache.drill.common.types.TypeProtos.MinorType)1 ResultSetLoader (org.apache.drill.exec.physical.resultSet.ResultSetLoader)1 SchemaBuilder (org.apache.drill.exec.record.metadata.SchemaBuilder)1 HDF5DoubleDataWriter (org.apache.drill.exec.store.hdf5.writers.HDF5DoubleDataWriter)1 HDF5FloatDataWriter (org.apache.drill.exec.store.hdf5.writers.HDF5FloatDataWriter)1 HDF5IntDataWriter (org.apache.drill.exec.store.hdf5.writers.HDF5IntDataWriter)1 HDF5LongDataWriter (org.apache.drill.exec.store.hdf5.writers.HDF5LongDataWriter)1 WriterSpec (org.apache.drill.exec.store.hdf5.writers.WriterSpec)1