Search in sources :

Example 1 with CreateStatementBuilder

use of co.cask.cdap.explore.table.CreateStatementBuilder in project cdap by caskdata.

the class ExploreTableManager method generateFileSetCreateStatement.

/**
 * Generate a create statement for a ((time-)partitioned) file set.
 *
 * @param dataset the instantiated dataset
 * @param datasetId the dataset id
 * @param properties the properties from dataset specification
 * @param truncating whether this call to create() is part of a truncate() operation. The effect is:
 *                   If possessExisting is true, then the truncate() has just dropped this
 *                   dataset and that deleted the explore table: we must recreate it.
 *
 * @return a CREATE TABLE statement, or null if the dataset is not explorable
 * @throws IllegalArgumentException if the schema cannot be parsed, or if shouldErrorOnMissingSchema is true and
 *                                  the dataset spec does not contain a schema.
 */
@Nullable
private String generateFileSetCreateStatement(DatasetId datasetId, Dataset dataset, Map<String, String> properties, boolean truncating) throws IllegalArgumentException, ExploreException {
    String tableName = tableNaming.getTableName(datasetId, properties);
    String databaseName = ExploreProperties.getExploreDatabaseName(properties);
    Map<String, String> tableProperties = FileSetProperties.getTableProperties(properties);
    // if this dataset reuses an existing table, do not attempt to create it
    if (FileSetProperties.isUseExisting(tableProperties) || (FileSetProperties.isPossessExisting(tableProperties) && !truncating)) {
        try {
            exploreService.getTableInfo(datasetId.getNamespace(), databaseName, tableName);
            // table exists: do not attempt to create
            return null;
        } catch (TableNotFoundException e) {
            throw new ExploreException(String.format("Dataset '%s' is configured to use an existing explore table, but table '%s' does not " + "exist in database '%s'. ", datasetId.getDataset(), tableName, databaseName));
        }
    }
    Location baseLocation;
    Partitioning partitioning = null;
    if (dataset instanceof PartitionedFileSet) {
        partitioning = ((PartitionedFileSet) dataset).getPartitioning();
        baseLocation = ((PartitionedFileSet) dataset).getEmbeddedFileSet().getBaseLocation();
    } else {
        baseLocation = ((FileSet) dataset).getBaseLocation();
    }
    CreateStatementBuilder createStatementBuilder = new CreateStatementBuilder(datasetId.getDataset(), databaseName, tableName, shouldEscapeColumns).setLocation(baseLocation).setPartitioning(partitioning).setTableProperties(tableProperties);
    String schema = FileSetProperties.getExploreSchema(properties);
    String format = FileSetProperties.getExploreFormat(properties);
    if (format != null) {
        if ("parquet".equals(format)) {
            return createStatementBuilder.setSchema(FileSetProperties.getExploreSchema(properties)).buildWithFileFormat("parquet");
        }
        // for text and csv, we know what to do
        Preconditions.checkArgument("text".equals(format) || "csv".equals(format), "Only text and csv are supported as native formats");
        Preconditions.checkNotNull(schema, "for native formats, explore schema must be given in dataset properties");
        String delimiter = null;
        if ("text".equals(format)) {
            delimiter = FileSetProperties.getExploreFormatProperties(properties).get("delimiter");
        } else if ("csv".equals(format)) {
            delimiter = ",";
        }
        return createStatementBuilder.setSchema(schema).setRowFormatDelimited(delimiter, null).buildWithFileFormat("TEXTFILE");
    } else {
        // They can be created by setting the avro.schema.literal table property
        if (schema != null) {
            createStatementBuilder.setSchema(schema);
        }
        // format not given, look for serde, input format, etc.
        String serde = FileSetProperties.getSerDe(properties);
        String inputFormat = FileSetProperties.getExploreInputFormat(properties);
        String outputFormat = FileSetProperties.getExploreOutputFormat(properties);
        Preconditions.checkArgument(serde != null && inputFormat != null && outputFormat != null, "All of SerDe, InputFormat and OutputFormat must be given in dataset properties");
        return createStatementBuilder.setRowFormatSerde(serde).buildWithFormats(inputFormat, outputFormat);
    }
}
Also used : Partitioning(co.cask.cdap.api.dataset.lib.Partitioning) CreateStatementBuilder(co.cask.cdap.explore.table.CreateStatementBuilder) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) Location(org.apache.twill.filesystem.Location) Nullable(javax.annotation.Nullable)

Example 2 with CreateStatementBuilder

use of co.cask.cdap.explore.table.CreateStatementBuilder in project cdap by caskdata.

the class ExploreTableManager method generateEnableStatement.

/**
 * Generate a Hive DDL statement to create a Hive table for the given dataset.
 *
 * @param dataset the instantiated dataset
 * @param spec the dataset specification
 * @param datasetId the dataset id
 * @param truncating whether this call to create() is part of a truncate() operation, which is in some
 *                   case implemented using disableExplore() followed by enableExplore()
 *
 * @return a CREATE TABLE statement, or null if the dataset is not explorable
 * @throws UnsupportedTypeException if the dataset is a RecordScannable of a type that is not supported by Hive
 */
@Nullable
private String generateEnableStatement(Dataset dataset, DatasetSpecification spec, DatasetId datasetId, String tableName, boolean truncating) throws UnsupportedTypeException, ExploreException {
    String datasetName = datasetId.getDataset();
    Map<String, String> serdeProperties = ImmutableMap.of(Constants.Explore.DATASET_NAME, datasetId.getDataset(), Constants.Explore.DATASET_NAMESPACE, datasetId.getNamespace());
    // or it must be a FileSet or a PartitionedFileSet with explore enabled in it properties.
    if (dataset instanceof Table) {
        // valid for a table not to have a schema property. this logic should really be in Table
        return generateCreateStatementFromSchemaProperty(spec, datasetId, tableName, serdeProperties, false);
    }
    if (dataset instanceof ObjectMappedTable) {
        return generateCreateStatementFromSchemaProperty(spec, datasetId, tableName, serdeProperties, true);
    }
    boolean isRecordScannable = dataset instanceof RecordScannable;
    boolean isRecordWritable = dataset instanceof RecordWritable;
    if (isRecordScannable || isRecordWritable) {
        Type recordType = isRecordScannable ? ((RecordScannable) dataset).getRecordType() : ((RecordWritable) dataset).getRecordType();
        // Use == because that's what same class means.
        if (StructuredRecord.class == recordType) {
            return generateCreateStatementFromSchemaProperty(spec, datasetId, tableName, serdeProperties, true);
        }
        // otherwise, derive the schema from the record type
        LOG.debug("Enabling explore for dataset instance {}", datasetName);
        String databaseName = ExploreProperties.getExploreDatabaseName(spec.getProperties());
        return new CreateStatementBuilder(datasetName, databaseName, tableName, shouldEscapeColumns).setSchema(hiveSchemaFor(recordType)).setTableComment("CDAP Dataset").buildWithStorageHandler(DatasetStorageHandler.class.getName(), serdeProperties);
    } else if (dataset instanceof FileSet || dataset instanceof PartitionedFileSet) {
        Map<String, String> properties = spec.getProperties();
        if (FileSetProperties.isExploreEnabled(properties)) {
            LOG.debug("Enabling explore for dataset instance {}", datasetName);
            return generateFileSetCreateStatement(datasetId, dataset, properties, truncating);
        }
    }
    // dataset is not explorable
    return null;
}
Also used : Table(co.cask.cdap.api.dataset.table.Table) ObjectMappedTable(co.cask.cdap.api.dataset.lib.ObjectMappedTable) RecordWritable(co.cask.cdap.api.data.batch.RecordWritable) FileSet(co.cask.cdap.api.dataset.lib.FileSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) CreateStatementBuilder(co.cask.cdap.explore.table.CreateStatementBuilder) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) RecordScannable(co.cask.cdap.api.data.batch.RecordScannable) Type(java.lang.reflect.Type) DatasetStorageHandler(co.cask.cdap.hive.datasets.DatasetStorageHandler) ObjectMappedTable(co.cask.cdap.api.dataset.lib.ObjectMappedTable) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) Nullable(javax.annotation.Nullable)

Example 3 with CreateStatementBuilder

use of co.cask.cdap.explore.table.CreateStatementBuilder in project cdap by caskdata.

the class ExploreTableManager method enableStream.

/**
 * Enable exploration on a stream by creating a corresponding Hive table. Enabling exploration on a
 * stream that has already been enabled is a no-op. Assumes the stream actually exists.
 *
 * @param tableName name of the Hive table to create
 * @param streamId the ID of the stream
 * @param formatSpec the format specification for the table
 * @return query handle for creating the Hive table for the stream
 * @throws UnsupportedTypeException if the stream schema is not compatible with Hive
 * @throws ExploreException if there was an exception submitting the create table statement
 * @throws SQLException if there was a problem with the create table statement
 */
public QueryHandle enableStream(String tableName, StreamId streamId, FormatSpecification formatSpec) throws UnsupportedTypeException, ExploreException, SQLException {
    String streamName = streamId.getStream();
    LOG.debug("Enabling explore for stream {} with table {}", streamId, tableName);
    // schema of a stream is always timestamp, headers, and then the schema of the body.
    List<Schema.Field> fields = Lists.newArrayList(Schema.Field.of("ts", Schema.of(Schema.Type.LONG)), Schema.Field.of("headers", Schema.mapOf(Schema.of(Schema.Type.STRING), Schema.of(Schema.Type.STRING))));
    if (formatSpec.getSchema() != null) {
        fields.addAll(formatSpec.getSchema().getFields());
    }
    Schema schema = Schema.recordOf("streamEvent", fields);
    Map<String, String> serdeProperties = ImmutableMap.of(Constants.Explore.STREAM_NAME, streamName, Constants.Explore.STREAM_NAMESPACE, streamId.getNamespace(), Constants.Explore.FORMAT_SPEC, GSON.toJson(formatSpec));
    String createStatement = new CreateStatementBuilder(streamName, null, tableName, shouldEscapeColumns).setSchema(schema).setTableComment("CDAP Stream").buildWithStorageHandler(StreamStorageHandler.class.getName(), serdeProperties);
    LOG.debug("Running create statement for stream {} with table {}: {}", streamName, tableName, createStatement);
    return exploreService.execute(streamId.getParent(), createStatement);
}
Also used : StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) Schema(co.cask.cdap.api.data.schema.Schema) CreateStatementBuilder(co.cask.cdap.explore.table.CreateStatementBuilder) StreamStorageHandler(co.cask.cdap.hive.stream.StreamStorageHandler)

Example 4 with CreateStatementBuilder

use of co.cask.cdap.explore.table.CreateStatementBuilder in project cdap by caskdata.

the class ExploreTableManager method generateCreateStatementFromSchemaProperty.

/**
 * Generate a create statement from the "schema" property of the dataset (specification). This is used for
 * Table, ObjectMappedTable and RecordScannables with record type StructuredRecord, all of which use the
 * {@link DatasetStorageHandler}.
 *
 * @param spec the dataset specification
 * @param datasetId the dataset id
 * @param serdeProperties properties to be passed to the {@link co.cask.cdap.hive.datasets.DatasetSerDe}
 * @param shouldErrorOnMissingSchema whether the schema is required.
 * @return a CREATE TABLE statement, or null if the dataset is not explorable
 * @throws UnsupportedTypeException if the schema cannot be represented in Hive
 * @throws IllegalArgumentException if the schema cannot be parsed, or if shouldErrorOnMissingSchema is true and
 *                                  the dataset spec does not contain a schema.
 */
@Nullable
private String generateCreateStatementFromSchemaProperty(DatasetSpecification spec, DatasetId datasetId, String tableName, Map<String, String> serdeProperties, boolean shouldErrorOnMissingSchema) throws UnsupportedTypeException {
    Schema schema = getSchemaFromProperty(spec, datasetId, shouldErrorOnMissingSchema);
    if (schema == null) {
        return null;
    }
    String databaseName = ExploreProperties.getExploreDatabaseName(spec.getProperties());
    return new CreateStatementBuilder(datasetId.getDataset(), databaseName, tableName, shouldEscapeColumns).setSchema(schema).setTableComment("CDAP Dataset").buildWithStorageHandler(DatasetStorageHandler.class.getName(), serdeProperties);
}
Also used : DatasetStorageHandler(co.cask.cdap.hive.datasets.DatasetStorageHandler) Schema(co.cask.cdap.api.data.schema.Schema) CreateStatementBuilder(co.cask.cdap.explore.table.CreateStatementBuilder) Nullable(javax.annotation.Nullable)

Aggregations

CreateStatementBuilder (co.cask.cdap.explore.table.CreateStatementBuilder)4 Nullable (javax.annotation.Nullable)3 Schema (co.cask.cdap.api.data.schema.Schema)2 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)2 DatasetStorageHandler (co.cask.cdap.hive.datasets.DatasetStorageHandler)2 RecordScannable (co.cask.cdap.api.data.batch.RecordScannable)1 RecordWritable (co.cask.cdap.api.data.batch.RecordWritable)1 FileSet (co.cask.cdap.api.dataset.lib.FileSet)1 ObjectMappedTable (co.cask.cdap.api.dataset.lib.ObjectMappedTable)1 Partitioning (co.cask.cdap.api.dataset.lib.Partitioning)1 Table (co.cask.cdap.api.dataset.table.Table)1 StreamStorageHandler (co.cask.cdap.hive.stream.StreamStorageHandler)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 Type (java.lang.reflect.Type)1 Map (java.util.Map)1 StructField (org.apache.hadoop.hive.serde2.objectinspector.StructField)1 Location (org.apache.twill.filesystem.Location)1