Search in sources :

Example 1 with ObjectDeserializer

use of co.cask.cdap.hive.serde.ObjectDeserializer in project cdap by caskdata.

the class StreamSerDe method initialize.

// initialize gets called multiple times by Hive. It may seem like a good idea to put additional settings into
// the conf, but be very careful when doing so. If there are multiple hive tables involved in a query, initialize
// for each table is called before input splits are fetched for any table. It is therefore not safe to put anything
// the input format may need into conf in this method. Rather, use StorageHandler's method to place needed config
// into the properties map there, which will get passed here and also copied into the job conf for the input
// format to consume.
@Override
public void initialize(Configuration conf, Properties properties) throws SerDeException {
    // The columns property comes from the Hive metastore, which has it from the create table statement
    // It is then important that this schema be accurate and in the right order - the same order as
    // object inspectors will reflect them.
    String streamName = properties.getProperty(Constants.Explore.STREAM_NAME);
    String streamNamespace = properties.getProperty(Constants.Explore.STREAM_NAMESPACE);
    // to avoid a null pointer exception that prevents dropping a table, we handle the null namespace case here.
    if (streamNamespace == null) {
        // we also still need an ObjectInspector as Hive uses it to check what columns the table has.
        this.inspector = new ObjectDeserializer(properties, null).getInspector();
        return;
    }
    StreamId streamId = new StreamId(streamNamespace, streamName);
    try (ContextManager.Context context = ContextManager.getContext(conf)) {
        Schema schema = null;
        // Because it calls initialize just to get the object inspector
        if (context != null) {
            // Get the stream format from the stream config.
            FormatSpecification formatSpec = getFormatSpec(properties, streamId, context);
            this.streamFormat = (AbstractStreamEventRecordFormat) RecordFormats.createInitializedFormat(formatSpec);
            schema = formatSpec.getSchema();
        }
        this.deserializer = new ObjectDeserializer(properties, schema, BODY_OFFSET);
        this.inspector = deserializer.getInspector();
    } catch (UnsupportedTypeException e) {
        // this should have been validated up front when schema was set on the stream.
        // if we hit this something went wrong much earlier.
        LOG.error("Schema unsupported by format.", e);
        throw new SerDeException("Schema unsupported by format.", e);
    } catch (IOException e) {
        LOG.error("Could not get the config for stream {}.", streamName, e);
        throw new SerDeException("Could not get the config for stream " + streamName, e);
    } catch (Exception e) {
        LOG.error("Could not create the format for stream {}.", streamName, e);
        throw new SerDeException("Could not create the format for stream " + streamName, e);
    }
}
Also used : StreamId(co.cask.cdap.proto.id.StreamId) ContextManager(co.cask.cdap.hive.context.ContextManager) Schema(co.cask.cdap.api.data.schema.Schema) FormatSpecification(co.cask.cdap.api.data.format.FormatSpecification) UnsupportedTypeException(co.cask.cdap.api.data.schema.UnsupportedTypeException) IOException(java.io.IOException) ObjectDeserializer(co.cask.cdap.hive.serde.ObjectDeserializer) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) UnsupportedTypeException(co.cask.cdap.api.data.schema.UnsupportedTypeException) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException)

Example 2 with ObjectDeserializer

use of co.cask.cdap.hive.serde.ObjectDeserializer in project cdap by caskdata.

the class DatasetSerDe method initialize.

@Override
public void initialize(Configuration conf, Properties properties) throws SerDeException {
    // The column names are saved as the given inspector to #serialize doesn't preserves them
    // - maybe because it's an external table
    // The columns property comes from the Hive metastore, which has it from the create table statement
    // It is then important that this schema be accurate and in the right order - the same order as
    // object inspectors will reflect them.
    String datasetName = properties.getProperty(Constants.Explore.DATASET_NAME);
    String namespace = properties.getProperty(Constants.Explore.DATASET_NAMESPACE);
    // to avoid a null pointer exception that prevents dropping a table, we handle the null namespace case here.
    if (namespace == null) {
        // we also still need an ObjectInspector as Hive uses it to check what columns the table has.
        this.objectInspector = new ObjectDeserializer(properties, null).getInspector();
        return;
    }
    if (datasetName == null || datasetName.isEmpty()) {
        throw new SerDeException("Dataset name not found in serde properties.");
    }
    // a bunch of times.
    if (schema == null) {
        DatasetId datasetId = new DatasetId(namespace, datasetName);
        getDatasetSchema(conf, datasetId);
    }
    this.deserializer = new ObjectDeserializer(properties, schema);
    ArrayList<String> columnNames = Lists.newArrayList(StringUtils.split(properties.getProperty("columns"), ","));
    this.serializer = new ObjectSerializer(columnNames);
    this.objectInspector = deserializer.getInspector();
}
Also used : ObjectSerializer(co.cask.cdap.hive.serde.ObjectSerializer) ObjectDeserializer(co.cask.cdap.hive.serde.ObjectDeserializer) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) DatasetId(co.cask.cdap.proto.id.DatasetId)

Aggregations

ObjectDeserializer (co.cask.cdap.hive.serde.ObjectDeserializer)2 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)2 FormatSpecification (co.cask.cdap.api.data.format.FormatSpecification)1 Schema (co.cask.cdap.api.data.schema.Schema)1 UnsupportedTypeException (co.cask.cdap.api.data.schema.UnsupportedTypeException)1 ContextManager (co.cask.cdap.hive.context.ContextManager)1 ObjectSerializer (co.cask.cdap.hive.serde.ObjectSerializer)1 DatasetId (co.cask.cdap.proto.id.DatasetId)1 StreamId (co.cask.cdap.proto.id.StreamId)1 IOException (java.io.IOException)1