Search in sources :

Example 16 with Deserializer

use of org.apache.hadoop.hive.serde2.Deserializer in project hive by apache.

the class VectorMapOperator method process.

@Override
public void process(Writable value) throws HiveException {
    // A mapper can span multiple files/partitions.
    // The VectorPartitionContext need to be changed if the input file changed
    ExecMapperContext context = getExecContext();
    if (context != null && context.inputFileChanged()) {
        // The child operators cleanup if input file has changed
        cleanUpInputFileChanged();
    }
    if (!oneRootOperator.getDone()) {
        /*
       * 3 different kinds of vectorized reading supported:
       *
       *   1) Read the Vectorized Input File Format which returns VectorizedRowBatch as the row.
       *
       *   2) Read using VectorDeserializeRow to deserialize each row into the VectorizedRowBatch.
       *
       *   3) And read using the regular partition deserializer to get the row object and assigning
       *      the row object into the VectorizedRowBatch with VectorAssignRow.
       */
        try {
            if (currentReadType == VectorMapOperatorReadType.VECTORIZED_INPUT_FILE_FORMAT) {
                /*
           * The Vectorized Input File Format reader has already set the partition column
           * values, reset and filled in the batch, etc.
           *
           * We pass the VectorizedRowBatch through here.
           */
                batchCounter++;
                if (value != null) {
                    numRows += ((VectorizedRowBatch) value).size;
                }
                oneRootOperator.process(value, 0);
                if (oneRootOperator.getDone()) {
                    setDone(true);
                    return;
                }
            } else {
                /*
           * We have a "regular" single rows from the Input File Format reader that we will need
           * to deserialize.
           */
                Preconditions.checkState(currentReadType == VectorMapOperatorReadType.VECTOR_DESERIALIZE || currentReadType == VectorMapOperatorReadType.ROW_DESERIALIZE);
                if (deserializerBatch.size == deserializerBatch.DEFAULT_SIZE) {
                    numRows += deserializerBatch.size;
                    /*
             * Feed current full batch to operator tree.
             */
                    batchCounter++;
                    oneRootOperator.process(deserializerBatch, 0);
                    /**
             * Only reset the current data columns.  Not any data columns defaulted to NULL
             * because they are not present in the partition, and not partition columns.
             */
                    for (int c = 0; c < currentDataColumnCount; c++) {
                        ColumnVector colVector = deserializerBatch.cols[c];
                        if (colVector != null) {
                            colVector.reset();
                            colVector.init();
                        }
                    }
                    deserializerBatch.selectedInUse = false;
                    deserializerBatch.size = 0;
                    deserializerBatch.endOfFile = false;
                    if (oneRootOperator.getDone()) {
                        setDone(true);
                        return;
                    }
                }
                /*
           * Do the {vector|row} deserialization of the one row into the VectorizedRowBatch.
           */
                switch(currentReadType) {
                    case VECTOR_DESERIALIZE:
                        {
                            BinaryComparable binComp = (BinaryComparable) value;
                            currentDeserializeRead.set(binComp.getBytes(), 0, binComp.getLength());
                            // Deserialize and append new row using the current batch size as the index.
                            try {
                                currentVectorDeserializeRow.deserialize(deserializerBatch, deserializerBatch.size++);
                            } catch (Exception e) {
                                throw new HiveException("\nDeserializeRead detail: " + currentVectorDeserializeRow.getDetailedReadPositionString(), e);
                            }
                        }
                        break;
                    case ROW_DESERIALIZE:
                        {
                            Object deserialized = currentPartDeserializer.deserialize(value);
                            // Note: Regardless of what the Input File Format returns, we have determined
                            // with VectorAppendRow.initConversion that only currentDataColumnCount columns
                            // have values we want.
                            //
                            // Any extra columns needed by the table schema were set to repeating null
                            // in the batch by setupPartitionContextVars.
                            // Convert input row to standard objects.
                            List<Object> standardObjects = new ArrayList<Object>();
                            ObjectInspectorUtils.copyToStandardObject(standardObjects, deserialized, currentPartRawRowObjectInspector, ObjectInspectorCopyOption.WRITABLE);
                            if (standardObjects.size() < currentDataColumnCount) {
                                throw new HiveException("Input File Format returned row with too few columns");
                            }
                            // Append the deserialized standard object row using the current batch size
                            // as the index.
                            currentVectorAssign.assignRow(deserializerBatch, deserializerBatch.size++, standardObjects, currentDataColumnCount);
                        }
                        break;
                    default:
                        throw new RuntimeException("Unexpected vector MapOperator read type " + currentReadType.name());
                }
            }
        } catch (Exception e) {
            throw new HiveException("Hive Runtime Error while processing row ", e);
        }
    }
}
Also used : ExecMapperContext(org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext) BinaryComparable(org.apache.hadoop.io.BinaryComparable) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) ArrayList(java.util.ArrayList) List(java.util.List) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException)

Example 17 with Deserializer

use of org.apache.hadoop.hive.serde2.Deserializer in project hive by apache.

the class OpProcFactory method pushFilterToStorageHandler.

/**
   * Attempts to push a predicate down into a storage handler.  For
   * native tables, this is a no-op.
   *
   * @param tableScanOp table scan against which predicate applies
   *
   * @param originalPredicate predicate to be pushed down
   *
   * @param owi object walk info
   *
   * @param hiveConf Hive configuration
   *
   * @return portion of predicate which needs to be evaluated
   * by Hive as a post-filter, or null if it was possible
   * to push down the entire predicate
   */
private static ExprNodeGenericFuncDesc pushFilterToStorageHandler(TableScanOperator tableScanOp, ExprNodeGenericFuncDesc originalPredicate, OpWalkerInfo owi, HiveConf hiveConf) {
    TableScanDesc tableScanDesc = tableScanOp.getConf();
    Table tbl = tableScanDesc.getTableMetadata();
    if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTINDEXFILTER)) {
        // attach the original predicate to the table scan operator for index
        // optimizations that require the pushed predicate before pcr & later
        // optimizations are applied
        tableScanDesc.setFilterExpr(originalPredicate);
    }
    if (!tbl.isNonNative()) {
        return originalPredicate;
    }
    HiveStorageHandler storageHandler = tbl.getStorageHandler();
    if (!(storageHandler instanceof HiveStoragePredicateHandler)) {
        // The storage handler does not provide predicate decomposition
        // support, so we'll implement the entire filter in Hive.  However,
        // we still provide the full predicate to the storage handler in
        // case it wants to do any of its own prefiltering.
        tableScanDesc.setFilterExpr(originalPredicate);
        return originalPredicate;
    }
    HiveStoragePredicateHandler predicateHandler = (HiveStoragePredicateHandler) storageHandler;
    JobConf jobConf = new JobConf(owi.getParseContext().getConf());
    Utilities.setColumnNameList(jobConf, tableScanOp);
    Utilities.setColumnTypeList(jobConf, tableScanOp);
    Utilities.copyTableJobPropertiesToConf(Utilities.getTableDesc(tbl), jobConf);
    Deserializer deserializer = tbl.getDeserializer();
    HiveStoragePredicateHandler.DecomposedPredicate decomposed = predicateHandler.decomposePredicate(jobConf, deserializer, originalPredicate);
    if (decomposed == null) {
        // not able to push anything down
        if (LOG.isDebugEnabled()) {
            LOG.debug("No pushdown possible for predicate:  " + originalPredicate.getExprString());
        }
        return originalPredicate;
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Original predicate:  " + originalPredicate.getExprString());
        if (decomposed.pushedPredicate != null) {
            LOG.debug("Pushed predicate:  " + decomposed.pushedPredicate.getExprString());
        }
        if (decomposed.residualPredicate != null) {
            LOG.debug("Residual predicate:  " + decomposed.residualPredicate.getExprString());
        }
    }
    tableScanDesc.setFilterExpr(decomposed.pushedPredicate);
    tableScanDesc.setFilterObject(decomposed.pushedPredicateObject);
    return decomposed.residualPredicate;
}
Also used : HiveStoragePredicateHandler(org.apache.hadoop.hive.ql.metadata.HiveStoragePredicateHandler) HiveStorageHandler(org.apache.hadoop.hive.ql.metadata.HiveStorageHandler) Table(org.apache.hadoop.hive.ql.metadata.Table) Deserializer(org.apache.hadoop.hive.serde2.Deserializer) TableScanDesc(org.apache.hadoop.hive.ql.plan.TableScanDesc) JobConf(org.apache.hadoop.mapred.JobConf)

Example 18 with Deserializer

use of org.apache.hadoop.hive.serde2.Deserializer in project hive by apache.

the class PlanUtils method getTableDesc.

/**
   * Generate a table descriptor from a createTableDesc.
   */
public static TableDesc getTableDesc(CreateTableDesc crtTblDesc, String cols, String colTypes) {
    TableDesc ret;
    // Resolve storage handler (if any)
    try {
        HiveStorageHandler storageHandler = null;
        if (crtTblDesc.getStorageHandler() != null) {
            storageHandler = HiveUtils.getStorageHandler(SessionState.getSessionConf(), crtTblDesc.getStorageHandler());
        }
        Class<? extends Deserializer> serdeClass = LazySimpleSerDe.class;
        String separatorCode = Integer.toString(Utilities.ctrlaCode);
        String columns = cols;
        String columnTypes = colTypes;
        boolean lastColumnTakesRestOfTheLine = false;
        if (storageHandler != null) {
            serdeClass = storageHandler.getSerDeClass();
        } else if (crtTblDesc.getSerName() != null) {
            serdeClass = JavaUtils.loadClass(crtTblDesc.getSerName());
        }
        if (crtTblDesc.getFieldDelim() != null) {
            separatorCode = crtTblDesc.getFieldDelim();
        }
        ret = getTableDesc(serdeClass, separatorCode, columns, columnTypes, lastColumnTakesRestOfTheLine, false);
        // set other table properties
        Properties properties = ret.getProperties();
        if (crtTblDesc.getStorageHandler() != null) {
            properties.setProperty(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_STORAGE, crtTblDesc.getStorageHandler());
        }
        if (crtTblDesc.getCollItemDelim() != null) {
            properties.setProperty(serdeConstants.COLLECTION_DELIM, crtTblDesc.getCollItemDelim());
        }
        if (crtTblDesc.getMapKeyDelim() != null) {
            properties.setProperty(serdeConstants.MAPKEY_DELIM, crtTblDesc.getMapKeyDelim());
        }
        if (crtTblDesc.getFieldEscape() != null) {
            properties.setProperty(serdeConstants.ESCAPE_CHAR, crtTblDesc.getFieldEscape());
        }
        if (crtTblDesc.getLineDelim() != null) {
            properties.setProperty(serdeConstants.LINE_DELIM, crtTblDesc.getLineDelim());
        }
        if (crtTblDesc.getNullFormat() != null) {
            properties.setProperty(serdeConstants.SERIALIZATION_NULL_FORMAT, crtTblDesc.getNullFormat());
        }
        if (crtTblDesc.getTableName() != null && crtTblDesc.getDatabaseName() != null) {
            properties.setProperty(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_NAME, crtTblDesc.getTableName());
        }
        if (crtTblDesc.getTblProps() != null) {
            properties.putAll(crtTblDesc.getTblProps());
        }
        if (crtTblDesc.getSerdeProps() != null) {
            properties.putAll(crtTblDesc.getSerdeProps());
        }
        // replace the default input & output file format with those found in
        // crtTblDesc
        Class<? extends InputFormat> in_class;
        if (storageHandler != null) {
            in_class = storageHandler.getInputFormatClass();
        } else {
            in_class = JavaUtils.loadClass(crtTblDesc.getInputFormat());
        }
        Class<? extends OutputFormat> out_class;
        if (storageHandler != null) {
            out_class = storageHandler.getOutputFormatClass();
        } else {
            out_class = JavaUtils.loadClass(crtTblDesc.getOutputFormat());
        }
        ret.setInputFileFormatClass(in_class);
        ret.setOutputFileFormatClass(out_class);
    } catch (ClassNotFoundException e) {
        throw new RuntimeException("Unable to find class in getTableDesc: " + e.getMessage(), e);
    } catch (HiveException e) {
        throw new RuntimeException("Error loading storage handler in getTableDesc: " + e.getMessage(), e);
    }
    return ret;
}
Also used : HiveStorageHandler(org.apache.hadoop.hive.ql.metadata.HiveStorageHandler) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) LazySimpleSerDe(org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe) Properties(java.util.Properties)

Example 19 with Deserializer

use of org.apache.hadoop.hive.serde2.Deserializer in project hive by apache.

the class MetaStoreUtils method getDeserializer.

/**
   * getDeserializer
   *
   * Get the Deserializer for a partition.
   *
   * @param conf
   *          - hadoop config
   * @param part
   *          the partition
   * @param table the table
   * @return
   *   Returns instantiated deserializer by looking up class name of deserializer stored in
   *   storage descriptor of passed in partition. Also, initializes the deserializer with
   *   schema of partition.
   * @exception MetaException
   *              if any problems instantiating the Deserializer
   *
   */
public static Deserializer getDeserializer(Configuration conf, org.apache.hadoop.hive.metastore.api.Partition part, org.apache.hadoop.hive.metastore.api.Table table) throws MetaException {
    String lib = part.getSd().getSerdeInfo().getSerializationLib();
    try {
        Deserializer deserializer = ReflectionUtil.newInstance(conf.getClassByName(lib).asSubclass(Deserializer.class), conf);
        SerDeUtils.initializeSerDe(deserializer, conf, MetaStoreUtils.getTableMetadata(table), MetaStoreUtils.getPartitionMetadata(part, table));
        return deserializer;
    } catch (RuntimeException e) {
        throw e;
    } catch (Exception e) {
        LOG.error("error in initSerDe: " + e.getClass().getName() + " " + e.getMessage(), e);
        throw new MetaException(e.getClass().getName() + " " + e.getMessage());
    }
}
Also used : Deserializer(org.apache.hadoop.hive.serde2.Deserializer) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) InvocationTargetException(java.lang.reflect.InvocationTargetException) InvalidOperationException(org.apache.hadoop.hive.metastore.api.InvalidOperationException) IOException(java.io.IOException) InvalidObjectException(org.apache.hadoop.hive.metastore.api.InvalidObjectException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) MetaException(org.apache.hadoop.hive.metastore.api.MetaException)

Example 20 with Deserializer

use of org.apache.hadoop.hive.serde2.Deserializer in project hive by apache.

the class MapOperator method populateVirtualColumnValues.

public static Object[] populateVirtualColumnValues(ExecMapperContext ctx, List<VirtualColumn> vcs, Object[] vcValues, Deserializer deserializer) {
    if (vcs == null) {
        return vcValues;
    }
    if (vcValues == null) {
        vcValues = new Object[vcs.size()];
    }
    for (int i = 0; i < vcs.size(); i++) {
        switch(vcs.get(i)) {
            case FILENAME:
                if (ctx.inputFileChanged()) {
                    vcValues[i] = new Text(ctx.getCurrentInputPath().toString());
                }
                break;
            case BLOCKOFFSET:
                {
                    long current = ctx.getIoCxt().getCurrentBlockStart();
                    LongWritable old = (LongWritable) vcValues[i];
                    if (old == null) {
                        old = new LongWritable(current);
                        vcValues[i] = old;
                        continue;
                    }
                    if (current != old.get()) {
                        old.set(current);
                    }
                }
                break;
            case ROWOFFSET:
                {
                    long current = ctx.getIoCxt().getCurrentRow();
                    LongWritable old = (LongWritable) vcValues[i];
                    if (old == null) {
                        old = new LongWritable(current);
                        vcValues[i] = old;
                        continue;
                    }
                    if (current != old.get()) {
                        old.set(current);
                    }
                }
                break;
            case RAWDATASIZE:
                long current = 0L;
                SerDeStats stats = deserializer.getSerDeStats();
                if (stats != null) {
                    current = stats.getRawDataSize();
                }
                LongWritable old = (LongWritable) vcValues[i];
                if (old == null) {
                    old = new LongWritable(current);
                    vcValues[i] = old;
                    continue;
                }
                if (current != old.get()) {
                    old.set(current);
                }
                break;
            case ROWID:
                if (ctx.getIoCxt().getRecordIdentifier() == null) {
                    vcValues[i] = null;
                } else {
                    if (vcValues[i] == null) {
                        vcValues[i] = new Object[RecordIdentifier.Field.values().length];
                    }
                    RecordIdentifier.StructInfo.toArray(ctx.getIoCxt().getRecordIdentifier(), (Object[]) vcValues[i]);
                    //so we don't accidentally cache the value; shouldn't
                    ctx.getIoCxt().setRecordIdentifier(null);
                //happen since IO layer either knows how to produce ROW__ID or not - but to be safe
                }
                break;
        }
    }
    return vcValues;
}
Also used : SerDeStats(org.apache.hadoop.hive.serde2.SerDeStats) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable)

Aggregations

Deserializer (org.apache.hadoop.hive.serde2.Deserializer)19 ArrayList (java.util.ArrayList)14 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)14 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)13 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)10 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)10 IOException (java.io.IOException)7 Properties (java.util.Properties)7 Path (org.apache.hadoop.fs.Path)6 MetaException (org.apache.hadoop.hive.metastore.api.MetaException)6 Configuration (org.apache.hadoop.conf.Configuration)5 HashMap (java.util.HashMap)4 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)4 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)4 LazySimpleSerDe (org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe)4 TypeInfo (org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)4 List (java.util.List)3 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)3 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)3 StructField (org.apache.hadoop.hive.serde2.objectinspector.StructField)3