Examples with Deserializer - org.apache.hadoop.hive.serde2.Deserializer

Example 31 with Deserializer

use of org.apache.hadoop.hive.serde2.Deserializer in project hive by apache.

the class HiveInputFormat method wrapForLlap.

public static InputFormat<WritableComparable, Writable> wrapForLlap(InputFormat<WritableComparable, Writable> inputFormat, Configuration conf, PartitionDesc part) throws HiveException {
    if (!HiveConf.getBoolVar(conf, ConfVars.LLAP_IO_ENABLED, LlapProxy.isDaemon())) {
        // LLAP not enabled, no-op.
        return inputFormat;
    }
    String ifName = inputFormat.getClass().getCanonicalName();
    boolean isSupported = inputFormat instanceof LlapWrappableInputFormatInterface;
    boolean isCacheOnly = inputFormat instanceof LlapCacheOnlyInputFormatInterface;
    boolean isVectorized = Utilities.getIsVectorized(conf);
    if (!isVectorized) {
        // Pretend it's vectorized if the non-vector wrapped is enabled.
        isVectorized = HiveConf.getBoolVar(conf, ConfVars.LLAP_IO_NONVECTOR_WRAPPER_ENABLED) && (Utilities.getPlanPath(conf) != null);
    }
    boolean isSerdeBased = false;
    if (isVectorized && !isSupported && HiveConf.getBoolVar(conf, ConfVars.LLAP_IO_ENCODE_ENABLED)) {
        // See if we can use re-encoding to read the format thru IO elevator.
        isSupported = isSerdeBased = checkInputFormatForLlapEncode(conf, ifName);
    }
    if ((!isSupported || !isVectorized) && !isCacheOnly) {
        if (LOG.isInfoEnabled()) {
            LOG.info("Not using llap for " + ifName + ": supported = " + isSupported + ", vectorized = " + isVectorized + ", cache only = " + isCacheOnly);
        }
        return inputFormat;
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Processing " + ifName);
    }
    @SuppressWarnings("unchecked") LlapIo<VectorizedRowBatch> llapIo = LlapProxy.getIo();
    if (llapIo == null) {
        if (LOG.isInfoEnabled()) {
            LOG.info("Not using LLAP IO because it is not initialized");
        }
        return inputFormat;
    }
    Deserializer serde = null;
    if (isSerdeBased) {
        if (part == null) {
            if (isCacheOnly) {
                LOG.info("Using cache only because there's no partition spec for SerDe-based IF");
                injectLlapCaches(inputFormat, llapIo);
            } else {
                LOG.info("Not using LLAP IO because there's no partition spec for SerDe-based IF");
            }
            return inputFormat;
        }
        serde = findSerDeForLlapSerDeIf(conf, part);
    }
    if (isSupported && isVectorized) {
        InputFormat<?, ?> wrappedIf = llapIo.getInputFormat(inputFormat, serde);
        // null means we cannot wrap; the cause is logged inside.
        if (wrappedIf != null) {
            return castInputFormat(wrappedIf);
        }
    }
    if (isCacheOnly) {
        injectLlapCaches(inputFormat, llapIo);
    }
    return inputFormat;
}

Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) Deserializer(org.apache.hadoop.hive.serde2.Deserializer)

Example 32 with Deserializer

use of org.apache.hadoop.hive.serde2.Deserializer in project hive by apache.

the class DynamicValueRegistryTez method init.

@Override
public void init(RegistryConf conf) throws Exception {
    RegistryConfTez rct = (RegistryConfTez) conf;
    for (String inputSourceName : rct.baseWork.getInputSourceToRuntimeValuesInfo().keySet()) {
        LOG.info("Runtime value source: " + inputSourceName);
        LogicalInput runtimeValueInput = rct.inputs.get(inputSourceName);
        RuntimeValuesInfo runtimeValuesInfo = rct.baseWork.getInputSourceToRuntimeValuesInfo().get(inputSourceName);
        // Setup deserializer/obj inspectors for the incoming data source
        Deserializer deserializer = ReflectionUtils.newInstance(runtimeValuesInfo.getTableDesc().getDeserializerClass(), null);
        deserializer.initialize(rct.conf, runtimeValuesInfo.getTableDesc().getProperties());
        ObjectInspector inspector = deserializer.getObjectInspector();
        // Set up col expressions for the dynamic values using this input
        List<ExprNodeEvaluator> colExprEvaluators = new ArrayList<ExprNodeEvaluator>();
        for (ExprNodeDesc expr : runtimeValuesInfo.getColExprs()) {
            ExprNodeEvaluator exprEval = ExprNodeEvaluatorFactory.get(expr, null);
            exprEval.initialize(inspector);
            colExprEvaluators.add(exprEval);
        }
        runtimeValueInput.start();
        List<Input> inputList = new ArrayList<Input>();
        inputList.add(runtimeValueInput);
        rct.processorContext.waitForAllInputsReady(inputList);
        KeyValueReader kvReader = (KeyValueReader) runtimeValueInput.getReader();
        long rowCount = 0;
        while (kvReader.next()) {
            Object row = deserializer.deserialize((Writable) kvReader.getCurrentValue());
            rowCount++;
            for (int colIdx = 0; colIdx < colExprEvaluators.size(); ++colIdx) {
                // Read each expression and save it to the value registry
                ExprNodeEvaluator eval = colExprEvaluators.get(colIdx);
                Object val = eval.evaluate(row);
                setValue(runtimeValuesInfo.getDynamicValueIDs().get(colIdx), val);
            }
        }
        // For now, expecting a single row (min/max, aggregated bloom filter), or no rows
        if (rowCount == 0) {
            LOG.debug("No input rows from " + inputSourceName + ", filling dynamic values with nulls");
            for (int colIdx = 0; colIdx < colExprEvaluators.size(); ++colIdx) {
                ExprNodeEvaluator eval = colExprEvaluators.get(colIdx);
                setValue(runtimeValuesInfo.getDynamicValueIDs().get(colIdx), null);
            }
        } else if (rowCount > 1) {
            throw new IllegalStateException("Expected 0 or 1 rows from " + inputSourceName + ", got " + rowCount);
        }
    }
}

Also used : ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) KeyValueReader(org.apache.tez.runtime.library.api.KeyValueReader) ExprNodeEvaluator(org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator) ArrayList(java.util.ArrayList) RuntimeValuesInfo(org.apache.hadoop.hive.ql.parse.RuntimeValuesInfo) LogicalInput(org.apache.tez.runtime.api.LogicalInput) Input(org.apache.tez.runtime.api.Input) Deserializer(org.apache.hadoop.hive.serde2.Deserializer) LogicalInput(org.apache.tez.runtime.api.LogicalInput) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc)

Example 33 with Deserializer

use of org.apache.hadoop.hive.serde2.Deserializer in project hive by apache.

the class VectorMapOperator method process.

@Override
public void process(Writable value) throws HiveException {
    // A mapper can span multiple files/partitions.
    // The VectorPartitionContext need to be changed if the input file changed
    ExecMapperContext context = getExecContext();
    if (context != null && context.inputFileChanged()) {
        // The child operators cleanup if input file has changed
        cleanUpInputFileChanged();
    }
    if (!oneRootOperator.getDone()) {
        /*
       * 3 different kinds of vectorized reading supported:
       *
       *   1) Read the Vectorized Input File Format which returns VectorizedRowBatch as the row.
       *
       *   2) Read using VectorDeserializeRow to deserialize each row into the VectorizedRowBatch.
       *
       *   3) And read using the regular partition deserializer to get the row object and assigning
       *      the row object into the VectorizedRowBatch with VectorAssignRow.
       */
        try {
            if (currentReadType == VectorMapOperatorReadType.VECTORIZED_INPUT_FILE_FORMAT) {
                if (!deliverVectorizedRowBatch(value)) {
                    // Operator tree is now done.
                    return;
                }
            } else if (value instanceof VectorizedRowBatch) {
                /*
           * Clear out any rows we may have processed in row-mode for the current partition..
           */
                if (!flushDeserializerBatch()) {
                    // Operator tree is now done.
                    return;
                }
                if (!deliverVectorizedRowBatch(value)) {
                    // Operator tree is now done.
                    return;
                }
            } else {
                /*
           * We have a "regular" single rows from the Input File Format reader that we will need
           * to deserialize.
           */
                Preconditions.checkState(currentReadType == VectorMapOperatorReadType.VECTOR_DESERIALIZE || currentReadType == VectorMapOperatorReadType.ROW_DESERIALIZE);
                if (deserializerBatch.size == deserializerBatch.DEFAULT_SIZE) {
                    numRows += deserializerBatch.size;
                    /*
             * Feed current full batch to operator tree.
             */
                    batchCounter++;
                    oneRootOperator.process(deserializerBatch, 0);
                    /**
                     * Only reset the current data columns.  Not any data columns defaulted to NULL
                     * because they are not present in the partition, and not partition columns.
                     */
                    for (int c = 0; c < currentDataColumnCount; c++) {
                        ColumnVector colVector = deserializerBatch.cols[c];
                        if (colVector != null) {
                            colVector.reset();
                            colVector.init();
                        }
                    }
                    deserializerBatch.selectedInUse = false;
                    deserializerBatch.size = 0;
                    deserializerBatch.endOfFile = false;
                    if (oneRootOperator.getDone()) {
                        setDone(true);
                        return;
                    }
                }
                /*
           * Do the {vector|row} deserialization of the one row into the VectorizedRowBatch.
           */
                switch(currentReadType) {
                    case VECTOR_DESERIALIZE:
                        {
                            BinaryComparable binComp = (BinaryComparable) value;
                            currentDeserializeRead.set(binComp.getBytes(), 0, binComp.getLength());
                            // Deserialize and append new row using the current batch size as the index.
                            try {
                                currentVectorDeserializeRow.deserialize(deserializerBatch, deserializerBatch.size++);
                            } catch (Exception e) {
                                throw new HiveException("\nDeserializeRead detail: " + currentVectorDeserializeRow.getDetailedReadPositionString(), e);
                            }
                        }
                        break;
                    case ROW_DESERIALIZE:
                        {
                            Object deserialized = currentPartDeserializer.deserialize(value);
                            // Note: Regardless of what the Input File Format returns, we have determined
                            // with VectorAppendRow.initConversion that only currentDataColumnCount columns
                            // have values we want.
                            // 
                            // Any extra columns needed by the table schema were set to repeating null
                            // in the batch by setupPartitionContextVars.
                            // Convert input row to standard objects.
                            List<Object> standardObjects = new ArrayList<Object>();
                            ObjectInspectorUtils.copyToStandardObject(standardObjects, deserialized, currentPartRawRowObjectInspector, ObjectInspectorCopyOption.WRITABLE);
                            if (standardObjects.size() < currentDataColumnCount) {
                                throw new HiveException("Input File Format returned row with too few columns");
                            }
                            // Append the deserialized standard object row using the current batch size
                            // as the index.
                            currentVectorAssign.assignRow(deserializerBatch, deserializerBatch.size++, standardObjects, currentDataColumnCount);
                        }
                        break;
                    default:
                        throw new RuntimeException("Unexpected vector MapOperator read type " + currentReadType.name());
                }
            }
        } catch (Exception e) {
            throw new HiveException("Hive Runtime Error while processing row ", e);
        }
    }
}

Also used : ExecMapperContext(org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext) BinaryComparable(org.apache.hadoop.io.BinaryComparable) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) ArrayList(java.util.ArrayList) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException)

Example 34 with Deserializer

use of org.apache.hadoop.hive.serde2.Deserializer in project hive by apache.

the class MapOperator method getConvertedOI.

// Return the mapping for table descriptor to the expected table OI
/**
 * Traverse all the partitions for a table, and get the OI for the table.
 * Note that a conversion is required if any of the partition OI is different
 * from the table OI. For eg. if the query references table T (partitions P1, P2),
 * and P1's schema is same as T, whereas P2's scheme is different from T, conversion
 * might be needed for both P1 and P2, since SettableOI might be needed for T
 */
private Map<TableDesc, StructObjectInspector> getConvertedOI(Map<String, Configuration> tableToConf) throws HiveException {
    Map<TableDesc, StructObjectInspector> tableDescOI = new HashMap<TableDesc, StructObjectInspector>();
    Set<TableDesc> identityConverterTableDesc = new HashSet<TableDesc>();
    try {
        Map<ObjectInspector, Boolean> oiSettableProperties = new HashMap<ObjectInspector, Boolean>();
        for (Path onefile : conf.getPathToAliases().keySet()) {
            PartitionDesc pd = conf.getPathToPartitionInfo().get(onefile);
            TableDesc tableDesc = pd.getTableDesc();
            Configuration hconf = tableToConf.get(tableDesc.getTableName());
            Deserializer partDeserializer = pd.getDeserializer(hconf);
            StructObjectInspector partRawRowObjectInspector;
            boolean isAcid = AcidUtils.isTablePropertyTransactional(tableDesc.getProperties());
            if (Utilities.isSchemaEvolutionEnabled(hconf, isAcid) && Utilities.isInputFileFormatSelfDescribing(pd)) {
                Deserializer tblDeserializer = tableDesc.getDeserializer(hconf);
                partRawRowObjectInspector = (StructObjectInspector) tblDeserializer.getObjectInspector();
            } else {
                partRawRowObjectInspector = (StructObjectInspector) partDeserializer.getObjectInspector();
            }
            StructObjectInspector tblRawRowObjectInspector = tableDescOI.get(tableDesc);
            if ((tblRawRowObjectInspector == null) || (identityConverterTableDesc.contains(tableDesc))) {
                Deserializer tblDeserializer = tableDesc.getDeserializer(hconf);
                tblRawRowObjectInspector = (StructObjectInspector) ObjectInspectorConverters.getConvertedOI(partRawRowObjectInspector, tblDeserializer.getObjectInspector(), oiSettableProperties);
                if (identityConverterTableDesc.contains(tableDesc)) {
                    if (!partRawRowObjectInspector.equals(tblRawRowObjectInspector)) {
                        identityConverterTableDesc.remove(tableDesc);
                    }
                } else if (partRawRowObjectInspector.equals(tblRawRowObjectInspector)) {
                    identityConverterTableDesc.add(tableDesc);
                }
                tableDescOI.put(tableDesc, tblRawRowObjectInspector);
            }
        }
    } catch (Exception e) {
        throw new HiveException(e);
    }
    return tableDescOI;
}

Also used : Path(org.apache.hadoop.fs.Path) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) Deserializer(org.apache.hadoop.hive.serde2.Deserializer) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) HashSet(java.util.HashSet)

Example 35 with Deserializer

use of org.apache.hadoop.hive.serde2.Deserializer in project hive by apache.

the class MapOperator method populateVirtualColumnValues.

public static Object[] populateVirtualColumnValues(ExecMapperContext ctx, List<VirtualColumn> vcs, Object[] vcValues, Deserializer deserializer) {
    if (vcs == null) {
        return vcValues;
    }
    if (vcValues == null) {
        vcValues = new Object[vcs.size()];
    }
    for (int i = 0; i < vcs.size(); i++) {
        switch(vcs.get(i)) {
            case FILENAME:
                if (ctx.inputFileChanged()) {
                    vcValues[i] = new Text(ctx.getCurrentInputPath().toString());
                }
                break;
            case BLOCKOFFSET:
                {
                    long current = ctx.getIoCxt().getCurrentBlockStart();
                    LongWritable old = (LongWritable) vcValues[i];
                    if (old == null) {
                        old = new LongWritable(current);
                        vcValues[i] = old;
                        continue;
                    }
                    if (current != old.get()) {
                        old.set(current);
                    }
                }
                break;
            case ROWOFFSET:
                {
                    long current = ctx.getIoCxt().getCurrentRow();
                    LongWritable old = (LongWritable) vcValues[i];
                    if (old == null) {
                        old = new LongWritable(current);
                        vcValues[i] = old;
                        continue;
                    }
                    if (current != old.get()) {
                        old.set(current);
                    }
                }
                break;
            case RAWDATASIZE:
                long current = 0L;
                SerDeStats stats = deserializer.getSerDeStats();
                if (stats != null) {
                    current = stats.getRawDataSize();
                }
                LongWritable old = (LongWritable) vcValues[i];
                if (old == null) {
                    old = new LongWritable(current);
                    vcValues[i] = old;
                    continue;
                }
                if (current != old.get()) {
                    old.set(current);
                }
                break;
            case ROWID:
                if (ctx.getIoCxt().getRecordIdentifier() == null) {
                    vcValues[i] = null;
                } else {
                    if (vcValues[i] == null) {
                        vcValues[i] = new Object[RecordIdentifier.Field.values().length];
                    }
                    RecordIdentifier.StructInfo.toArray(ctx.getIoCxt().getRecordIdentifier(), (Object[]) vcValues[i]);
                    // so we don't accidentally cache the value; shouldn't
                    ctx.getIoCxt().setRecordIdentifier(null);
                // happen since IO layer either knows how to produce ROW__ID or not - but to be safe
                }
                break;
        }
    }
    return vcValues;
}

Also used : SerDeStats(org.apache.hadoop.hive.serde2.SerDeStats) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable)

Aggregations

Deserializer (org.apache.hadoop.hive.serde2.Deserializer)21 ArrayList (java.util.ArrayList)17 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)15 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)15 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)11 MetaException (org.apache.hadoop.hive.metastore.api.MetaException)10 IOException (java.io.IOException)9 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)9 Path (org.apache.hadoop.fs.Path)7 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)6 HashMap (java.util.HashMap)5 Properties (java.util.Properties)5 Configuration (org.apache.hadoop.conf.Configuration)5 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)5 SQLCheckConstraint (org.apache.hadoop.hive.metastore.api.SQLCheckConstraint)5 SQLDefaultConstraint (org.apache.hadoop.hive.metastore.api.SQLDefaultConstraint)5 SQLNotNullConstraint (org.apache.hadoop.hive.metastore.api.SQLNotNullConstraint)5 SQLUniqueConstraint (org.apache.hadoop.hive.metastore.api.SQLUniqueConstraint)5 CheckConstraint (org.apache.hadoop.hive.ql.metadata.CheckConstraint)5 DefaultConstraint (org.apache.hadoop.hive.ql.metadata.DefaultConstraint)5