Search in sources :

Example 1 with ObjectInspectorUtils

use of org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils in project hive by apache.

the class ReduceSinkOperator method initializeOp.

@Override
protected void initializeOp(Configuration hconf) throws HiveException {
    super.initializeOp(hconf);
    try {
        numRows = 0;
        cntr = 1;
        logEveryNRows = HiveConf.getLongVar(hconf, HiveConf.ConfVars.HIVE_LOG_N_RECORDS);
        List<ExprNodeDesc> keys = conf.getKeyCols();
        if (LOG.isDebugEnabled()) {
            LOG.debug("keys size is " + keys.size());
            for (ExprNodeDesc k : keys) {
                LOG.debug("Key exprNodeDesc " + k.getExprString());
            }
        }
        keyEval = new ExprNodeEvaluator[keys.size()];
        int i = 0;
        for (ExprNodeDesc e : keys) {
            if (e instanceof ExprNodeGenericFuncDesc && ((ExprNodeGenericFuncDesc) e).getGenericUDF() instanceof GenericUDFBucketNumber) {
                buckColIdxInKeyForSdpo = i;
            }
            keyEval[i++] = ExprNodeEvaluatorFactory.get(e);
        }
        numDistributionKeys = conf.getNumDistributionKeys();
        distinctColIndices = conf.getDistinctColumnIndices();
        numDistinctExprs = distinctColIndices.size();
        valueEval = new ExprNodeEvaluator[conf.getValueCols().size()];
        i = 0;
        for (ExprNodeDesc e : conf.getValueCols()) {
            valueEval[i++] = ExprNodeEvaluatorFactory.get(e);
        }
        partitionEval = new ExprNodeEvaluator[conf.getPartitionCols().size()];
        i = 0;
        for (ExprNodeDesc e : conf.getPartitionCols()) {
            int index = ExprNodeDescUtils.indexOf(e, keys);
            partitionEval[i++] = index < 0 ? ExprNodeEvaluatorFactory.get(e) : keyEval[index];
        }
        if (conf.getBucketCols() != null && !conf.getBucketCols().isEmpty()) {
            bucketEval = new ExprNodeEvaluator[conf.getBucketCols().size()];
            i = 0;
            for (ExprNodeDesc e : conf.getBucketCols()) {
                int index = ExprNodeDescUtils.indexOf(e, keys);
                bucketEval[i++] = index < 0 ? ExprNodeEvaluatorFactory.get(e) : keyEval[index];
            }
            buckColIdxInKey = conf.getPartitionCols().size();
        }
        tag = conf.getTag();
        tagByte[0] = (byte) tag;
        skipTag = conf.getSkipTag();
        LOG.info("Using tag = " + tag);
        TableDesc keyTableDesc = conf.getKeySerializeInfo();
        AbstractSerDe keySerDe = keyTableDesc.getSerDeClass().newInstance();
        keySerDe.initialize(null, keyTableDesc.getProperties(), null);
        keySerializer = keySerDe;
        keyIsText = keySerializer.getSerializedClass().equals(Text.class);
        TableDesc valueTableDesc = conf.getValueSerializeInfo();
        AbstractSerDe valueSerDe = valueTableDesc.getSerDeClass().newInstance();
        valueSerDe.initialize(null, valueTableDesc.getProperties(), null);
        valueSerializer = valueSerDe;
        int limit = conf.getTopN();
        float memUsage = conf.getTopNMemoryUsage();
        if (limit >= 0 && memUsage > 0) {
            reducerHash = conf.isPTFReduceSink() ? new PTFTopNHash() : new TopNHash();
            reducerHash.initialize(limit, memUsage, conf.isMapGroupBy(), this, conf, hconf);
        }
        useUniformHash = conf.getReducerTraits().contains(UNIFORM);
        firstRow = true;
        // acidOp flag has to be checked to use JAVA hash which works like
        // identity function for integers, necessary to read RecordIdentifier
        // incase of ACID updates/deletes.
        boolean acidOp = conf.getWriteType() == AcidUtils.Operation.UPDATE || conf.getWriteType() == AcidUtils.Operation.DELETE;
        hashFunc = getConf().getBucketingVersion() == 2 && !acidOp ? ObjectInspectorUtils::getBucketHashCode : ObjectInspectorUtils::getBucketHashCodeOld;
    } catch (Exception e) {
        String msg = "Error initializing ReduceSinkOperator: " + e.getMessage();
        LOG.error(msg, e);
        throw new RuntimeException(e);
    }
}
Also used : GenericUDFBucketNumber(org.apache.hadoop.hive.ql.udf.generic.GenericUDFBucketNumber) ExprNodeGenericFuncDesc(org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc)

Example 2 with ObjectInspectorUtils

use of org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils in project hive by apache.

the class FileSinkOperator method initializeOp.

@Override
protected void initializeOp(Configuration hconf) throws HiveException {
    super.initializeOp(hconf);
    try {
        this.hconf = hconf;
        filesCreated = false;
        isTemporary = conf.isTemporary();
        multiFileSpray = conf.isMultiFileSpray();
        this.isBucketed = hconf.getInt(hive_metastoreConstants.BUCKET_COUNT, 0) > 0;
        totalFiles = conf.getTotalFiles();
        numFiles = conf.getNumFiles();
        dpCtx = conf.getDynPartCtx();
        lbCtx = conf.getLbCtx();
        fsp = prevFsp = null;
        valToPaths = new HashMap<String, FSPaths>();
        taskId = originalTaskId = Utilities.getTaskId(hconf);
        initializeSpecPath();
        fs = specPath.getFileSystem(hconf);
        if (hconf instanceof JobConf) {
            jc = (JobConf) hconf;
        } else {
            // test code path
            jc = new JobConf(hconf);
        }
        try {
            createHiveOutputFormat(jc);
        } catch (HiveException ex) {
            logOutputFormatError(hconf, ex);
            throw ex;
        }
        isCompressed = conf.getCompressed();
        if (conf.isLinkedFileSink() && conf.isDirectInsert()) {
            parent = Utilities.toTempPath(conf.getFinalDirName());
        } else {
            parent = Utilities.toTempPath(conf.getDirName());
        }
        statsFromRecordWriter = new boolean[numFiles];
        AbstractSerDe serde = conf.getTableInfo().getSerDeClass().newInstance();
        serde.initialize(unsetNestedColumnPaths(hconf), conf.getTableInfo().getProperties(), null);
        serializer = serde;
        outputClass = serializer.getSerializedClass();
        destTablePath = conf.getDestPath();
        isInsertOverwrite = conf.getInsertOverwrite();
        counterGroup = HiveConf.getVar(hconf, HiveConf.ConfVars.HIVECOUNTERGROUP);
        LOG.info("Using serializer : " + serializer + " and formatter : " + hiveOutputFormat + (isCompressed ? " with compression" : ""));
        // Timeout is chosen to make sure that even if one iteration takes more than
        // half of the script.timeout but less than script.timeout, we will still
        // be able to report progress.
        timeOut = hconf.getInt("mapred.healthChecker.script.timeout", 600000) / 2;
        if (multiFileSpray) {
            partitionEval = new ExprNodeEvaluator[conf.getPartitionCols().size()];
            int i = 0;
            for (ExprNodeDesc e : conf.getPartitionCols()) {
                partitionEval[i++] = ExprNodeEvaluatorFactory.get(e);
            }
            partitionObjectInspectors = initEvaluators(partitionEval, outputObjInspector);
            prtner = (HivePartitioner<HiveKey, Object>) ReflectionUtils.newInstance(jc.getPartitionerClass(), null);
        }
        if (dpCtx != null && !inspectPartitionValues()) {
            dpSetup();
        }
        if (lbCtx != null) {
            lbSetup();
        }
        if (!bDynParts) {
            fsp = new FSPaths(specPath, conf.isMmTable(), conf.isDirectInsert(), conf.getInsertOverwrite(), conf.getAcidOperation());
            fsp.subdirAfterTxn = combinePathFragments(generateListBucketingDirName(null), unionPath);
            if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
                Utilities.FILE_OP_LOGGER.trace("creating new paths " + System.identityHashCode(fsp) + " from ctor; childSpec " + unionPath + ": tmpPath " + fsp.buildTmpPath() + ", task path " + fsp.buildTaskOutputTempPath());
            }
            // createBucketFiles(fsp);
            if (!this.isSkewedStoredAsSubDirectories) {
                // special entry for non-DP case
                valToPaths.put("", fsp);
            }
        }
        final StoragePolicyValue tmpStorage = StoragePolicyValue.lookup(HiveConf.getVar(hconf, HIVE_TEMPORARY_TABLE_STORAGE));
        if (isTemporary && fsp != null && tmpStorage != StoragePolicyValue.DEFAULT) {
            // Not supported for temp tables.
            assert !conf.isMmTable();
            final Path outputPath = fsp.buildTaskOutputTempPath();
            StoragePolicyShim shim = ShimLoader.getHadoopShims().getStoragePolicyShim(fs);
            if (shim != null) {
                // directory creation is otherwise within the writers
                fs.mkdirs(outputPath);
                shim.setStoragePolicy(outputPath, tmpStorage);
            }
        }
        if (conf.getWriteType() == AcidUtils.Operation.UPDATE || conf.getWriteType() == AcidUtils.Operation.DELETE) {
            // ROW__ID is always in the first field
            recIdField = ((StructObjectInspector) outputObjInspector).getAllStructFieldRefs().get(0);
            recIdInspector = (StructObjectInspector) recIdField.getFieldObjectInspector();
            // bucket is the second field in the record id
            bucketField = recIdInspector.getAllStructFieldRefs().get(1);
            bucketInspector = (IntObjectInspector) bucketField.getFieldObjectInspector();
        }
        numRows = 0;
        cntr = 1;
        logEveryNRows = HiveConf.getLongVar(hconf, HiveConf.ConfVars.HIVE_LOG_N_RECORDS);
        statsMap.put(getCounterName(Counter.RECORDS_OUT), row_count);
        // Setup hashcode
        hashFunc = conf.getTableInfo().getBucketingVersion() == 2 ? ObjectInspectorUtils::getBucketHashCode : ObjectInspectorUtils::getBucketHashCodeOld;
        // This count is used to get total number of rows in an insert query.
        if (conf.getTableInfo() != null && conf.getTableInfo().getTableName() != null) {
            statsMap.put(TOTAL_TABLE_ROWS_WRITTEN, row_count);
        }
    } catch (HiveException e) {
        throw e;
    } catch (Exception e) {
        throw new HiveException(e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) StoragePolicyValue(org.apache.hadoop.hive.shims.HadoopShims.StoragePolicyValue) HiveFatalException(org.apache.hadoop.hive.ql.metadata.HiveFatalException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) IOException(java.io.IOException) HiveKey(org.apache.hadoop.hive.ql.io.HiveKey) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) JobConf(org.apache.hadoop.mapred.JobConf) StoragePolicyShim(org.apache.hadoop.hive.shims.HadoopShims.StoragePolicyShim) SubStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SubStructObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 3 with ObjectInspectorUtils

use of org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils in project hive by apache.

the class VectorReduceSinkObjectHashOperator method initializeOp.

@Override
protected void initializeOp(Configuration hconf) throws HiveException {
    super.initializeOp(hconf);
    VectorExpression.doTransientInit(reduceSinkBucketExpressions, hconf);
    VectorExpression.doTransientInit(reduceSinkPartitionExpressions, hconf);
    if (!isEmptyKey) {
        // For this variation, we serialize the key without caring if it single Long,
        // single String, multi-key, etc.
        keyOutput = new Output();
        keyBinarySortableSerializeWrite.set(keyOutput);
        keyVectorSerializeRow = new VectorSerializeRow<BinarySortableSerializeWrite>(keyBinarySortableSerializeWrite);
        keyVectorSerializeRow.init(reduceSinkKeyTypeInfos, reduceSinkKeyColumnMap);
    }
    if (isEmptyBuckets) {
        numBuckets = 0;
    } else {
        numBuckets = conf.getNumBuckets();
        bucketObjectInspectors = getObjectInspectorArray(reduceSinkBucketTypeInfos);
        bucketVectorExtractRow = new VectorExtractRow();
        bucketVectorExtractRow.init(reduceSinkBucketTypeInfos, reduceSinkBucketColumnMap);
        bucketFieldValues = new Object[reduceSinkBucketTypeInfos.length];
    }
    if (isEmptyPartitions) {
        nonPartitionRandom = new Random(12345);
    } else {
        partitionObjectInspectors = getObjectInspectorArray(reduceSinkPartitionTypeInfos);
        partitionVectorExtractRow = new VectorExtractRow();
        partitionVectorExtractRow.init(reduceSinkPartitionTypeInfos, reduceSinkPartitionColumnMap);
        partitionFieldValues = new Object[reduceSinkPartitionTypeInfos.length];
    }
    // Set hashFunc
    hashFunc = getConf().getBucketingVersion() == 2 && !vectorDesc.getIsAcidChange() ? ObjectInspectorUtils::getBucketHashCode : ObjectInspectorUtils::getBucketHashCodeOld;
    // Set function to evaluate _bucket_number if needed.
    if (reduceSinkKeyExpressions != null) {
        for (VectorExpression ve : reduceSinkKeyExpressions) {
            if (ve instanceof BucketNumExpression) {
                bucketExpr = (BucketNumExpression) ve;
                break;
            }
        }
    }
}
Also used : Random(java.util.Random) Output(org.apache.hadoop.hive.serde2.ByteStream.Output) BucketNumExpression(org.apache.hadoop.hive.ql.exec.vector.expressions.BucketNumExpression) VectorExpression(org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression) BinarySortableSerializeWrite(org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite) VectorExtractRow(org.apache.hadoop.hive.ql.exec.vector.VectorExtractRow)

Example 4 with ObjectInspectorUtils

use of org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils in project hive by apache.

the class TestLazyBinarySerDe method compareDiffSizedStructs.

/**
 * Compare two structs that have different number of fields. We just compare
 * the first few common fields, ignoring the fields existing in one struct but
 * not the other.
 *
 * @see ObjectInspectorUtils#compare(Object, ObjectInspector, Object,
 *      ObjectInspector)
 */
int compareDiffSizedStructs(Object o1, ObjectInspector oi1, Object o2, ObjectInspector oi2) {
    StructObjectInspector soi1 = (StructObjectInspector) oi1;
    StructObjectInspector soi2 = (StructObjectInspector) oi2;
    List<? extends StructField> fields1 = soi1.getAllStructFieldRefs();
    List<? extends StructField> fields2 = soi2.getAllStructFieldRefs();
    int minimum = Math.min(fields1.size(), fields2.size());
    for (int i = 0; i < minimum; i++) {
        int result = ObjectInspectorUtils.compare(soi1.getStructFieldData(o1, fields1.get(i)), fields1.get(i).getFieldObjectInspector(), soi2.getStructFieldData(o2, fields2.get(i)), fields2.get(i).getFieldObjectInspector());
        if (result != 0) {
            return result;
        }
    }
    return 0;
}
Also used : StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Aggregations

IOException (java.io.IOException)2 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)2 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)2 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)2 Random (java.util.Random)1 Path (org.apache.hadoop.fs.Path)1 VectorExtractRow (org.apache.hadoop.hive.ql.exec.vector.VectorExtractRow)1 BucketNumExpression (org.apache.hadoop.hive.ql.exec.vector.expressions.BucketNumExpression)1 VectorExpression (org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression)1 HiveKey (org.apache.hadoop.hive.ql.io.HiveKey)1 HiveFatalException (org.apache.hadoop.hive.ql.metadata.HiveFatalException)1 ExprNodeGenericFuncDesc (org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc)1 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)1 GenericUDFBucketNumber (org.apache.hadoop.hive.ql.udf.generic.GenericUDFBucketNumber)1 AbstractSerDe (org.apache.hadoop.hive.serde2.AbstractSerDe)1 Output (org.apache.hadoop.hive.serde2.ByteStream.Output)1 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)1 BinarySortableSerializeWrite (org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite)1 SubStructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.SubStructObjectInspector)1 StoragePolicyShim (org.apache.hadoop.hive.shims.HadoopShims.StoragePolicyShim)1