Search in sources :

Example 1 with SubStructObjectInspector

use of org.apache.hadoop.hive.serde2.objectinspector.SubStructObjectInspector in project hive by apache.

the class FileSinkOperator method dpSetup.

/**
   * Set up for dynamic partitioning including a new ObjectInspector for the output row.
   */
private void dpSetup() {
    this.bDynParts = false;
    this.numDynParts = dpCtx.getNumDPCols();
    this.dpColNames = dpCtx.getDPColNames();
    this.maxPartitions = dpCtx.getMaxPartitionsPerNode();
    assert numDynParts == dpColNames.size() : "number of dynamic partitions should be the same as the size of DP mapping";
    if (dpColNames != null && dpColNames.size() > 0) {
        this.bDynParts = true;
        assert inputObjInspectors.length == 1 : "FileSinkOperator should have 1 parent, but it has " + inputObjInspectors.length;
        StructObjectInspector soi = (StructObjectInspector) inputObjInspectors[0];
        this.dpStartCol = Utilities.getDPColOffset(conf);
        this.subSetOI = new SubStructObjectInspector(soi, 0, this.dpStartCol);
        this.dpVals = new ArrayList<String>(numDynParts);
        this.dpWritables = new ArrayList<Object>(numDynParts);
    }
}
Also used : SubStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SubStructObjectInspector) SubStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SubStructObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 2 with SubStructObjectInspector

use of org.apache.hadoop.hive.serde2.objectinspector.SubStructObjectInspector in project hive by apache.

the class FileSinkOperator method process.

@Override
public void process(Object row, int tag) throws HiveException {
    runTimeNumRows++;
    /* Create list bucketing sub-directory only if stored-as-directories is on. */
    String lbDirName = null;
    lbDirName = (lbCtx == null) ? null : generateListBucketingDirName(row);
    if (!bDynParts && !filesCreated) {
        if (lbDirName != null) {
            FSPaths fsp2 = lookupListBucketingPaths(lbDirName);
        } else {
            createBucketFiles(fsp);
        }
    }
    try {
        updateProgress();
        // if DP is enabled, get the final output writers and prepare the real output row
        assert inputObjInspectors[0].getCategory() == ObjectInspector.Category.STRUCT : "input object inspector is not struct";
        if (bDynParts) {
            // we need to read bucket number which is the last column in value (after partition columns)
            if (conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED)) {
                numDynParts += 1;
            }
            // copy the DP column values from the input row to dpVals
            dpVals.clear();
            dpWritables.clear();
            ObjectInspectorUtils.partialCopyToStandardObject(dpWritables, row, dpStartCol, numDynParts, (StructObjectInspector) inputObjInspectors[0], ObjectInspectorCopyOption.WRITABLE);
            // pass the null value along to the escaping process to determine what the dir should be
            for (Object o : dpWritables) {
                if (o == null || o.toString().length() == 0) {
                    dpVals.add(dpCtx.getDefaultPartitionName());
                } else {
                    dpVals.add(o.toString());
                }
            }
            String invalidPartitionVal;
            if ((invalidPartitionVal = HiveStringUtils.getPartitionValWithInvalidCharacter(dpVals, dpCtx.getWhiteListPattern())) != null) {
                throw new HiveFatalException("Partition value '" + invalidPartitionVal + "' contains a character not matched by whitelist pattern '" + dpCtx.getWhiteListPattern().toString() + "'.  " + "(configure with " + HiveConf.ConfVars.METASTORE_PARTITION_NAME_WHITELIST_PATTERN.varname + ")");
            }
            fpaths = getDynOutPaths(dpVals, lbDirName);
            // use SubStructObjectInspector to serialize the non-partitioning columns in the input row
            recordValue = serializer.serialize(row, subSetOI);
        } else {
            if (lbDirName != null) {
                fpaths = lookupListBucketingPaths(lbDirName);
            } else {
                fpaths = fsp;
            }
            recordValue = serializer.serialize(row, inputObjInspectors[0]);
            // is kept track of in the SerDe)
            if (recordValue == null) {
                return;
            }
        }
        rowOutWriters = fpaths.outWriters;
        // check if all record writers implement statistics. if atleast one RW
        // doesn't implement stats interface we will fallback to conventional way
        // of gathering stats
        isCollectRWStats = areAllTrue(statsFromRecordWriter);
        if (conf.isGatherStats() && !isCollectRWStats) {
            SerDeStats stats = serializer.getSerDeStats();
            if (stats != null) {
                fpaths.stat.addToStat(StatsSetupConst.RAW_DATA_SIZE, stats.getRawDataSize());
            }
            fpaths.stat.addToStat(StatsSetupConst.ROW_COUNT, 1);
        }
        if ((++numRows == cntr) && isLogInfoEnabled) {
            cntr = logEveryNRows == 0 ? cntr * 10 : numRows + logEveryNRows;
            if (cntr < 0 || numRows < 0) {
                cntr = 0;
                numRows = 1;
            }
            LOG.info(toString() + ": records written - " + numRows);
        }
        // This should always be 0 for the final result file
        int writerOffset = findWriterOffset(row);
        // pass the row rather than recordValue.
        if (conf.getWriteType() == AcidUtils.Operation.NOT_ACID) {
            rowOutWriters[writerOffset].write(recordValue);
        } else if (conf.getWriteType() == AcidUtils.Operation.INSERT) {
            fpaths.updaters[writerOffset].insert(conf.getTransactionId(), row);
        } else {
            // TODO I suspect we could skip much of the stuff above this in the function in the case
            // of update and delete.  But I don't understand all of the side effects of the above
            // code and don't want to skip over it yet.
            // Find the bucket id, and switch buckets if need to
            ObjectInspector rowInspector = bDynParts ? subSetOI : outputObjInspector;
            Object recId = ((StructObjectInspector) rowInspector).getStructFieldData(row, recIdField);
            int bucketNum = bucketInspector.get(recIdInspector.getStructFieldData(recId, bucketField));
            if (fpaths.acidLastBucket != bucketNum) {
                fpaths.acidLastBucket = bucketNum;
                // Switch files
                fpaths.updaters[conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED) ? 0 : ++fpaths.acidFileOffset] = HiveFileFormatUtils.getAcidRecordUpdater(jc, conf.getTableInfo(), bucketNum, conf, fpaths.outPaths[conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED) ? 0 : fpaths.acidFileOffset], rowInspector, reporter, 0);
                if (isDebugEnabled) {
                    LOG.debug("Created updater for bucket number " + bucketNum + " using file " + fpaths.outPaths[conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED) ? 0 : fpaths.acidFileOffset]);
                }
            }
            if (conf.getWriteType() == AcidUtils.Operation.UPDATE) {
                fpaths.updaters[conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED) ? 0 : fpaths.acidFileOffset].update(conf.getTransactionId(), row);
            } else if (conf.getWriteType() == AcidUtils.Operation.DELETE) {
                fpaths.updaters[conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED) ? 0 : fpaths.acidFileOffset].delete(conf.getTransactionId(), row);
            } else {
                throw new HiveException("Unknown write type " + conf.getWriteType().toString());
            }
        }
    } catch (IOException e) {
        throw new HiveException(e);
    } catch (SerDeException e) {
        throw new HiveException(e);
    }
}
Also used : SerDeStats(org.apache.hadoop.hive.serde2.SerDeStats) SubStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SubStructObjectInspector) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) IntObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) HiveFatalException(org.apache.hadoop.hive.ql.metadata.HiveFatalException) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException)

Aggregations

StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)2 SubStructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.SubStructObjectInspector)2 IOException (java.io.IOException)1 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)1 HiveFatalException (org.apache.hadoop.hive.ql.metadata.HiveFatalException)1 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)1 SerDeStats (org.apache.hadoop.hive.serde2.SerDeStats)1 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)1 IntObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector)1