use of org.apache.hadoop.hive.serde2.objectinspector.SubStructObjectInspector in project hive by apache.
the class FileSinkOperator method dpSetup.
/**
* Set up for dynamic partitioning including a new ObjectInspector for the output row.
*/
private void dpSetup() {
this.bDynParts = false;
this.numDynParts = dpCtx.getNumDPCols();
this.dpColNames = dpCtx.getDPColNames();
this.maxPartitions = dpCtx.getMaxPartitionsPerNode();
assert numDynParts == dpColNames.size() : "number of dynamic partitions should be the same as the size of DP mapping";
if (dpColNames != null && dpColNames.size() > 0) {
this.bDynParts = true;
assert inputObjInspectors.length == 1 : "FileSinkOperator should have 1 parent, but it has " + inputObjInspectors.length;
StructObjectInspector soi = (StructObjectInspector) inputObjInspectors[0];
this.dpStartCol = Utilities.getDPColOffset(conf);
this.subSetOI = new SubStructObjectInspector(soi, 0, this.dpStartCol);
this.dpVals = new ArrayList<String>(numDynParts);
this.dpWritables = new ArrayList<Object>(numDynParts);
}
}
use of org.apache.hadoop.hive.serde2.objectinspector.SubStructObjectInspector in project hive by apache.
the class FileSinkOperator method process.
@Override
public void process(Object row, int tag) throws HiveException {
runTimeNumRows++;
/* Create list bucketing sub-directory only if stored-as-directories is on. */
String lbDirName = null;
lbDirName = (lbCtx == null) ? null : generateListBucketingDirName(row);
if (!bDynParts && !filesCreated) {
if (lbDirName != null) {
FSPaths fsp2 = lookupListBucketingPaths(lbDirName);
} else {
createBucketFiles(fsp);
}
}
try {
updateProgress();
// if DP is enabled, get the final output writers and prepare the real output row
assert inputObjInspectors[0].getCategory() == ObjectInspector.Category.STRUCT : "input object inspector is not struct";
if (bDynParts) {
// we need to read bucket number which is the last column in value (after partition columns)
if (conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED)) {
numDynParts += 1;
}
// copy the DP column values from the input row to dpVals
dpVals.clear();
dpWritables.clear();
ObjectInspectorUtils.partialCopyToStandardObject(dpWritables, row, dpStartCol, numDynParts, (StructObjectInspector) inputObjInspectors[0], ObjectInspectorCopyOption.WRITABLE);
// pass the null value along to the escaping process to determine what the dir should be
for (Object o : dpWritables) {
if (o == null || o.toString().length() == 0) {
dpVals.add(dpCtx.getDefaultPartitionName());
} else {
dpVals.add(o.toString());
}
}
String invalidPartitionVal;
if ((invalidPartitionVal = HiveStringUtils.getPartitionValWithInvalidCharacter(dpVals, dpCtx.getWhiteListPattern())) != null) {
throw new HiveFatalException("Partition value '" + invalidPartitionVal + "' contains a character not matched by whitelist pattern '" + dpCtx.getWhiteListPattern().toString() + "'. " + "(configure with " + HiveConf.ConfVars.METASTORE_PARTITION_NAME_WHITELIST_PATTERN.varname + ")");
}
fpaths = getDynOutPaths(dpVals, lbDirName);
// use SubStructObjectInspector to serialize the non-partitioning columns in the input row
recordValue = serializer.serialize(row, subSetOI);
} else {
if (lbDirName != null) {
fpaths = lookupListBucketingPaths(lbDirName);
} else {
fpaths = fsp;
}
recordValue = serializer.serialize(row, inputObjInspectors[0]);
// is kept track of in the SerDe)
if (recordValue == null) {
return;
}
}
rowOutWriters = fpaths.outWriters;
// check if all record writers implement statistics. if atleast one RW
// doesn't implement stats interface we will fallback to conventional way
// of gathering stats
isCollectRWStats = areAllTrue(statsFromRecordWriter);
if (conf.isGatherStats() && !isCollectRWStats) {
SerDeStats stats = serializer.getSerDeStats();
if (stats != null) {
fpaths.stat.addToStat(StatsSetupConst.RAW_DATA_SIZE, stats.getRawDataSize());
}
fpaths.stat.addToStat(StatsSetupConst.ROW_COUNT, 1);
}
if ((++numRows == cntr) && isLogInfoEnabled) {
cntr = logEveryNRows == 0 ? cntr * 10 : numRows + logEveryNRows;
if (cntr < 0 || numRows < 0) {
cntr = 0;
numRows = 1;
}
LOG.info(toString() + ": records written - " + numRows);
}
// This should always be 0 for the final result file
int writerOffset = findWriterOffset(row);
// pass the row rather than recordValue.
if (conf.getWriteType() == AcidUtils.Operation.NOT_ACID) {
rowOutWriters[writerOffset].write(recordValue);
} else if (conf.getWriteType() == AcidUtils.Operation.INSERT) {
fpaths.updaters[writerOffset].insert(conf.getTransactionId(), row);
} else {
// TODO I suspect we could skip much of the stuff above this in the function in the case
// of update and delete. But I don't understand all of the side effects of the above
// code and don't want to skip over it yet.
// Find the bucket id, and switch buckets if need to
ObjectInspector rowInspector = bDynParts ? subSetOI : outputObjInspector;
Object recId = ((StructObjectInspector) rowInspector).getStructFieldData(row, recIdField);
int bucketNum = bucketInspector.get(recIdInspector.getStructFieldData(recId, bucketField));
if (fpaths.acidLastBucket != bucketNum) {
fpaths.acidLastBucket = bucketNum;
// Switch files
fpaths.updaters[conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED) ? 0 : ++fpaths.acidFileOffset] = HiveFileFormatUtils.getAcidRecordUpdater(jc, conf.getTableInfo(), bucketNum, conf, fpaths.outPaths[conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED) ? 0 : fpaths.acidFileOffset], rowInspector, reporter, 0);
if (isDebugEnabled) {
LOG.debug("Created updater for bucket number " + bucketNum + " using file " + fpaths.outPaths[conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED) ? 0 : fpaths.acidFileOffset]);
}
}
if (conf.getWriteType() == AcidUtils.Operation.UPDATE) {
fpaths.updaters[conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED) ? 0 : fpaths.acidFileOffset].update(conf.getTransactionId(), row);
} else if (conf.getWriteType() == AcidUtils.Operation.DELETE) {
fpaths.updaters[conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED) ? 0 : fpaths.acidFileOffset].delete(conf.getTransactionId(), row);
} else {
throw new HiveException("Unknown write type " + conf.getWriteType().toString());
}
}
} catch (IOException e) {
throw new HiveException(e);
} catch (SerDeException e) {
throw new HiveException(e);
}
}
Aggregations