use of org.apache.hadoop.hive.ql.metadata.HiveFatalException in project hive by apache.
the class FileSinkOperator method process.
@Override
public void process(Object row, int tag) throws HiveException {
runTimeNumRows++;
/* Create list bucketing sub-directory only if stored-as-directories is on. */
String lbDirName = null;
lbDirName = (lbCtx == null) ? null : generateListBucketingDirName(row);
if (!bDynParts && !filesCreated) {
if (lbDirName != null) {
FSPaths fsp2 = lookupListBucketingPaths(lbDirName);
} else {
createBucketFiles(fsp);
}
}
try {
updateProgress();
// if DP is enabled, get the final output writers and prepare the real output row
assert inputObjInspectors[0].getCategory() == ObjectInspector.Category.STRUCT : "input object inspector is not struct";
if (bDynParts) {
// we need to read bucket number which is the last column in value (after partition columns)
if (conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED)) {
numDynParts += 1;
}
// copy the DP column values from the input row to dpVals
dpVals.clear();
dpWritables.clear();
ObjectInspectorUtils.partialCopyToStandardObject(dpWritables, row, dpStartCol, numDynParts, (StructObjectInspector) inputObjInspectors[0], ObjectInspectorCopyOption.WRITABLE);
// pass the null value along to the escaping process to determine what the dir should be
for (Object o : dpWritables) {
if (o == null || o.toString().length() == 0) {
dpVals.add(dpCtx.getDefaultPartitionName());
} else {
dpVals.add(o.toString());
}
}
String invalidPartitionVal;
if ((invalidPartitionVal = HiveStringUtils.getPartitionValWithInvalidCharacter(dpVals, dpCtx.getWhiteListPattern())) != null) {
throw new HiveFatalException("Partition value '" + invalidPartitionVal + "' contains a character not matched by whitelist pattern '" + dpCtx.getWhiteListPattern().toString() + "'. " + "(configure with " + HiveConf.ConfVars.METASTORE_PARTITION_NAME_WHITELIST_PATTERN.varname + ")");
}
fpaths = getDynOutPaths(dpVals, lbDirName);
// use SubStructObjectInspector to serialize the non-partitioning columns in the input row
recordValue = serializer.serialize(row, subSetOI);
} else {
if (lbDirName != null) {
fpaths = lookupListBucketingPaths(lbDirName);
} else {
fpaths = fsp;
}
recordValue = serializer.serialize(row, inputObjInspectors[0]);
// is kept track of in the SerDe)
if (recordValue == null) {
return;
}
}
rowOutWriters = fpaths.outWriters;
// check if all record writers implement statistics. if atleast one RW
// doesn't implement stats interface we will fallback to conventional way
// of gathering stats
isCollectRWStats = areAllTrue(statsFromRecordWriter);
if (conf.isGatherStats() && !isCollectRWStats) {
SerDeStats stats = serializer.getSerDeStats();
if (stats != null) {
fpaths.stat.addToStat(StatsSetupConst.RAW_DATA_SIZE, stats.getRawDataSize());
}
fpaths.stat.addToStat(StatsSetupConst.ROW_COUNT, 1);
}
if ((++numRows == cntr) && isLogInfoEnabled) {
cntr = logEveryNRows == 0 ? cntr * 10 : numRows + logEveryNRows;
if (cntr < 0 || numRows < 0) {
cntr = 0;
numRows = 1;
}
LOG.info(toString() + ": records written - " + numRows);
}
// This should always be 0 for the final result file
int writerOffset = findWriterOffset(row);
// pass the row rather than recordValue.
if (conf.getWriteType() == AcidUtils.Operation.NOT_ACID) {
rowOutWriters[writerOffset].write(recordValue);
} else if (conf.getWriteType() == AcidUtils.Operation.INSERT) {
fpaths.updaters[writerOffset].insert(conf.getTransactionId(), row);
} else {
// TODO I suspect we could skip much of the stuff above this in the function in the case
// of update and delete. But I don't understand all of the side effects of the above
// code and don't want to skip over it yet.
// Find the bucket id, and switch buckets if need to
ObjectInspector rowInspector = bDynParts ? subSetOI : outputObjInspector;
Object recId = ((StructObjectInspector) rowInspector).getStructFieldData(row, recIdField);
int bucketNum = bucketInspector.get(recIdInspector.getStructFieldData(recId, bucketField));
if (fpaths.acidLastBucket != bucketNum) {
fpaths.acidLastBucket = bucketNum;
// Switch files
fpaths.updaters[conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED) ? 0 : ++fpaths.acidFileOffset] = HiveFileFormatUtils.getAcidRecordUpdater(jc, conf.getTableInfo(), bucketNum, conf, fpaths.outPaths[conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED) ? 0 : fpaths.acidFileOffset], rowInspector, reporter, 0);
if (isDebugEnabled) {
LOG.debug("Created updater for bucket number " + bucketNum + " using file " + fpaths.outPaths[conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED) ? 0 : fpaths.acidFileOffset]);
}
}
if (conf.getWriteType() == AcidUtils.Operation.UPDATE) {
fpaths.updaters[conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED) ? 0 : fpaths.acidFileOffset].update(conf.getTransactionId(), row);
} else if (conf.getWriteType() == AcidUtils.Operation.DELETE) {
fpaths.updaters[conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED) ? 0 : fpaths.acidFileOffset].delete(conf.getTransactionId(), row);
} else {
throw new HiveException("Unknown write type " + conf.getWriteType().toString());
}
}
} catch (IOException e) {
throw new HiveException(e);
} catch (SerDeException e) {
throw new HiveException(e);
}
}
use of org.apache.hadoop.hive.ql.metadata.HiveFatalException in project hive by apache.
the class FileSinkOperator method getDynOutPaths.
protected FSPaths getDynOutPaths(List<String> row, String lbDirName) throws HiveException {
FSPaths fp;
// get the path corresponding to the dynamic partition columns,
String dpDir = getDynPartDirectory(row, dpColNames);
String pathKey = null;
if (dpDir != null) {
dpDir = appendToSource(lbDirName, dpDir);
pathKey = dpDir;
if (conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED)) {
String buckNum = row.get(row.size() - 1);
taskId = Utilities.replaceTaskIdFromFilename(taskId, buckNum);
pathKey = appendToSource(taskId, dpDir);
}
FSPaths fsp2 = valToPaths.get(pathKey);
if (fsp2 == null) {
// check # of dp
if (valToPaths.size() > maxPartitions) {
// we cannot proceed and need to tell the hive client that retries won't succeed either
throw new HiveFatalException(ErrorMsg.DYNAMIC_PARTITIONS_TOO_MANY_PER_NODE_ERROR.getErrorCodedMsg() + "Maximum was set to " + maxPartitions + " partitions per node" + ", number of dynamic partitions on this node: " + valToPaths.size());
}
if (!conf.getDpSortState().equals(DPSortState.NONE) && prevFsp != null) {
// close the previous fsp as it is no longer needed
prevFsp.closeWriters(false);
// stats from the record writer and store in the previous fsp that is cached
if (conf.isGatherStats() && isCollectRWStats) {
SerDeStats stats = null;
if (conf.getWriteType() == AcidUtils.Operation.NOT_ACID) {
RecordWriter outWriter = prevFsp.outWriters[0];
if (outWriter != null) {
stats = ((StatsProvidingRecordWriter) outWriter).getStats();
}
} else if (prevFsp.updaters[0] != null) {
stats = prevFsp.updaters[0].getStats();
}
if (stats != null) {
prevFsp.stat.addToStat(StatsSetupConst.RAW_DATA_SIZE, stats.getRawDataSize());
prevFsp.stat.addToStat(StatsSetupConst.ROW_COUNT, stats.getRowCount());
}
}
// let writers release the memory for garbage collection
prevFsp.outWriters[0] = null;
prevFsp = null;
}
fsp2 = createNewPaths(dpDir);
if (prevFsp == null) {
prevFsp = fsp2;
}
if (conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED)) {
createBucketForFileIdx(fsp2, 0);
valToPaths.put(pathKey, fsp2);
}
}
fp = fsp2;
} else {
fp = fsp;
}
return fp;
}
Aggregations