use of org.apache.hadoop.hive.ql.metadata.HiveException in project hive by apache.
the class FileSinkOperator method createBucketFiles.
protected void createBucketFiles(FSPaths fsp) throws HiveException {
try {
int filesIdx = 0;
Set<Integer> seenBuckets = new HashSet<Integer>();
for (int idx = 0; idx < totalFiles; idx++) {
if (this.getExecContext() != null && this.getExecContext().getFileId() != null) {
if (isInfoEnabled) {
LOG.info("replace taskId from execContext ");
}
taskId = Utilities.replaceTaskIdFromFilename(taskId, this.getExecContext().getFileId());
if (isInfoEnabled) {
LOG.info("new taskId: FS " + taskId);
}
assert !multiFileSpray;
assert totalFiles == 1;
}
int bucketNum = 0;
if (multiFileSpray) {
key.setHashCode(idx);
// Does this hashcode belong to this reducer
int numReducers = totalFiles / numFiles;
if (numReducers > 1) {
int currReducer = Integer.parseInt(Utilities.getTaskIdFromFilename(Utilities.getTaskId(hconf)));
int reducerIdx = prtner.getPartition(key, null, numReducers);
if (currReducer != reducerIdx) {
continue;
}
}
bucketNum = prtner.getBucket(key, null, totalFiles);
if (seenBuckets.contains(bucketNum)) {
continue;
}
seenBuckets.add(bucketNum);
bucketMap.put(bucketNum, filesIdx);
taskId = Utilities.replaceTaskIdFromFilename(Utilities.getTaskId(hconf), bucketNum);
}
createBucketForFileIdx(fsp, filesIdx);
filesIdx++;
}
assert filesIdx == numFiles;
// in recent hadoop versions, use deleteOnExit to clean tmp files.
if (isNativeTable && fs != null && fsp != null) {
autoDelete = fs.deleteOnExit(fsp.outPaths[0]);
}
} catch (Exception e) {
e.printStackTrace();
throw new HiveException(e);
}
filesCreated = true;
}
use of org.apache.hadoop.hive.ql.metadata.HiveException in project hive by apache.
the class FileSinkOperator method process.
@Override
public void process(Object row, int tag) throws HiveException {
runTimeNumRows++;
/* Create list bucketing sub-directory only if stored-as-directories is on. */
String lbDirName = null;
lbDirName = (lbCtx == null) ? null : generateListBucketingDirName(row);
if (!bDynParts && !filesCreated) {
if (lbDirName != null) {
FSPaths fsp2 = lookupListBucketingPaths(lbDirName);
} else {
createBucketFiles(fsp);
}
}
try {
updateProgress();
// if DP is enabled, get the final output writers and prepare the real output row
assert inputObjInspectors[0].getCategory() == ObjectInspector.Category.STRUCT : "input object inspector is not struct";
if (bDynParts) {
// we need to read bucket number which is the last column in value (after partition columns)
if (conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED)) {
numDynParts += 1;
}
// copy the DP column values from the input row to dpVals
dpVals.clear();
dpWritables.clear();
ObjectInspectorUtils.partialCopyToStandardObject(dpWritables, row, dpStartCol, numDynParts, (StructObjectInspector) inputObjInspectors[0], ObjectInspectorCopyOption.WRITABLE);
// pass the null value along to the escaping process to determine what the dir should be
for (Object o : dpWritables) {
if (o == null || o.toString().length() == 0) {
dpVals.add(dpCtx.getDefaultPartitionName());
} else {
dpVals.add(o.toString());
}
}
String invalidPartitionVal;
if ((invalidPartitionVal = HiveStringUtils.getPartitionValWithInvalidCharacter(dpVals, dpCtx.getWhiteListPattern())) != null) {
throw new HiveFatalException("Partition value '" + invalidPartitionVal + "' contains a character not matched by whitelist pattern '" + dpCtx.getWhiteListPattern().toString() + "'. " + "(configure with " + HiveConf.ConfVars.METASTORE_PARTITION_NAME_WHITELIST_PATTERN.varname + ")");
}
fpaths = getDynOutPaths(dpVals, lbDirName);
// use SubStructObjectInspector to serialize the non-partitioning columns in the input row
recordValue = serializer.serialize(row, subSetOI);
} else {
if (lbDirName != null) {
fpaths = lookupListBucketingPaths(lbDirName);
} else {
fpaths = fsp;
}
recordValue = serializer.serialize(row, inputObjInspectors[0]);
// is kept track of in the SerDe)
if (recordValue == null) {
return;
}
}
rowOutWriters = fpaths.outWriters;
// check if all record writers implement statistics. if atleast one RW
// doesn't implement stats interface we will fallback to conventional way
// of gathering stats
isCollectRWStats = areAllTrue(statsFromRecordWriter);
if (conf.isGatherStats() && !isCollectRWStats) {
SerDeStats stats = serializer.getSerDeStats();
if (stats != null) {
fpaths.stat.addToStat(StatsSetupConst.RAW_DATA_SIZE, stats.getRawDataSize());
}
fpaths.stat.addToStat(StatsSetupConst.ROW_COUNT, 1);
}
if ((++numRows == cntr) && isLogInfoEnabled) {
cntr = logEveryNRows == 0 ? cntr * 10 : numRows + logEveryNRows;
if (cntr < 0 || numRows < 0) {
cntr = 0;
numRows = 1;
}
LOG.info(toString() + ": records written - " + numRows);
}
// This should always be 0 for the final result file
int writerOffset = findWriterOffset(row);
// pass the row rather than recordValue.
if (conf.getWriteType() == AcidUtils.Operation.NOT_ACID) {
rowOutWriters[writerOffset].write(recordValue);
} else if (conf.getWriteType() == AcidUtils.Operation.INSERT) {
fpaths.updaters[writerOffset].insert(conf.getTransactionId(), row);
} else {
// TODO I suspect we could skip much of the stuff above this in the function in the case
// of update and delete. But I don't understand all of the side effects of the above
// code and don't want to skip over it yet.
// Find the bucket id, and switch buckets if need to
ObjectInspector rowInspector = bDynParts ? subSetOI : outputObjInspector;
Object recId = ((StructObjectInspector) rowInspector).getStructFieldData(row, recIdField);
int bucketNum = bucketInspector.get(recIdInspector.getStructFieldData(recId, bucketField));
if (fpaths.acidLastBucket != bucketNum) {
fpaths.acidLastBucket = bucketNum;
// Switch files
fpaths.updaters[conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED) ? 0 : ++fpaths.acidFileOffset] = HiveFileFormatUtils.getAcidRecordUpdater(jc, conf.getTableInfo(), bucketNum, conf, fpaths.outPaths[conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED) ? 0 : fpaths.acidFileOffset], rowInspector, reporter, 0);
if (isDebugEnabled) {
LOG.debug("Created updater for bucket number " + bucketNum + " using file " + fpaths.outPaths[conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED) ? 0 : fpaths.acidFileOffset]);
}
}
if (conf.getWriteType() == AcidUtils.Operation.UPDATE) {
fpaths.updaters[conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED) ? 0 : fpaths.acidFileOffset].update(conf.getTransactionId(), row);
} else if (conf.getWriteType() == AcidUtils.Operation.DELETE) {
fpaths.updaters[conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED) ? 0 : fpaths.acidFileOffset].delete(conf.getTransactionId(), row);
} else {
throw new HiveException("Unknown write type " + conf.getWriteType().toString());
}
}
} catch (IOException e) {
throw new HiveException(e);
} catch (SerDeException e) {
throw new HiveException(e);
}
}
use of org.apache.hadoop.hive.ql.metadata.HiveException in project hive by apache.
the class FileSinkOperator method jobCloseOp.
@Override
public void jobCloseOp(Configuration hconf, boolean success) throws HiveException {
try {
if ((conf != null) && isNativeTable) {
Path specPath = conf.getDirName();
DynamicPartitionCtx dpCtx = conf.getDynPartCtx();
if (conf.isLinkedFileSink() && (dpCtx != null)) {
specPath = conf.getParentDir();
}
Utilities.mvFileToFinalPath(specPath, hconf, success, LOG, dpCtx, conf, reporter);
}
} catch (IOException e) {
throw new HiveException(e);
}
super.jobCloseOp(hconf, success);
}
use of org.apache.hadoop.hive.ql.metadata.HiveException in project hive by apache.
the class FileSinkOperator method publishStats.
private void publishStats() throws HiveException {
boolean isStatsReliable = conf.isStatsReliable();
// Initializing a stats publisher
StatsPublisher statsPublisher = Utilities.getStatsPublisher(jc);
if (statsPublisher == null) {
// just return, stats gathering should not block the main query
LOG.error("StatsPublishing error: StatsPublisher is not initialized.");
if (isStatsReliable) {
throw new HiveException(ErrorMsg.STATSPUBLISHER_NOT_OBTAINED.getErrorCodedMsg());
}
return;
}
StatsCollectionContext sContext = new StatsCollectionContext(hconf);
sContext.setStatsTmpDir(conf.getStatsTmpDir());
if (!statsPublisher.connect(sContext)) {
// just return, stats gathering should not block the main query
LOG.error("StatsPublishing error: cannot connect to database");
if (isStatsReliable) {
throw new HiveException(ErrorMsg.STATSPUBLISHER_CONNECTION_ERROR.getErrorCodedMsg());
}
return;
}
String spSpec = conf.getStaticSpec();
for (Map.Entry<String, FSPaths> entry : valToPaths.entrySet()) {
// DP/LB
String fspKey = entry.getKey();
FSPaths fspValue = entry.getValue();
// adds the taskId to the fspKey.
if (conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED)) {
String taskID = Utilities.getTaskIdFromFilename(fspKey);
// if length of (prefix/ds=__HIVE_DEFAULT_PARTITION__/000000_0) is greater than max key prefix
// and if (prefix/ds=10/000000_0) is less than max key prefix, then former will get hashed
// to a smaller prefix (MD5hash/000000_0) and later will stored as such in staging stats table.
// When stats gets aggregated in StatsTask only the keys that starts with "prefix" will be fetched.
// Now that (prefix/ds=__HIVE_DEFAULT_PARTITION__) is hashed to a smaller prefix it will
// not be retrieved from staging table and hence not aggregated. To avoid this issue
// we will remove the taskId from the key which is redundant anyway.
fspKey = fspKey.split(taskID)[0];
}
// split[0] = DP, split[1] = LB
String[] split = splitKey(fspKey);
String dpSpec = split[0];
// key = "database.table/SP/DP/"LB/
// Hive store lowercase table name in metastore, and Counters is character case sensitive, so we
// use lowercase table name as prefix here, as StatsTask get table name from metastore to fetch counter.
String prefix = conf.getTableInfo().getTableName().toLowerCase();
prefix = Utilities.join(prefix, spSpec, dpSpec);
prefix = prefix.endsWith(Path.SEPARATOR) ? prefix : prefix + Path.SEPARATOR;
Map<String, String> statsToPublish = new HashMap<String, String>();
for (String statType : fspValue.stat.getStoredStats()) {
statsToPublish.put(statType, Long.toString(fspValue.stat.getStat(statType)));
}
if (!statsPublisher.publishStat(prefix, statsToPublish)) {
// Not changing the interface to maintain backward compatibility
if (isStatsReliable) {
throw new HiveException(ErrorMsg.STATSPUBLISHER_PUBLISHING_ERROR.getErrorCodedMsg());
}
}
}
sContext.setIndexForTezUnion(this.getIndexForTezUnion());
if (!statsPublisher.closeConnection(sContext)) {
// Not changing the interface to maintain backward compatibility
if (isStatsReliable) {
throw new HiveException(ErrorMsg.STATSPUBLISHER_CLOSING_ERROR.getErrorCodedMsg());
}
}
}
use of org.apache.hadoop.hive.ql.metadata.HiveException in project hive by apache.
the class FileSinkOperator method initializeOp.
@Override
protected void initializeOp(Configuration hconf) throws HiveException {
super.initializeOp(hconf);
try {
this.hconf = hconf;
filesCreated = false;
isNativeTable = !conf.getTableInfo().isNonNative();
isTemporary = conf.isTemporary();
multiFileSpray = conf.isMultiFileSpray();
totalFiles = conf.getTotalFiles();
numFiles = conf.getNumFiles();
dpCtx = conf.getDynPartCtx();
lbCtx = conf.getLbCtx();
fsp = prevFsp = null;
valToPaths = new HashMap<String, FSPaths>();
taskId = Utilities.getTaskId(hconf);
initializeSpecPath();
fs = specPath.getFileSystem(hconf);
try {
createHiveOutputFormat(hconf);
} catch (HiveException ex) {
logOutputFormatError(hconf, ex);
throw ex;
}
isCompressed = conf.getCompressed();
parent = Utilities.toTempPath(conf.getDirName());
statsFromRecordWriter = new boolean[numFiles];
serializer = (Serializer) conf.getTableInfo().getDeserializerClass().newInstance();
serializer.initialize(unsetNestedColumnPaths(hconf), conf.getTableInfo().getProperties());
outputClass = serializer.getSerializedClass();
if (isLogInfoEnabled) {
LOG.info("Using serializer : " + serializer + " and formatter : " + hiveOutputFormat + (isCompressed ? " with compression" : ""));
}
// Timeout is chosen to make sure that even if one iteration takes more than
// half of the script.timeout but less than script.timeout, we will still
// be able to report progress.
timeOut = hconf.getInt("mapred.healthChecker.script.timeout", 600000) / 2;
if (hconf instanceof JobConf) {
jc = (JobConf) hconf;
} else {
// test code path
jc = new JobConf(hconf);
}
if (multiFileSpray) {
partitionEval = new ExprNodeEvaluator[conf.getPartitionCols().size()];
int i = 0;
for (ExprNodeDesc e : conf.getPartitionCols()) {
partitionEval[i++] = ExprNodeEvaluatorFactory.get(e);
}
partitionObjectInspectors = initEvaluators(partitionEval, outputObjInspector);
prtner = (HivePartitioner<HiveKey, Object>) ReflectionUtils.newInstance(jc.getPartitionerClass(), null);
}
if (dpCtx != null) {
dpSetup();
}
if (lbCtx != null) {
lbSetup();
}
if (!bDynParts) {
fsp = new FSPaths(specPath);
// createBucketFiles(fsp);
if (!this.isSkewedStoredAsSubDirectories) {
// special entry for non-DP case
valToPaths.put("", fsp);
}
}
final StoragePolicyValue tmpStorage = StoragePolicyValue.lookup(HiveConf.getVar(hconf, HIVE_TEMPORARY_TABLE_STORAGE));
if (isTemporary && fsp != null && tmpStorage != StoragePolicyValue.DEFAULT) {
final Path outputPath = fsp.taskOutputTempPath;
StoragePolicyShim shim = ShimLoader.getHadoopShims().getStoragePolicyShim(fs);
if (shim != null) {
// directory creation is otherwise within the writers
fs.mkdirs(outputPath);
shim.setStoragePolicy(outputPath, tmpStorage);
}
}
if (conf.getWriteType() == AcidUtils.Operation.UPDATE || conf.getWriteType() == AcidUtils.Operation.DELETE) {
// ROW__ID is always in the first field
recIdField = ((StructObjectInspector) outputObjInspector).getAllStructFieldRefs().get(0);
recIdInspector = (StructObjectInspector) recIdField.getFieldObjectInspector();
// bucket is the second field in the record id
bucketField = recIdInspector.getAllStructFieldRefs().get(1);
bucketInspector = (IntObjectInspector) bucketField.getFieldObjectInspector();
}
numRows = 0;
cntr = 1;
logEveryNRows = HiveConf.getLongVar(hconf, HiveConf.ConfVars.HIVE_LOG_N_RECORDS);
statsMap.put(getCounterName(Counter.RECORDS_OUT), row_count);
} catch (HiveException e) {
throw e;
} catch (Exception e) {
e.printStackTrace();
throw new HiveException(e);
}
}
Aggregations