use of org.apache.hadoop.hive.ql.io.StatsProvidingRecordWriter in project hive by apache.
the class FileSinkOperator method closeOp.
@Override
public void closeOp(boolean abort) throws HiveException {
row_count.set(numRows);
LOG.info(toString() + ": records written - " + numRows);
if (!bDynParts && !filesCreated) {
boolean skipFiles = "tez".equalsIgnoreCase(HiveConf.getVar(hconf, ConfVars.HIVE_EXECUTION_ENGINE));
if (skipFiles) {
Class<?> clazz = conf.getTableInfo().getOutputFileFormatClass();
skipFiles = !StreamingOutputFormat.class.isAssignableFrom(clazz);
}
if (!skipFiles) {
createBucketFiles(fsp);
}
}
lastProgressReport = System.currentTimeMillis();
if (!abort) {
// (the size of buffer is kept track of in the ThriftJDBCBinarySerDe).
if (conf.isUsingThriftJDBCBinarySerDe()) {
try {
recordValue = serializer.serialize(null, inputObjInspectors[0]);
if (null != fpaths) {
rowOutWriters = fpaths.outWriters;
rowOutWriters[0].write(recordValue);
}
} catch (SerDeException | IOException e) {
throw new HiveException(e);
}
}
List<Path> commitPaths = new ArrayList<>();
for (FSPaths fsp : valToPaths.values()) {
fsp.closeWriters(abort);
// accumulated statistics which will be aggregated in case of spray writers
if (conf.isGatherStats() && isCollectRWStats) {
if (conf.getWriteType() == AcidUtils.Operation.NOT_ACID || conf.isMmTable()) {
for (int idx = 0; idx < fsp.outWriters.length; idx++) {
RecordWriter outWriter = fsp.outWriters[idx];
if (outWriter != null) {
SerDeStats stats = ((StatsProvidingRecordWriter) outWriter).getStats();
if (stats != null) {
fsp.addToStat(StatsSetupConst.RAW_DATA_SIZE, stats.getRawDataSize());
fsp.addToStat(StatsSetupConst.ROW_COUNT, stats.getRowCount());
}
}
}
} else {
for (int i = 0; i < fsp.updaters.length; i++) {
if (fsp.updaters[i] != null) {
SerDeStats stats = fsp.updaters[i].getStats();
if (stats != null) {
fsp.addToStat(StatsSetupConst.RAW_DATA_SIZE, stats.getRawDataSize());
fsp.addToStat(StatsSetupConst.ROW_COUNT, stats.getRowCount());
}
}
}
}
}
if (isNativeTable()) {
fsp.commit(fs, commitPaths);
}
}
if (conf.isMmTable()) {
Utilities.writeMmCommitManifest(commitPaths, specPath, fs, taskId, conf.getTableWriteId(), conf.getStatementId(), unionPath, conf.getInsertOverwrite());
}
// Only publish stats if this operator's flag was set to gather stats
if (conf.isGatherStats()) {
publishStats();
}
} else {
// reduce().
for (FSPaths fsp : valToPaths.values()) {
fsp.abortWriters(fs, abort, !autoDelete && isNativeTable() && !conf.isMmTable());
}
}
fsp = prevFsp = null;
super.closeOp(abort);
}
use of org.apache.hadoop.hive.ql.io.StatsProvidingRecordWriter in project hive by apache.
the class FileSinkOperator method getDynOutPaths.
protected FSPaths getDynOutPaths(List<String> row, String lbDirName) throws HiveException {
FSPaths fp;
// get the path corresponding to the dynamic partition columns,
String dpDir = getDynPartDirectory(row, dpColNames);
String pathKey = null;
if (dpDir != null) {
dpDir = appendToSource(lbDirName, dpDir);
pathKey = dpDir;
if (conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED)) {
String buckNum = row.get(row.size() - 1);
taskId = Utilities.replaceTaskIdFromFilename(taskId, buckNum);
pathKey = appendToSource(taskId, dpDir);
}
FSPaths fsp2 = valToPaths.get(pathKey);
if (fsp2 == null) {
// CREATED_DYNAMIC_PARTITION counter
if (valToPaths.size() > maxPartitions) {
// we cannot proceed and need to tell the hive client that retries won't succeed either
throw new HiveFatalException(ErrorMsg.DYNAMIC_PARTITIONS_TOO_MANY_PER_NODE_ERROR.getErrorCodedMsg() + "Maximum was set to " + maxPartitions + " partitions per node" + ", number of dynamic partitions on this node: " + valToPaths.size());
}
if (!conf.getDpSortState().equals(DPSortState.NONE) && prevFsp != null) {
// close the previous fsp as it is no longer needed
prevFsp.closeWriters(false);
// stats from the record writer and store in the previous fsp that is cached
if (conf.isGatherStats() && isCollectRWStats) {
SerDeStats stats = null;
if (conf.getWriteType() == AcidUtils.Operation.NOT_ACID || conf.isMmTable()) {
RecordWriter outWriter = prevFsp.outWriters[0];
if (outWriter != null) {
stats = ((StatsProvidingRecordWriter) outWriter).getStats();
}
} else if (prevFsp.updaters[0] != null) {
stats = prevFsp.updaters[0].getStats();
}
if (stats != null) {
prevFsp.addToStat(StatsSetupConst.RAW_DATA_SIZE, stats.getRawDataSize());
prevFsp.addToStat(StatsSetupConst.ROW_COUNT, stats.getRowCount());
}
}
// let writers release the memory for garbage collection
prevFsp.outWriters[0] = null;
prevFsp = null;
}
fsp2 = createNewPaths(dpDir);
if (prevFsp == null) {
prevFsp = fsp2;
}
if (conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED)) {
createBucketForFileIdx(fsp2, 0);
valToPaths.put(pathKey, fsp2);
}
}
fp = fsp2;
} else {
fp = fsp;
}
return fp;
}
use of org.apache.hadoop.hive.ql.io.StatsProvidingRecordWriter in project hive by apache.
the class FileSinkOperator method createBucketForFileIdx.
protected void createBucketForFileIdx(FSPaths fsp, int filesIdx) throws HiveException {
try {
fsp.initializeBucketPaths(filesIdx, taskId, isNativeTable(), isSkewedStoredAsSubDirectories);
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("createBucketForFileIdx " + filesIdx + ": final path " + fsp.finalPaths[filesIdx] + "; out path " + fsp.outPaths[filesIdx] + " (spec path " + specPath + ", tmp path " + fsp.getTmpPath() + ", task " + taskId + ")");
}
if (LOG.isInfoEnabled()) {
LOG.info("New Final Path: FS " + fsp.finalPaths[filesIdx]);
}
if (isNativeTable() && !conf.isMmTable()) {
// in recent hadoop versions, use deleteOnExit to clean tmp files.
autoDelete = fs.deleteOnExit(fsp.outPaths[filesIdx]);
}
updateDPCounters(fsp, filesIdx);
Utilities.copyTableJobPropertiesToConf(conf.getTableInfo(), jc);
// If MM wants to create a new base for IOW (instead of delta dir), it should specify it here
if (conf.getWriteType() == AcidUtils.Operation.NOT_ACID || conf.isMmTable()) {
Path outPath = fsp.outPaths[filesIdx];
if (conf.isMmTable() && !FileUtils.mkdir(fs, outPath.getParent(), hconf)) {
LOG.warn("Unable to create directory with inheritPerms: " + outPath);
}
fsp.outWriters[filesIdx] = HiveFileFormatUtils.getHiveRecordWriter(jc, conf.getTableInfo(), outputClass, conf, outPath, reporter);
// If the record writer provides stats, get it from there instead of the serde
statsFromRecordWriter[filesIdx] = fsp.outWriters[filesIdx] instanceof StatsProvidingRecordWriter;
// increment the CREATED_FILES counter
} else if (conf.getWriteType() == AcidUtils.Operation.INSERT) {
// Only set up the updater for insert. For update and delete we don't know unitl we see
// the row.
ObjectInspector inspector = bDynParts ? subSetOI : outputObjInspector;
int acidBucketNum = Integer.parseInt(Utilities.getTaskIdFromFilename(taskId));
fsp.updaters[filesIdx] = HiveFileFormatUtils.getAcidRecordUpdater(jc, conf.getTableInfo(), acidBucketNum, conf, fsp.outPaths[filesIdx], inspector, reporter, -1);
}
if (reporter != null) {
reporter.incrCounter(counterGroup, Operator.HIVE_COUNTER_CREATED_FILES, 1);
}
} catch (IOException e) {
throw new HiveException(e);
}
}
Aggregations