use of org.apache.hadoop.hive.ql.stats.StatsCollectionContext in project hive by apache.
the class DagUtils method createVertex.
/**
* Create a vertex from a given work object.
*
* @param conf JobConf to be used to this execution unit
* @param workUnit The instance of BaseWork representing the actual work to be performed
* by this vertex.
* @param scratchDir HDFS scratch dir for this execution unit.
* @return Vertex
*/
@SuppressWarnings("deprecation")
public Vertex createVertex(JobConf conf, BaseWork workUnit, Path scratchDir, TezWork tezWork, Map<String, LocalResource> localResources) throws Exception {
Vertex vertex;
// simply dispatch the call to the right method for the actual (sub-) type of
// BaseWork.
VertexType vertexType = tezWork.getVertexType(workUnit);
if (workUnit instanceof MapWork) {
vertex = createVertexFromMapWork(conf, (MapWork) workUnit, scratchDir, vertexType);
} else if (workUnit instanceof ReduceWork) {
vertex = createVertexFromReduceWork(conf, (ReduceWork) workUnit, scratchDir);
} else if (workUnit instanceof MergeJoinWork) {
vertex = createVertexFromMergeWork(conf, (MergeJoinWork) workUnit, scratchDir, vertexType);
// set VertexManagerPlugin if whether it's a cross product destination vertex
List<String> crossProductSources = new ArrayList<>();
for (BaseWork parentWork : tezWork.getParents(workUnit)) {
if (tezWork.getEdgeType(parentWork, workUnit) == EdgeType.XPROD_EDGE) {
crossProductSources.add(parentWork.getName());
}
}
if (!crossProductSources.isEmpty()) {
CartesianProductConfig cpConfig = new CartesianProductConfig(crossProductSources);
vertex.setVertexManagerPlugin(VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName()).setUserPayload(cpConfig.toUserPayload(new TezConfiguration(conf))));
// parallelism shouldn't be set for cartesian product vertex
}
} else {
// something is seriously wrong if this is happening
throw new HiveException(ErrorMsg.GENERIC_ERROR.getErrorCodedMsg());
}
VertexExecutionContext vertexExecutionContext = createVertexExecutionContext(workUnit);
vertex.addTaskLocalFiles(localResources);
vertex.setTaskLaunchCmdOpts(getContainerJavaOpts(conf));
vertex.setExecutionContext(vertexExecutionContext);
// initialize stats publisher if necessary
if (workUnit.isGatheringStats()) {
StatsPublisher statsPublisher;
StatsFactory factory = StatsFactory.newFactory(conf);
if (factory != null) {
StatsCollectionContext sCntxt = new StatsCollectionContext(conf);
sCntxt.setStatsTmpDirs(Utilities.getStatsTmpDirs(workUnit, conf));
statsPublisher = factory.getStatsPublisher();
if (!statsPublisher.init(sCntxt)) {
// creating stats table if not exists
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
throw new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
}
}
}
}
final Class outputKlass;
if (HiveOutputFormatImpl.class.getName().equals(conf.get("mapred.output.format.class"))) {
// Hive uses this output format, when it is going to write all its data through FS operator
outputKlass = NullMROutput.class;
} else {
outputKlass = MROutput.class;
}
// If there is a fileSink add a DataSink to the vertex
boolean hasFileSink = workUnit.getAllOperators().stream().anyMatch(o -> o instanceof FileSinkOperator);
// final vertices need to have at least one output
boolean endVertex = tezWork.getLeaves().contains(workUnit);
if (endVertex || hasFileSink) {
OutputCommitterDescriptor ocd = null;
String committer = HiveConf.getVar(conf, ConfVars.TEZ_MAPREDUCE_OUTPUT_COMMITTER);
if (committer != null && !committer.isEmpty()) {
ocd = OutputCommitterDescriptor.create(committer);
}
vertex.addDataSink("out_" + workUnit.getName(), new DataSinkDescriptor(OutputDescriptor.create(outputKlass.getName()).setUserPayload(vertex.getProcessorDescriptor().getUserPayload()), ocd, null));
}
return vertex;
}
use of org.apache.hadoop.hive.ql.stats.StatsCollectionContext in project hive by apache.
the class ExplainSemanticAnalyzer method aggregateStats.
private Map<String, Long> aggregateStats(Path localTmpPath) {
Map<String, Long> opIdToRuntimeNumRows = new HashMap<String, Long>();
// localTmpPath is the root of all the stats.
// Under it, there will be SEL_1/statsfiles, SEL_2/statsfiles etc where SEL_1 and SEL_2 are the op ids.
FileSystem fs;
FileStatus[] statuses = null;
try {
fs = localTmpPath.getFileSystem(conf);
statuses = fs.listStatus(localTmpPath, FileUtils.HIDDEN_FILES_PATH_FILTER);
// statuses can be null if it is DDL, etc
} catch (IOException e) {
LOG.warn(e.toString());
}
if (statuses != null) {
for (FileStatus status : statuses) {
if (status.isDir()) {
StatsCollectionContext scc = new StatsCollectionContext(conf);
String[] names = status.getPath().toString().split(Path.SEPARATOR);
String opId = names[names.length - 1];
scc.setStatsTmpDir(status.getPath().toString());
StatsAggregator statsAggregator = new FSStatsAggregator();
if (!statsAggregator.connect(scc)) {
// -1 means that there is no stats
opIdToRuntimeNumRows.put(opId, -1L);
} else {
String value = statsAggregator.aggregateStats("", StatsSetupConst.RUN_TIME_ROW_COUNT);
opIdToRuntimeNumRows.put(opId, Long.parseLong(value));
}
if (statsAggregator != null) {
statsAggregator.closeConnection(scc);
}
}
}
}
return opIdToRuntimeNumRows;
}
use of org.apache.hadoop.hive.ql.stats.StatsCollectionContext in project hive by apache.
the class AnnotateRunTimeStatsOptimizer method setRuntimeStatsDir.
private static void setRuntimeStatsDir(Operator<? extends OperatorDesc> op, ParseContext pctx) throws SemanticException {
try {
OperatorDesc conf = op.getConf();
if (conf != null) {
LOG.info("setRuntimeStatsDir for " + op.getOperatorId());
String path = new Path(pctx.getContext().getExplainConfig().getExplainRootPath(), op.getOperatorId()).toString();
StatsPublisher statsPublisher = new FSStatsPublisher();
StatsCollectionContext runtimeStatsContext = new StatsCollectionContext(pctx.getConf());
runtimeStatsContext.setStatsTmpDir(path);
if (!statsPublisher.init(runtimeStatsContext)) {
LOG.error("StatsPublishing error: StatsPublisher is not initialized.");
throw new HiveException(ErrorMsg.STATSPUBLISHER_NOT_OBTAINED.getErrorCodedMsg());
}
conf.setRuntimeStatsTmpDir(path);
} else {
LOG.debug("skip setRuntimeStatsDir for " + op.getOperatorId() + " because OperatorDesc is null");
}
} catch (HiveException e) {
throw new SemanticException(e);
}
}
use of org.apache.hadoop.hive.ql.stats.StatsCollectionContext in project hive by apache.
the class DagUtils method createVertex.
/**
* Create a vertex from a given work object.
*
* @param conf JobConf to be used to this execution unit
* @param work The instance of BaseWork representing the actual work to be performed
* by this vertex.
* @param scratchDir HDFS scratch dir for this execution unit.
* @param appJarLr Local resource for hive-exec.
* @param additionalLr
* @param fileSystem FS corresponding to scratchDir and LocalResources
* @param ctx This query's context
* @return Vertex
*/
@SuppressWarnings("deprecation")
public Vertex createVertex(JobConf conf, BaseWork work, Path scratchDir, LocalResource appJarLr, List<LocalResource> additionalLr, FileSystem fileSystem, Context ctx, boolean hasChildren, TezWork tezWork, VertexType vertexType) throws Exception {
Vertex v = null;
// BaseWork.
if (work instanceof MapWork) {
v = createVertex(conf, (MapWork) work, appJarLr, additionalLr, fileSystem, scratchDir, ctx, vertexType);
} else if (work instanceof ReduceWork) {
v = createVertex(conf, (ReduceWork) work, appJarLr, additionalLr, fileSystem, scratchDir, ctx);
} else if (work instanceof MergeJoinWork) {
v = createVertex(conf, (MergeJoinWork) work, appJarLr, additionalLr, fileSystem, scratchDir, ctx, vertexType);
} else {
// something is seriously wrong if this is happening
throw new HiveException(ErrorMsg.GENERIC_ERROR.getErrorCodedMsg());
}
// initialize stats publisher if necessary
if (work.isGatheringStats()) {
StatsPublisher statsPublisher;
StatsFactory factory = StatsFactory.newFactory(conf);
if (factory != null) {
StatsCollectionContext sCntxt = new StatsCollectionContext(conf);
sCntxt.setStatsTmpDirs(Utilities.getStatsTmpDirs(work, conf));
statsPublisher = factory.getStatsPublisher();
if (!statsPublisher.init(sCntxt)) {
// creating stats table if not exists
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
throw new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
}
}
}
}
// final vertices need to have at least one output
if (!hasChildren) {
v.addDataSink("out_" + work.getName(), new DataSinkDescriptor(OutputDescriptor.create(MROutput.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(conf)), null, null));
}
return v;
}
use of org.apache.hadoop.hive.ql.stats.StatsCollectionContext in project hive by apache.
the class PartialScanMapper method publishStats.
/**
* Publish statistics.
* similar to FileSinkOperator.java publishStats()
*
* @throws HiveException
*/
private void publishStats() throws HiveException {
// Initializing a stats publisher
StatsPublisher statsPublisher = Utilities.getStatsPublisher(jc);
if (statsPublisher == null) {
// just return, stats gathering should not block the main query
LOG.error("StatsPublishing error: StatsPublisher is not initialized.");
throw new HiveException(ErrorMsg.STATSPUBLISHER_NOT_OBTAINED.getErrorCodedMsg());
}
StatsCollectionContext sc = new StatsCollectionContext(jc);
sc.setStatsTmpDir(jc.get(StatsSetupConst.STATS_TMP_LOC, ""));
if (!statsPublisher.connect(sc)) {
// should fail since stats gathering is main purpose of the job
LOG.error("StatsPublishing error: cannot connect to database");
throw new HiveException(ErrorMsg.STATSPUBLISHER_CONNECTION_ERROR.getErrorCodedMsg());
}
// construct key used to store stats in intermediate db
String key = statsAggKeyPrefix.endsWith(Path.SEPARATOR) ? statsAggKeyPrefix : statsAggKeyPrefix + Path.SEPARATOR;
// construct statistics to be stored
Map<String, String> statsToPublish = new HashMap<String, String>();
statsToPublish.put(StatsSetupConst.RAW_DATA_SIZE, Long.toString(uncompressedFileSize));
statsToPublish.put(StatsSetupConst.ROW_COUNT, Long.toString(rowNo));
if (!statsPublisher.publishStat(key, statsToPublish)) {
// Not changing the interface to maintain backward compatibility
throw new HiveException(ErrorMsg.STATSPUBLISHER_PUBLISHING_ERROR.getErrorCodedMsg());
}
if (!statsPublisher.closeConnection(sc)) {
// Not changing the interface to maintain backward compatibility
throw new HiveException(ErrorMsg.STATSPUBLISHER_CLOSING_ERROR.getErrorCodedMsg());
}
}
Aggregations