Examples with StatsCollectionContext - org.apache.hadoop.hive.ql.stats.StatsCollectionContext

Example 6 with StatsCollectionContext

use of org.apache.hadoop.hive.ql.stats.StatsCollectionContext in project hive by apache.

the class DagUtils method createVertex.

/**
 * Create a vertex from a given work object.
 *
 * @param conf JobConf to be used to this execution unit
 * @param workUnit The instance of BaseWork representing the actual work to be performed
 * by this vertex.
 * @param scratchDir HDFS scratch dir for this execution unit.
 * @return Vertex
 */
@SuppressWarnings("deprecation")
public Vertex createVertex(JobConf conf, BaseWork workUnit, Path scratchDir, TezWork tezWork, Map<String, LocalResource> localResources) throws Exception {
    Vertex vertex;
    // simply dispatch the call to the right method for the actual (sub-) type of
    // BaseWork.
    VertexType vertexType = tezWork.getVertexType(workUnit);
    if (workUnit instanceof MapWork) {
        vertex = createVertexFromMapWork(conf, (MapWork) workUnit, scratchDir, vertexType);
    } else if (workUnit instanceof ReduceWork) {
        vertex = createVertexFromReduceWork(conf, (ReduceWork) workUnit, scratchDir);
    } else if (workUnit instanceof MergeJoinWork) {
        vertex = createVertexFromMergeWork(conf, (MergeJoinWork) workUnit, scratchDir, vertexType);
        // set VertexManagerPlugin if whether it's a cross product destination vertex
        List<String> crossProductSources = new ArrayList<>();
        for (BaseWork parentWork : tezWork.getParents(workUnit)) {
            if (tezWork.getEdgeType(parentWork, workUnit) == EdgeType.XPROD_EDGE) {
                crossProductSources.add(parentWork.getName());
            }
        }
        if (!crossProductSources.isEmpty()) {
            CartesianProductConfig cpConfig = new CartesianProductConfig(crossProductSources);
            vertex.setVertexManagerPlugin(VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName()).setUserPayload(cpConfig.toUserPayload(new TezConfiguration(conf))));
        // parallelism shouldn't be set for cartesian product vertex
        }
    } else {
        // something is seriously wrong if this is happening
        throw new HiveException(ErrorMsg.GENERIC_ERROR.getErrorCodedMsg());
    }
    VertexExecutionContext vertexExecutionContext = createVertexExecutionContext(workUnit);
    vertex.addTaskLocalFiles(localResources);
    vertex.setTaskLaunchCmdOpts(getContainerJavaOpts(conf));
    vertex.setExecutionContext(vertexExecutionContext);
    // initialize stats publisher if necessary
    if (workUnit.isGatheringStats()) {
        StatsPublisher statsPublisher;
        StatsFactory factory = StatsFactory.newFactory(conf);
        if (factory != null) {
            StatsCollectionContext sCntxt = new StatsCollectionContext(conf);
            sCntxt.setStatsTmpDirs(Utilities.getStatsTmpDirs(workUnit, conf));
            statsPublisher = factory.getStatsPublisher();
            if (!statsPublisher.init(sCntxt)) {
                // creating stats table if not exists
                if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
                    throw new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
                }
            }
        }
    }
    final Class outputKlass;
    if (HiveOutputFormatImpl.class.getName().equals(conf.get("mapred.output.format.class"))) {
        // Hive uses this output format, when it is going to write all its data through FS operator
        outputKlass = NullMROutput.class;
    } else {
        outputKlass = MROutput.class;
    }
    // If there is a fileSink add a DataSink to the vertex
    boolean hasFileSink = workUnit.getAllOperators().stream().anyMatch(o -> o instanceof FileSinkOperator);
    // final vertices need to have at least one output
    boolean endVertex = tezWork.getLeaves().contains(workUnit);
    if (endVertex || hasFileSink) {
        OutputCommitterDescriptor ocd = null;
        String committer = HiveConf.getVar(conf, ConfVars.TEZ_MAPREDUCE_OUTPUT_COMMITTER);
        if (committer != null && !committer.isEmpty()) {
            ocd = OutputCommitterDescriptor.create(committer);
        }
        vertex.addDataSink("out_" + workUnit.getName(), new DataSinkDescriptor(OutputDescriptor.create(outputKlass.getName()).setUserPayload(vertex.getProcessorDescriptor().getUserPayload()), ocd, null));
    }
    return vertex;
}

Also used : StatsCollectionContext(org.apache.hadoop.hive.ql.stats.StatsCollectionContext) Vertex(org.apache.tez.dag.api.Vertex) PreWarmVertex(org.apache.tez.dag.api.PreWarmVertex) MergeJoinWork(org.apache.hadoop.hive.ql.plan.MergeJoinWork) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) VertexExecutionContext(org.apache.tez.dag.api.Vertex.VertexExecutionContext) OutputCommitterDescriptor(org.apache.tez.dag.api.OutputCommitterDescriptor) ArrayList(java.util.ArrayList) CartesianProductVertexManager(org.apache.tez.runtime.library.cartesianproduct.CartesianProductVertexManager) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) StatsPublisher(org.apache.hadoop.hive.ql.stats.StatsPublisher) StatsFactory(org.apache.hadoop.hive.ql.stats.StatsFactory) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) HiveOutputFormatImpl(org.apache.hadoop.hive.ql.io.HiveOutputFormatImpl) CartesianProductConfig(org.apache.tez.runtime.library.cartesianproduct.CartesianProductConfig) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) VertexType(org.apache.hadoop.hive.ql.plan.TezWork.VertexType) TezConfiguration(org.apache.tez.dag.api.TezConfiguration)

Example 7 with StatsCollectionContext

use of org.apache.hadoop.hive.ql.stats.StatsCollectionContext in project hive by apache.

the class ExplainSemanticAnalyzer method aggregateStats.

private Map<String, Long> aggregateStats(Path localTmpPath) {
    Map<String, Long> opIdToRuntimeNumRows = new HashMap<String, Long>();
    // localTmpPath is the root of all the stats.
    // Under it, there will be SEL_1/statsfiles, SEL_2/statsfiles etc where SEL_1 and SEL_2 are the op ids.
    FileSystem fs;
    FileStatus[] statuses = null;
    try {
        fs = localTmpPath.getFileSystem(conf);
        statuses = fs.listStatus(localTmpPath, FileUtils.HIDDEN_FILES_PATH_FILTER);
    // statuses can be null if it is DDL, etc
    } catch (IOException e) {
        LOG.warn(e.toString());
    }
    if (statuses != null) {
        for (FileStatus status : statuses) {
            if (status.isDir()) {
                StatsCollectionContext scc = new StatsCollectionContext(conf);
                String[] names = status.getPath().toString().split(Path.SEPARATOR);
                String opId = names[names.length - 1];
                scc.setStatsTmpDir(status.getPath().toString());
                StatsAggregator statsAggregator = new FSStatsAggregator();
                if (!statsAggregator.connect(scc)) {
                    // -1 means that there is no stats
                    opIdToRuntimeNumRows.put(opId, -1L);
                } else {
                    String value = statsAggregator.aggregateStats("", StatsSetupConst.RUN_TIME_ROW_COUNT);
                    opIdToRuntimeNumRows.put(opId, Long.parseLong(value));
                }
                if (statsAggregator != null) {
                    statsAggregator.closeConnection(scc);
                }
            }
        }
    }
    return opIdToRuntimeNumRows;
}

Also used : StatsCollectionContext(org.apache.hadoop.hive.ql.stats.StatsCollectionContext) FileStatus(org.apache.hadoop.fs.FileStatus) HashMap(java.util.HashMap) FileSystem(org.apache.hadoop.fs.FileSystem) FSStatsAggregator(org.apache.hadoop.hive.ql.stats.fs.FSStatsAggregator) IOException(java.io.IOException) StatsAggregator(org.apache.hadoop.hive.ql.stats.StatsAggregator) FSStatsAggregator(org.apache.hadoop.hive.ql.stats.fs.FSStatsAggregator)

Example 8 with StatsCollectionContext

use of org.apache.hadoop.hive.ql.stats.StatsCollectionContext in project hive by apache.

the class AnnotateRunTimeStatsOptimizer method setRuntimeStatsDir.

private static void setRuntimeStatsDir(Operator<? extends OperatorDesc> op, ParseContext pctx) throws SemanticException {
    try {
        OperatorDesc conf = op.getConf();
        if (conf != null) {
            LOG.info("setRuntimeStatsDir for " + op.getOperatorId());
            String path = new Path(pctx.getContext().getExplainConfig().getExplainRootPath(), op.getOperatorId()).toString();
            StatsPublisher statsPublisher = new FSStatsPublisher();
            StatsCollectionContext runtimeStatsContext = new StatsCollectionContext(pctx.getConf());
            runtimeStatsContext.setStatsTmpDir(path);
            if (!statsPublisher.init(runtimeStatsContext)) {
                LOG.error("StatsPublishing error: StatsPublisher is not initialized.");
                throw new HiveException(ErrorMsg.STATSPUBLISHER_NOT_OBTAINED.getErrorCodedMsg());
            }
            conf.setRuntimeStatsTmpDir(path);
        } else {
            LOG.debug("skip setRuntimeStatsDir for " + op.getOperatorId() + " because OperatorDesc is null");
        }
    } catch (HiveException e) {
        throw new SemanticException(e);
    }
}

Also used : Path(org.apache.hadoop.fs.Path) StatsPublisher(org.apache.hadoop.hive.ql.stats.StatsPublisher) FSStatsPublisher(org.apache.hadoop.hive.ql.stats.fs.FSStatsPublisher) StatsCollectionContext(org.apache.hadoop.hive.ql.stats.StatsCollectionContext) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) FSStatsPublisher(org.apache.hadoop.hive.ql.stats.fs.FSStatsPublisher) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Example 9 with StatsCollectionContext

use of org.apache.hadoop.hive.ql.stats.StatsCollectionContext in project hive by apache.

the class DagUtils method createVertex.

/**
   * Create a vertex from a given work object.
   *
   * @param conf JobConf to be used to this execution unit
   * @param work The instance of BaseWork representing the actual work to be performed
   * by this vertex.
   * @param scratchDir HDFS scratch dir for this execution unit.
   * @param appJarLr Local resource for hive-exec.
   * @param additionalLr
   * @param fileSystem FS corresponding to scratchDir and LocalResources
   * @param ctx This query's context
   * @return Vertex
   */
@SuppressWarnings("deprecation")
public Vertex createVertex(JobConf conf, BaseWork work, Path scratchDir, LocalResource appJarLr, List<LocalResource> additionalLr, FileSystem fileSystem, Context ctx, boolean hasChildren, TezWork tezWork, VertexType vertexType) throws Exception {
    Vertex v = null;
    // BaseWork.
    if (work instanceof MapWork) {
        v = createVertex(conf, (MapWork) work, appJarLr, additionalLr, fileSystem, scratchDir, ctx, vertexType);
    } else if (work instanceof ReduceWork) {
        v = createVertex(conf, (ReduceWork) work, appJarLr, additionalLr, fileSystem, scratchDir, ctx);
    } else if (work instanceof MergeJoinWork) {
        v = createVertex(conf, (MergeJoinWork) work, appJarLr, additionalLr, fileSystem, scratchDir, ctx, vertexType);
    } else {
        // something is seriously wrong if this is happening
        throw new HiveException(ErrorMsg.GENERIC_ERROR.getErrorCodedMsg());
    }
    // initialize stats publisher if necessary
    if (work.isGatheringStats()) {
        StatsPublisher statsPublisher;
        StatsFactory factory = StatsFactory.newFactory(conf);
        if (factory != null) {
            StatsCollectionContext sCntxt = new StatsCollectionContext(conf);
            sCntxt.setStatsTmpDirs(Utilities.getStatsTmpDirs(work, conf));
            statsPublisher = factory.getStatsPublisher();
            if (!statsPublisher.init(sCntxt)) {
                // creating stats table if not exists
                if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
                    throw new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
                }
            }
        }
    }
    // final vertices need to have at least one output
    if (!hasChildren) {
        v.addDataSink("out_" + work.getName(), new DataSinkDescriptor(OutputDescriptor.create(MROutput.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(conf)), null, null));
    }
    return v;
}

Also used : StatsPublisher(org.apache.hadoop.hive.ql.stats.StatsPublisher) StatsCollectionContext(org.apache.hadoop.hive.ql.stats.StatsCollectionContext) Vertex(org.apache.tez.dag.api.Vertex) PreWarmVertex(org.apache.tez.dag.api.PreWarmVertex) StatsFactory(org.apache.hadoop.hive.ql.stats.StatsFactory) MergeJoinWork(org.apache.hadoop.hive.ql.plan.MergeJoinWork) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor)

Example 10 with StatsCollectionContext

use of org.apache.hadoop.hive.ql.stats.StatsCollectionContext in project hive by apache.

the class PartialScanMapper method publishStats.

/**
   * Publish statistics.
   * similar to FileSinkOperator.java publishStats()
   *
   * @throws HiveException
   */
private void publishStats() throws HiveException {
    // Initializing a stats publisher
    StatsPublisher statsPublisher = Utilities.getStatsPublisher(jc);
    if (statsPublisher == null) {
        // just return, stats gathering should not block the main query
        LOG.error("StatsPublishing error: StatsPublisher is not initialized.");
        throw new HiveException(ErrorMsg.STATSPUBLISHER_NOT_OBTAINED.getErrorCodedMsg());
    }
    StatsCollectionContext sc = new StatsCollectionContext(jc);
    sc.setStatsTmpDir(jc.get(StatsSetupConst.STATS_TMP_LOC, ""));
    if (!statsPublisher.connect(sc)) {
        // should fail since stats gathering is main purpose of the job
        LOG.error("StatsPublishing error: cannot connect to database");
        throw new HiveException(ErrorMsg.STATSPUBLISHER_CONNECTION_ERROR.getErrorCodedMsg());
    }
    // construct key used to store stats in intermediate db
    String key = statsAggKeyPrefix.endsWith(Path.SEPARATOR) ? statsAggKeyPrefix : statsAggKeyPrefix + Path.SEPARATOR;
    // construct statistics to be stored
    Map<String, String> statsToPublish = new HashMap<String, String>();
    statsToPublish.put(StatsSetupConst.RAW_DATA_SIZE, Long.toString(uncompressedFileSize));
    statsToPublish.put(StatsSetupConst.ROW_COUNT, Long.toString(rowNo));
    if (!statsPublisher.publishStat(key, statsToPublish)) {
        // Not changing the interface to maintain backward compatibility
        throw new HiveException(ErrorMsg.STATSPUBLISHER_PUBLISHING_ERROR.getErrorCodedMsg());
    }
    if (!statsPublisher.closeConnection(sc)) {
        // Not changing the interface to maintain backward compatibility
        throw new HiveException(ErrorMsg.STATSPUBLISHER_CLOSING_ERROR.getErrorCodedMsg());
    }
}

Also used : StatsPublisher(org.apache.hadoop.hive.ql.stats.StatsPublisher) StatsCollectionContext(org.apache.hadoop.hive.ql.stats.StatsCollectionContext) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) HashMap(java.util.HashMap)

Aggregations

StatsCollectionContext (org.apache.hadoop.hive.ql.stats.StatsCollectionContext)15 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)14 StatsPublisher (org.apache.hadoop.hive.ql.stats.StatsPublisher)12 StatsFactory (org.apache.hadoop.hive.ql.stats.StatsFactory)7 HashMap (java.util.HashMap)5 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)5 ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)5 IOException (java.io.IOException)4 FileStatus (org.apache.hadoop.fs.FileStatus)4 ArrayList (java.util.ArrayList)3 FileSystem (org.apache.hadoop.fs.FileSystem)3 Path (org.apache.hadoop.fs.Path)3 Context (org.apache.hadoop.hive.ql.Context)3 MergeJoinWork (org.apache.hadoop.hive.ql.plan.MergeJoinWork)3 DataSinkDescriptor (org.apache.tez.dag.api.DataSinkDescriptor)3 PreWarmVertex (org.apache.tez.dag.api.PreWarmVertex)3 Vertex (org.apache.tez.dag.api.Vertex)3 LogInitializationException (org.apache.hadoop.hive.common.LogUtils.LogInitializationException)2 CompilationOpContext (org.apache.hadoop.hive.ql.CompilationOpContext)2 DriverContext (org.apache.hadoop.hive.ql.DriverContext)2