Search in sources :

Example 1 with DriverContext

use of org.apache.hadoop.hive.ql.DriverContext in project hive by apache.

the class GenMRTableScan1 method handlePartialScanCommand.

/**
   * handle partial scan command. It is composed of PartialScanTask followed by StatsTask .
   * @param op
   * @param ctx
   * @param parseCtx
   * @param currTask
   * @param parseInfo
   * @param statsWork
   * @param statsTask
   * @throws SemanticException
   */
private void handlePartialScanCommand(TableScanOperator op, GenMRProcContext ctx, ParseContext parseCtx, Task<? extends Serializable> currTask, StatsWork statsWork, Task<StatsWork> statsTask) throws SemanticException {
    String aggregationKey = op.getConf().getStatsAggPrefix();
    StringBuilder aggregationKeyBuffer = new StringBuilder(aggregationKey);
    List<Path> inputPaths = GenMapRedUtils.getInputPathsForPartialScan(op, aggregationKeyBuffer);
    aggregationKey = aggregationKeyBuffer.toString();
    // scan work
    PartialScanWork scanWork = new PartialScanWork(inputPaths);
    scanWork.setMapperCannotSpanPartns(true);
    scanWork.setAggKey(aggregationKey);
    scanWork.setStatsTmpDir(op.getConf().getTmpStatsDir(), parseCtx.getConf());
    // stats work
    statsWork.setPartialScanAnalyzeCommand(true);
    // partial scan task
    DriverContext driverCxt = new DriverContext();
    Task<PartialScanWork> psTask = TaskFactory.get(scanWork, parseCtx.getConf());
    psTask.initialize(parseCtx.getQueryState(), null, driverCxt, op.getCompilationOpContext());
    psTask.setWork(scanWork);
    // task dependency
    ctx.getRootTasks().remove(currTask);
    ctx.getRootTasks().add(psTask);
    psTask.addDependentTask(statsTask);
    List<Task<? extends Serializable>> parentTasks = new ArrayList<Task<? extends Serializable>>();
    parentTasks.add(psTask);
    statsTask.setParentTasks(parentTasks);
}
Also used : Path(org.apache.hadoop.fs.Path) DriverContext(org.apache.hadoop.hive.ql.DriverContext) Task(org.apache.hadoop.hive.ql.exec.Task) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) Serializable(java.io.Serializable) PartialScanWork(org.apache.hadoop.hive.ql.io.rcfile.stats.PartialScanWork) ArrayList(java.util.ArrayList)

Example 2 with DriverContext

use of org.apache.hadoop.hive.ql.DriverContext in project hive by apache.

the class PartialScanTask method main.

public static void main(String[] args) {
    String inputPathStr = null;
    String outputDir = null;
    String jobConfFileName = null;
    try {
        for (int i = 0; i < args.length; i++) {
            if (args[i].equals("-input")) {
                inputPathStr = args[++i];
            } else if (args[i].equals("-jobconffile")) {
                jobConfFileName = args[++i];
            } else if (args[i].equals("-outputDir")) {
                outputDir = args[++i];
            }
        }
    } catch (IndexOutOfBoundsException e) {
        System.err.println("Missing argument to option");
        printUsage();
    }
    if (inputPathStr == null || outputDir == null || outputDir.trim().equals("")) {
        printUsage();
    }
    List<Path> inputPaths = new ArrayList<Path>();
    String[] paths = inputPathStr.split(INPUT_SEPERATOR);
    if (paths == null || paths.length == 0) {
        printUsage();
    }
    FileSystem fs = null;
    JobConf conf = new JobConf(PartialScanTask.class);
    for (String path : paths) {
        try {
            Path pathObj = new Path(path);
            if (fs == null) {
                fs = FileSystem.get(pathObj.toUri(), conf);
            }
            FileStatus fstatus = fs.getFileStatus(pathObj);
            if (fstatus.isDir()) {
                FileStatus[] fileStatus = fs.listStatus(pathObj);
                for (FileStatus st : fileStatus) {
                    inputPaths.add(st.getPath());
                }
            } else {
                inputPaths.add(fstatus.getPath());
            }
        } catch (IOException e) {
            e.printStackTrace(System.err);
        }
    }
    if (jobConfFileName != null) {
        conf.addResource(new Path(jobConfFileName));
    }
    org.slf4j.Logger LOG = LoggerFactory.getLogger(PartialScanTask.class.getName());
    boolean isSilent = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVESESSIONSILENT);
    LogHelper console = new LogHelper(LOG, isSilent);
    // that it's easy to find reason for local mode execution failures
    for (Appender appender : ((Logger) LogManager.getRootLogger()).getAppenders().values()) {
        if (appender instanceof FileAppender) {
            console.printInfo("Execution log at: " + ((FileAppender) appender).getFileName());
        } else if (appender instanceof RollingFileAppender) {
            console.printInfo("Execution log at: " + ((RollingFileAppender) appender).getFileName());
        }
    }
    QueryState queryState = new QueryState(new HiveConf(conf, PartialScanTask.class));
    PartialScanWork mergeWork = new PartialScanWork(inputPaths);
    DriverContext driverCxt = new DriverContext();
    PartialScanTask taskExec = new PartialScanTask();
    taskExec.initialize(queryState, null, driverCxt, new CompilationOpContext());
    taskExec.setWork(mergeWork);
    int ret = taskExec.execute(driverCxt);
    if (ret != 0) {
        System.exit(2);
    }
}
Also used : DriverContext(org.apache.hadoop.hive.ql.DriverContext) FileStatus(org.apache.hadoop.fs.FileStatus) LogHelper(org.apache.hadoop.hive.ql.session.SessionState.LogHelper) RollingFileAppender(org.apache.logging.log4j.core.appender.RollingFileAppender) ArrayList(java.util.ArrayList) FileSystem(org.apache.hadoop.fs.FileSystem) HiveConf(org.apache.hadoop.hive.conf.HiveConf) JobConf(org.apache.hadoop.mapred.JobConf) Path(org.apache.hadoop.fs.Path) Appender(org.apache.logging.log4j.core.Appender) FileAppender(org.apache.logging.log4j.core.appender.FileAppender) RollingFileAppender(org.apache.logging.log4j.core.appender.RollingFileAppender) FileAppender(org.apache.logging.log4j.core.appender.FileAppender) RollingFileAppender(org.apache.logging.log4j.core.appender.RollingFileAppender) IOException(java.io.IOException) QueryState(org.apache.hadoop.hive.ql.QueryState) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext)

Example 3 with DriverContext

use of org.apache.hadoop.hive.ql.DriverContext in project hive by apache.

the class LocalHiveSparkClient method execute.

@Override
public SparkJobRef execute(DriverContext driverContext, SparkWork sparkWork) throws Exception {
    Context ctx = driverContext.getCtx();
    HiveConf hiveConf = (HiveConf) ctx.getConf();
    refreshLocalResources(sparkWork, hiveConf);
    JobConf jobConf = new JobConf(hiveConf);
    // Create temporary scratch dir
    Path emptyScratchDir;
    emptyScratchDir = ctx.getMRTmpPath();
    FileSystem fs = emptyScratchDir.getFileSystem(jobConf);
    fs.mkdirs(emptyScratchDir);
    // Update credential provider location
    // the password to the credential provider in already set in the sparkConf
    // in HiveSparkClientFactory
    HiveConfUtil.updateJobCredentialProviders(jobConf);
    SparkCounters sparkCounters = new SparkCounters(sc);
    Map<String, List<String>> prefixes = sparkWork.getRequiredCounterPrefix();
    if (prefixes != null) {
        for (String group : prefixes.keySet()) {
            for (String counterName : prefixes.get(group)) {
                sparkCounters.createCounter(group, counterName);
            }
        }
    }
    SparkReporter sparkReporter = new SparkReporter(sparkCounters);
    // Generate Spark plan
    SparkPlanGenerator gen = new SparkPlanGenerator(sc, ctx, jobConf, emptyScratchDir, sparkReporter);
    SparkPlan plan = gen.generate(sparkWork);
    if (driverContext.isShutdown()) {
        throw new HiveException("Operation is cancelled.");
    }
    // Execute generated plan.
    JavaPairRDD<HiveKey, BytesWritable> finalRDD = plan.generateGraph();
    sc.setJobGroup("queryId = " + sparkWork.getQueryId(), DagUtils.getQueryName(jobConf));
    // We use Spark RDD async action to submit job as it's the only way to get jobId now.
    JavaFutureAction<Void> future = finalRDD.foreachAsync(HiveVoidFunction.getInstance());
    // As we always use foreach action to submit RDD graph, it would only trigger one job.
    int jobId = future.jobIds().get(0);
    LocalSparkJobStatus sparkJobStatus = new LocalSparkJobStatus(sc, jobId, jobMetricsListener, sparkCounters, plan.getCachedRDDIds(), future);
    return new LocalSparkJobRef(Integer.toString(jobId), hiveConf, sparkJobStatus, sc);
}
Also used : JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Context(org.apache.hadoop.hive.ql.Context) DriverContext(org.apache.hadoop.hive.ql.DriverContext) Path(org.apache.hadoop.fs.Path) SparkCounters(org.apache.hive.spark.counter.SparkCounters) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) BytesWritable(org.apache.hadoop.io.BytesWritable) LocalSparkJobStatus(org.apache.hadoop.hive.ql.exec.spark.status.impl.LocalSparkJobStatus) HiveKey(org.apache.hadoop.hive.ql.io.HiveKey) FileSystem(org.apache.hadoop.fs.FileSystem) HiveConf(org.apache.hadoop.hive.conf.HiveConf) ArrayList(java.util.ArrayList) List(java.util.List) LocalSparkJobRef(org.apache.hadoop.hive.ql.exec.spark.status.impl.LocalSparkJobRef) JobConf(org.apache.hadoop.mapred.JobConf)

Example 4 with DriverContext

use of org.apache.hadoop.hive.ql.DriverContext in project hive by apache.

the class RemoteHiveSparkClient method submit.

private SparkJobRef submit(final DriverContext driverContext, final SparkWork sparkWork) throws Exception {
    final Context ctx = driverContext.getCtx();
    final HiveConf hiveConf = (HiveConf) ctx.getConf();
    refreshLocalResources(sparkWork, hiveConf);
    final JobConf jobConf = new JobConf(hiveConf);
    // update the credential provider location in the jobConf
    HiveConfUtil.updateJobCredentialProviders(jobConf);
    // Create temporary scratch dir
    final Path emptyScratchDir = ctx.getMRTmpPath();
    FileSystem fs = emptyScratchDir.getFileSystem(jobConf);
    fs.mkdirs(emptyScratchDir);
    // make sure NullScanFileSystem can be loaded - HIVE-18442
    jobConf.set("fs." + NullScanFileSystem.getBaseScheme() + ".impl", NullScanFileSystem.class.getCanonicalName());
    byte[] jobConfBytes = KryoSerializer.serializeJobConf(jobConf);
    byte[] scratchDirBytes = KryoSerializer.serialize(emptyScratchDir);
    byte[] sparkWorkBytes = KryoSerializer.serialize(sparkWork);
    JobStatusJob job = new JobStatusJob(jobConfBytes, scratchDirBytes, sparkWorkBytes);
    if (driverContext.isShutdown()) {
        throw new HiveException("Operation is cancelled.");
    }
    JobHandle<Serializable> jobHandle = remoteClient.submit(job);
    RemoteSparkJobStatus sparkJobStatus = new RemoteSparkJobStatus(remoteClient, jobHandle, sparkClientTimtout);
    return new RemoteSparkJobRef(hiveConf, jobHandle, sparkJobStatus);
}
Also used : Context(org.apache.hadoop.hive.ql.Context) DriverContext(org.apache.hadoop.hive.ql.DriverContext) JobContext(org.apache.hive.spark.client.JobContext) Path(org.apache.hadoop.fs.Path) Serializable(java.io.Serializable) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) RemoteSparkJobRef(org.apache.hadoop.hive.ql.exec.spark.status.impl.RemoteSparkJobRef) RemoteSparkJobStatus(org.apache.hadoop.hive.ql.exec.spark.status.impl.RemoteSparkJobStatus) FileSystem(org.apache.hadoop.fs.FileSystem) NullScanFileSystem(org.apache.hadoop.hive.ql.io.NullScanFileSystem) HiveConf(org.apache.hadoop.hive.conf.HiveConf) NullScanFileSystem(org.apache.hadoop.hive.ql.io.NullScanFileSystem) JobConf(org.apache.hadoop.mapred.JobConf)

Example 5 with DriverContext

use of org.apache.hadoop.hive.ql.DriverContext in project hive by apache.

the class ReplLoadTask method execute.

@Override
protected int execute(DriverContext driverContext) {
    try {
        int maxTasks = conf.getIntVar(HiveConf.ConfVars.REPL_APPROX_MAX_LOAD_TASKS);
        Context context = new Context(conf, getHive(), work.sessionStateLineageState, work.currentTransactionId);
        TaskTracker loadTaskTracker = new TaskTracker(maxTasks);
        /*
          for now for simplicity we are doing just one directory ( one database ), come back to use
          of multiple databases once we have the basic flow to chain creating of tasks in place for
          a database ( directory )
      */
        BootstrapEventsIterator iterator = work.iterator();
        ConstraintEventsIterator constraintIterator = work.constraintIterator();
        /*
      This is used to get hold of a reference during the current creation of tasks and is initialized
      with "0" tasks such that it will be non consequential in any operations done with task tracker
      compositions.
       */
        TaskTracker dbTracker = new TaskTracker(ZERO_TASKS);
        TaskTracker tableTracker = new TaskTracker(ZERO_TASKS);
        Scope scope = new Scope();
        boolean loadingConstraint = false;
        if (!iterator.hasNext() && constraintIterator.hasNext()) {
            loadingConstraint = true;
        }
        while ((iterator.hasNext() || (loadingConstraint && constraintIterator.hasNext())) && loadTaskTracker.canAddMoreTasks()) {
            BootstrapEvent next;
            if (!loadingConstraint) {
                next = iterator.next();
            } else {
                next = constraintIterator.next();
            }
            switch(next.eventType()) {
                case Database:
                    DatabaseEvent dbEvent = (DatabaseEvent) next;
                    dbTracker = new LoadDatabase(context, dbEvent, work.dbNameToLoadIn, loadTaskTracker).tasks();
                    loadTaskTracker.update(dbTracker);
                    if (work.hasDbState()) {
                        loadTaskTracker.update(updateDatabaseLastReplID(maxTasks, context, scope));
                    }
                    work.updateDbEventState(dbEvent.toState());
                    scope.database = true;
                    scope.rootTasks.addAll(dbTracker.tasks());
                    dbTracker.debugLog("database");
                    break;
                case Table:
                    {
                        /*
              Implicit assumption here is that database level is processed first before table level,
              which will depend on the iterator used since it should provide the higher level directory
              listing before providing the lower level listing. This is also required such that
              the dbTracker /  tableTracker are setup correctly always.
           */
                        TableContext tableContext = new TableContext(dbTracker, work.dbNameToLoadIn, work.tableNameToLoadIn);
                        TableEvent tableEvent = (TableEvent) next;
                        LoadTable loadTable = new LoadTable(tableEvent, context, iterator.replLogger(), tableContext, loadTaskTracker);
                        tableTracker = loadTable.tasks();
                        if (!scope.database) {
                            scope.rootTasks.addAll(tableTracker.tasks());
                            scope.table = true;
                        }
                        setUpDependencies(dbTracker, tableTracker);
                        /*
            for table replication if we reach the max number of tasks then for the next run we will
            try to reload the same table again, this is mainly for ease of understanding the code
            as then we can avoid handling == > loading partitions for the table given that
            the creation of table lead to reaching max tasks vs,  loading next table since current
            one does not have partitions.
           */
                        // for a table we explicitly try to load partitions as there is no separate partitions events.
                        LoadPartitions loadPartitions = new LoadPartitions(context, iterator.replLogger(), loadTaskTracker, tableEvent, work.dbNameToLoadIn, tableContext);
                        TaskTracker partitionsTracker = loadPartitions.tasks();
                        partitionsPostProcessing(iterator, scope, loadTaskTracker, tableTracker, partitionsTracker);
                        tableTracker.debugLog("table");
                        partitionsTracker.debugLog("partitions for table");
                        break;
                    }
                case Partition:
                    {
                        /*
              This will happen only when loading tables and we reach the limit of number of tasks we can create;
              hence we know here that the table should exist and there should be a lastPartitionName
          */
                        PartitionEvent event = (PartitionEvent) next;
                        TableContext tableContext = new TableContext(dbTracker, work.dbNameToLoadIn, work.tableNameToLoadIn);
                        LoadPartitions loadPartitions = new LoadPartitions(context, iterator.replLogger(), tableContext, loadTaskTracker, event.asTableEvent(), work.dbNameToLoadIn, event.lastPartitionReplicated());
                        /*
               the tableTracker here should be a new instance and not an existing one as this can
               only happen when we break in between loading partitions.
           */
                        TaskTracker partitionsTracker = loadPartitions.tasks();
                        partitionsPostProcessing(iterator, scope, loadTaskTracker, tableTracker, partitionsTracker);
                        partitionsTracker.debugLog("partitions");
                        break;
                    }
                case Function:
                    {
                        LoadFunction loadFunction = new LoadFunction(context, iterator.replLogger(), (FunctionEvent) next, work.dbNameToLoadIn, dbTracker);
                        TaskTracker functionsTracker = loadFunction.tasks();
                        if (!scope.database) {
                            scope.rootTasks.addAll(functionsTracker.tasks());
                        } else {
                            setUpDependencies(dbTracker, functionsTracker);
                        }
                        loadTaskTracker.update(functionsTracker);
                        functionsTracker.debugLog("functions");
                        break;
                    }
                case Constraint:
                    {
                        LoadConstraint loadConstraint = new LoadConstraint(context, (ConstraintEvent) next, work.dbNameToLoadIn, dbTracker);
                        TaskTracker constraintTracker = loadConstraint.tasks();
                        scope.rootTasks.addAll(constraintTracker.tasks());
                        loadTaskTracker.update(constraintTracker);
                        constraintTracker.debugLog("constraints");
                    }
            }
            if (!loadingConstraint && !iterator.currentDbHasNext()) {
                createEndReplLogTask(context, scope, iterator.replLogger());
            }
        }
        boolean addAnotherLoadTask = iterator.hasNext() || loadTaskTracker.hasReplicationState() || constraintIterator.hasNext();
        createBuilderTask(scope.rootTasks, addAnotherLoadTask);
        if (!iterator.hasNext() && !constraintIterator.hasNext()) {
            loadTaskTracker.update(updateDatabaseLastReplID(maxTasks, context, scope));
            work.updateDbEventState(null);
        }
        this.childTasks = scope.rootTasks;
        /*
      Since there can be multiple rounds of this run all of which will be tied to the same
      query id -- generated in compile phase , adding a additional UUID to the end to print each run
      in separate files.
       */
        LOG.info("Root Tasks / Total Tasks : {} / {} ", childTasks.size(), loadTaskTracker.numberOfTasks());
        // Populate the driver context with the scratch dir info from the repl context, so that the temp dirs will be cleaned up later
        driverContext.getCtx().getFsScratchDirs().putAll(context.pathInfo.getFsScratchDirs());
    } catch (Exception e) {
        LOG.error("failed replication", e);
        setException(e);
        return 1;
    }
    LOG.info("completed load task run : {}", work.executedLoadTask());
    return 0;
}
Also used : Context(org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.util.Context) DriverContext(org.apache.hadoop.hive.ql.DriverContext) TableContext(org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.table.TableContext) TaskTracker(org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.TaskTracker) LoadFunction(org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.LoadFunction) LoadConstraint(org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.LoadConstraint) BootstrapEvent(org.apache.hadoop.hive.ql.exec.repl.bootstrap.events.BootstrapEvent) LoadPartitions(org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.table.LoadPartitions) BootstrapEventsIterator(org.apache.hadoop.hive.ql.exec.repl.bootstrap.events.filesystem.BootstrapEventsIterator) LoadTable(org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.table.LoadTable) PartitionEvent(org.apache.hadoop.hive.ql.exec.repl.bootstrap.events.PartitionEvent) LoadConstraint(org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.LoadConstraint) ConstraintEvent(org.apache.hadoop.hive.ql.exec.repl.bootstrap.events.ConstraintEvent) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) TableEvent(org.apache.hadoop.hive.ql.exec.repl.bootstrap.events.TableEvent) TableContext(org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.table.TableContext) LoadDatabase(org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.LoadDatabase) FunctionEvent(org.apache.hadoop.hive.ql.exec.repl.bootstrap.events.FunctionEvent) ConstraintEventsIterator(org.apache.hadoop.hive.ql.exec.repl.bootstrap.events.filesystem.ConstraintEventsIterator) DatabaseEvent(org.apache.hadoop.hive.ql.exec.repl.bootstrap.events.DatabaseEvent)

Aggregations

DriverContext (org.apache.hadoop.hive.ql.DriverContext)17 Path (org.apache.hadoop.fs.Path)13 IOException (java.io.IOException)9 Context (org.apache.hadoop.hive.ql.Context)9 CompilationOpContext (org.apache.hadoop.hive.ql.CompilationOpContext)7 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)7 FileSystem (org.apache.hadoop.fs.FileSystem)6 ArrayList (java.util.ArrayList)4 JobClient (org.apache.hadoop.mapred.JobClient)4 HiveConf (org.apache.hadoop.hive.conf.HiveConf)3 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)3 JobConf (org.apache.hadoop.mapred.JobConf)3 RunningJob (org.apache.hadoop.mapred.RunningJob)3 File (java.io.File)2 OutputStream (java.io.OutputStream)2 Serializable (java.io.Serializable)2 HashMap (java.util.HashMap)2 Map (java.util.Map)2 Properties (java.util.Properties)2 FileStatus (org.apache.hadoop.fs.FileStatus)2