Search in sources :

Example 6 with MergeFileWork

use of org.apache.hadoop.hive.ql.io.merge.MergeFileWork in project hive by apache.

the class SparkMergeFileRecordHandler method init.

@SuppressWarnings("unchecked")
@Override
public <K, V> void init(JobConf job, OutputCollector<K, V> output, Reporter reporter) throws Exception {
    super.init(job, output, reporter);
    try {
        jc = job;
        MapWork mapWork = Utilities.getMapWork(job);
        if (mapWork instanceof MergeFileWork) {
            MergeFileWork mergeFileWork = (MergeFileWork) mapWork;
            String alias = mergeFileWork.getAliasToWork().keySet().iterator().next();
            op = mergeFileWork.getAliasToWork().get(alias);
            if (op instanceof AbstractFileMergeOperator) {
                mergeOp = (AbstractFileMergeOperator<? extends FileMergeDesc>) op;
                mergeOp.initializeOp(jc);
                row = new Object[2];
                abort = false;
            } else {
                abort = true;
                throw new IllegalStateException("Merge file work's top operator should be an" + " instance of AbstractFileMergeOperator");
            }
        } else {
            abort = true;
            throw new IllegalStateException("Map work should be a merge file work.");
        }
        LOG.info(mergeOp.dump(0));
    } catch (HiveException e) {
        abort = true;
        throw new RuntimeException(e);
    }
}
Also used : MergeFileWork(org.apache.hadoop.hive.ql.io.merge.MergeFileWork) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) AbstractFileMergeOperator(org.apache.hadoop.hive.ql.exec.AbstractFileMergeOperator)

Example 7 with MergeFileWork

use of org.apache.hadoop.hive.ql.io.merge.MergeFileWork in project hive by apache.

the class DagUtils method createVertex.

/*
   * Helper function to create Vertex from MapWork.
   */
private Vertex createVertex(JobConf conf, MapWork mapWork, FileSystem fs, Path mrScratchDir, Context ctx, VertexType vertexType, Map<String, LocalResource> localResources) throws Exception {
    // set up the operator plan
    Utilities.cacheMapWork(conf, mapWork, mrScratchDir);
    // create the directories FileSinkOperators need
    Utilities.createTmpDirs(conf, mapWork);
    // finally create the vertex
    Vertex map = null;
    // use tez to combine splits
    boolean groupSplitsInInputInitializer;
    DataSourceDescriptor dataSource;
    int numTasks = -1;
    @SuppressWarnings("rawtypes") Class inputFormatClass = conf.getClass("mapred.input.format.class", InputFormat.class);
    boolean vertexHasCustomInput = VertexType.isCustomInputType(vertexType);
    LOG.info("Vertex has custom input? " + vertexHasCustomInput);
    if (vertexHasCustomInput) {
        groupSplitsInInputInitializer = false;
        // grouping happens in execution phase. The input payload should not enable grouping here,
        // it will be enabled in the CustomVertex.
        inputFormatClass = HiveInputFormat.class;
        conf.setClass("mapred.input.format.class", HiveInputFormat.class, InputFormat.class);
        // mapreduce.tez.input.initializer.serialize.event.payload should be set to false when using
        // this plug-in to avoid getting a serialized event at run-time.
        conf.setBoolean("mapreduce.tez.input.initializer.serialize.event.payload", false);
    } else {
        // is HiveInputFormat
        if (inputFormatClass == HiveInputFormat.class) {
            groupSplitsInInputInitializer = true;
        } else {
            groupSplitsInInputInitializer = false;
        }
    }
    if (mapWork instanceof MergeFileWork) {
        Path outputPath = ((MergeFileWork) mapWork).getOutputDir();
        // prepare the tmp output directory. The output tmp directory should
        // exist before jobClose (before renaming after job completion)
        Path tempOutPath = Utilities.toTempPath(outputPath);
        try {
            FileSystem tmpOutFS = tempOutPath.getFileSystem(conf);
            if (!tmpOutFS.exists(tempOutPath)) {
                tmpOutFS.mkdirs(tempOutPath);
            }
        } catch (IOException e) {
            throw new RuntimeException("Can't make path " + outputPath + " : " + e.getMessage(), e);
        }
    }
    // remember mapping of plan to input
    conf.set(Utilities.INPUT_NAME, mapWork.getName());
    if (HiveConf.getBoolVar(conf, ConfVars.HIVE_AM_SPLIT_GENERATION)) {
        // set up the operator plan. (before setting up splits on the AM)
        Utilities.setMapWork(conf, mapWork, mrScratchDir, false);
        // the correct plugin.
        if (groupSplitsInInputInitializer) {
            // Not setting a payload, since the MRInput payload is the same and can be accessed.
            InputInitializerDescriptor descriptor = InputInitializerDescriptor.create(HiveSplitGenerator.class.getName());
            dataSource = MRInputLegacy.createConfigBuilder(conf, inputFormatClass).groupSplits(true).setCustomInitializerDescriptor(descriptor).build();
        } else {
            // Not HiveInputFormat, or a custom VertexManager will take care of grouping splits
            if (vertexHasCustomInput && vertexType == VertexType.MULTI_INPUT_UNINITIALIZED_EDGES) {
                // SMB Join.
                dataSource = MultiMRInput.createConfigBuilder(conf, inputFormatClass).groupSplits(false).build();
            } else {
                dataSource = MRInputLegacy.createConfigBuilder(conf, inputFormatClass).groupSplits(false).build();
            }
        }
    } else {
        // Setup client side split generation.
        // we need to set this, because with HS2 and client side split
        // generation we end up not finding the map work. This is
        // because of thread local madness (tez split generation is
        // multi-threaded - HS2 plan cache uses thread locals). Setting
        // VECTOR_MODE/USE_VECTORIZED_INPUT_FILE_FORMAT causes the split gen code to use the conf instead
        // of the map work.
        conf.setBoolean(Utilities.VECTOR_MODE, mapWork.getVectorMode());
        conf.setBoolean(Utilities.USE_VECTORIZED_INPUT_FILE_FORMAT, mapWork.getUseVectorizedInputFileFormat());
        InputSplitInfo inputSplitInfo = MRInputHelpers.generateInputSplitsToMem(conf, false, 0);
        InputInitializerDescriptor descriptor = InputInitializerDescriptor.create(MRInputSplitDistributor.class.getName());
        InputDescriptor inputDescriptor = InputDescriptor.create(MRInputLegacy.class.getName()).setUserPayload(UserPayload.create(MRRuntimeProtos.MRInputUserPayloadProto.newBuilder().setConfigurationBytes(TezUtils.createByteStringFromConf(conf)).setSplits(inputSplitInfo.getSplitsProto()).build().toByteString().asReadOnlyByteBuffer()));
        dataSource = DataSourceDescriptor.create(inputDescriptor, descriptor, null);
        numTasks = inputSplitInfo.getNumTasks();
        // set up the operator plan. (after generating splits - that changes configs)
        Utilities.setMapWork(conf, mapWork, mrScratchDir, false);
    }
    UserPayload serializedConf = TezUtils.createUserPayloadFromConf(conf);
    String procClassName = MapTezProcessor.class.getName();
    if (mapWork instanceof MergeFileWork) {
        procClassName = MergeFileTezProcessor.class.getName();
    }
    VertexExecutionContext executionContext = createVertexExecutionContext(mapWork);
    map = Vertex.create(mapWork.getName(), ProcessorDescriptor.create(procClassName).setUserPayload(serializedConf), numTasks, getContainerResource(conf));
    map.setTaskEnvironment(getContainerEnvironment(conf, true));
    map.setExecutionContext(executionContext);
    map.setTaskLaunchCmdOpts(getContainerJavaOpts(conf));
    assert mapWork.getAliasToWork().keySet().size() == 1;
    // Add the actual source input
    String alias = mapWork.getAliasToWork().keySet().iterator().next();
    map.addDataSource(alias, dataSource);
    map.addTaskLocalFiles(localResources);
    return map;
}
Also used : Path(org.apache.hadoop.fs.Path) InputDescriptor(org.apache.tez.dag.api.InputDescriptor) Vertex(org.apache.tez.dag.api.Vertex) PreWarmVertex(org.apache.tez.dag.api.PreWarmVertex) MergeFileWork(org.apache.hadoop.hive.ql.io.merge.MergeFileWork) UserPayload(org.apache.tez.dag.api.UserPayload) VertexExecutionContext(org.apache.tez.dag.api.Vertex.VertexExecutionContext) InputSplitInfo(org.apache.tez.mapreduce.hadoop.InputSplitInfo) IOException(java.io.IOException) MRInputSplitDistributor(org.apache.tez.mapreduce.common.MRInputSplitDistributor) FileSystem(org.apache.hadoop.fs.FileSystem) InputInitializerDescriptor(org.apache.tez.dag.api.InputInitializerDescriptor) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor)

Example 8 with MergeFileWork

use of org.apache.hadoop.hive.ql.io.merge.MergeFileWork in project hive by apache.

the class MergeFileRecordProcessor method init.

@Override
void init(MRTaskReporter mrReporter, Map<String, LogicalInput> inputs, Map<String, LogicalOutput> outputs) throws Exception {
    // TODO HIVE-14042. Abort handling.
    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.TEZ_INIT_OPERATORS);
    super.init(mrReporter, inputs, outputs);
    execContext = new ExecMapperContext(jconf);
    // Update JobConf using MRInput, info like filename comes via this
    mrInput = getMRInput(inputs);
    Configuration updatedConf = mrInput.getConfigUpdates();
    if (updatedConf != null) {
        for (Map.Entry<String, String> entry : updatedConf) {
            jconf.set(entry.getKey(), entry.getValue());
        }
    }
    createOutputMap();
    // Start all the Outputs.
    for (Map.Entry<String, LogicalOutput> outputEntry : outputs.entrySet()) {
        outputEntry.getValue().start();
        ((TezProcessor.TezKVOutputCollector) outMap.get(outputEntry.getKey())).initialize();
    }
    String queryId = HiveConf.getVar(jconf, HiveConf.ConfVars.HIVEQUERYID);
    cache = ObjectCacheFactory.getCache(jconf, queryId, true);
    try {
        execContext.setJc(jconf);
        cacheKey = MAP_PLAN_KEY;
        MapWork mapWork = (MapWork) cache.retrieve(cacheKey, new Callable<Object>() {

            @Override
            public Object call() {
                return Utilities.getMapWork(jconf);
            }
        });
        Utilities.setMapWork(jconf, mapWork);
        if (mapWork instanceof MergeFileWork) {
            mfWork = (MergeFileWork) mapWork;
        } else {
            throw new RuntimeException("MapWork should be an instance of MergeFileWork.");
        }
        String alias = mfWork.getAliasToWork().keySet().iterator().next();
        mergeOp = mfWork.getAliasToWork().get(alias);
        LOG.info(mergeOp.dump(0));
        MapredContext.init(true, new JobConf(jconf));
        ((TezContext) MapredContext.get()).setInputs(inputs);
        mergeOp.passExecContext(execContext);
        mergeOp.initializeLocalWork(jconf);
        mergeOp.initialize(jconf, null);
        OperatorUtils.setChildrenCollector(mergeOp.getChildOperators(), outMap);
        mergeOp.setReporter(reporter);
        MapredContext.get().setReporter(reporter);
    } catch (Throwable e) {
        if (e instanceof OutOfMemoryError) {
            // Don't create a new object if we are already out of memory
            throw (OutOfMemoryError) e;
        } else if (e instanceof InterruptedException) {
            l4j.info("Hit an interrupt while initializing MergeFileRecordProcessor. Message={}", e.getMessage());
            throw (InterruptedException) e;
        } else {
            throw new RuntimeException("Map operator initialization failed", e);
        }
    }
    perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.TEZ_INIT_OPERATORS);
}
Also used : ExecMapperContext(org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext) MergeFileWork(org.apache.hadoop.hive.ql.io.merge.MergeFileWork) Configuration(org.apache.hadoop.conf.Configuration) LogicalOutput(org.apache.tez.runtime.api.LogicalOutput) Callable(java.util.concurrent.Callable) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) Map(java.util.Map) JobConf(org.apache.hadoop.mapred.JobConf)

Example 9 with MergeFileWork

use of org.apache.hadoop.hive.ql.io.merge.MergeFileWork in project hive by apache.

the class DDLTask method mergeFiles.

/**
 * First, make sure the source table/partition is not
 * archived/indexes/non-rcfile. If either of these is true, throw an
 * exception.
 *
 * The way how it does the merge is to create a BlockMergeTask from the
 * mergeFilesDesc.
 *
 * @param db
 * @param mergeFilesDesc
 * @return
 * @throws HiveException
 */
private int mergeFiles(Hive db, AlterTablePartMergeFilesDesc mergeFilesDesc, DriverContext driverContext) throws HiveException {
    ListBucketingCtx lbCtx = mergeFilesDesc.getLbCtx();
    boolean lbatc = lbCtx == null ? false : lbCtx.isSkewedStoredAsDir();
    int lbd = lbCtx == null ? 0 : lbCtx.calculateListBucketingLevel();
    // merge work only needs input and output.
    MergeFileWork mergeWork = new MergeFileWork(mergeFilesDesc.getInputDir(), mergeFilesDesc.getOutputDir(), mergeFilesDesc.getInputFormatClass().getName(), mergeFilesDesc.getTableDesc());
    LinkedHashMap<Path, ArrayList<String>> pathToAliases = new LinkedHashMap<>();
    ArrayList<String> inputDirstr = new ArrayList<String>(1);
    inputDirstr.add(mergeFilesDesc.getInputDir().toString());
    pathToAliases.put(mergeFilesDesc.getInputDir().get(0), inputDirstr);
    mergeWork.setPathToAliases(pathToAliases);
    mergeWork.setListBucketingCtx(mergeFilesDesc.getLbCtx());
    mergeWork.resolveConcatenateMerge(db.getConf());
    mergeWork.setMapperCannotSpanPartns(true);
    mergeWork.setSourceTableInputFormat(mergeFilesDesc.getInputFormatClass().getName());
    final FileMergeDesc fmd;
    if (mergeFilesDesc.getInputFormatClass().equals(RCFileInputFormat.class)) {
        fmd = new RCFileMergeDesc();
    } else {
        // safe to assume else is ORC as semantic analyzer will check for RC/ORC
        fmd = new OrcFileMergeDesc();
    }
    fmd.setDpCtx(null);
    fmd.setHasDynamicPartitions(false);
    fmd.setListBucketingAlterTableConcatenate(lbatc);
    fmd.setListBucketingDepth(lbd);
    fmd.setOutputPath(mergeFilesDesc.getOutputDir());
    CompilationOpContext opContext = driverContext.getCtx().getOpContext();
    Operator<? extends OperatorDesc> mergeOp = OperatorFactory.get(opContext, fmd);
    LinkedHashMap<String, Operator<? extends OperatorDesc>> aliasToWork = new LinkedHashMap<String, Operator<? extends OperatorDesc>>();
    aliasToWork.put(mergeFilesDesc.getInputDir().toString(), mergeOp);
    mergeWork.setAliasToWork(aliasToWork);
    DriverContext driverCxt = new DriverContext();
    Task<?> task;
    if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
        TezWork tezWork = new TezWork(queryState.getQueryId(), conf);
        mergeWork.setName("File Merge");
        tezWork.add(mergeWork);
        task = new TezTask();
        ((TezTask) task).setWork(tezWork);
    } else {
        task = new MergeFileTask();
        ((MergeFileTask) task).setWork(mergeWork);
    }
    // initialize the task and execute
    task.initialize(queryState, getQueryPlan(), driverCxt, opContext);
    subtask = task;
    int ret = task.execute(driverCxt);
    if (subtask.getException() != null) {
        setException(subtask.getException());
    }
    return ret;
}
Also used : Path(org.apache.hadoop.fs.Path) DriverContext(org.apache.hadoop.hive.ql.DriverContext) MergeFileWork(org.apache.hadoop.hive.ql.io.merge.MergeFileWork) RCFileMergeDesc(org.apache.hadoop.hive.ql.plan.RCFileMergeDesc) OrcFileMergeDesc(org.apache.hadoop.hive.ql.plan.OrcFileMergeDesc) FileMergeDesc(org.apache.hadoop.hive.ql.plan.FileMergeDesc) RCFileMergeDesc(org.apache.hadoop.hive.ql.plan.RCFileMergeDesc) OrcFileMergeDesc(org.apache.hadoop.hive.ql.plan.OrcFileMergeDesc) ArrayList(java.util.ArrayList) TezTask(org.apache.hadoop.hive.ql.exec.tez.TezTask) SQLUniqueConstraint(org.apache.hadoop.hive.metastore.api.SQLUniqueConstraint) CheckConstraint(org.apache.hadoop.hive.ql.metadata.CheckConstraint) NotNullConstraint(org.apache.hadoop.hive.ql.metadata.NotNullConstraint) SQLCheckConstraint(org.apache.hadoop.hive.metastore.api.SQLCheckConstraint) SQLDefaultConstraint(org.apache.hadoop.hive.metastore.api.SQLDefaultConstraint) DefaultConstraint(org.apache.hadoop.hive.ql.metadata.DefaultConstraint) UniqueConstraint(org.apache.hadoop.hive.ql.metadata.UniqueConstraint) SQLNotNullConstraint(org.apache.hadoop.hive.metastore.api.SQLNotNullConstraint) LinkedHashMap(java.util.LinkedHashMap) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) ListBucketingCtx(org.apache.hadoop.hive.ql.plan.ListBucketingCtx) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) TezWork(org.apache.hadoop.hive.ql.plan.TezWork) MergeFileTask(org.apache.hadoop.hive.ql.io.merge.MergeFileTask)

Aggregations

MergeFileWork (org.apache.hadoop.hive.ql.io.merge.MergeFileWork)9 Path (org.apache.hadoop.fs.Path)6 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)4 JobConf (org.apache.hadoop.mapred.JobConf)4 IOException (java.io.IOException)3 FileSystem (org.apache.hadoop.fs.FileSystem)3 ArrayList (java.util.ArrayList)2 LinkedHashMap (java.util.LinkedHashMap)2 ExecMapper (org.apache.hadoop.hive.ql.exec.mr.ExecMapper)2 MergeFileMapper (org.apache.hadoop.hive.ql.io.merge.MergeFileMapper)2 FileMergeDesc (org.apache.hadoop.hive.ql.plan.FileMergeDesc)2 ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)2 DataSourceDescriptor (org.apache.tez.dag.api.DataSourceDescriptor)2 InputInitializerDescriptor (org.apache.tez.dag.api.InputInitializerDescriptor)2 PreWarmVertex (org.apache.tez.dag.api.PreWarmVertex)2 UserPayload (org.apache.tez.dag.api.UserPayload)2 Vertex (org.apache.tez.dag.api.Vertex)2 VertexExecutionContext (org.apache.tez.dag.api.Vertex.VertexExecutionContext)2 HashMap (java.util.HashMap)1 Map (java.util.Map)1