Search in sources :

Example 61 with MapWork

use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.

the class DagUtils method createVertex.

private Vertex createVertex(JobConf conf, MergeJoinWork mergeJoinWork, LocalResource appJarLr, List<LocalResource> additionalLr, FileSystem fs, Path mrScratchDir, Context ctx, VertexType vertexType) throws Exception {
    Utilities.setMergeWork(conf, mergeJoinWork, mrScratchDir, false);
    if (mergeJoinWork.getMainWork() instanceof MapWork) {
        List<BaseWork> mapWorkList = mergeJoinWork.getBaseWorkList();
        MapWork mapWork = (MapWork) (mergeJoinWork.getMainWork());
        Vertex mergeVx = createVertex(conf, mapWork, appJarLr, additionalLr, fs, mrScratchDir, ctx, vertexType);
        conf.setClass("mapred.input.format.class", HiveInputFormat.class, InputFormat.class);
        // mapreduce.tez.input.initializer.serialize.event.payload should be set
        // to false when using this plug-in to avoid getting a serialized event at run-time.
        conf.setBoolean("mapreduce.tez.input.initializer.serialize.event.payload", false);
        for (int i = 0; i < mapWorkList.size(); i++) {
            mapWork = (MapWork) (mapWorkList.get(i));
            conf.set(TEZ_MERGE_CURRENT_MERGE_FILE_PREFIX, mapWork.getName());
            conf.set(Utilities.INPUT_NAME, mapWork.getName());
            LOG.info("Going through each work and adding MultiMRInput");
            mergeVx.addDataSource(mapWork.getName(), MultiMRInput.createConfigBuilder(conf, HiveInputFormat.class).build());
        }
        VertexManagerPluginDescriptor desc = VertexManagerPluginDescriptor.create(CustomPartitionVertex.class.getName());
        // the +1 to the size is because of the main work.
        CustomVertexConfiguration vertexConf = new CustomVertexConfiguration(mergeJoinWork.getMergeJoinOperator().getConf().getNumBuckets(), vertexType, mergeJoinWork.getBigTableAlias(), mapWorkList.size() + 1);
        DataOutputBuffer dob = new DataOutputBuffer();
        vertexConf.write(dob);
        byte[] userPayload = dob.getData();
        desc.setUserPayload(UserPayload.create(ByteBuffer.wrap(userPayload)));
        mergeVx.setVertexManagerPlugin(desc);
        return mergeVx;
    } else {
        Vertex mergeVx = createVertex(conf, (ReduceWork) mergeJoinWork.getMainWork(), appJarLr, additionalLr, fs, mrScratchDir, ctx);
        return mergeVx;
    }
}
Also used : Vertex(org.apache.tez.dag.api.Vertex) PreWarmVertex(org.apache.tez.dag.api.PreWarmVertex) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) DataOutputBuffer(org.apache.hadoop.io.DataOutputBuffer) VertexManagerPluginDescriptor(org.apache.tez.dag.api.VertexManagerPluginDescriptor) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork)

Example 62 with MapWork

use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.

the class SparkMergeFileRecordHandler method init.

@SuppressWarnings("unchecked")
@Override
public <K, V> void init(JobConf job, OutputCollector<K, V> output, Reporter reporter) throws Exception {
    super.init(job, output, reporter);
    try {
        jc = job;
        MapWork mapWork = Utilities.getMapWork(job);
        if (mapWork instanceof MergeFileWork) {
            MergeFileWork mergeFileWork = (MergeFileWork) mapWork;
            String alias = mergeFileWork.getAliasToWork().keySet().iterator().next();
            op = mergeFileWork.getAliasToWork().get(alias);
            if (op instanceof AbstractFileMergeOperator) {
                mergeOp = (AbstractFileMergeOperator<? extends FileMergeDesc>) op;
                mergeOp.initializeOp(jc);
                row = new Object[2];
                abort = false;
            } else {
                abort = true;
                throw new IllegalStateException("Merge file work's top operator should be an" + " instance of AbstractFileMergeOperator");
            }
        } else {
            abort = true;
            throw new IllegalStateException("Map work should be a merge file work.");
        }
        LOG.info(mergeOp.dump(0));
    } catch (HiveException e) {
        abort = true;
        throw new RuntimeException(e);
    }
}
Also used : MergeFileWork(org.apache.hadoop.hive.ql.io.merge.MergeFileWork) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) AbstractFileMergeOperator(org.apache.hadoop.hive.ql.exec.AbstractFileMergeOperator)

Example 63 with MapWork

use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.

the class SparkPlanGenerator method generateParentTran.

// Generate (possibly get from a cached result) parent SparkTran
private SparkTran generateParentTran(SparkPlan sparkPlan, SparkWork sparkWork, BaseWork work) throws Exception {
    if (cloneToWork.containsKey(work)) {
        BaseWork originalWork = cloneToWork.get(work);
        if (workToParentWorkTranMap.containsKey(originalWork)) {
            return workToParentWorkTranMap.get(originalWork);
        }
    }
    SparkTran result;
    if (work instanceof MapWork) {
        result = generateMapInput(sparkPlan, (MapWork) work);
        sparkPlan.addTran(result);
    } else if (work instanceof ReduceWork) {
        List<BaseWork> parentWorks = sparkWork.getParents(work);
        result = generate(sparkPlan, sparkWork.getEdgeProperty(parentWorks.get(0), work), cloneToWork.containsKey(work));
        sparkPlan.addTran(result);
        for (BaseWork parentWork : parentWorks) {
            sparkPlan.connect(workToTranMap.get(parentWork), result);
        }
    } else {
        throw new IllegalStateException("AssertionError: expected either MapWork or ReduceWork, " + "but found " + work.getClass().getName());
    }
    if (cloneToWork.containsKey(work)) {
        workToParentWorkTranMap.put(cloneToWork.get(work), result);
    }
    return result;
}
Also used : MapWork(org.apache.hadoop.hive.ql.plan.MapWork) List(java.util.List) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork)

Example 64 with MapWork

use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.

the class SplitGrouper method generateGroupedSplits.

/** Generate groups of splits, separated by schema evolution boundaries */
public Multimap<Integer, InputSplit> generateGroupedSplits(JobConf jobConf, Configuration conf, InputSplit[] splits, float waves, int availableSlots, String inputName, boolean groupAcrossFiles, SplitLocationProvider locationProvider) throws Exception {
    MapWork work = populateMapWork(jobConf, inputName);
    // ArrayListMultimap is important here to retain the ordering for the splits.
    Multimap<Integer, InputSplit> bucketSplitMultiMap = ArrayListMultimap.<Integer, InputSplit>create();
    int i = 0;
    InputSplit prevSplit = null;
    for (InputSplit s : splits) {
        // schema boundaries
        if (schemaEvolved(s, prevSplit, groupAcrossFiles, work)) {
            ++i;
            prevSplit = s;
        }
        bucketSplitMultiMap.put(i, s);
    }
    LOG.info("# Src groups for split generation: " + (i + 1));
    // group them into the chunks we want
    Multimap<Integer, InputSplit> groupedSplits = this.group(jobConf, bucketSplitMultiMap, availableSlots, waves, locationProvider);
    return groupedSplits;
}
Also used : MapWork(org.apache.hadoop.hive.ql.plan.MapWork) InputSplit(org.apache.hadoop.mapred.InputSplit) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint)

Example 65 with MapWork

use of org.apache.hadoop.hive.ql.plan.MapWork in project haivvreo by jghoman.

the class AvroSerDe method determineCorrectProperties.

// Hive passes different properties in at different times.  If we're in a MR job,
// we'll get properties for the partition rather than the table, which will give
// us old values for the schema (if it's evolved).  Therefore, in an MR job
// we need to extract the table properties.
// Also, in join queries, multiple properties will be included, so we need
// to extract out the one appropriate to the table we're serde'ing.
private Properties determineCorrectProperties(Configuration configuration, Properties properties) {
    if ((configuration instanceof JobConf) && HaivvreoUtils.insideMRJob((JobConf) configuration)) {
        LOG.info("In MR job, extracting table-level properties");
        MapWork mapWork = Utilities.getMapWork(configuration);
        LinkedHashMap<String, PartitionDesc> a = mapWork.getAliasToPartnInfo();
        if (a.size() == 1) {
            LOG.info("Only one PartitionDesc found.  Returning that Properties");
            PartitionDesc p = a.values().iterator().next();
            TableDesc tableDesc = p.getTableDesc();
            return tableDesc.getProperties();
        } else {
            String tableName = properties.getProperty("name");
            LOG.info("Multiple PartitionDescs.  Return properties for " + tableName);
            for (Map.Entry<String, PartitionDesc> partitionDescs : a.entrySet()) {
                Properties p = partitionDescs.getValue().getTableDesc().getProperties();
                if (p.get("name").equals(tableName)) {
                    // We've found the matching table partition
                    LOG.info("Matched table name against " + partitionDescs.getKey() + ", return its properties");
                    return p;
                }
            }
            // Didn't find anything in partitions to match on.  WARN, at least.
            LOG.warn("Couldn't find any matching properties for table: " + tableName + ". Returning original properties");
        }
    }
    return properties;
}
Also used : MapWork(org.apache.hadoop.hive.ql.plan.MapWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) Properties(java.util.Properties) JobConf(org.apache.hadoop.mapred.JobConf) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Aggregations

MapWork (org.apache.hadoop.hive.ql.plan.MapWork)65 ArrayList (java.util.ArrayList)20 Path (org.apache.hadoop.fs.Path)20 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)19 Operator (org.apache.hadoop.hive.ql.exec.Operator)17 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)16 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)14 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)12 ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)12 Test (org.junit.Test)12 Task (org.apache.hadoop.hive.ql.exec.Task)11 JobConf (org.apache.hadoop.mapred.JobConf)11 Serializable (java.io.Serializable)10 LinkedHashMap (java.util.LinkedHashMap)10 BaseWork (org.apache.hadoop.hive.ql.plan.BaseWork)10 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)10 FileSystem (org.apache.hadoop.fs.FileSystem)9 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)9 List (java.util.List)8 ConditionalTask (org.apache.hadoop.hive.ql.exec.ConditionalTask)8