Examples with HiveInputFormat - org.apache.hadoop.hive.ql.io.HiveInputFormat

Example 1 with HiveInputFormat

use of org.apache.hadoop.hive.ql.io.HiveInputFormat in project hive by apache.

the class TestInputOutputFormat method testCombinationInputFormatWithAcid.

// test non-vectorized, acid, combine
@Test
public void testCombinationInputFormatWithAcid() throws Exception {
    // get the object inspector for MyRow
    StructObjectInspector inspector;
    final int PARTITIONS = 2;
    final int BUCKETS = 3;
    synchronized (TestOrcFile.class) {
        inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    JobConf conf = createMockExecutionEnvironment(workDir, new Path("mock:///"), "combinationAcid", inspector, false, PARTITIONS, MockFileSystem.class.getName());
    // write the orc file to the mock file system
    Path[] partDir = new Path[PARTITIONS];
    String[] paths = conf.getStrings("mapred.input.dir");
    for (int p = 0; p < PARTITIONS; ++p) {
        partDir[p] = new Path(paths[p]);
    }
    // write a base file in partition 0
    OrcRecordUpdater writer = new OrcRecordUpdater(partDir[0], new AcidOutputFormat.Options(conf).maximumWriteId(10).writingBase(true).bucket(0).inspector(inspector).finalDestination(partDir[0]));
    for (int i = 0; i < 10; ++i) {
        writer.insert(10, new MyRow(i, 2 * i));
    }
    writer.close(false);
    // base file
    Path base0 = new Path("mock:/combinationAcid/p=0/base_0000010/bucket_00000");
    setBlocks(base0, conf, new MockBlock("host1", "host2"));
    // write a delta file in partition 0
    writer = new OrcRecordUpdater(partDir[0], new AcidOutputFormat.Options(conf).maximumWriteId(10).writingBase(true).bucket(1).inspector(inspector).finalDestination(partDir[0]));
    for (int i = 10; i < 20; ++i) {
        writer.insert(10, new MyRow(i, 2 * i));
    }
    writer.close(false);
    Path base1 = new Path("mock:/combinationAcid/p=0/base_0000010/bucket_00001");
    setBlocks(base1, conf, new MockBlock("host1", "host2"));
    // write three files in partition 1
    for (int bucket = 0; bucket < BUCKETS; ++bucket) {
        Path path = new Path(partDir[1], "00000" + bucket + "_0");
        Writer orc = OrcFile.createWriter(path, OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
        orc.addRow(new MyRow(1, 2));
        orc.close();
        setBlocks(path, conf, new MockBlock("host3", "host4"));
    }
    // call getsplits
    conf.setInt(hive_metastoreConstants.BUCKET_COUNT, BUCKETS);
    setupAcidProperties(conf, RowType.MYROW);
    HiveInputFormat<?, ?> inputFormat = new CombineHiveInputFormat<WritableComparable, Writable>();
    InputSplit[] splits = inputFormat.getSplits(conf, 1);
    assertEquals(3, splits.length);
    HiveInputFormat.HiveInputSplit split = (HiveInputFormat.HiveInputSplit) splits[0];
    assertEquals("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat", split.inputFormatClassName());
    assertEquals("mock:/combinationAcid/p=0/base_0000010/bucket_00000", split.getPath().toString());
    assertEquals(0, split.getStart());
    assertEquals(702, split.getLength());
    split = (HiveInputFormat.HiveInputSplit) splits[1];
    assertEquals("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat", split.inputFormatClassName());
    assertEquals("mock:/combinationAcid/p=0/base_0000010/bucket_00001", split.getPath().toString());
    assertEquals(0, split.getStart());
    assertEquals(726, split.getLength());
    CombineHiveInputFormat.CombineHiveInputSplit combineSplit = (CombineHiveInputFormat.CombineHiveInputSplit) splits[2];
    assertEquals(BUCKETS, combineSplit.getNumPaths());
    for (int bucket = 0; bucket < BUCKETS; ++bucket) {
        assertEquals("mock:/combinationAcid/p=1/00000" + bucket + "_0", combineSplit.getPath(bucket).toString());
        assertEquals(0, combineSplit.getOffset(bucket));
        assertEquals(253, combineSplit.getLength(bucket));
    }
    String[] hosts = combineSplit.getLocations();
    assertEquals(2, hosts.length);
}

Also used : CombineHiveInputFormat(org.apache.hadoop.hive.ql.io.CombineHiveInputFormat) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) CombineHiveInputFormat(org.apache.hadoop.hive.ql.io.CombineHiveInputFormat) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) RecordWriter(org.apache.hadoop.mapred.RecordWriter) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Example 2 with HiveInputFormat

use of org.apache.hadoop.hive.ql.io.HiveInputFormat in project hive by apache.

the class TestInputOutputFormat method testVectorizationWithBuckets.

/**
 * Test vectorization, non-acid, non-combine.
 * @throws Exception
 */
@Test
public void testVectorizationWithBuckets() throws Exception {
    // get the object inspector for MyRow
    StructObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    JobConf conf = createMockExecutionEnvironment(workDir, new Path("mock:///"), "vectorBuckets", inspector, true, 1, MockFileSystem.class.getName());
    // write the orc file to the mock file system
    Path path = new Path(conf.get("mapred.input.dir") + "/0_0");
    Writer writer = OrcFile.createWriter(path, OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
    for (int i = 0; i < 10; ++i) {
        writer.addRow(new MyRow(i, 2 * i));
    }
    writer.close();
    setBlocks(path, conf, new MockBlock("host0", "host1"));
    // call getsplits
    conf.setInt(hive_metastoreConstants.BUCKET_COUNT, 3);
    HiveInputFormat<?, ?> inputFormat = new HiveInputFormat<WritableComparable, Writable>();
    InputSplit[] splits = inputFormat.getSplits(conf, 10);
    assertEquals(1, splits.length);
    org.apache.hadoop.mapred.RecordReader<NullWritable, VectorizedRowBatch> reader = inputFormat.getRecordReader(splits[0], conf, Reporter.NULL);
    NullWritable key = reader.createKey();
    VectorizedRowBatch value = reader.createValue();
    assertEquals(true, reader.next(key, value));
    assertEquals(10, value.count());
    LongColumnVector col0 = (LongColumnVector) value.cols[0];
    for (int i = 0; i < 10; i++) {
        assertEquals("checking " + i, i, col0.vector[i]);
    }
    assertEquals(false, reader.next(key, value));
}

Also used : NullWritable(org.apache.hadoop.io.NullWritable) CombineHiveInputFormat(org.apache.hadoop.hive.ql.io.CombineHiveInputFormat) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) RecordWriter(org.apache.hadoop.mapred.RecordWriter) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Example 3 with HiveInputFormat

use of org.apache.hadoop.hive.ql.io.HiveInputFormat in project hive by apache.

the class FetchOperator method getInputFormatFromCache.

@SuppressWarnings("unchecked")
static InputFormat getInputFormatFromCache(Class<? extends InputFormat> inputFormatClass, JobConf conf) throws IOException {
    if (Configurable.class.isAssignableFrom(inputFormatClass) || JobConfigurable.class.isAssignableFrom(inputFormatClass)) {
        return ReflectionUtil.newInstance(inputFormatClass, conf);
    }
    // TODO: why is this copy-pasted from HiveInputFormat?
    InputFormat format = inputFormats.get(inputFormatClass.getName());
    if (format == null) {
        try {
            format = ReflectionUtil.newInstance(inputFormatClass, conf);
            inputFormats.put(inputFormatClass.getName(), format);
        } catch (Exception e) {
            throw new IOException("Cannot create an instance of InputFormat class " + inputFormatClass.getName() + " as specified in mapredWork!", e);
        }
    }
    return format;
}

Also used : JobConfigurable(org.apache.hadoop.mapred.JobConfigurable) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapreduce.lib.input.FileInputFormat) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) Configurable(org.apache.hadoop.conf.Configurable) JobConfigurable(org.apache.hadoop.mapred.JobConfigurable) IOException(java.io.IOException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) IOException(java.io.IOException)

Example 4 with HiveInputFormat

use of org.apache.hadoop.hive.ql.io.HiveInputFormat in project hive by apache.

the class FetchOperator method processCurrPathForMmWriteIds.

private String processCurrPathForMmWriteIds(InputFormat inputFormat) throws IOException {
    if (inputFormat instanceof HiveInputFormat) {
        // No need to process here.
        return StringUtils.escapeString(currPath.toString());
    }
    ValidWriteIdList validWriteIdList;
    if (AcidUtils.isInsertOnlyTable(currDesc.getTableDesc().getProperties())) {
        validWriteIdList = extractValidWriteIdList();
    } else {
        // non-MM case
        validWriteIdList = null;
    }
    if (validWriteIdList != null) {
        Utilities.FILE_OP_LOGGER.info("Processing " + currDesc.getTableName() + " for MM paths");
    }
    Path[] dirs = HiveInputFormat.processPathsForMmRead(Lists.newArrayList(currPath), job, validWriteIdList);
    if (dirs == null || dirs.length == 0) {
        // No valid inputs. This condition is logged inside the call.
        return null;
    }
    StringBuffer str = new StringBuffer(StringUtils.escapeString(dirs[0].toString()));
    for (int i = 1; i < dirs.length; i++) {
        str.append(",").append(StringUtils.escapeString(dirs[i].toString()));
    }
    return str.toString();
}

Also used : Path(org.apache.hadoop.fs.Path) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) ValidWriteIdList(org.apache.hadoop.hive.common.ValidWriteIdList)

Example 5 with HiveInputFormat

use of org.apache.hadoop.hive.ql.io.HiveInputFormat in project hive by apache.

the class GenSparkSkewJoinProcessor method processSkewJoin.

@SuppressWarnings("unchecked")
public static void processSkewJoin(JoinOperator joinOp, Task<? extends Serializable> currTask, ReduceWork reduceWork, ParseContext parseCtx) throws SemanticException {
    SparkWork currentWork = ((SparkTask) currTask).getWork();
    if (currentWork.getChildren(reduceWork).size() > 0) {
        LOG.warn("Skip runtime skew join as the ReduceWork has child work and hasn't been split.");
        return;
    }
    List<Task<? extends Serializable>> children = currTask.getChildTasks();
    Path baseTmpDir = parseCtx.getContext().getMRTmpPath();
    JoinDesc joinDescriptor = joinOp.getConf();
    Map<Byte, List<ExprNodeDesc>> joinValues = joinDescriptor.getExprs();
    int numAliases = joinValues.size();
    Map<Byte, Path> bigKeysDirMap = new HashMap<Byte, Path>();
    Map<Byte, Map<Byte, Path>> smallKeysDirMap = new HashMap<Byte, Map<Byte, Path>>();
    Map<Byte, Path> skewJoinJobResultsDir = new HashMap<Byte, Path>();
    Byte[] tags = joinDescriptor.getTagOrder();
    // for each joining table, set dir for big key and small keys properly
    for (int i = 0; i < numAliases; i++) {
        Byte alias = tags[i];
        bigKeysDirMap.put(alias, GenMRSkewJoinProcessor.getBigKeysDir(baseTmpDir, alias));
        Map<Byte, Path> smallKeysMap = new HashMap<Byte, Path>();
        smallKeysDirMap.put(alias, smallKeysMap);
        for (Byte src2 : tags) {
            if (!src2.equals(alias)) {
                smallKeysMap.put(src2, GenMRSkewJoinProcessor.getSmallKeysDir(baseTmpDir, alias, src2));
            }
        }
        skewJoinJobResultsDir.put(alias, GenMRSkewJoinProcessor.getBigKeysSkewJoinResultDir(baseTmpDir, alias));
    }
    joinDescriptor.setHandleSkewJoin(true);
    joinDescriptor.setBigKeysDirMap(bigKeysDirMap);
    joinDescriptor.setSmallKeysDirMap(smallKeysDirMap);
    joinDescriptor.setSkewKeyDefinition(HiveConf.getIntVar(parseCtx.getConf(), HiveConf.ConfVars.HIVESKEWJOINKEY));
    // create proper table/column desc for spilled tables
    TableDesc keyTblDesc = (TableDesc) reduceWork.getKeyDesc().clone();
    List<String> joinKeys = Utilities.getColumnNames(keyTblDesc.getProperties());
    List<String> joinKeyTypes = Utilities.getColumnTypes(keyTblDesc.getProperties());
    Map<Byte, TableDesc> tableDescList = new HashMap<Byte, TableDesc>();
    Map<Byte, RowSchema> rowSchemaList = new HashMap<Byte, RowSchema>();
    Map<Byte, List<ExprNodeDesc>> newJoinValues = new HashMap<Byte, List<ExprNodeDesc>>();
    Map<Byte, List<ExprNodeDesc>> newJoinKeys = new HashMap<Byte, List<ExprNodeDesc>>();
    // used for create mapJoinDesc, should be in order
    List<TableDesc> newJoinValueTblDesc = new ArrayList<TableDesc>();
    for (int i = 0; i < tags.length; i++) {
        newJoinValueTblDesc.add(null);
    }
    for (int i = 0; i < numAliases; i++) {
        Byte alias = tags[i];
        List<ExprNodeDesc> valueCols = joinValues.get(alias);
        String colNames = "";
        String colTypes = "";
        int columnSize = valueCols.size();
        List<ExprNodeDesc> newValueExpr = new ArrayList<ExprNodeDesc>();
        List<ExprNodeDesc> newKeyExpr = new ArrayList<ExprNodeDesc>();
        ArrayList<ColumnInfo> columnInfos = new ArrayList<ColumnInfo>();
        boolean first = true;
        for (int k = 0; k < columnSize; k++) {
            TypeInfo type = valueCols.get(k).getTypeInfo();
            // any name, it does not matter.
            String newColName = i + "_VALUE_" + k;
            ColumnInfo columnInfo = new ColumnInfo(newColName, type, alias.toString(), false);
            columnInfos.add(columnInfo);
            newValueExpr.add(new ExprNodeColumnDesc(columnInfo.getType(), columnInfo.getInternalName(), columnInfo.getTabAlias(), false));
            if (!first) {
                colNames = colNames + ",";
                colTypes = colTypes + ",";
            }
            first = false;
            colNames = colNames + newColName;
            colTypes = colTypes + valueCols.get(k).getTypeString();
        }
        // we are putting join keys at last part of the spilled table
        for (int k = 0; k < joinKeys.size(); k++) {
            if (!first) {
                colNames = colNames + ",";
                colTypes = colTypes + ",";
            }
            first = false;
            colNames = colNames + joinKeys.get(k);
            colTypes = colTypes + joinKeyTypes.get(k);
            ColumnInfo columnInfo = new ColumnInfo(joinKeys.get(k), TypeInfoFactory.getPrimitiveTypeInfo(joinKeyTypes.get(k)), alias.toString(), false);
            columnInfos.add(columnInfo);
            newKeyExpr.add(new ExprNodeColumnDesc(columnInfo.getType(), columnInfo.getInternalName(), columnInfo.getTabAlias(), false));
        }
        newJoinValues.put(alias, newValueExpr);
        newJoinKeys.put(alias, newKeyExpr);
        tableDescList.put(alias, Utilities.getTableDesc(colNames, colTypes));
        rowSchemaList.put(alias, new RowSchema(columnInfos));
        // construct value table Desc
        String valueColNames = "";
        String valueColTypes = "";
        first = true;
        for (int k = 0; k < columnSize; k++) {
            // any name, it does not matter.
            String newColName = i + "_VALUE_" + k;
            if (!first) {
                valueColNames = valueColNames + ",";
                valueColTypes = valueColTypes + ",";
            }
            valueColNames = valueColNames + newColName;
            valueColTypes = valueColTypes + valueCols.get(k).getTypeString();
            first = false;
        }
        newJoinValueTblDesc.set((byte) i, Utilities.getTableDesc(valueColNames, valueColTypes));
    }
    joinDescriptor.setSkewKeysValuesTables(tableDescList);
    joinDescriptor.setKeyTableDesc(keyTblDesc);
    // create N-1 map join tasks
    HashMap<Path, Task<? extends Serializable>> bigKeysDirToTaskMap = new HashMap<Path, Task<? extends Serializable>>();
    List<Serializable> listWorks = new ArrayList<Serializable>();
    List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>();
    for (int i = 0; i < numAliases - 1; i++) {
        Byte src = tags[i];
        HiveConf hiveConf = new HiveConf(parseCtx.getConf(), GenSparkSkewJoinProcessor.class);
        SparkWork sparkWork = new SparkWork(parseCtx.getConf().getVar(HiveConf.ConfVars.HIVEQUERYID));
        Task<? extends Serializable> skewJoinMapJoinTask = TaskFactory.get(sparkWork);
        skewJoinMapJoinTask.setFetchSource(currTask.isFetchSource());
        // create N TableScans
        Operator<? extends OperatorDesc>[] parentOps = new TableScanOperator[tags.length];
        for (int k = 0; k < tags.length; k++) {
            Operator<? extends OperatorDesc> ts = GenMapRedUtils.createTemporaryTableScanOperator(joinOp.getCompilationOpContext(), rowSchemaList.get((byte) k));
            ((TableScanOperator) ts).setTableDescSkewJoin(tableDescList.get((byte) k));
            parentOps[k] = ts;
        }
        // create the MapJoinOperator
        String dumpFilePrefix = "mapfile" + PlanUtils.getCountForMapJoinDumpFilePrefix();
        MapJoinDesc mapJoinDescriptor = new MapJoinDesc(newJoinKeys, keyTblDesc, newJoinValues, newJoinValueTblDesc, newJoinValueTblDesc, joinDescriptor.getOutputColumnNames(), i, joinDescriptor.getConds(), joinDescriptor.getFilters(), joinDescriptor.getNoOuterJoin(), dumpFilePrefix, joinDescriptor.getMemoryMonitorInfo(), joinDescriptor.getInMemoryDataSize());
        mapJoinDescriptor.setTagOrder(tags);
        mapJoinDescriptor.setHandleSkewJoin(false);
        mapJoinDescriptor.setNullSafes(joinDescriptor.getNullSafes());
        mapJoinDescriptor.setColumnExprMap(joinDescriptor.getColumnExprMap());
        // temporarily, mark it as child of all the TS
        MapJoinOperator mapJoinOp = (MapJoinOperator) OperatorFactory.getAndMakeChild(joinOp.getCompilationOpContext(), mapJoinDescriptor, null, parentOps);
        // clone the original join operator, and replace it with the MJ
        // this makes sure MJ has the same downstream operator plan as the original join
        List<Operator<?>> reducerList = new ArrayList<Operator<?>>();
        reducerList.add(reduceWork.getReducer());
        Operator<? extends OperatorDesc> reducer = SerializationUtilities.cloneOperatorTree(reducerList).get(0);
        Preconditions.checkArgument(reducer instanceof JoinOperator, "Reducer should be join operator, but actually is " + reducer.getName());
        JoinOperator cloneJoinOp = (JoinOperator) reducer;
        List<Operator<? extends OperatorDesc>> childOps = cloneJoinOp.getChildOperators();
        for (Operator<? extends OperatorDesc> childOp : childOps) {
            childOp.replaceParent(cloneJoinOp, mapJoinOp);
        }
        mapJoinOp.setChildOperators(childOps);
        // set memory usage for the MJ operator
        setMemUsage(mapJoinOp, skewJoinMapJoinTask, parseCtx);
        // create N MapWorks and add them to the SparkWork
        MapWork bigMapWork = null;
        Map<Byte, Path> smallTblDirs = smallKeysDirMap.get(src);
        for (int j = 0; j < tags.length; j++) {
            MapWork mapWork = PlanUtils.getMapRedWork().getMapWork();
            sparkWork.add(mapWork);
            // This code has been only added for testing
            boolean mapperCannotSpanPartns = parseCtx.getConf().getBoolVar(HiveConf.ConfVars.HIVE_MAPPER_CANNOT_SPAN_MULTIPLE_PARTITIONS);
            mapWork.setMapperCannotSpanPartns(mapperCannotSpanPartns);
            Operator<? extends OperatorDesc> tableScan = parentOps[j];
            String alias = tags[j].toString();
            ArrayList<String> aliases = new ArrayList<String>();
            aliases.add(alias);
            Path path;
            if (j == i) {
                path = bigKeysDirMap.get(tags[j]);
                bigKeysDirToTaskMap.put(path, skewJoinMapJoinTask);
                bigMapWork = mapWork;
            } else {
                path = smallTblDirs.get(tags[j]);
            }
            mapWork.addPathToAlias(path, aliases);
            mapWork.getAliasToWork().put(alias, tableScan);
            PartitionDesc partitionDesc = new PartitionDesc(tableDescList.get(tags[j]), null);
            mapWork.addPathToPartitionInfo(path, partitionDesc);
            mapWork.getAliasToPartnInfo().put(alias, partitionDesc);
            mapWork.setName("Map " + GenSparkUtils.getUtils().getNextSeqNumber());
        }
        // connect all small dir map work to the big dir map work
        Preconditions.checkArgument(bigMapWork != null, "Haven't identified big dir MapWork");
        // these 2 flags are intended only for the big-key map work
        bigMapWork.setNumMapTasks(HiveConf.getIntVar(hiveConf, HiveConf.ConfVars.HIVESKEWJOINMAPJOINNUMMAPTASK));
        bigMapWork.setMinSplitSize(HiveConf.getLongVar(hiveConf, HiveConf.ConfVars.HIVESKEWJOINMAPJOINMINSPLIT));
        // use HiveInputFormat so that we can control the number of map tasks
        bigMapWork.setInputformat(HiveInputFormat.class.getName());
        for (BaseWork work : sparkWork.getRoots()) {
            Preconditions.checkArgument(work instanceof MapWork, "All root work should be MapWork, but got " + work.getClass().getSimpleName());
            if (work != bigMapWork) {
                sparkWork.connect(work, bigMapWork, new SparkEdgeProperty(SparkEdgeProperty.SHUFFLE_NONE));
            }
        }
        // insert SparkHashTableSink and Dummy operators
        for (int j = 0; j < tags.length; j++) {
            if (j != i) {
                insertSHTS(tags[j], (TableScanOperator) parentOps[j], bigMapWork);
            }
        }
        listWorks.add(skewJoinMapJoinTask.getWork());
        listTasks.add(skewJoinMapJoinTask);
    }
    if (children != null) {
        for (Task<? extends Serializable> tsk : listTasks) {
            for (Task<? extends Serializable> oldChild : children) {
                tsk.addDependentTask(oldChild);
            }
        }
        currTask.setChildTasks(new ArrayList<Task<? extends Serializable>>());
        for (Task<? extends Serializable> oldChild : children) {
            oldChild.getParentTasks().remove(currTask);
        }
        listTasks.addAll(children);
        for (Task<? extends Serializable> oldChild : children) {
            listWorks.add(oldChild.getWork());
        }
    }
    ConditionalResolverSkewJoin.ConditionalResolverSkewJoinCtx context = new ConditionalResolverSkewJoin.ConditionalResolverSkewJoinCtx(bigKeysDirToTaskMap, children);
    ConditionalWork cndWork = new ConditionalWork(listWorks);
    ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork);
    cndTsk.setListTasks(listTasks);
    cndTsk.setResolver(new ConditionalResolverSkewJoin());
    cndTsk.setResolverCtx(context);
    currTask.setChildTasks(new ArrayList<Task<? extends Serializable>>());
    currTask.addDependentTask(cndTsk);
}

Also used : MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) Serializable(java.io.Serializable) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ConditionalWork(org.apache.hadoop.hive.ql.plan.ConditionalWork) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) List(java.util.List) ArrayList(java.util.ArrayList) HiveConf(org.apache.hadoop.hive.conf.HiveConf) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) ConditionalResolverSkewJoin(org.apache.hadoop.hive.ql.plan.ConditionalResolverSkewJoin) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) MapJoinDesc(org.apache.hadoop.hive.ql.plan.MapJoinDesc) JoinDesc(org.apache.hadoop.hive.ql.plan.JoinDesc) Map(java.util.Map) HashMap(java.util.HashMap) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) HashTableDummyOperator(org.apache.hadoop.hive.ql.exec.HashTableDummyOperator) SparkHashTableSinkOperator(org.apache.hadoop.hive.ql.exec.SparkHashTableSinkOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) Path(org.apache.hadoop.fs.Path) MapJoinDesc(org.apache.hadoop.hive.ql.plan.MapJoinDesc) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) SparkEdgeProperty(org.apache.hadoop.hive.ql.plan.SparkEdgeProperty) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc)

Aggregations

HiveInputFormat (org.apache.hadoop.hive.ql.io.HiveInputFormat)13 Path (org.apache.hadoop.fs.Path)6 IOException (java.io.IOException)5 CombineHiveInputFormat (org.apache.hadoop.hive.ql.io.CombineHiveInputFormat)5 InputSplit (org.apache.hadoop.mapred.InputSplit)5 Test (org.junit.Test)5 ArrayList (java.util.ArrayList)3 HashMap (java.util.HashMap)3 List (java.util.List)3 Map (java.util.Map)3 FileSystem (org.apache.hadoop.fs.FileSystem)3 HiveConf (org.apache.hadoop.hive.conf.HiveConf)3 LongColumnVector (org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)3 VectorizedRowBatch (org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch)3 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)3 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)3 InputFormat (org.apache.hadoop.mapred.InputFormat)3 Serializable (java.io.Serializable)2 Configurable (org.apache.hadoop.conf.Configurable)2 FileStatus (org.apache.hadoop.fs.FileStatus)2