Search in sources :

Example 61 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class GenSparkSkewJoinProcessor method processSkewJoin.

@SuppressWarnings("unchecked")
public static void processSkewJoin(JoinOperator joinOp, Task<?> currTask, ReduceWork reduceWork, ParseContext parseCtx) throws SemanticException {
    SparkWork currentWork = ((SparkTask) currTask).getWork();
    if (currentWork.getChildren(reduceWork).size() > 0) {
        LOG.warn("Skip runtime skew join as the ReduceWork has child work and hasn't been split.");
        return;
    }
    List<Task<?>> children = currTask.getChildTasks();
    Path baseTmpDir = parseCtx.getContext().getMRTmpPath();
    JoinDesc joinDescriptor = joinOp.getConf();
    Map<Byte, List<ExprNodeDesc>> joinValues = joinDescriptor.getExprs();
    int numAliases = joinValues.size();
    Map<Byte, Path> bigKeysDirMap = new HashMap<Byte, Path>();
    Map<Byte, Map<Byte, Path>> smallKeysDirMap = new HashMap<Byte, Map<Byte, Path>>();
    Map<Byte, Path> skewJoinJobResultsDir = new HashMap<Byte, Path>();
    Byte[] tags = joinDescriptor.getTagOrder();
    // for each joining table, set dir for big key and small keys properly
    for (int i = 0; i < numAliases; i++) {
        Byte alias = tags[i];
        bigKeysDirMap.put(alias, GenMRSkewJoinProcessor.getBigKeysDir(baseTmpDir, alias));
        Map<Byte, Path> smallKeysMap = new HashMap<Byte, Path>();
        smallKeysDirMap.put(alias, smallKeysMap);
        for (Byte src2 : tags) {
            if (!src2.equals(alias)) {
                smallKeysMap.put(src2, GenMRSkewJoinProcessor.getSmallKeysDir(baseTmpDir, alias, src2));
            }
        }
        skewJoinJobResultsDir.put(alias, GenMRSkewJoinProcessor.getBigKeysSkewJoinResultDir(baseTmpDir, alias));
    }
    joinDescriptor.setHandleSkewJoin(true);
    joinDescriptor.setBigKeysDirMap(bigKeysDirMap);
    joinDescriptor.setSmallKeysDirMap(smallKeysDirMap);
    joinDescriptor.setSkewKeyDefinition(HiveConf.getIntVar(parseCtx.getConf(), HiveConf.ConfVars.HIVESKEWJOINKEY));
    // create proper table/column desc for spilled tables
    TableDesc keyTblDesc = (TableDesc) reduceWork.getKeyDesc().clone();
    List<String> joinKeys = Utilities.getColumnNames(keyTblDesc.getProperties());
    List<String> joinKeyTypes = Utilities.getColumnTypes(keyTblDesc.getProperties());
    Map<Byte, TableDesc> tableDescList = new HashMap<Byte, TableDesc>();
    Map<Byte, RowSchema> rowSchemaList = new HashMap<Byte, RowSchema>();
    Map<Byte, List<ExprNodeDesc>> newJoinValues = new HashMap<Byte, List<ExprNodeDesc>>();
    Map<Byte, List<ExprNodeDesc>> newJoinKeys = new HashMap<Byte, List<ExprNodeDesc>>();
    // used for create mapJoinDesc, should be in order
    List<TableDesc> newJoinValueTblDesc = new ArrayList<TableDesc>();
    for (int i = 0; i < tags.length; i++) {
        newJoinValueTblDesc.add(null);
    }
    for (int i = 0; i < numAliases; i++) {
        Byte alias = tags[i];
        List<ExprNodeDesc> valueCols = joinValues.get(alias);
        String colNames = "";
        String colTypes = "";
        int columnSize = valueCols.size();
        List<ExprNodeDesc> newValueExpr = new ArrayList<ExprNodeDesc>();
        List<ExprNodeDesc> newKeyExpr = new ArrayList<ExprNodeDesc>();
        ArrayList<ColumnInfo> columnInfos = new ArrayList<ColumnInfo>();
        boolean first = true;
        for (int k = 0; k < columnSize; k++) {
            TypeInfo type = valueCols.get(k).getTypeInfo();
            // any name, it does not matter.
            String newColName = i + "_VALUE_" + k;
            ColumnInfo columnInfo = new ColumnInfo(newColName, type, alias.toString(), false);
            columnInfos.add(columnInfo);
            newValueExpr.add(new ExprNodeColumnDesc(columnInfo.getType(), columnInfo.getInternalName(), columnInfo.getTabAlias(), false));
            if (!first) {
                colNames = colNames + ",";
                colTypes = colTypes + ",";
            }
            first = false;
            colNames = colNames + newColName;
            colTypes = colTypes + valueCols.get(k).getTypeString();
        }
        // we are putting join keys at last part of the spilled table
        for (int k = 0; k < joinKeys.size(); k++) {
            if (!first) {
                colNames = colNames + ",";
                colTypes = colTypes + ",";
            }
            first = false;
            colNames = colNames + joinKeys.get(k);
            colTypes = colTypes + joinKeyTypes.get(k);
            ColumnInfo columnInfo = new ColumnInfo(joinKeys.get(k), TypeInfoFactory.getPrimitiveTypeInfo(joinKeyTypes.get(k)), alias.toString(), false);
            columnInfos.add(columnInfo);
            newKeyExpr.add(new ExprNodeColumnDesc(columnInfo.getType(), columnInfo.getInternalName(), columnInfo.getTabAlias(), false));
        }
        newJoinValues.put(alias, newValueExpr);
        newJoinKeys.put(alias, newKeyExpr);
        tableDescList.put(alias, Utilities.getTableDesc(colNames, colTypes));
        rowSchemaList.put(alias, new RowSchema(columnInfos));
        // construct value table Desc
        String valueColNames = "";
        String valueColTypes = "";
        first = true;
        for (int k = 0; k < columnSize; k++) {
            // any name, it does not matter.
            String newColName = i + "_VALUE_" + k;
            if (!first) {
                valueColNames = valueColNames + ",";
                valueColTypes = valueColTypes + ",";
            }
            valueColNames = valueColNames + newColName;
            valueColTypes = valueColTypes + valueCols.get(k).getTypeString();
            first = false;
        }
        newJoinValueTblDesc.set((byte) i, Utilities.getTableDesc(valueColNames, valueColTypes));
    }
    joinDescriptor.setSkewKeysValuesTables(tableDescList);
    joinDescriptor.setKeyTableDesc(keyTblDesc);
    // create N-1 map join tasks
    HashMap<Path, Task<?>> bigKeysDirToTaskMap = new HashMap<Path, Task<?>>();
    List<Serializable> listWorks = new ArrayList<Serializable>();
    List<Task<?>> listTasks = new ArrayList<Task<?>>();
    for (int i = 0; i < numAliases - 1; i++) {
        Byte src = tags[i];
        HiveConf hiveConf = new HiveConf(parseCtx.getConf(), GenSparkSkewJoinProcessor.class);
        SparkWork sparkWork = new SparkWork(parseCtx.getConf().getVar(HiveConf.ConfVars.HIVEQUERYID));
        Task<?> skewJoinMapJoinTask = TaskFactory.get(sparkWork);
        skewJoinMapJoinTask.setFetchSource(currTask.isFetchSource());
        // create N TableScans
        Operator<? extends OperatorDesc>[] parentOps = new TableScanOperator[tags.length];
        for (int k = 0; k < tags.length; k++) {
            Operator<? extends OperatorDesc> ts = GenMapRedUtils.createTemporaryTableScanOperator(joinOp.getCompilationOpContext(), rowSchemaList.get((byte) k));
            ((TableScanOperator) ts).setTableDescSkewJoin(tableDescList.get((byte) k));
            parentOps[k] = ts;
        }
        // create the MapJoinOperator
        String dumpFilePrefix = "mapfile" + PlanUtils.getCountForMapJoinDumpFilePrefix();
        MapJoinDesc mapJoinDescriptor = new MapJoinDesc(newJoinKeys, keyTblDesc, newJoinValues, newJoinValueTblDesc, newJoinValueTblDesc, joinDescriptor.getOutputColumnNames(), i, joinDescriptor.getConds(), joinDescriptor.getFilters(), joinDescriptor.getNoOuterJoin(), dumpFilePrefix, joinDescriptor.getMemoryMonitorInfo(), joinDescriptor.getInMemoryDataSize());
        mapJoinDescriptor.setTagOrder(tags);
        mapJoinDescriptor.setHandleSkewJoin(false);
        mapJoinDescriptor.setNullSafes(joinDescriptor.getNullSafes());
        mapJoinDescriptor.setColumnExprMap(joinDescriptor.getColumnExprMap());
        // temporarily, mark it as child of all the TS
        MapJoinOperator mapJoinOp = (MapJoinOperator) OperatorFactory.getAndMakeChild(joinOp.getCompilationOpContext(), mapJoinDescriptor, null, parentOps);
        // clone the original join operator, and replace it with the MJ
        // this makes sure MJ has the same downstream operator plan as the original join
        List<Operator<?>> reducerList = new ArrayList<Operator<?>>();
        reducerList.add(reduceWork.getReducer());
        Operator<? extends OperatorDesc> reducer = SerializationUtilities.cloneOperatorTree(reducerList).get(0);
        Preconditions.checkArgument(reducer instanceof JoinOperator, "Reducer should be join operator, but actually is " + reducer.getName());
        JoinOperator cloneJoinOp = (JoinOperator) reducer;
        List<Operator<? extends OperatorDesc>> childOps = cloneJoinOp.getChildOperators();
        for (Operator<? extends OperatorDesc> childOp : childOps) {
            childOp.replaceParent(cloneJoinOp, mapJoinOp);
        }
        mapJoinOp.setChildOperators(childOps);
        // set memory usage for the MJ operator
        setMemUsage(mapJoinOp, skewJoinMapJoinTask, parseCtx);
        // create N MapWorks and add them to the SparkWork
        MapWork bigMapWork = null;
        Map<Byte, Path> smallTblDirs = smallKeysDirMap.get(src);
        for (int j = 0; j < tags.length; j++) {
            MapWork mapWork = PlanUtils.getMapRedWork().getMapWork();
            sparkWork.add(mapWork);
            // This code has been only added for testing
            boolean mapperCannotSpanPartns = parseCtx.getConf().getBoolVar(HiveConf.ConfVars.HIVE_MAPPER_CANNOT_SPAN_MULTIPLE_PARTITIONS);
            mapWork.setMapperCannotSpanPartns(mapperCannotSpanPartns);
            Operator<? extends OperatorDesc> tableScan = parentOps[j];
            String alias = tags[j].toString();
            ArrayList<String> aliases = new ArrayList<String>();
            aliases.add(alias);
            Path path;
            if (j == i) {
                path = bigKeysDirMap.get(tags[j]);
                bigKeysDirToTaskMap.put(path, skewJoinMapJoinTask);
                bigMapWork = mapWork;
            } else {
                path = smallTblDirs.get(tags[j]);
            }
            mapWork.addPathToAlias(path, aliases);
            mapWork.getAliasToWork().put(alias, tableScan);
            PartitionDesc partitionDesc = new PartitionDesc(tableDescList.get(tags[j]), null);
            mapWork.addPathToPartitionInfo(path, partitionDesc);
            mapWork.getAliasToPartnInfo().put(alias, partitionDesc);
            mapWork.setName("Map " + GenSparkUtils.getUtils().getNextSeqNumber());
        }
        // connect all small dir map work to the big dir map work
        Preconditions.checkArgument(bigMapWork != null, "Haven't identified big dir MapWork");
        // these 2 flags are intended only for the big-key map work
        bigMapWork.setNumMapTasks(HiveConf.getIntVar(hiveConf, HiveConf.ConfVars.HIVESKEWJOINMAPJOINNUMMAPTASK));
        bigMapWork.setMinSplitSize(HiveConf.getLongVar(hiveConf, HiveConf.ConfVars.HIVESKEWJOINMAPJOINMINSPLIT));
        // use HiveInputFormat so that we can control the number of map tasks
        bigMapWork.setInputformat(HiveInputFormat.class.getName());
        for (BaseWork work : sparkWork.getRoots()) {
            Preconditions.checkArgument(work instanceof MapWork, "All root work should be MapWork, but got " + work.getClass().getSimpleName());
            if (work != bigMapWork) {
                sparkWork.connect(work, bigMapWork, new SparkEdgeProperty(SparkEdgeProperty.SHUFFLE_NONE));
            }
        }
        // insert SparkHashTableSink and Dummy operators
        for (int j = 0; j < tags.length; j++) {
            if (j != i) {
                insertSHTS(tags[j], (TableScanOperator) parentOps[j], bigMapWork);
            }
        }
        listWorks.add(skewJoinMapJoinTask.getWork());
        listTasks.add(skewJoinMapJoinTask);
    }
    if (children != null) {
        for (Task<?> tsk : listTasks) {
            for (Task<?> oldChild : children) {
                tsk.addDependentTask(oldChild);
            }
        }
        currTask.setChildTasks(new ArrayList<Task<?>>());
        for (Task<?> oldChild : children) {
            oldChild.getParentTasks().remove(currTask);
        }
        listTasks.addAll(children);
        for (Task<?> oldChild : children) {
            listWorks.add(oldChild.getWork());
        }
    }
    ConditionalResolverSkewJoin.ConditionalResolverSkewJoinCtx context = new ConditionalResolverSkewJoin.ConditionalResolverSkewJoinCtx(bigKeysDirToTaskMap, children);
    ConditionalWork cndWork = new ConditionalWork(listWorks);
    ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork);
    cndTsk.setListTasks(listTasks);
    cndTsk.setResolver(new ConditionalResolverSkewJoin());
    cndTsk.setResolverCtx(context);
    currTask.setChildTasks(new ArrayList<Task<?>>());
    currTask.addDependentTask(cndTsk);
}
Also used : MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) Serializable(java.io.Serializable) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ConditionalWork(org.apache.hadoop.hive.ql.plan.ConditionalWork) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) List(java.util.List) ArrayList(java.util.ArrayList) HiveConf(org.apache.hadoop.hive.conf.HiveConf) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) ConditionalResolverSkewJoin(org.apache.hadoop.hive.ql.plan.ConditionalResolverSkewJoin) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) MapJoinDesc(org.apache.hadoop.hive.ql.plan.MapJoinDesc) JoinDesc(org.apache.hadoop.hive.ql.plan.JoinDesc) Map(java.util.Map) HashMap(java.util.HashMap) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) HashTableDummyOperator(org.apache.hadoop.hive.ql.exec.HashTableDummyOperator) SparkHashTableSinkOperator(org.apache.hadoop.hive.ql.exec.SparkHashTableSinkOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) Path(org.apache.hadoop.fs.Path) MapJoinDesc(org.apache.hadoop.hive.ql.plan.MapJoinDesc) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) SparkEdgeProperty(org.apache.hadoop.hive.ql.plan.SparkEdgeProperty) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc)

Example 62 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class TestPlan method testPlan.

@Test
public void testPlan() throws Exception {
    final String f1 = "#affiliations";
    final String f2 = "friends[0].friendid";
    try {
        // initialize a complete map reduce configuration
        ExprNodeDesc expr1 = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, f1, "", false);
        ExprNodeDesc expr2 = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, f2, "", false);
        ExprNodeDesc filterExpr = ExprNodeTypeCheck.getExprNodeDefaultExprProcessor().getFuncExprNodeDesc("==", expr1, expr2);
        FilterDesc filterCtx = new FilterDesc(filterExpr, false);
        Operator<FilterDesc> op = OperatorFactory.get(new CompilationOpContext(), FilterDesc.class);
        op.setConf(filterCtx);
        ArrayList<String> aliasList = new ArrayList<String>();
        aliasList.add("a");
        Map<Path, List<String>> pa = new LinkedHashMap<>();
        pa.put(new Path("/tmp/testfolder"), aliasList);
        TableDesc tblDesc = Utilities.defaultTd;
        PartitionDesc partDesc = new PartitionDesc(tblDesc, null);
        LinkedHashMap<Path, PartitionDesc> pt = new LinkedHashMap<>();
        pt.put(new Path("/tmp/testfolder"), partDesc);
        LinkedHashMap<String, Operator<? extends OperatorDesc>> ao = new LinkedHashMap<String, Operator<? extends OperatorDesc>>();
        ao.put("a", op);
        MapredWork mrwork = new MapredWork();
        mrwork.getMapWork().setPathToAliases(pa);
        mrwork.getMapWork().setPathToPartitionInfo(pt);
        mrwork.getMapWork().setAliasToWork(ao);
        JobConf job = new JobConf(TestPlan.class);
        // serialize the configuration once ..
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        SerializationUtilities.serializePlan(mrwork, baos);
        baos.close();
        String v1 = baos.toString();
        // store into configuration
        job.set("fs.default.name", "file:///");
        Utilities.setMapRedWork(job, mrwork, new Path(System.getProperty("java.io.tmpdir") + File.separator + System.getProperty("user.name") + File.separator + "hive"));
        MapredWork mrwork2 = Utilities.getMapRedWork(job);
        Utilities.clearWork(job);
        // over here we should have some checks of the deserialized object against
        // the orginal object
        // System.out.println(v1);
        // serialize again
        baos.reset();
        SerializationUtilities.serializePlan(mrwork2, baos);
        baos.close();
        // verify that the two are equal
        assertEquals(v1, baos.toString());
    } catch (Exception excp) {
        excp.printStackTrace();
        throw excp;
    }
    System.out.println("Serialization/Deserialization of plan successful");
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) ByteArrayOutputStream(java.io.ByteArrayOutputStream) LinkedHashMap(java.util.LinkedHashMap) FilterDesc(org.apache.hadoop.hive.ql.plan.FilterDesc) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) ArrayList(java.util.ArrayList) List(java.util.List) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) JobConf(org.apache.hadoop.mapred.JobConf) Test(org.junit.Test)

Example 63 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class TestUtilities method testGetInputPathsWithMultipleThreadsAndEmptyPartitions.

/**
 * Check that calling {@link Utilities#getInputPaths(JobConf, MapWork, Path, Context, boolean)}
 * can process two different tables that both have empty partitions when using multiple threads.
 * Some extra logic is placed at the end of the test to validate no race conditions put the
 * {@link MapWork} object in an invalid state.
 */
@Test
public void testGetInputPathsWithMultipleThreadsAndEmptyPartitions() throws Exception {
    int numPartitions = 15;
    JobConf jobConf = new JobConf();
    jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, Runtime.getRuntime().availableProcessors() * 2);
    MapWork mapWork = new MapWork();
    Path testTablePath = new Path("testTable");
    Path[] testPartitionsPaths = new Path[numPartitions];
    PartitionDesc mockPartitionDesc = mock(PartitionDesc.class);
    TableDesc mockTableDesc = mock(TableDesc.class);
    when(mockTableDesc.isNonNative()).thenReturn(false);
    when(mockTableDesc.getProperties()).thenReturn(new Properties());
    when(mockPartitionDesc.getProperties()).thenReturn(new Properties());
    when(mockPartitionDesc.getTableDesc()).thenReturn(mockTableDesc);
    doReturn(HiveSequenceFileOutputFormat.class).when(mockPartitionDesc).getOutputFileFormatClass();
    for (int i = 0; i < numPartitions; i++) {
        String testPartitionName = "p=" + i;
        testPartitionsPaths[i] = new Path(testTablePath, "p=" + i);
        mapWork.getPathToAliases().put(testPartitionsPaths[i], Lists.newArrayList(testPartitionName));
        mapWork.getAliasToWork().put(testPartitionName, mock(Operator.class));
        mapWork.getPathToPartitionInfo().put(testPartitionsPaths[i], mockPartitionDesc);
    }
    FileSystem fs = FileSystem.getLocal(jobConf);
    try {
        fs.mkdirs(testTablePath);
        List<Path> inputPaths = Utilities.getInputPaths(jobConf, mapWork, new Path(HiveConf.getVar(jobConf, HiveConf.ConfVars.LOCALSCRATCHDIR)), mock(Context.class), false);
        assertEquals(inputPaths.size(), numPartitions);
        for (int i = 0; i < numPartitions; i++) {
            assertNotEquals(inputPaths.get(i), testPartitionsPaths[i]);
        }
        assertEquals(mapWork.getPathToAliases().size(), numPartitions);
        assertEquals(mapWork.getPathToPartitionInfo().size(), numPartitions);
        assertEquals(mapWork.getAliasToWork().size(), numPartitions);
        for (Map.Entry<Path, List<String>> entry : mapWork.getPathToAliases().entrySet()) {
            assertNotNull(entry.getKey());
            assertNotNull(entry.getValue());
            assertEquals(entry.getValue().size(), 1);
            assertTrue(entry.getKey().getFileSystem(new Configuration()).exists(entry.getKey()));
        }
    } finally {
        if (fs.exists(testTablePath)) {
            fs.delete(testTablePath, true);
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Context(org.apache.hadoop.hive.ql.Context) Configuration(org.apache.hadoop.conf.Configuration) Properties(java.util.Properties) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) FileSystem(org.apache.hadoop.fs.FileSystem) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) List(java.util.List) ArrayList(java.util.ArrayList) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) JobConf(org.apache.hadoop.mapred.JobConf) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Test(org.junit.Test)

Example 64 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class TestSerializationUtilities method mockMapWorkWithSomePartitionDescProperties.

private static MapWork mockMapWorkWithSomePartitionDescProperties() throws Exception {
    String tableName = "test_table";
    int numPartitions = 2;
    Path root = new Path("/warehouse", "test_table");
    String[] partPath = new String[numPartitions];
    StringBuilder buffer = new StringBuilder();
    for (int p = 0; p < numPartitions; ++p) {
        partPath[p] = new Path(root, "p=" + p).toString();
        if (p != 0) {
            buffer.append(',');
        }
        buffer.append(partPath[p]);
    }
    Properties tblProps = new Properties();
    TableDesc tbl = new TableDesc(OrcInputFormat.class, OrcOutputFormat.class, tblProps);
    MapWork mapWork = new MapWork();
    Map<Path, List<String>> aliasMap = new LinkedHashMap<>();
    List<String> aliases = new ArrayList<String>();
    aliases.add(tableName);
    LinkedHashMap<Path, PartitionDesc> partMap = new LinkedHashMap<>();
    for (int p = 0; p < numPartitions; ++p) {
        Path path = new Path(partPath[p]);
        aliasMap.put(path, aliases);
        LinkedHashMap<String, String> partSpec = new LinkedHashMap<String, String>();
        PartitionDesc part = new PartitionDesc(tbl, partSpec);
        part.setVectorPartitionDesc(VectorPartitionDesc.createVectorizedInputFileFormat("MockInputFileFormatClassName", false, null));
        part.getProperties().put("impala_intermediate_stats_chunk1", "asdfghjk12345678");
        part.getProperties().put("impala_intermediate_stats_chunk2", "asdfghjk12345678");
        part.getProperties().put("rawDataSize", "10");
        part.getProperties().put("serialization.ddl", "asdf");
        partMap.put(path, part);
    }
    mapWork.setPathToAliases(aliasMap);
    mapWork.setPathToPartitionInfo(partMap);
    return mapWork;
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) Properties(java.util.Properties) LinkedHashMap(java.util.LinkedHashMap) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ArrayList(java.util.ArrayList) List(java.util.List) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) VectorPartitionDesc(org.apache.hadoop.hive.ql.plan.VectorPartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc)

Example 65 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class TestInputOutputFormat method createMockExecutionEnvironment.

/**
 * Create a mock execution environment that has enough detail that
 * ORC, vectorization, HiveInputFormat, and CombineHiveInputFormat don't
 * explode.
 * @param workDir a local filesystem work directory
 * @param warehouseDir a mock filesystem warehouse directory
 * @param tableName the table name
 * @param objectInspector object inspector for the row
 * @param isVectorized should run vectorized
 * @return a JobConf that contains the necessary information
 * @throws IOException
 * @throws HiveException
 */
static JobConf createMockExecutionEnvironment(Path workDir, Path warehouseDir, String tableName, ObjectInspector objectInspector, boolean isVectorized, int partitions, String currFileSystemName) throws IOException, HiveException {
    JobConf conf = new JobConf();
    Utilities.clearWorkMap(conf);
    conf.set("hive.exec.plan", workDir.toString());
    conf.set("mapred.job.tracker", "local");
    String isVectorizedString = Boolean.toString(isVectorized);
    conf.set("hive.vectorized.execution.enabled", isVectorizedString);
    conf.set(Utilities.VECTOR_MODE, isVectorizedString);
    conf.set(Utilities.USE_VECTORIZED_INPUT_FILE_FORMAT, isVectorizedString);
    conf.set("fs.mock.impl", currFileSystemName);
    conf.set("mapred.mapper.class", ExecMapper.class.getName());
    Path root = new Path(warehouseDir, tableName);
    // clean out previous contents
    ((MockFileSystem) root.getFileSystem(conf)).clear();
    // build partition strings
    String[] partPath = new String[partitions];
    StringBuilder buffer = new StringBuilder();
    for (int p = 0; p < partitions; ++p) {
        partPath[p] = new Path(root, "p=" + p).toString();
        if (p != 0) {
            buffer.append(',');
        }
        buffer.append(partPath[p]);
    }
    conf.set("mapred.input.dir", buffer.toString());
    StringBuilder columnIds = new StringBuilder();
    StringBuilder columnNames = new StringBuilder();
    StringBuilder columnTypes = new StringBuilder();
    StructObjectInspector structOI = (StructObjectInspector) objectInspector;
    List<? extends StructField> fields = structOI.getAllStructFieldRefs();
    int numCols = fields.size();
    for (int i = 0; i < numCols; ++i) {
        if (i != 0) {
            columnIds.append(',');
            columnNames.append(',');
            columnTypes.append(',');
        }
        columnIds.append(i);
        columnNames.append(fields.get(i).getFieldName());
        columnTypes.append(fields.get(i).getFieldObjectInspector().getTypeName());
    }
    conf.set("hive.io.file.readcolumn.ids", columnIds.toString());
    conf.set("partition_columns", "p");
    conf.set(serdeConstants.LIST_COLUMNS, columnNames.toString());
    conf.set(serdeConstants.LIST_COLUMN_TYPES, columnTypes.toString());
    MockFileSystem fs = (MockFileSystem) warehouseDir.getFileSystem(conf);
    fs.clear();
    Properties tblProps = new Properties();
    tblProps.put("name", tableName);
    tblProps.put("serialization.lib", OrcSerde.class.getName());
    tblProps.put("columns", columnNames.toString());
    tblProps.put("columns.types", columnTypes.toString());
    TableDesc tbl = new TableDesc(OrcInputFormat.class, OrcOutputFormat.class, tblProps);
    MapWork mapWork = new MapWork();
    mapWork.setVectorMode(isVectorized);
    if (isVectorized) {
        VectorizedRowBatchCtx vectorizedRowBatchCtx = new VectorizedRowBatchCtx();
        vectorizedRowBatchCtx.init(structOI, new String[0]);
        mapWork.setVectorizedRowBatchCtx(vectorizedRowBatchCtx);
    }
    mapWork.setUseBucketizedHiveInputFormat(false);
    Map<Path, List<String>> aliasMap = new LinkedHashMap<>();
    List<String> aliases = new ArrayList<String>();
    aliases.add(tableName);
    LinkedHashMap<Path, PartitionDesc> partMap = new LinkedHashMap<>();
    for (int p = 0; p < partitions; ++p) {
        Path path = new Path(partPath[p]);
        aliasMap.put(path, aliases);
        LinkedHashMap<String, String> partSpec = new LinkedHashMap<String, String>();
        PartitionDesc part = new PartitionDesc(tbl, partSpec);
        if (isVectorized) {
            part.setVectorPartitionDesc(VectorPartitionDesc.createVectorizedInputFileFormat("MockInputFileFormatClassName", false, null));
        }
        partMap.put(path, part);
    }
    mapWork.setPathToAliases(aliasMap);
    mapWork.setPathToPartitionInfo(partMap);
    // write the plan out
    FileSystem localFs = FileSystem.getLocal(conf).getRaw();
    Path mapXml = new Path(workDir, "map.xml");
    localFs.delete(mapXml, true);
    FSDataOutputStream planStream = localFs.create(mapXml);
    SerializationUtilities.serializePlan(mapWork, planStream);
    conf.setBoolean(Utilities.HAS_MAP_WORK, true);
    planStream.close();
    return conf;
}
Also used : ArrayList(java.util.ArrayList) Properties(java.util.Properties) LinkedHashMap(java.util.LinkedHashMap) ValidReadTxnList(org.apache.hadoop.hive.common.ValidReadTxnList) ArrayList(java.util.ArrayList) ValidTxnList(org.apache.hadoop.hive.common.ValidTxnList) ValidWriteIdList(org.apache.hadoop.hive.common.ValidWriteIdList) List(java.util.List) LinkedList(java.util.LinkedList) JobConf(org.apache.hadoop.mapred.JobConf) VectorizedRowBatchCtx(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) VectorPartitionDesc(org.apache.hadoop.hive.ql.plan.VectorPartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) ExecMapper(org.apache.hadoop.hive.ql.exec.mr.ExecMapper) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Aggregations

PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)90 Path (org.apache.hadoop.fs.Path)67 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)41 ArrayList (java.util.ArrayList)39 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)27 LinkedHashMap (java.util.LinkedHashMap)24 List (java.util.List)23 JobConf (org.apache.hadoop.mapred.JobConf)21 Map (java.util.Map)18 Properties (java.util.Properties)18 HashMap (java.util.HashMap)17 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)17 IOException (java.io.IOException)15 Operator (org.apache.hadoop.hive.ql.exec.Operator)15 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)14 Configuration (org.apache.hadoop.conf.Configuration)13 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)13 FileSystem (org.apache.hadoop.fs.FileSystem)11 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)9 HiveInputFormat (org.apache.hadoop.hive.ql.io.HiveInputFormat)9