Search in sources :

Example 51 with MapWork

use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.

the class TestDynamicPartitionPruner method testMultipleSourcesOrdering1.

@Test(timeout = 5000)
public void testMultipleSourcesOrdering1() throws InterruptedException, SerDeException {
    InputInitializerContext mockInitContext = mock(InputInitializerContext.class);
    doReturn(2).when(mockInitContext).getVertexNumTasks("v1");
    doReturn(3).when(mockInitContext).getVertexNumTasks("v2");
    MapWork mapWork = createMockMapWork(new TestSource("v1", 2), new TestSource("v2", 1));
    DynamicPartitionPruner pruner = new DynamicPartitionPrunerForEventTesting(mockInitContext, mapWork);
    PruneRunnable pruneRunnable = new PruneRunnable(pruner);
    Thread t = new Thread(pruneRunnable);
    t.start();
    try {
        pruneRunnable.start();
        InputInitializerEvent eventV1 = InputInitializerEvent.create("FakeTarget", "TargetInput", ByteBuffer.allocate(0));
        eventV1.setSourceVertexName("v1");
        InputInitializerEvent eventV2 = InputInitializerEvent.create("FakeTarget", "TargetInput", ByteBuffer.allocate(0));
        eventV2.setSourceVertexName("v2");
        // 2 X 2 events for V1. 3 X 1 events for V2
        pruner.addEvent(eventV1);
        pruner.addEvent(eventV1);
        pruner.addEvent(eventV1);
        pruner.addEvent(eventV1);
        pruner.addEvent(eventV2);
        pruner.addEvent(eventV2);
        pruner.addEvent(eventV2);
        pruner.processVertex("v1");
        pruner.processVertex("v2");
        pruneRunnable.awaitEnd();
        assertFalse(pruneRunnable.inError.get());
    } finally {
        t.interrupt();
        t.join();
    }
}
Also used : InputInitializerEvent(org.apache.tez.runtime.api.events.InputInitializerEvent) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) InputInitializerContext(org.apache.tez.runtime.api.InputInitializerContext) Test(org.junit.Test)

Example 52 with MapWork

use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.

the class TestDynamicPartitionPruner method testExtraEvents.

@Test(timeout = 5000, expected = IllegalStateException.class)
public void testExtraEvents() throws InterruptedException, IOException, HiveException, SerDeException {
    InputInitializerContext mockInitContext = mock(InputInitializerContext.class);
    doReturn(1).when(mockInitContext).getVertexNumTasks("v1");
    MapWork mapWork = createMockMapWork(new TestSource("v1", 1));
    DynamicPartitionPruner pruner = new DynamicPartitionPrunerForEventTesting(mockInitContext, mapWork);
    PruneRunnable pruneRunnable = new PruneRunnable(pruner);
    Thread t = new Thread(pruneRunnable);
    t.start();
    try {
        pruneRunnable.start();
        InputInitializerEvent event = InputInitializerEvent.create("FakeTarget", "TargetInput", ByteBuffer.allocate(0));
        event.setSourceVertexName("v1");
        pruner.addEvent(event);
        pruner.addEvent(event);
        pruner.processVertex("v1");
        pruneRunnable.awaitEnd();
        assertFalse(pruneRunnable.inError.get());
    } finally {
        t.interrupt();
        t.join();
    }
}
Also used : InputInitializerEvent(org.apache.tez.runtime.api.events.InputInitializerEvent) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) InputInitializerContext(org.apache.tez.runtime.api.InputInitializerContext) Test(org.junit.Test)

Example 53 with MapWork

use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.

the class TestDynamicPartitionPruner method testMultipleSourcesOrdering3.

@Test(timeout = 5000)
public void testMultipleSourcesOrdering3() throws InterruptedException, SerDeException {
    InputInitializerContext mockInitContext = mock(InputInitializerContext.class);
    doReturn(2).when(mockInitContext).getVertexNumTasks("v1");
    doReturn(3).when(mockInitContext).getVertexNumTasks("v2");
    MapWork mapWork = createMockMapWork(new TestSource("v1", 2), new TestSource("v2", 1));
    DynamicPartitionPruner pruner = new DynamicPartitionPrunerForEventTesting(mockInitContext, mapWork);
    PruneRunnable pruneRunnable = new PruneRunnable(pruner);
    Thread t = new Thread(pruneRunnable);
    t.start();
    try {
        pruneRunnable.start();
        InputInitializerEvent eventV1 = InputInitializerEvent.create("FakeTarget", "TargetInput", ByteBuffer.allocate(0));
        eventV1.setSourceVertexName("v1");
        InputInitializerEvent eventV2 = InputInitializerEvent.create("FakeTarget", "TargetInput", ByteBuffer.allocate(0));
        eventV2.setSourceVertexName("v2");
        // 2 X 2 events for V1. 3 X 1 events for V2
        pruner.addEvent(eventV1);
        pruner.addEvent(eventV1);
        pruner.processVertex("v1");
        pruner.addEvent(eventV1);
        pruner.addEvent(eventV1);
        pruner.addEvent(eventV2);
        pruner.processVertex("v2");
        pruner.addEvent(eventV2);
        pruner.addEvent(eventV2);
        pruneRunnable.awaitEnd();
        assertFalse(pruneRunnable.inError.get());
    } finally {
        t.interrupt();
        t.join();
    }
}
Also used : InputInitializerEvent(org.apache.tez.runtime.api.events.InputInitializerEvent) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) InputInitializerContext(org.apache.tez.runtime.api.InputInitializerContext) Test(org.junit.Test)

Example 54 with MapWork

use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.

the class TestGenTezWork method testCreateMap.

@Test
public void testCreateMap() throws SemanticException {
    proc.process(rs, null, ctx, (Object[]) null);
    assertNotNull(ctx.currentTask);
    assertTrue(ctx.rootTasks.contains(ctx.currentTask));
    TezWork work = ctx.currentTask.getWork();
    assertEquals(work.getAllWork().size(), 1);
    BaseWork w = work.getAllWork().get(0);
    assertTrue(w instanceof MapWork);
    MapWork mw = (MapWork) w;
    // need to make sure names are set for tez to connect things right
    assertNotNull(w.getName());
    // map work should start with our ts op
    assertSame(mw.getAliasToWork().entrySet().iterator().next().getValue(), ts);
    // preceeding work must be set to the newly generated map
    assertSame(ctx.preceedingWork, mw);
    // should have a new root now
    assertSame(ctx.currentRootOperator, fs);
}
Also used : MapWork(org.apache.hadoop.hive.ql.plan.MapWork) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) TezWork(org.apache.hadoop.hive.ql.plan.TezWork) Test(org.junit.Test)

Example 55 with MapWork

use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.

the class CommonJoinTaskDispatcher method mergeMapJoinTaskIntoItsChildMapRedTask.

/*
   * A task and its child task has been converted from join to mapjoin.
   * See if the two tasks can be merged.
   */
private void mergeMapJoinTaskIntoItsChildMapRedTask(MapRedTask mapJoinTask, Configuration conf) throws SemanticException {
    // If so, check if we can merge mapJoinTask into that child.
    if (mapJoinTask.getChildTasks() == null || mapJoinTask.getChildTasks().size() > 1) {
        // child-tasks in which case we don't want to do anything.
        return;
    }
    Task<? extends Serializable> childTask = mapJoinTask.getChildTasks().get(0);
    if (!(childTask instanceof MapRedTask)) {
        // Nothing to do if it is not a MapReduce task.
        return;
    }
    MapRedTask childMapRedTask = (MapRedTask) childTask;
    MapWork mapJoinMapWork = mapJoinTask.getWork().getMapWork();
    MapWork childMapWork = childMapRedTask.getWork().getMapWork();
    Map<String, Operator<? extends OperatorDesc>> mapJoinAliasToWork = mapJoinMapWork.getAliasToWork();
    if (mapJoinAliasToWork.size() > 1) {
        // Do not merge if the MapredWork of MapJoin has multiple input aliases.
        return;
    }
    Entry<String, Operator<? extends OperatorDesc>> mapJoinAliasToWorkEntry = mapJoinAliasToWork.entrySet().iterator().next();
    String mapJoinAlias = mapJoinAliasToWorkEntry.getKey();
    TableScanOperator mapJoinTaskTableScanOperator = OperatorUtils.findSingleOperator(mapJoinAliasToWorkEntry.getValue(), TableScanOperator.class);
    if (mapJoinTaskTableScanOperator == null) {
        throw new SemanticException("Expected a " + TableScanOperator.getOperatorName() + " operator as the work associated with alias " + mapJoinAlias + ". Found a " + mapJoinAliasToWork.get(mapJoinAlias).getName() + " operator.");
    }
    FileSinkOperator mapJoinTaskFileSinkOperator = OperatorUtils.findSingleOperator(mapJoinTaskTableScanOperator, FileSinkOperator.class);
    if (mapJoinTaskFileSinkOperator == null) {
        throw new SemanticException("Cannot find the " + FileSinkOperator.getOperatorName() + " operator at the last operator of the MapJoin Task.");
    }
    // The mapJoinTaskFileSinkOperator writes to a different directory
    Path childMRPath = mapJoinTaskFileSinkOperator.getConf().getDirName();
    List<String> childMRAliases = childMapWork.getPathToAliases().get(childMRPath);
    if (childMRAliases == null || childMRAliases.size() != 1) {
        return;
    }
    String childMRAlias = childMRAliases.get(0);
    // Sanity check to make sure there is no alias conflict after merge.
    for (Entry<Path, ArrayList<String>> entry : childMapWork.getPathToAliases().entrySet()) {
        Path path = entry.getKey();
        List<String> aliases = entry.getValue();
        if (path.equals(childMRPath)) {
            continue;
        }
        if (aliases.contains(mapJoinAlias)) {
            // alias confict should not happen here.
            return;
        }
    }
    MapredLocalWork mapJoinLocalWork = mapJoinMapWork.getMapRedLocalWork();
    MapredLocalWork childLocalWork = childMapWork.getMapRedLocalWork();
    if ((mapJoinLocalWork != null && mapJoinLocalWork.getBucketMapjoinContext() != null) || (childLocalWork != null && childLocalWork.getBucketMapjoinContext() != null)) {
        // We should relax this constraint with a follow-up jira.
        return;
    }
    // is under the limit.
    if (!isLocalTableTotalSizeUnderLimitAfterMerge(conf, mapJoinLocalWork, childLocalWork)) {
        // Do not merge.
        return;
    }
    TableScanOperator childMRTaskTableScanOperator = OperatorUtils.findSingleOperator(childMapWork.getAliasToWork().get(childMRAlias.toString()), TableScanOperator.class);
    if (childMRTaskTableScanOperator == null) {
        throw new SemanticException("Expected a " + TableScanOperator.getOperatorName() + " operator as the work associated with alias " + childMRAlias + ". Found a " + childMapWork.getAliasToWork().get(childMRAlias).getName() + " operator.");
    }
    List<Operator<? extends OperatorDesc>> parentsInMapJoinTask = mapJoinTaskFileSinkOperator.getParentOperators();
    List<Operator<? extends OperatorDesc>> childrenInChildMRTask = childMRTaskTableScanOperator.getChildOperators();
    if (parentsInMapJoinTask.size() > 1 || childrenInChildMRTask.size() > 1) {
        // Do not merge if we do not know how to connect two operator trees.
        return;
    }
    // Step 2: Merge mapJoinTask into the Map-side of its child.
    // Step 2.1: Connect the operator trees of two MapRedTasks.
    Operator<? extends OperatorDesc> parentInMapJoinTask = parentsInMapJoinTask.get(0);
    Operator<? extends OperatorDesc> childInChildMRTask = childrenInChildMRTask.get(0);
    parentInMapJoinTask.replaceChild(mapJoinTaskFileSinkOperator, childInChildMRTask);
    childInChildMRTask.replaceParent(childMRTaskTableScanOperator, parentInMapJoinTask);
    // Step 2.2: Replace the corresponding part childMRWork's MapWork.
    GenMapRedUtils.replaceMapWork(mapJoinAlias, childMRAlias.toString(), mapJoinMapWork, childMapWork);
    // Step 2.3: Fill up stuff in local work
    if (mapJoinLocalWork != null) {
        if (childLocalWork == null) {
            childMapWork.setMapRedLocalWork(mapJoinLocalWork);
        } else {
            childLocalWork.getAliasToFetchWork().putAll(mapJoinLocalWork.getAliasToFetchWork());
            childLocalWork.getAliasToWork().putAll(mapJoinLocalWork.getAliasToWork());
        }
    }
    // Step 2.4: Remove this MapJoin task
    List<Task<? extends Serializable>> parentTasks = mapJoinTask.getParentTasks();
    mapJoinTask.setParentTasks(null);
    mapJoinTask.setChildTasks(null);
    childMapRedTask.getParentTasks().remove(mapJoinTask);
    if (parentTasks != null) {
        childMapRedTask.getParentTasks().addAll(parentTasks);
        for (Task<? extends Serializable> parentTask : parentTasks) {
            parentTask.getChildTasks().remove(mapJoinTask);
            if (!parentTask.getChildTasks().contains(childMapRedTask)) {
                parentTask.getChildTasks().add(childMapRedTask);
            }
        }
    } else {
        if (physicalContext.getRootTasks().contains(mapJoinTask)) {
            physicalContext.removeFromRootTask(mapJoinTask);
            if (childMapRedTask.getParentTasks() != null && childMapRedTask.getParentTasks().size() == 0 && !physicalContext.getRootTasks().contains(childMapRedTask)) {
                physicalContext.addToRootTask(childMapRedTask);
            }
        }
    }
    if (childMapRedTask.getParentTasks().size() == 0) {
        childMapRedTask.setParentTasks(null);
    }
}
Also used : LateralViewForwardOperator(org.apache.hadoop.hive.ql.exec.LateralViewForwardOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) Path(org.apache.hadoop.fs.Path) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) Serializable(java.io.Serializable) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) ArrayList(java.util.ArrayList) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Aggregations

MapWork (org.apache.hadoop.hive.ql.plan.MapWork)79 ArrayList (java.util.ArrayList)25 Path (org.apache.hadoop.fs.Path)24 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)23 Operator (org.apache.hadoop.hive.ql.exec.Operator)21 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)17 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)16 JobConf (org.apache.hadoop.mapred.JobConf)15 Test (org.junit.Test)15 BaseWork (org.apache.hadoop.hive.ql.plan.BaseWork)14 ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)14 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)13 Serializable (java.io.Serializable)12 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)12 Task (org.apache.hadoop.hive.ql.exec.Task)12 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)12 Context (org.apache.hadoop.hive.ql.Context)11 LinkedHashMap (java.util.LinkedHashMap)10 FileSystem (org.apache.hadoop.fs.FileSystem)10 ConditionalTask (org.apache.hadoop.hive.ql.exec.ConditionalTask)10