Search in sources :

Example 1 with LineageState

use of org.apache.hadoop.hive.ql.session.LineageState in project hive by apache.

the class TestGenMapRedUtilsCreateConditionalTask method testConditionalMoveTaskIsNotOptimized.

@Test
public void testConditionalMoveTaskIsNotOptimized() throws SemanticException {
    hiveConf.set(HiveConf.ConfVars.HIVE_BLOBSTORE_OPTIMIZATIONS_ENABLED.varname, "false");
    Path sinkDirName = new Path("s3a://bucket/scratch/-ext-10002");
    FileSinkOperator fileSinkOperator = createFileSinkOperator(sinkDirName);
    Path finalDirName = new Path("s3a://bucket/scratch/-ext-10000");
    Path tableLocation = new Path("s3a://bucket/warehouse/table");
    Task<MoveWork> moveTask = createMoveTask(finalDirName, tableLocation);
    List<Task<MoveWork>> moveTaskList = Collections.singletonList(moveTask);
    GenMapRedUtils.createMRWorkForMergingFiles(fileSinkOperator, finalDirName, null, moveTaskList, hiveConf, dummyMRTask, new LineageState());
    ConditionalTask conditionalTask = (ConditionalTask) dummyMRTask.getChildTasks().get(0);
    Task<? extends Serializable> moveOnlyTask = conditionalTask.getListTasks().get(0);
    Task<? extends Serializable> mergeOnlyTask = conditionalTask.getListTasks().get(1);
    Task<? extends Serializable> mergeAndMoveTask = conditionalTask.getListTasks().get(2);
    // Verify moveOnlyTask is NOT optimized
    assertEquals(1, moveOnlyTask.getChildTasks().size());
    verifyMoveTask(moveOnlyTask, sinkDirName, finalDirName);
    verifyMoveTask(moveOnlyTask.getChildTasks().get(0), finalDirName, tableLocation);
    // Verify mergeOnlyTask is NOT optimized
    assertEquals(1, mergeOnlyTask.getChildTasks().size());
    verifyMoveTask(mergeOnlyTask.getChildTasks().get(0), finalDirName, tableLocation);
    // Verify mergeAndMoveTask is NOT optimized
    assertEquals(1, mergeAndMoveTask.getChildTasks().size());
    assertEquals(1, mergeAndMoveTask.getChildTasks().get(0).getChildTasks().size());
    verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0), sinkDirName, finalDirName);
    verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0).getChildTasks().get(0), finalDirName, tableLocation);
}
Also used : Path(org.apache.hadoop.fs.Path) MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) MoveTask(org.apache.hadoop.hive.ql.exec.MoveTask) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) LineageState(org.apache.hadoop.hive.ql.session.LineageState) Test(org.junit.Test)

Example 2 with LineageState

use of org.apache.hadoop.hive.ql.session.LineageState in project hive by apache.

the class TestGenMapRedUtilsCreateConditionalTask method testConditionalMoveTaskIsOptimized.

@Test
public void testConditionalMoveTaskIsOptimized() throws SemanticException {
    hiveConf.set(HiveConf.ConfVars.HIVE_BLOBSTORE_OPTIMIZATIONS_ENABLED.varname, "true");
    Path sinkDirName = new Path("s3a://bucket/scratch/-ext-10002");
    FileSinkOperator fileSinkOperator = createFileSinkOperator(sinkDirName);
    Path finalDirName = new Path("s3a://bucket/scratch/-ext-10000");
    Path tableLocation = new Path("s3a://bucket/warehouse/table");
    Task<MoveWork> moveTask = createMoveTask(finalDirName, tableLocation);
    List<Task<MoveWork>> moveTaskList = Collections.singletonList(moveTask);
    GenMapRedUtils.createMRWorkForMergingFiles(fileSinkOperator, finalDirName, null, moveTaskList, hiveConf, dummyMRTask, new LineageState());
    ConditionalTask conditionalTask = (ConditionalTask) dummyMRTask.getChildTasks().get(0);
    Task<? extends Serializable> moveOnlyTask = conditionalTask.getListTasks().get(0);
    Task<? extends Serializable> mergeOnlyTask = conditionalTask.getListTasks().get(1);
    Task<? extends Serializable> mergeAndMoveTask = conditionalTask.getListTasks().get(2);
    /*
     * OPTIMIZATION
     * The ConditionalTask avoids linking 2 MoveTask that are expensive on blobstorage systems. Instead of
     * linking, it creates one MoveTask where the source is the first MoveTask source, and target is the
     * second MoveTask target.
     */
    // Verify moveOnlyTask is optimized
    assertNull(moveOnlyTask.getChildTasks());
    verifyMoveTask(moveOnlyTask, sinkDirName, tableLocation);
    // Verify mergeOnlyTask is NOT optimized (a merge task writes directly to finalDirName, then a MoveTask is executed)
    assertEquals(1, mergeOnlyTask.getChildTasks().size());
    verifyMoveTask(mergeOnlyTask.getChildTasks().get(0), finalDirName, tableLocation);
    // Verify mergeAndMoveTask is NOT optimized
    assertEquals(1, mergeAndMoveTask.getChildTasks().size());
    assertEquals(1, mergeAndMoveTask.getChildTasks().get(0).getChildTasks().size());
    verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0), sinkDirName, finalDirName);
    verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0).getChildTasks().get(0), finalDirName, tableLocation);
}
Also used : Path(org.apache.hadoop.fs.Path) MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) MoveTask(org.apache.hadoop.hive.ql.exec.MoveTask) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) LineageState(org.apache.hadoop.hive.ql.session.LineageState) Test(org.junit.Test)

Example 3 with LineageState

use of org.apache.hadoop.hive.ql.session.LineageState in project hive by apache.

the class TestGenMapRedUtilsCreateConditionalTask method testMergePathValidMoveWorkReturnsNewMoveWork.

@Test
public void testMergePathValidMoveWorkReturnsNewMoveWork() {
    final Path condInputPath = new Path("s3a://bucket/scratch/-ext-10000");
    final Path condOutputPath = new Path("s3a://bucket/scratch/-ext-10002");
    final Path targetMoveWorkPath = new Path("s3a://bucket/scratch/-ext-10003");
    final MoveWork mockWork = mock(MoveWork.class);
    final LineageState lineageState = new LineageState();
    MoveWork newWork;
    // test using loadFileWork
    when(mockWork.getLoadFileWork()).thenReturn(new LoadFileDesc(condOutputPath, targetMoveWorkPath, false, "", "", false));
    newWork = GenMapRedUtils.mergeMovePaths(condInputPath, mockWork, lineageState);
    assertNotNull(newWork);
    assertNotEquals(newWork, mockWork);
    assertEquals(condInputPath, newWork.getLoadFileWork().getSourcePath());
    assertEquals(targetMoveWorkPath, newWork.getLoadFileWork().getTargetDir());
    // test using loadTableWork
    TableDesc tableDesc = new TableDesc();
    reset(mockWork);
    when(mockWork.getLoadTableWork()).thenReturn(new LoadTableDesc(condOutputPath, tableDesc, null));
    newWork = GenMapRedUtils.mergeMovePaths(condInputPath, mockWork, lineageState);
    assertNotNull(newWork);
    assertNotEquals(newWork, mockWork);
    assertEquals(condInputPath, newWork.getLoadTableWork().getSourcePath());
    assertTrue(newWork.getLoadTableWork().getTable().equals(tableDesc));
}
Also used : Path(org.apache.hadoop.fs.Path) MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) LoadFileDesc(org.apache.hadoop.hive.ql.plan.LoadFileDesc) LineageState(org.apache.hadoop.hive.ql.session.LineageState) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) Test(org.junit.Test)

Example 4 with LineageState

use of org.apache.hadoop.hive.ql.session.LineageState in project hive by apache.

the class TestGenMapRedUtilsCreateConditionalTask method testMergePathWithInvalidMoveWorkThrowsException.

@Test(expected = IllegalArgumentException.class)
public void testMergePathWithInvalidMoveWorkThrowsException() {
    final Path condInputPath = new Path("s3a://bucket/scratch/-ext-10000");
    final MoveWork mockWork = mock(MoveWork.class);
    final LineageState lineageState = new LineageState();
    when(mockWork.getLoadMultiFilesWork()).thenReturn(new LoadMultiFilesDesc());
    GenMapRedUtils.mergeMovePaths(condInputPath, mockWork, lineageState);
}
Also used : Path(org.apache.hadoop.fs.Path) MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) LineageState(org.apache.hadoop.hive.ql.session.LineageState) LoadMultiFilesDesc(org.apache.hadoop.hive.ql.plan.LoadMultiFilesDesc) Test(org.junit.Test)

Example 5 with LineageState

use of org.apache.hadoop.hive.ql.session.LineageState in project hive by apache.

the class TestGenMapRedUtilsCreateConditionalTask method testConditionalMoveOnHdfsIsNotOptimized.

@Test
public void testConditionalMoveOnHdfsIsNotOptimized() throws SemanticException {
    hiveConf.set(HiveConf.ConfVars.HIVE_BLOBSTORE_OPTIMIZATIONS_ENABLED.varname, "true");
    Path sinkDirName = new Path("hdfs://bucket/scratch/-ext-10002");
    FileSinkOperator fileSinkOperator = createFileSinkOperator(sinkDirName);
    Path finalDirName = new Path("hdfs://bucket/scratch/-ext-10000");
    Path tableLocation = new Path("hdfs://bucket/warehouse/table");
    Task<MoveWork> moveTask = createMoveTask(finalDirName, tableLocation);
    List<Task<MoveWork>> moveTaskList = Collections.singletonList(moveTask);
    GenMapRedUtils.createMRWorkForMergingFiles(fileSinkOperator, finalDirName, null, moveTaskList, hiveConf, dummyMRTask, new LineageState());
    ConditionalTask conditionalTask = (ConditionalTask) dummyMRTask.getChildTasks().get(0);
    Task<? extends Serializable> moveOnlyTask = conditionalTask.getListTasks().get(0);
    Task<? extends Serializable> mergeOnlyTask = conditionalTask.getListTasks().get(1);
    Task<? extends Serializable> mergeAndMoveTask = conditionalTask.getListTasks().get(2);
    // Verify moveOnlyTask is NOT optimized
    assertEquals(1, moveOnlyTask.getChildTasks().size());
    verifyMoveTask(moveOnlyTask, sinkDirName, finalDirName);
    verifyMoveTask(moveOnlyTask.getChildTasks().get(0), finalDirName, tableLocation);
    // Verify mergeOnlyTask is NOT optimized
    assertEquals(1, mergeOnlyTask.getChildTasks().size());
    verifyMoveTask(mergeOnlyTask.getChildTasks().get(0), finalDirName, tableLocation);
    // Verify mergeAndMoveTask is NOT optimized
    assertEquals(1, mergeAndMoveTask.getChildTasks().size());
    assertEquals(1, mergeAndMoveTask.getChildTasks().get(0).getChildTasks().size());
    verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0), sinkDirName, finalDirName);
    verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0).getChildTasks().get(0), finalDirName, tableLocation);
}
Also used : Path(org.apache.hadoop.fs.Path) MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) MoveTask(org.apache.hadoop.hive.ql.exec.MoveTask) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) LineageState(org.apache.hadoop.hive.ql.session.LineageState) Test(org.junit.Test)

Aggregations

Path (org.apache.hadoop.fs.Path)5 MoveWork (org.apache.hadoop.hive.ql.plan.MoveWork)5 LineageState (org.apache.hadoop.hive.ql.session.LineageState)5 Test (org.junit.Test)5 ConditionalTask (org.apache.hadoop.hive.ql.exec.ConditionalTask)3 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)3 MoveTask (org.apache.hadoop.hive.ql.exec.MoveTask)3 Task (org.apache.hadoop.hive.ql.exec.Task)3 MapRedTask (org.apache.hadoop.hive.ql.exec.mr.MapRedTask)3 LoadFileDesc (org.apache.hadoop.hive.ql.plan.LoadFileDesc)1 LoadMultiFilesDesc (org.apache.hadoop.hive.ql.plan.LoadMultiFilesDesc)1 LoadTableDesc (org.apache.hadoop.hive.ql.plan.LoadTableDesc)1 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)1