Search in sources :

Example 11 with FileSinkOperator

use of org.apache.hadoop.hive.ql.exec.FileSinkOperator in project hive by apache.

the class TestGenMapRedUtilsCreateConditionalTask method createFileSinkOperator.

private FileSinkOperator createFileSinkOperator(Path finalDirName) {
    FileSinkOperator fileSinkOperator = mock(FileSinkOperator.class);
    TableDesc tableDesc = new TableDesc(HiveInputFormat.class, HiveOutputFormat.class, new Properties());
    FileSinkDesc fileSinkDesc = new FileSinkDesc(finalDirName, tableDesc, false);
    fileSinkDesc.setDirName(finalDirName);
    when(fileSinkOperator.getConf()).thenReturn(fileSinkDesc);
    when(fileSinkOperator.getSchema()).thenReturn(mock(RowSchema.class));
    fileSinkDesc.setTableInfo(tableDesc);
    when(fileSinkOperator.getCompilationOpContext()).thenReturn(mock(CompilationOpContext.class));
    return fileSinkOperator;
}
Also used : RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FileSinkDesc(org.apache.hadoop.hive.ql.plan.FileSinkDesc) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) Properties(java.util.Properties)

Example 12 with FileSinkOperator

use of org.apache.hadoop.hive.ql.exec.FileSinkOperator in project hive by apache.

the class TestGenMapRedUtilsCreateConditionalTask method testConditionalMoveTaskIsNotOptimized.

@Test
public void testConditionalMoveTaskIsNotOptimized() throws SemanticException {
    hiveConf.set(HiveConf.ConfVars.HIVE_BLOBSTORE_OPTIMIZATIONS_ENABLED.varname, "false");
    Path sinkDirName = new Path("s3a://bucket/scratch/-ext-10002");
    FileSinkOperator fileSinkOperator = createFileSinkOperator(sinkDirName);
    Path finalDirName = new Path("s3a://bucket/scratch/-ext-10000");
    Path tableLocation = new Path("s3a://bucket/warehouse/table");
    Task<MoveWork> moveTask = createMoveTask(finalDirName, tableLocation);
    List<Task<MoveWork>> moveTaskList = Collections.singletonList(moveTask);
    GenMapRedUtils.createMRWorkForMergingFiles(fileSinkOperator, finalDirName, null, moveTaskList, hiveConf, dummyMRTask, new LineageState());
    ConditionalTask conditionalTask = (ConditionalTask) dummyMRTask.getChildTasks().get(0);
    Task<? extends Serializable> moveOnlyTask = conditionalTask.getListTasks().get(0);
    Task<? extends Serializable> mergeOnlyTask = conditionalTask.getListTasks().get(1);
    Task<? extends Serializable> mergeAndMoveTask = conditionalTask.getListTasks().get(2);
    // Verify moveOnlyTask is NOT optimized
    assertEquals(1, moveOnlyTask.getChildTasks().size());
    verifyMoveTask(moveOnlyTask, sinkDirName, finalDirName);
    verifyMoveTask(moveOnlyTask.getChildTasks().get(0), finalDirName, tableLocation);
    // Verify mergeOnlyTask is NOT optimized
    assertEquals(1, mergeOnlyTask.getChildTasks().size());
    verifyMoveTask(mergeOnlyTask.getChildTasks().get(0), finalDirName, tableLocation);
    // Verify mergeAndMoveTask is NOT optimized
    assertEquals(1, mergeAndMoveTask.getChildTasks().size());
    assertEquals(1, mergeAndMoveTask.getChildTasks().get(0).getChildTasks().size());
    verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0), sinkDirName, finalDirName);
    verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0).getChildTasks().get(0), finalDirName, tableLocation);
}
Also used : Path(org.apache.hadoop.fs.Path) MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) MoveTask(org.apache.hadoop.hive.ql.exec.MoveTask) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) LineageState(org.apache.hadoop.hive.ql.session.LineageState) Test(org.junit.Test)

Example 13 with FileSinkOperator

use of org.apache.hadoop.hive.ql.exec.FileSinkOperator in project hive by apache.

the class TestGenMapRedUtilsCreateConditionalTask method testConditionalMoveTaskIsOptimized.

@Test
public void testConditionalMoveTaskIsOptimized() throws SemanticException {
    hiveConf.set(HiveConf.ConfVars.HIVE_BLOBSTORE_OPTIMIZATIONS_ENABLED.varname, "true");
    Path sinkDirName = new Path("s3a://bucket/scratch/-ext-10002");
    FileSinkOperator fileSinkOperator = createFileSinkOperator(sinkDirName);
    Path finalDirName = new Path("s3a://bucket/scratch/-ext-10000");
    Path tableLocation = new Path("s3a://bucket/warehouse/table");
    Task<MoveWork> moveTask = createMoveTask(finalDirName, tableLocation);
    List<Task<MoveWork>> moveTaskList = Collections.singletonList(moveTask);
    GenMapRedUtils.createMRWorkForMergingFiles(fileSinkOperator, finalDirName, null, moveTaskList, hiveConf, dummyMRTask, new LineageState());
    ConditionalTask conditionalTask = (ConditionalTask) dummyMRTask.getChildTasks().get(0);
    Task<? extends Serializable> moveOnlyTask = conditionalTask.getListTasks().get(0);
    Task<? extends Serializable> mergeOnlyTask = conditionalTask.getListTasks().get(1);
    Task<? extends Serializable> mergeAndMoveTask = conditionalTask.getListTasks().get(2);
    /*
     * OPTIMIZATION
     * The ConditionalTask avoids linking 2 MoveTask that are expensive on blobstorage systems. Instead of
     * linking, it creates one MoveTask where the source is the first MoveTask source, and target is the
     * second MoveTask target.
     */
    // Verify moveOnlyTask is optimized
    assertNull(moveOnlyTask.getChildTasks());
    verifyMoveTask(moveOnlyTask, sinkDirName, tableLocation);
    // Verify mergeOnlyTask is NOT optimized (a merge task writes directly to finalDirName, then a MoveTask is executed)
    assertEquals(1, mergeOnlyTask.getChildTasks().size());
    verifyMoveTask(mergeOnlyTask.getChildTasks().get(0), finalDirName, tableLocation);
    // Verify mergeAndMoveTask is NOT optimized
    assertEquals(1, mergeAndMoveTask.getChildTasks().size());
    assertEquals(1, mergeAndMoveTask.getChildTasks().get(0).getChildTasks().size());
    verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0), sinkDirName, finalDirName);
    verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0).getChildTasks().get(0), finalDirName, tableLocation);
}
Also used : Path(org.apache.hadoop.fs.Path) MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) MoveTask(org.apache.hadoop.hive.ql.exec.MoveTask) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) LineageState(org.apache.hadoop.hive.ql.session.LineageState) Test(org.junit.Test)

Example 14 with FileSinkOperator

use of org.apache.hadoop.hive.ql.exec.FileSinkOperator in project hive by apache.

the class TestGenTezWork method setUp.

/**
 * @throws java.lang.Exception
 */
@SuppressWarnings("unchecked")
@Before
public void setUp() throws Exception {
    // Init conf
    final HiveConf conf = new HiveConf(SemanticAnalyzer.class);
    SessionState.start(conf);
    // Init parse context
    final ParseContext pctx = new ParseContext();
    pctx.setContext(new Context(conf));
    ctx = new GenTezProcContext(conf, pctx, Collections.EMPTY_LIST, new ArrayList<Task<? extends Serializable>>(), Collections.EMPTY_SET, Collections.EMPTY_SET);
    proc = new GenTezWork(new GenTezUtils() {

        @Override
        protected void setupMapWork(MapWork mapWork, GenTezProcContext context, PrunedPartitionList partitions, TableScanOperator root, String alias) throws SemanticException {
            LinkedHashMap<String, Operator<? extends OperatorDesc>> map = new LinkedHashMap<String, Operator<? extends OperatorDesc>>();
            map.put("foo", root);
            mapWork.setAliasToWork(map);
            return;
        }
    });
    CompilationOpContext cCtx = new CompilationOpContext();
    fs = new FileSinkOperator(cCtx);
    fs.setConf(new FileSinkDesc());
    rs = new ReduceSinkOperator(cCtx);
    rs.setConf(new ReduceSinkDesc());
    TableDesc tableDesc = new TableDesc();
    tableDesc.setProperties(new Properties());
    rs.getConf().setKeySerializeInfo(tableDesc);
    ts = new TableScanOperator(cCtx);
    ts.setConf(new TableScanDesc(null));
    ts.getChildOperators().add(rs);
    rs.getParentOperators().add(ts);
    rs.getChildOperators().add(fs);
    fs.getParentOperators().add(rs);
    ctx.preceedingWork = null;
    ctx.currentRootOperator = ts;
}
Also used : Context(org.apache.hadoop.hive.ql.Context) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FileSinkDesc(org.apache.hadoop.hive.ql.plan.FileSinkDesc) ArrayList(java.util.ArrayList) TableScanDesc(org.apache.hadoop.hive.ql.plan.TableScanDesc) Properties(java.util.Properties) LinkedHashMap(java.util.LinkedHashMap) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) HiveConf(org.apache.hadoop.hive.conf.HiveConf) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) Before(org.junit.Before)

Example 15 with FileSinkOperator

use of org.apache.hadoop.hive.ql.exec.FileSinkOperator in project hive by apache.

the class HiveOutputFormatImpl method checkOutputSpecs.

@Override
public void checkOutputSpecs(FileSystem ignored, JobConf job) throws IOException {
    MapredWork work = Utilities.getMapRedWork(job);
    List<Operator<?>> opList = work.getAllOperators();
    for (Operator<?> op : opList) {
        if (op instanceof FileSinkOperator) {
            ((FileSinkOperator) op).checkOutputSpecs(ignored, job);
        }
    }
}
Also used : FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator)

Aggregations

FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)23 Path (org.apache.hadoop.fs.Path)10 Operator (org.apache.hadoop.hive.ql.exec.Operator)9 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)9 ArrayList (java.util.ArrayList)8 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)7 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)6 Task (org.apache.hadoop.hive.ql.exec.Task)6 FileSinkDesc (org.apache.hadoop.hive.ql.plan.FileSinkDesc)6 ConditionalTask (org.apache.hadoop.hive.ql.exec.ConditionalTask)5 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)5 MapRedTask (org.apache.hadoop.hive.ql.exec.mr.MapRedTask)5 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)5 Serializable (java.io.Serializable)4 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)4 MoveWork (org.apache.hadoop.hive.ql.plan.MoveWork)4 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)4 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)4 LinkedHashMap (java.util.LinkedHashMap)3 LinkedList (java.util.LinkedList)3