use of org.apache.hadoop.hive.ql.exec.FileSinkOperator in project hive by apache.
the class TestGenMapRedUtilsCreateConditionalTask method createFileSinkOperator.
private FileSinkOperator createFileSinkOperator(Path finalDirName) {
FileSinkOperator fileSinkOperator = mock(FileSinkOperator.class);
TableDesc tableDesc = new TableDesc(HiveInputFormat.class, HiveOutputFormat.class, new Properties());
FileSinkDesc fileSinkDesc = new FileSinkDesc(finalDirName, tableDesc, false);
fileSinkDesc.setDirName(finalDirName);
when(fileSinkOperator.getConf()).thenReturn(fileSinkDesc);
when(fileSinkOperator.getSchema()).thenReturn(mock(RowSchema.class));
fileSinkDesc.setTableInfo(tableDesc);
when(fileSinkOperator.getCompilationOpContext()).thenReturn(mock(CompilationOpContext.class));
return fileSinkOperator;
}
use of org.apache.hadoop.hive.ql.exec.FileSinkOperator in project hive by apache.
the class TestGenMapRedUtilsCreateConditionalTask method testConditionalMoveTaskIsNotOptimized.
@Test
public void testConditionalMoveTaskIsNotOptimized() throws SemanticException {
hiveConf.set(HiveConf.ConfVars.HIVE_BLOBSTORE_OPTIMIZATIONS_ENABLED.varname, "false");
Path sinkDirName = new Path("s3a://bucket/scratch/-ext-10002");
FileSinkOperator fileSinkOperator = createFileSinkOperator(sinkDirName);
Path finalDirName = new Path("s3a://bucket/scratch/-ext-10000");
Path tableLocation = new Path("s3a://bucket/warehouse/table");
Task<MoveWork> moveTask = createMoveTask(finalDirName, tableLocation);
List<Task<MoveWork>> moveTaskList = Collections.singletonList(moveTask);
GenMapRedUtils.createMRWorkForMergingFiles(fileSinkOperator, finalDirName, null, moveTaskList, hiveConf, dummyMRTask, new LineageState());
ConditionalTask conditionalTask = (ConditionalTask) dummyMRTask.getChildTasks().get(0);
Task<? extends Serializable> moveOnlyTask = conditionalTask.getListTasks().get(0);
Task<? extends Serializable> mergeOnlyTask = conditionalTask.getListTasks().get(1);
Task<? extends Serializable> mergeAndMoveTask = conditionalTask.getListTasks().get(2);
// Verify moveOnlyTask is NOT optimized
assertEquals(1, moveOnlyTask.getChildTasks().size());
verifyMoveTask(moveOnlyTask, sinkDirName, finalDirName);
verifyMoveTask(moveOnlyTask.getChildTasks().get(0), finalDirName, tableLocation);
// Verify mergeOnlyTask is NOT optimized
assertEquals(1, mergeOnlyTask.getChildTasks().size());
verifyMoveTask(mergeOnlyTask.getChildTasks().get(0), finalDirName, tableLocation);
// Verify mergeAndMoveTask is NOT optimized
assertEquals(1, mergeAndMoveTask.getChildTasks().size());
assertEquals(1, mergeAndMoveTask.getChildTasks().get(0).getChildTasks().size());
verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0), sinkDirName, finalDirName);
verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0).getChildTasks().get(0), finalDirName, tableLocation);
}
use of org.apache.hadoop.hive.ql.exec.FileSinkOperator in project hive by apache.
the class TestGenMapRedUtilsCreateConditionalTask method testConditionalMoveTaskIsOptimized.
@Test
public void testConditionalMoveTaskIsOptimized() throws SemanticException {
hiveConf.set(HiveConf.ConfVars.HIVE_BLOBSTORE_OPTIMIZATIONS_ENABLED.varname, "true");
Path sinkDirName = new Path("s3a://bucket/scratch/-ext-10002");
FileSinkOperator fileSinkOperator = createFileSinkOperator(sinkDirName);
Path finalDirName = new Path("s3a://bucket/scratch/-ext-10000");
Path tableLocation = new Path("s3a://bucket/warehouse/table");
Task<MoveWork> moveTask = createMoveTask(finalDirName, tableLocation);
List<Task<MoveWork>> moveTaskList = Collections.singletonList(moveTask);
GenMapRedUtils.createMRWorkForMergingFiles(fileSinkOperator, finalDirName, null, moveTaskList, hiveConf, dummyMRTask, new LineageState());
ConditionalTask conditionalTask = (ConditionalTask) dummyMRTask.getChildTasks().get(0);
Task<? extends Serializable> moveOnlyTask = conditionalTask.getListTasks().get(0);
Task<? extends Serializable> mergeOnlyTask = conditionalTask.getListTasks().get(1);
Task<? extends Serializable> mergeAndMoveTask = conditionalTask.getListTasks().get(2);
/*
* OPTIMIZATION
* The ConditionalTask avoids linking 2 MoveTask that are expensive on blobstorage systems. Instead of
* linking, it creates one MoveTask where the source is the first MoveTask source, and target is the
* second MoveTask target.
*/
// Verify moveOnlyTask is optimized
assertNull(moveOnlyTask.getChildTasks());
verifyMoveTask(moveOnlyTask, sinkDirName, tableLocation);
// Verify mergeOnlyTask is NOT optimized (a merge task writes directly to finalDirName, then a MoveTask is executed)
assertEquals(1, mergeOnlyTask.getChildTasks().size());
verifyMoveTask(mergeOnlyTask.getChildTasks().get(0), finalDirName, tableLocation);
// Verify mergeAndMoveTask is NOT optimized
assertEquals(1, mergeAndMoveTask.getChildTasks().size());
assertEquals(1, mergeAndMoveTask.getChildTasks().get(0).getChildTasks().size());
verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0), sinkDirName, finalDirName);
verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0).getChildTasks().get(0), finalDirName, tableLocation);
}
use of org.apache.hadoop.hive.ql.exec.FileSinkOperator in project hive by apache.
the class TestGenTezWork method setUp.
/**
* @throws java.lang.Exception
*/
@SuppressWarnings("unchecked")
@Before
public void setUp() throws Exception {
// Init conf
final HiveConf conf = new HiveConf(SemanticAnalyzer.class);
SessionState.start(conf);
// Init parse context
final ParseContext pctx = new ParseContext();
pctx.setContext(new Context(conf));
ctx = new GenTezProcContext(conf, pctx, Collections.EMPTY_LIST, new ArrayList<Task<? extends Serializable>>(), Collections.EMPTY_SET, Collections.EMPTY_SET);
proc = new GenTezWork(new GenTezUtils() {
@Override
protected void setupMapWork(MapWork mapWork, GenTezProcContext context, PrunedPartitionList partitions, TableScanOperator root, String alias) throws SemanticException {
LinkedHashMap<String, Operator<? extends OperatorDesc>> map = new LinkedHashMap<String, Operator<? extends OperatorDesc>>();
map.put("foo", root);
mapWork.setAliasToWork(map);
return;
}
});
CompilationOpContext cCtx = new CompilationOpContext();
fs = new FileSinkOperator(cCtx);
fs.setConf(new FileSinkDesc());
rs = new ReduceSinkOperator(cCtx);
rs.setConf(new ReduceSinkDesc());
TableDesc tableDesc = new TableDesc();
tableDesc.setProperties(new Properties());
rs.getConf().setKeySerializeInfo(tableDesc);
ts = new TableScanOperator(cCtx);
ts.setConf(new TableScanDesc(null));
ts.getChildOperators().add(rs);
rs.getParentOperators().add(ts);
rs.getChildOperators().add(fs);
fs.getParentOperators().add(rs);
ctx.preceedingWork = null;
ctx.currentRootOperator = ts;
}
use of org.apache.hadoop.hive.ql.exec.FileSinkOperator in project hive by apache.
the class HiveOutputFormatImpl method checkOutputSpecs.
@Override
public void checkOutputSpecs(FileSystem ignored, JobConf job) throws IOException {
MapredWork work = Utilities.getMapRedWork(job);
List<Operator<?>> opList = work.getAllOperators();
for (Operator<?> op : opList) {
if (op instanceof FileSinkOperator) {
((FileSinkOperator) op).checkOutputSpecs(ignored, job);
}
}
}
Aggregations