use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.
the class GenMapRedUtils method splitTasks.
@SuppressWarnings("nls")
private static /**
* Split two tasks by creating a temporary file between them.
*
* @param op reduce sink operator being processed
* @param parentTask the parent task
* @param childTask the child task
* @param opProcCtx context
*/
void splitTasks(ReduceSinkOperator op, Task<? extends Serializable> parentTask, Task<? extends Serializable> childTask, GenMRProcContext opProcCtx) throws SemanticException {
if (op.getNumParent() != 1) {
throw new IllegalStateException("Expecting operator " + op + " to have one parent. " + "But found multiple parents : " + op.getParentOperators());
}
ParseContext parseCtx = opProcCtx.getParseCtx();
parentTask.addDependentTask(childTask);
// Root Task cannot depend on any other task, therefore childTask cannot be
// a root Task
List<Task<? extends Serializable>> rootTasks = opProcCtx.getRootTasks();
if (rootTasks.contains(childTask)) {
rootTasks.remove(childTask);
}
// Generate the temporary file name
Context baseCtx = parseCtx.getContext();
Path taskTmpDir = baseCtx.getMRTmpPath();
Operator<? extends OperatorDesc> parent = op.getParentOperators().get(0);
TableDesc tt_desc = PlanUtils.getIntermediateFileTableDesc(PlanUtils.getFieldSchemasFromRowSchema(parent.getSchema(), "temporarycol"));
// Create the temporary file, its corresponding FileSinkOperaotr, and
// its corresponding TableScanOperator.
TableScanOperator tableScanOp = createTemporaryFile(parent, op, taskTmpDir, tt_desc, parseCtx);
Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx();
mapCurrCtx.put(tableScanOp, new GenMapRedCtx(childTask, null));
String streamDesc = taskTmpDir.toUri().toString();
MapredWork cplan = (MapredWork) childTask.getWork();
if (needsTagging(cplan.getReduceWork())) {
Operator<? extends OperatorDesc> reducerOp = cplan.getReduceWork().getReducer();
String id = null;
if (reducerOp instanceof JoinOperator) {
if (parseCtx.getJoinOps().contains(reducerOp)) {
id = ((JoinOperator) reducerOp).getConf().getId();
}
} else if (reducerOp instanceof MapJoinOperator) {
if (parseCtx.getMapJoinOps().contains(reducerOp)) {
id = ((MapJoinOperator) reducerOp).getConf().getId();
}
} else if (reducerOp instanceof SMBMapJoinOperator) {
if (parseCtx.getSmbMapJoinOps().contains(reducerOp)) {
id = ((SMBMapJoinOperator) reducerOp).getConf().getId();
}
}
if (id != null) {
streamDesc = id + ":$INTNAME";
} else {
streamDesc = "$INTNAME";
}
String origStreamDesc = streamDesc;
int pos = 0;
while (cplan.getMapWork().getAliasToWork().get(streamDesc) != null) {
streamDesc = origStreamDesc.concat(String.valueOf(++pos));
}
// TODO: Allocate work to remove the temporary files and make that
// dependent on the redTask
cplan.getReduceWork().setNeedsTagging(true);
}
// Add the path to alias mapping
setTaskPlan(taskTmpDir, streamDesc, tableScanOp, cplan.getMapWork(), false, tt_desc);
opProcCtx.setCurrTopOp(null);
opProcCtx.setCurrAliasId(null);
opProcCtx.setCurrTask(childTask);
opProcCtx.addRootIfPossible(parentTask);
}
use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.
the class GenMapRedUtils method joinPlan.
/**
* Merge the current task into the old task for the reducer
*
* @param currTask
* the current task for the current reducer
* @param oldTask
* the old task for the current reducer
* @param opProcCtx
* processing context
*/
public static void joinPlan(Task<? extends Serializable> currTask, Task<? extends Serializable> oldTask, GenMRProcContext opProcCtx) throws SemanticException {
assert currTask != null && oldTask != null;
TableScanOperator currTopOp = opProcCtx.getCurrTopOp();
List<Task<? extends Serializable>> parTasks = null;
// terminate the old task and make current task dependent on it
if (currTask.getParentTasks() != null && !currTask.getParentTasks().isEmpty()) {
parTasks = new ArrayList<Task<? extends Serializable>>();
parTasks.addAll(currTask.getParentTasks());
Object[] parTaskArr = parTasks.toArray();
for (Object element : parTaskArr) {
((Task<? extends Serializable>) element).removeDependentTask(currTask);
}
}
if (currTopOp != null) {
mergeInput(currTopOp, opProcCtx, oldTask, false);
}
if (parTasks != null) {
for (Task<? extends Serializable> parTask : parTasks) {
parTask.addDependentTask(oldTask);
}
}
if (oldTask instanceof MapRedTask && currTask instanceof MapRedTask) {
((MapRedTask) currTask).getWork().getMapWork().mergingInto(((MapRedTask) oldTask).getWork().getMapWork());
}
opProcCtx.setCurrTopOp(null);
opProcCtx.setCurrTask(oldTask);
}
use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.
the class GenMapRedUtils method createMRWorkForMergingFiles.
/**
* @param fsInput The FileSink operator.
* @param finalName the final destination path the merge job should output.
* @param dependencyTask
* @param mvTasks
* @param conf
* @param currTask
* @param lineageState
* @throws SemanticException
*
* create a Map-only merge job using CombineHiveInputFormat for all partitions with
* following operators:
* MR job J0:
* ...
* |
* v
* FileSinkOperator_1 (fsInput)
* |
* v
* Merge job J1:
* |
* v
* TableScan (using CombineHiveInputFormat) (tsMerge)
* |
* v
* FileSinkOperator (fsMerge)
*
* Here the pathToPartitionInfo & pathToAlias will remain the same, which means the paths
* do
* not contain the dynamic partitions (their parent). So after the dynamic partitions are
* created (after the first job finished before the moveTask or ConditionalTask start),
* we need to change the pathToPartitionInfo & pathToAlias to include the dynamic
* partition
* directories.
*/
public static void createMRWorkForMergingFiles(FileSinkOperator fsInput, Path finalName, DependencyCollectionTask dependencyTask, List<Task<MoveWork>> mvTasks, HiveConf conf, Task<? extends Serializable> currTask, LineageState lineageState) throws SemanticException {
//
// 1. create the operator tree
//
FileSinkDesc fsInputDesc = fsInput.getConf();
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("Creating merge work from " + System.identityHashCode(fsInput) + " with write ID " + (fsInputDesc.isMmTable() ? fsInputDesc.getTableWriteId() : null) + " into " + finalName);
}
boolean isBlockMerge = (conf.getBoolVar(ConfVars.HIVEMERGERCFILEBLOCKLEVEL) && fsInputDesc.getTableInfo().getInputFileFormatClass().equals(RCFileInputFormat.class)) || (conf.getBoolVar(ConfVars.HIVEMERGEORCFILESTRIPELEVEL) && fsInputDesc.getTableInfo().getInputFileFormatClass().equals(OrcInputFormat.class));
RowSchema inputRS = fsInput.getSchema();
Long srcMmWriteId = fsInputDesc.isMmTable() ? fsInputDesc.getTableWriteId() : null;
FileSinkDesc fsOutputDesc = null;
TableScanOperator tsMerge = null;
if (!isBlockMerge) {
// Create a TableScan operator
tsMerge = GenMapRedUtils.createTemporaryTableScanOperator(fsInput.getCompilationOpContext(), inputRS);
// Create a FileSink operator
TableDesc ts = (TableDesc) fsInputDesc.getTableInfo().clone();
Path mergeDest = srcMmWriteId == null ? finalName : finalName.getParent();
fsOutputDesc = new FileSinkDesc(mergeDest, ts, conf.getBoolVar(ConfVars.COMPRESSRESULT));
fsOutputDesc.setMmWriteId(srcMmWriteId);
fsOutputDesc.setIsMerge(true);
// Create and attach the filesink for the merge.
OperatorFactory.getAndMakeChild(fsOutputDesc, inputRS, tsMerge);
}
// If the input FileSinkOperator is a dynamic partition enabled, the tsMerge input schema
// needs to include the partition column, and the fsOutput should have
// a DynamicPartitionCtx to indicate that it needs to dynamically partitioned.
DynamicPartitionCtx dpCtx = fsInputDesc.getDynPartCtx();
if (dpCtx != null && dpCtx.getNumDPCols() > 0) {
// adding DP ColumnInfo to the RowSchema signature
ArrayList<ColumnInfo> signature = inputRS.getSignature();
String tblAlias = fsInputDesc.getTableInfo().getTableName();
for (String dpCol : dpCtx.getDPColNames()) {
ColumnInfo colInfo = new ColumnInfo(dpCol, // all partition column type should be string
TypeInfoFactory.stringTypeInfo, tblAlias, // partition column is virtual column
true);
signature.add(colInfo);
}
inputRS.setSignature(signature);
if (!isBlockMerge) {
// create another DynamicPartitionCtx, which has a different input-to-DP column mapping
DynamicPartitionCtx dpCtx2 = new DynamicPartitionCtx(dpCtx);
fsOutputDesc.setDynPartCtx(dpCtx2);
}
// update the FileSinkOperator to include partition columns
usePartitionColumns(fsInputDesc.getTableInfo().getProperties(), dpCtx.getDPColNames());
} else {
// non-partitioned table
fsInputDesc.getTableInfo().getProperties().remove(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS);
}
//
// 2. Constructing a conditional task consisting of a move task and a map reduce task
//
Path inputDirName = fsInputDesc.getMergeInputDirName();
MapWork cplan;
Serializable work;
if (isBlockMerge) {
cplan = GenMapRedUtils.createMergeTask(fsInputDesc, finalName, dpCtx != null && dpCtx.getNumDPCols() > 0, fsInput.getCompilationOpContext());
if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
work = new TezWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID), conf);
cplan.setName("File Merge");
((TezWork) work).add(cplan);
} else if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("spark")) {
work = new SparkWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID));
cplan.setName("Spark Merge File Work");
((SparkWork) work).add(cplan);
} else {
work = cplan;
}
} else {
cplan = createMRWorkForMergingFiles(conf, tsMerge, fsInputDesc);
if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
work = new TezWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID), conf);
cplan.setName("File Merge");
((TezWork) work).add(cplan);
} else if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("spark")) {
work = new SparkWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID));
cplan.setName("Spark Merge File Work");
((SparkWork) work).add(cplan);
} else {
work = new MapredWork();
((MapredWork) work).setMapWork(cplan);
}
}
// use CombineHiveInputFormat for map-only merging
cplan.setInputformat("org.apache.hadoop.hive.ql.io.CombineHiveInputFormat");
// NOTE: we should gather stats in MR1 rather than MR2 at merge job since we don't
// know if merge MR2 will be triggered at execution time
MoveWork dummyMv = null;
if (srcMmWriteId == null) {
// Only create the movework for non-MM table. No action needed for a MM table.
dummyMv = new MoveWork(null, null, null, new LoadFileDesc(inputDirName, finalName, true, null, null, false), false);
}
// Use the original fsOp path here in case of MM - while the new FSOP merges files inside the
// MM directory, the original MoveTask still commits based on the parent. Note that this path
// can only be triggered for a merge that's part of insert for now; MM tables do not support
// concatenate. Keeping the old logic for non-MM tables with temp directories and stuff.
Path fsopPath = srcMmWriteId != null ? fsInputDesc.getFinalDirName() : finalName;
Task<MoveWork> mvTask = GenMapRedUtils.findMoveTaskForFsopOutput(mvTasks, fsopPath, fsInputDesc.isMmTable());
ConditionalTask cndTsk = GenMapRedUtils.createCondTask(conf, currTask, dummyMv, work, fsInputDesc.getMergeInputDirName(), finalName, mvTask, dependencyTask, lineageState);
// keep the dynamic partition context in conditional task resolver context
ConditionalResolverMergeFilesCtx mrCtx = (ConditionalResolverMergeFilesCtx) cndTsk.getResolverCtx();
mrCtx.setDPCtx(fsInputDesc.getDynPartCtx());
mrCtx.setLbCtx(fsInputDesc.getLbCtx());
}
use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.
the class SharedWorkOptimizer method areMergeable.
// FIXME: probably this should also be integrated with isSame() logics
private static boolean areMergeable(ParseContext pctx, SharedWorkOptimizerCache optimizerCache, TableScanOperator tsOp1, TableScanOperator tsOp2) throws SemanticException {
// First we check if the two table scan operators can actually be merged
// If schemas do not match, we currently do not merge
List<String> prevTsOpNeededColumns = tsOp1.getNeededColumns();
List<String> tsOpNeededColumns = tsOp2.getNeededColumns();
if (prevTsOpNeededColumns.size() != tsOpNeededColumns.size()) {
return false;
}
boolean notEqual = false;
for (int i = 0; i < prevTsOpNeededColumns.size(); i++) {
if (!prevTsOpNeededColumns.get(i).equals(tsOpNeededColumns.get(i))) {
notEqual = true;
break;
}
}
if (notEqual) {
return false;
}
// If row limit does not match, we currently do not merge
if (tsOp1.getConf().getRowLimit() != tsOp2.getConf().getRowLimit()) {
return false;
}
// If partitions do not match, we currently do not merge
PrunedPartitionList prevTsOpPPList = pctx.getPrunedPartitions(tsOp1);
PrunedPartitionList tsOpPPList = pctx.getPrunedPartitions(tsOp2);
if (!prevTsOpPPList.getPartitions().equals(tsOpPPList.getPartitions())) {
return false;
}
// If is a DPP, check if actually it refers to same target, column, etc.
// Further, the DPP value needs to be generated from same subtree
List<Operator<?>> dppsOp1 = new ArrayList<>(optimizerCache.tableScanToDPPSource.get(tsOp1));
List<Operator<?>> dppsOp2 = new ArrayList<>(optimizerCache.tableScanToDPPSource.get(tsOp2));
if (dppsOp1.isEmpty() && dppsOp2.isEmpty()) {
return true;
}
for (int i = 0; i < dppsOp1.size(); i++) {
Operator<?> op = dppsOp1.get(i);
if (op instanceof ReduceSinkOperator) {
Set<Operator<?>> ascendants = findAscendantWorkOperators(pctx, optimizerCache, op);
if (ascendants.contains(tsOp2)) {
// This should not happen, we cannot merge
return false;
}
}
}
for (int i = 0; i < dppsOp2.size(); i++) {
Operator<?> op = dppsOp2.get(i);
if (op instanceof ReduceSinkOperator) {
Set<Operator<?>> ascendants = findAscendantWorkOperators(pctx, optimizerCache, op);
if (ascendants.contains(tsOp1)) {
// This should not happen, we cannot merge
return false;
}
}
}
if (dppsOp1.size() != dppsOp2.size()) {
// Only first or second operator contains DPP pruning
return false;
}
// Check if DPP branches are equal
BitSet bs = new BitSet();
for (int i = 0; i < dppsOp1.size(); i++) {
Operator<?> dppOp1 = dppsOp1.get(i);
for (int j = 0; j < dppsOp2.size(); j++) {
if (!bs.get(j)) {
// If not visited yet
Operator<?> dppOp2 = dppsOp2.get(j);
if (compareAndGatherOps(pctx, dppOp1, dppOp2) != null) {
// The DPP operator/branch are equal
bs.set(j);
break;
}
}
}
if (bs.cardinality() < i + 1) {
return false;
}
}
return true;
}
use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.
the class SharedWorkOptimizer method pushFilterToTopOfTableScan.
private static void pushFilterToTopOfTableScan(SharedWorkOptimizerCache optimizerCache, TableScanOperator tsOp) throws UDFArgumentException {
ExprNodeGenericFuncDesc tableScanExprNode = tsOp.getConf().getFilterExpr();
List<Operator<? extends OperatorDesc>> allChildren = Lists.newArrayList(tsOp.getChildOperators());
for (Operator<? extends OperatorDesc> op : allChildren) {
if (op instanceof FilterOperator) {
FilterOperator filterOp = (FilterOperator) op;
ExprNodeDesc filterExprNode = filterOp.getConf().getPredicate();
if (tableScanExprNode.isSame(filterExprNode)) {
// We do not need to do anything
return;
}
if (tableScanExprNode.getGenericUDF() instanceof GenericUDFOPOr) {
for (ExprNodeDesc childExprNode : tableScanExprNode.getChildren()) {
if (childExprNode.isSame(filterExprNode)) {
// so probably we pushed previously
return;
}
}
}
ExprNodeGenericFuncDesc newPred = ExprNodeGenericFuncDesc.newInstance(new GenericUDFOPAnd(), Arrays.<ExprNodeDesc>asList(tableScanExprNode.clone(), filterExprNode));
filterOp.getConf().setPredicate(newPred);
} else {
Operator<FilterDesc> newOp = OperatorFactory.get(tsOp.getCompilationOpContext(), new FilterDesc(tableScanExprNode.clone(), false), new RowSchema(tsOp.getSchema().getSignature()));
tsOp.replaceChild(op, newOp);
newOp.getParentOperators().add(tsOp);
op.replaceParent(tsOp, newOp);
newOp.getChildOperators().add(op);
// Add to cache (same group as tsOp)
optimizerCache.putIfWorkExists(newOp, tsOp);
}
}
}
Aggregations