use of org.apache.hadoop.hive.ql.exec.mr.MapRedTask in project hive by apache.
the class IndexWhereProcessor method process.
@Override
public /**
* Process a node of the operator tree. This matches on the rule in IndexWhereTaskDispatcher
*/
Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException {
TableScanOperator operator = (TableScanOperator) nd;
List<Node> opChildren = operator.getChildren();
TableScanDesc operatorDesc = operator.getConf();
if (operatorDesc == null || !tsToIndices.containsKey(operator)) {
return null;
}
List<Index> indexes = tsToIndices.get(operator);
ExprNodeDesc predicate = operatorDesc.getFilterExpr();
IndexWhereProcCtx context = (IndexWhereProcCtx) procCtx;
ParseContext pctx = context.getParseContext();
LOG.info("Processing predicate for index optimization");
if (predicate == null) {
LOG.info("null predicate pushed down");
return null;
}
LOG.info(predicate.getExprString());
// check if we have tsToIndices on all partitions in this table scan
Set<Partition> queryPartitions;
try {
queryPartitions = IndexUtils.checkPartitionsCoveredByIndex(operator, pctx, indexes);
if (queryPartitions == null) {
// partitions not covered
return null;
}
} catch (HiveException e) {
LOG.error("Fatal Error: problem accessing metastore", e);
throw new SemanticException(e);
}
// we can only process MapReduce tasks to check input size
if (!context.getCurrentTask().isMapRedTask()) {
return null;
}
MapRedTask currentTask = (MapRedTask) context.getCurrentTask();
// get potential reentrant index queries from each index
Map<Index, HiveIndexQueryContext> queryContexts = new HashMap<Index, HiveIndexQueryContext>();
// make sure we have an index on the table being scanned
TableDesc tblDesc = operator.getTableDesc();
Map<String, List<Index>> indexesByType = new HashMap<String, List<Index>>();
for (Index indexOnTable : indexes) {
if (indexesByType.get(indexOnTable.getIndexHandlerClass()) == null) {
List<Index> newType = new ArrayList<Index>();
newType.add(indexOnTable);
indexesByType.put(indexOnTable.getIndexHandlerClass(), newType);
} else {
indexesByType.get(indexOnTable.getIndexHandlerClass()).add(indexOnTable);
}
}
// choose index type with most tsToIndices of the same type on the table
// TODO HIVE-2130 This would be a good place for some sort of cost based choice?
List<Index> bestIndexes = indexesByType.values().iterator().next();
for (List<Index> indexTypes : indexesByType.values()) {
if (bestIndexes.size() < indexTypes.size()) {
bestIndexes = indexTypes;
}
}
// rewrite index queries for the chosen index type
HiveIndexQueryContext tmpQueryContext = new HiveIndexQueryContext();
tmpQueryContext.setQueryPartitions(queryPartitions);
rewriteForIndexes(predicate, bestIndexes, pctx, currentTask, tmpQueryContext);
List<Task<?>> indexTasks = tmpQueryContext.getQueryTasks();
if (indexTasks != null && indexTasks.size() > 0) {
queryContexts.put(bestIndexes.get(0), tmpQueryContext);
}
// choose an index rewrite to use
if (queryContexts.size() > 0) {
// TODO HIVE-2130 This would be a good place for some sort of cost based choice?
Index chosenIndex = queryContexts.keySet().iterator().next();
// modify the parse context to use indexing
// we need to delay this until we choose one index so that we don't attempt to modify pctx multiple times
HiveIndexQueryContext queryContext = queryContexts.get(chosenIndex);
// prepare the map reduce job to use indexing
MapWork work = currentTask.getWork().getMapWork();
work.setInputformat(queryContext.getIndexInputFormat());
work.addIndexIntermediateFile(queryContext.getIndexIntermediateFile());
// modify inputs based on index query
Set<ReadEntity> inputs = pctx.getSemanticInputs();
inputs.addAll(queryContext.getAdditionalSemanticInputs());
List<Task<?>> chosenRewrite = queryContext.getQueryTasks();
// add dependencies so index query runs first
insertIndexQuery(pctx, context, chosenRewrite);
}
return null;
}
use of org.apache.hadoop.hive.ql.exec.mr.MapRedTask in project hive by apache.
the class Utilities method reworkMapRedWork.
/**
* The check here is kind of not clean. It first use a for loop to go through
* all input formats, and choose the ones that extend ReworkMapredInputFormat
* to a set. And finally go through the ReworkMapredInputFormat set, and call
* rework for each one.
*
* Technically all these can be avoided if all Hive's input formats can share
* a same interface. As in today's hive and Hadoop, it is not possible because
* a lot of Hive's input formats are in Hadoop's code. And most of Hadoop's
* input formats just extend InputFormat interface.
*
* @param task
* @param reworkMapredWork
* @param conf
* @throws SemanticException
*/
public static void reworkMapRedWork(Task<? extends Serializable> task, boolean reworkMapredWork, HiveConf conf) throws SemanticException {
if (reworkMapredWork && (task instanceof MapRedTask)) {
try {
MapredWork mapredWork = ((MapRedTask) task).getWork();
Set<Class<? extends InputFormat>> reworkInputFormats = new HashSet<Class<? extends InputFormat>>();
for (PartitionDesc part : mapredWork.getMapWork().getPathToPartitionInfo().values()) {
Class<? extends InputFormat> inputFormatCls = part.getInputFileFormatClass();
if (ReworkMapredInputFormat.class.isAssignableFrom(inputFormatCls)) {
reworkInputFormats.add(inputFormatCls);
}
}
if (reworkInputFormats.size() > 0) {
for (Class<? extends InputFormat> inputFormatCls : reworkInputFormats) {
ReworkMapredInputFormat inst = (ReworkMapredInputFormat) ReflectionUtil.newInstance(inputFormatCls, null);
inst.rework(conf, mapredWork);
}
}
} catch (IOException e) {
throw new SemanticException(e);
}
}
}
use of org.apache.hadoop.hive.ql.exec.mr.MapRedTask in project hive by apache.
the class TestGenMapRedUtilsCreateConditionalTask method setUp.
@Before
public void setUp() {
dummyMRTask = new MapRedTask();
SessionState.start(hiveConf);
}
use of org.apache.hadoop.hive.ql.exec.mr.MapRedTask in project hive by apache.
the class SamplingOptimizer method resolve.
public PhysicalContext resolve(PhysicalContext pctx) throws SemanticException {
for (Task<?> task : pctx.getRootTasks()) {
if (!(task instanceof MapRedTask) || !((MapRedTask) task).getWork().isFinalMapRed()) {
// this could be replaced by bucketing on RS + bucketed fetcher for next MR
continue;
}
MapredWork mrWork = ((MapRedTask) task).getWork();
MapWork mapWork = mrWork.getMapWork();
ReduceWork reduceWork = mrWork.getReduceWork();
if (reduceWork == null || reduceWork.getNumReduceTasks() != 1 || mapWork.getAliasToWork().size() != 1 || mapWork.getSamplingType() > 0 || reduceWork.getReducer() == null) {
continue;
}
// GROUPBY operator in reducer may not be processed in parallel. Skip optimizing.
if (OperatorUtils.findSingleOperator(reduceWork.getReducer(), GroupByOperator.class) != null) {
continue;
}
Operator<?> operator = mapWork.getAliasToWork().values().iterator().next();
if (!(operator instanceof TableScanOperator)) {
continue;
}
TableScanOperator tsop = (TableScanOperator) operator;
Table tbl = tsop.getConf().getTableMetadata();
if (tbl == null) {
continue;
}
if (AcidUtils.isInsertOnlyTable(tbl.getParameters())) {
// sampler will limit the input to the the correct directories, but we don't care about MR.
continue;
}
ReduceSinkOperator child = OperatorUtils.findSingleOperator(operator, ReduceSinkOperator.class);
if (child == null || child.getConf().getNumReducers() != 1 || !child.getConf().getPartitionCols().isEmpty()) {
continue;
}
child.getConf().setNumReducers(-1);
reduceWork.setNumReduceTasks(-1);
mapWork.setSamplingType(MapWork.SAMPLING_ON_START);
}
return pctx;
}
use of org.apache.hadoop.hive.ql.exec.mr.MapRedTask in project hive by apache.
the class MoveTask method inferTaskInformation.
private void inferTaskInformation(TaskInformation ti) {
// (Either standard, local, or a merge)
while (ti.task.getParentTasks() != null && ti.task.getParentTasks().size() == 1) {
ti.task = (Task) ti.task.getParentTasks().get(0);
// If it was a merge task or a local map reduce task, nothing can be inferred
if (ti.task instanceof MergeFileTask || ti.task instanceof MapredLocalTask) {
break;
}
// the directory this move task is moving
if (ti.task instanceof MapRedTask) {
MapredWork work = (MapredWork) ti.task.getWork();
MapWork mapWork = work.getMapWork();
ti.bucketCols = mapWork.getBucketedColsByDirectory().get(ti.path);
ti.sortCols = mapWork.getSortedColsByDirectory().get(ti.path);
if (work.getReduceWork() != null) {
ti.numBuckets = work.getReduceWork().getNumReduceTasks();
}
if (ti.bucketCols != null || ti.sortCols != null) {
// operator that writes the final output)
assert work.isFinalMapRed();
}
break;
}
// condition for merging is not met, see GenMRFileSink1.
if (ti.task instanceof MoveTask) {
MoveTask mt = (MoveTask) ti.task;
if (mt.getWork().getLoadFileWork() != null) {
ti.path = mt.getWork().getLoadFileWork().getSourcePath().toUri().toString();
}
}
}
}
Aggregations