use of org.apache.hadoop.hive.ql.plan.MapredWork in project hive by apache.
the class TestOperators method testMapOperator.
public void testMapOperator() throws Throwable {
try {
System.out.println("Testing Map Operator");
// initialize configuration
JobConf hconf = new JobConf(TestOperators.class);
hconf.set(MRJobConfig.MAP_INPUT_FILE, "hdfs:///testDir/testFile");
IOContextMap.get(hconf).setInputPath(new Path("hdfs:///testDir/testFile"));
// initialize pathToAliases
ArrayList<String> aliases = new ArrayList<String>();
aliases.add("a");
aliases.add("b");
LinkedHashMap<Path, ArrayList<String>> pathToAliases = new LinkedHashMap<>();
pathToAliases.put(new Path("hdfs:///testDir"), aliases);
// initialize pathToTableInfo
// Default: treat the table as a single column "col"
TableDesc td = Utilities.defaultTd;
PartitionDesc pd = new PartitionDesc(td, null);
LinkedHashMap<Path, org.apache.hadoop.hive.ql.plan.PartitionDesc> pathToPartitionInfo = new LinkedHashMap<>();
pathToPartitionInfo.put(new Path("hdfs:///testDir"), pd);
// initialize aliasToWork
CompilationOpContext ctx = new CompilationOpContext();
CollectDesc cd = new CollectDesc(Integer.valueOf(1));
CollectOperator cdop1 = (CollectOperator) OperatorFactory.get(ctx, CollectDesc.class);
cdop1.setConf(cd);
CollectOperator cdop2 = (CollectOperator) OperatorFactory.get(ctx, CollectDesc.class);
cdop2.setConf(cd);
LinkedHashMap<String, Operator<? extends OperatorDesc>> aliasToWork = new LinkedHashMap<String, Operator<? extends OperatorDesc>>();
aliasToWork.put("a", cdop1);
aliasToWork.put("b", cdop2);
// initialize mapredWork
MapredWork mrwork = new MapredWork();
mrwork.getMapWork().setPathToAliases(pathToAliases);
mrwork.getMapWork().setPathToPartitionInfo(pathToPartitionInfo);
mrwork.getMapWork().setAliasToWork(aliasToWork);
// get map operator and initialize it
MapOperator mo = new MapOperator(new CompilationOpContext());
mo.initializeAsRoot(hconf, mrwork.getMapWork());
Text tw = new Text();
InspectableObject io1 = new InspectableObject();
InspectableObject io2 = new InspectableObject();
for (int i = 0; i < 5; i++) {
String answer = "[[" + i + ", " + (i + 1) + ", " + (i + 2) + "]]";
tw.set("" + i + "" + (i + 1) + "" + (i + 2));
mo.process(tw);
cdop1.retrieve(io1);
cdop2.retrieve(io2);
System.out.println("io1.o.toString() = " + io1.o.toString());
System.out.println("io2.o.toString() = " + io2.o.toString());
System.out.println("answer.toString() = " + answer.toString());
assertEquals(answer.toString(), io1.o.toString());
assertEquals(answer.toString(), io2.o.toString());
}
System.out.println("Map Operator ok");
} catch (Throwable e) {
e.printStackTrace();
throw (e);
}
}
use of org.apache.hadoop.hive.ql.plan.MapredWork in project hive by apache.
the class CompactIndexHandler method generateIndexQuery.
@Override
public void generateIndexQuery(List<Index> indexes, ExprNodeDesc predicate, ParseContext pctx, HiveIndexQueryContext queryContext) {
Index index = indexes.get(0);
DecomposedPredicate decomposedPredicate = decomposePredicate(predicate, index, queryContext.getQueryPartitions());
if (decomposedPredicate == null) {
queryContext.setQueryTasks(null);
// abort if we couldn't pull out anything from the predicate
return;
}
// pass residual predicate back out for further processing
queryContext.setResidualPredicate(decomposedPredicate.residualPredicate);
// setup TableScanOperator to change input format for original query
queryContext.setIndexInputFormat(HiveCompactIndexInputFormat.class.getName());
// Build reentrant QL for index query
StringBuilder qlCommand = new StringBuilder("INSERT OVERWRITE DIRECTORY ");
String tmpFile = pctx.getContext().getMRTmpPath().toUri().toString();
queryContext.setIndexIntermediateFile(tmpFile);
// QL includes " around file name
qlCommand.append("\"" + tmpFile + "\" ");
qlCommand.append("SELECT `_bucketname` , `_offsets` FROM ");
qlCommand.append(HiveUtils.unparseIdentifier(index.getIndexTableName()));
qlCommand.append(" WHERE ");
String predicateString = decomposedPredicate.pushedPredicate.getExprString();
qlCommand.append(predicateString);
// generate tasks from index query string
LOG.info("Generating tasks for re-entrant QL query: " + qlCommand.toString());
HiveConf queryConf = new HiveConf(pctx.getConf(), CompactIndexHandler.class);
HiveConf.setBoolVar(queryConf, HiveConf.ConfVars.COMPRESSRESULT, false);
Driver driver = new Driver(queryConf);
driver.compile(qlCommand.toString(), false);
if (pctx.getConf().getBoolVar(ConfVars.HIVE_INDEX_COMPACT_BINARY_SEARCH) && useSorted) {
// For now, only works if the predicate is a single condition
MapWork work = null;
String originalInputFormat = null;
for (Task task : driver.getPlan().getRootTasks()) {
// Otherwise something is wrong, log the problem and continue using the default format
if (task.getWork() instanceof MapredWork) {
if (work != null) {
LOG.error("Tried to use a binary search on a compact index but there were an " + "unexpected number (>1) of root level map reduce tasks in the " + "reentrant query plan.");
work.setInputformat(null);
work.setInputFormatSorted(false);
break;
}
if (task.getWork() != null) {
work = ((MapredWork) task.getWork()).getMapWork();
}
String inputFormat = work.getInputformat();
originalInputFormat = inputFormat;
if (inputFormat == null) {
inputFormat = HiveConf.getVar(pctx.getConf(), HiveConf.ConfVars.HIVEINPUTFORMAT);
}
// and BucketizedHiveInputFormat
try {
if (!HiveInputFormat.class.isAssignableFrom(JavaUtils.loadClass(inputFormat))) {
work = null;
break;
}
} catch (ClassNotFoundException e) {
LOG.error("Map reduce work's input format class: " + inputFormat + " was not found. " + "Cannot use the fact the compact index is sorted.");
work = null;
break;
}
work.setInputFormatSorted(true);
}
}
if (work != null) {
// Find the filter operator and expr node which act on the index column and mark them
if (!findIndexColumnFilter(work.getAliasToWork().values())) {
LOG.error("Could not locate the index column's filter operator and expr node. Cannot " + "use the fact the compact index is sorted.");
work.setInputformat(originalInputFormat);
work.setInputFormatSorted(false);
}
}
}
queryContext.addAdditionalSemanticInputs(driver.getPlan().getInputs());
queryContext.setQueryTasks(driver.getPlan().getRootTasks());
return;
}
use of org.apache.hadoop.hive.ql.plan.MapredWork in project hive by apache.
the class GenMapRedUtils method addStatsTask.
/**
* Add the StatsTask as a dependent task of the MoveTask
* because StatsTask will change the Table/Partition metadata. For atomicity, we
* should not change it before the data is actually there done by MoveTask.
*
* @param nd
* the FileSinkOperator whose results are taken care of by the MoveTask.
* @param mvTask
* The MoveTask that moves the FileSinkOperator's results.
* @param currTask
* The MapRedTask that the FileSinkOperator belongs to.
* @param hconf
* HiveConf
*/
public static void addStatsTask(FileSinkOperator nd, MoveTask mvTask, Task<? extends Serializable> currTask, HiveConf hconf) {
MoveWork mvWork = mvTask.getWork();
StatsWork statsWork = null;
if (mvWork.getLoadTableWork() != null) {
statsWork = new StatsWork(mvWork.getLoadTableWork());
} else if (mvWork.getLoadFileWork() != null) {
statsWork = new StatsWork(mvWork.getLoadFileWork());
}
assert statsWork != null : "Error when generating StatsTask";
statsWork.setSourceTask(currTask);
statsWork.setStatsReliable(hconf.getBoolVar(ConfVars.HIVE_STATS_RELIABLE));
statsWork.setStatsTmpDir(nd.getConf().getStatsTmpDir());
if (currTask.getWork() instanceof MapredWork) {
MapredWork mrWork = (MapredWork) currTask.getWork();
mrWork.getMapWork().setGatheringStats(true);
if (mrWork.getReduceWork() != null) {
mrWork.getReduceWork().setGatheringStats(true);
}
} else if (currTask.getWork() instanceof SparkWork) {
SparkWork work = (SparkWork) currTask.getWork();
for (BaseWork w : work.getAllWork()) {
w.setGatheringStats(true);
}
} else {
// must be TezWork
TezWork work = (TezWork) currTask.getWork();
for (BaseWork w : work.getAllWork()) {
w.setGatheringStats(true);
}
}
// AggKey in StatsWork is used for stats aggregation while StatsAggPrefix
// in FileSinkDesc is used for stats publishing. They should be consistent.
statsWork.setAggKey(nd.getConf().getStatsAggPrefix());
Task<? extends Serializable> statsTask = TaskFactory.get(statsWork, hconf);
// subscribe feeds from the MoveTask so that MoveTask can forward the list
// of dynamic partition list to the StatsTask
mvTask.addDependentTask(statsTask);
statsTask.subscribeFeed(mvTask);
}
use of org.apache.hadoop.hive.ql.plan.MapredWork in project hive by apache.
the class GenMapRedUtils method initPlan.
/**
* Initialize the current plan by adding it to root tasks.
*
* @param op
* the reduce sink operator encountered
* @param opProcCtx
* processing context
*/
public static void initPlan(ReduceSinkOperator op, GenMRProcContext opProcCtx) throws SemanticException {
Operator<? extends OperatorDesc> reducer = op.getChildOperators().get(0);
Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx();
GenMapRedCtx mapredCtx = mapCurrCtx.get(op.getParentOperators().get(0));
Task<? extends Serializable> currTask = mapredCtx.getCurrTask();
MapredWork plan = (MapredWork) currTask.getWork();
HashMap<Operator<? extends OperatorDesc>, Task<? extends Serializable>> opTaskMap = opProcCtx.getOpTaskMap();
TableScanOperator currTopOp = opProcCtx.getCurrTopOp();
opTaskMap.put(reducer, currTask);
plan.setReduceWork(new ReduceWork());
plan.getReduceWork().setReducer(reducer);
ReduceSinkDesc desc = op.getConf();
plan.getReduceWork().setNumReduceTasks(desc.getNumReducers());
if (needsTagging(plan.getReduceWork())) {
plan.getReduceWork().setNeedsTagging(true);
}
assert currTopOp != null;
String currAliasId = opProcCtx.getCurrAliasId();
if (!opProcCtx.isSeenOp(currTask, currTopOp)) {
setTaskPlan(currAliasId, currTopOp, currTask, false, opProcCtx);
}
currTopOp = null;
currAliasId = null;
opProcCtx.setCurrTask(currTask);
opProcCtx.setCurrTopOp(currTopOp);
opProcCtx.setCurrAliasId(currAliasId);
}
use of org.apache.hadoop.hive.ql.plan.MapredWork in project hive by apache.
the class GenMapRedUtils method splitTasks.
@SuppressWarnings("nls")
private static /**
* Split two tasks by creating a temporary file between them.
*
* @param op reduce sink operator being processed
* @param parentTask the parent task
* @param childTask the child task
* @param opProcCtx context
**/
void splitTasks(ReduceSinkOperator op, Task<? extends Serializable> parentTask, Task<? extends Serializable> childTask, GenMRProcContext opProcCtx) throws SemanticException {
if (op.getNumParent() != 1) {
throw new IllegalStateException("Expecting operator " + op + " to have one parent. " + "But found multiple parents : " + op.getParentOperators());
}
ParseContext parseCtx = opProcCtx.getParseCtx();
parentTask.addDependentTask(childTask);
// Root Task cannot depend on any other task, therefore childTask cannot be
// a root Task
List<Task<? extends Serializable>> rootTasks = opProcCtx.getRootTasks();
if (rootTasks.contains(childTask)) {
rootTasks.remove(childTask);
}
// Generate the temporary file name
Context baseCtx = parseCtx.getContext();
Path taskTmpDir = baseCtx.getMRTmpPath();
Operator<? extends OperatorDesc> parent = op.getParentOperators().get(0);
TableDesc tt_desc = PlanUtils.getIntermediateFileTableDesc(PlanUtils.getFieldSchemasFromRowSchema(parent.getSchema(), "temporarycol"));
// Create the temporary file, its corresponding FileSinkOperaotr, and
// its corresponding TableScanOperator.
TableScanOperator tableScanOp = createTemporaryFile(parent, op, taskTmpDir, tt_desc, parseCtx);
Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx();
mapCurrCtx.put(tableScanOp, new GenMapRedCtx(childTask, null));
String streamDesc = taskTmpDir.toUri().toString();
MapredWork cplan = (MapredWork) childTask.getWork();
if (needsTagging(cplan.getReduceWork())) {
Operator<? extends OperatorDesc> reducerOp = cplan.getReduceWork().getReducer();
String id = null;
if (reducerOp instanceof JoinOperator) {
if (parseCtx.getJoinOps().contains(reducerOp)) {
id = ((JoinOperator) reducerOp).getConf().getId();
}
} else if (reducerOp instanceof MapJoinOperator) {
if (parseCtx.getMapJoinOps().contains(reducerOp)) {
id = ((MapJoinOperator) reducerOp).getConf().getId();
}
} else if (reducerOp instanceof SMBMapJoinOperator) {
if (parseCtx.getSmbMapJoinOps().contains(reducerOp)) {
id = ((SMBMapJoinOperator) reducerOp).getConf().getId();
}
}
if (id != null) {
streamDesc = id + ":$INTNAME";
} else {
streamDesc = "$INTNAME";
}
String origStreamDesc = streamDesc;
int pos = 0;
while (cplan.getMapWork().getAliasToWork().get(streamDesc) != null) {
streamDesc = origStreamDesc.concat(String.valueOf(++pos));
}
// TODO: Allocate work to remove the temporary files and make that
// dependent on the redTask
cplan.getReduceWork().setNeedsTagging(true);
}
// Add the path to alias mapping
setTaskPlan(taskTmpDir, streamDesc, tableScanOp, cplan.getMapWork(), false, tt_desc);
opProcCtx.setCurrTopOp(null);
opProcCtx.setCurrAliasId(null);
opProcCtx.setCurrTask(childTask);
opProcCtx.addRootIfPossible(parentTask);
}
Aggregations