use of org.apache.hadoop.hive.ql.plan.FetchWork in project hive by apache.
the class TaskCompiler method genColumnStatsTask.
/**
* A helper function to generate a column stats task on top of map-red task. The column stats
* task fetches from the output of the map-red task, constructs the column stats object and
* persists it to the metastore.
*
* This method generates a plan with a column stats task on top of map-red task and sets up the
* appropriate metadata to be used during execution.
*
* @param analyzeRewrite
* @param loadTableWork
* @param loadFileWork
* @param rootTasks
* @param outerQueryLimit
*/
@SuppressWarnings("unchecked")
protected void genColumnStatsTask(AnalyzeRewriteContext analyzeRewrite, List<LoadFileDesc> loadFileWork, Set<Task<? extends Serializable>> leafTasks, int outerQueryLimit, int numBitVector) {
ColumnStatsTask cStatsTask = null;
ColumnStatsWork cStatsWork = null;
FetchWork fetch = null;
String tableName = analyzeRewrite.getTableName();
List<String> colName = analyzeRewrite.getColName();
List<String> colType = analyzeRewrite.getColType();
boolean isTblLevel = analyzeRewrite.isTblLvl();
String cols = loadFileWork.get(0).getColumns();
String colTypes = loadFileWork.get(0).getColumnTypes();
String resFileFormat;
TableDesc resultTab;
if (SessionState.get().isHiveServerQuery() && conf.getBoolVar(HiveConf.ConfVars.HIVE_SERVER2_THRIFT_RESULTSET_SERIALIZE_IN_TASKS)) {
resFileFormat = "SequenceFile";
resultTab = PlanUtils.getDefaultQueryOutputTableDesc(cols, colTypes, resFileFormat, ThriftJDBCBinarySerDe.class);
} else {
resFileFormat = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEQUERYRESULTFILEFORMAT);
resultTab = PlanUtils.getDefaultQueryOutputTableDesc(cols, colTypes, resFileFormat, LazySimpleSerDe.class);
}
fetch = new FetchWork(loadFileWork.get(0).getSourcePath(), resultTab, outerQueryLimit);
ColumnStatsDesc cStatsDesc = new ColumnStatsDesc(tableName, colName, colType, isTblLevel, numBitVector);
cStatsWork = new ColumnStatsWork(fetch, cStatsDesc);
cStatsTask = (ColumnStatsTask) TaskFactory.get(cStatsWork, conf);
for (Task<? extends Serializable> tsk : leafTasks) {
tsk.addDependentTask(cStatsTask);
}
}
use of org.apache.hadoop.hive.ql.plan.FetchWork in project hive by apache.
the class TaskCompiler method compile.
@SuppressWarnings({ "nls", "unchecked" })
public void compile(final ParseContext pCtx, final List<Task<? extends Serializable>> rootTasks, final HashSet<ReadEntity> inputs, final HashSet<WriteEntity> outputs) throws SemanticException {
Context ctx = pCtx.getContext();
GlobalLimitCtx globalLimitCtx = pCtx.getGlobalLimitCtx();
List<Task<MoveWork>> mvTask = new ArrayList<Task<MoveWork>>();
List<LoadTableDesc> loadTableWork = pCtx.getLoadTableWork();
List<LoadFileDesc> loadFileWork = pCtx.getLoadFileWork();
boolean isCStats = pCtx.getQueryProperties().isAnalyzeRewrite();
int outerQueryLimit = pCtx.getQueryProperties().getOuterQueryLimit();
if (pCtx.getFetchTask() != null) {
if (pCtx.getFetchTask().getTblDesc() == null) {
return;
}
pCtx.getFetchTask().getWork().setHiveServerQuery(SessionState.get().isHiveServerQuery());
TableDesc resultTab = pCtx.getFetchTask().getTblDesc();
// then either the ThriftFormatter or the DefaultFetchFormatter should be used.
if (!resultTab.getSerdeClassName().equalsIgnoreCase(ThriftJDBCBinarySerDe.class.getName())) {
if (SessionState.get().isHiveServerQuery()) {
conf.set(SerDeUtils.LIST_SINK_OUTPUT_FORMATTER, ThriftFormatter.class.getName());
} else {
String formatterName = conf.get(SerDeUtils.LIST_SINK_OUTPUT_FORMATTER);
if (formatterName == null || formatterName.isEmpty()) {
conf.set(SerDeUtils.LIST_SINK_OUTPUT_FORMATTER, DefaultFetchFormatter.class.getName());
}
}
}
return;
}
optimizeOperatorPlan(pCtx, inputs, outputs);
/*
* In case of a select, use a fetch task instead of a move task.
* If the select is from analyze table column rewrite, don't create a fetch task. Instead create
* a column stats task later.
*/
if (pCtx.getQueryProperties().isQuery() && !isCStats) {
if ((!loadTableWork.isEmpty()) || (loadFileWork.size() != 1)) {
throw new SemanticException(ErrorMsg.INVALID_LOAD_TABLE_FILE_WORK.getMsg());
}
LoadFileDesc loadFileDesc = loadFileWork.get(0);
String cols = loadFileDesc.getColumns();
String colTypes = loadFileDesc.getColumnTypes();
String resFileFormat;
TableDesc resultTab = pCtx.getFetchTableDesc();
if (resultTab == null) {
resFileFormat = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEQUERYRESULTFILEFORMAT);
if (SessionState.get().getIsUsingThriftJDBCBinarySerDe() && (resFileFormat.equalsIgnoreCase("SequenceFile"))) {
resultTab = PlanUtils.getDefaultQueryOutputTableDesc(cols, colTypes, resFileFormat, ThriftJDBCBinarySerDe.class);
// Set the fetch formatter to be a no-op for the ListSinkOperator, since we'll
// read formatted thrift objects from the output SequenceFile written by Tasks.
conf.set(SerDeUtils.LIST_SINK_OUTPUT_FORMATTER, NoOpFetchFormatter.class.getName());
} else {
resultTab = PlanUtils.getDefaultQueryOutputTableDesc(cols, colTypes, resFileFormat, LazySimpleSerDe.class);
}
} else {
if (resultTab.getProperties().getProperty(serdeConstants.SERIALIZATION_LIB).equalsIgnoreCase(ThriftJDBCBinarySerDe.class.getName())) {
// Set the fetch formatter to be a no-op for the ListSinkOperator, since we'll
// read formatted thrift objects from the output SequenceFile written by Tasks.
conf.set(SerDeUtils.LIST_SINK_OUTPUT_FORMATTER, NoOpFetchFormatter.class.getName());
}
}
FetchWork fetch = new FetchWork(loadFileDesc.getSourcePath(), resultTab, outerQueryLimit);
boolean isHiveServerQuery = SessionState.get().isHiveServerQuery();
fetch.setHiveServerQuery(isHiveServerQuery);
fetch.setSource(pCtx.getFetchSource());
fetch.setSink(pCtx.getFetchSink());
if (isHiveServerQuery && null != resultTab && resultTab.getSerdeClassName().equalsIgnoreCase(ThriftJDBCBinarySerDe.class.getName()) && HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_SERVER2_THRIFT_RESULTSET_SERIALIZE_IN_TASKS)) {
fetch.setIsUsingThriftJDBCBinarySerDe(true);
} else {
fetch.setIsUsingThriftJDBCBinarySerDe(false);
}
pCtx.setFetchTask((FetchTask) TaskFactory.get(fetch, conf));
// For the FetchTask, the limit optimization requires we fetch all the rows
// in memory and count how many rows we get. It's not practical if the
// limit factor is too big
int fetchLimit = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVELIMITOPTMAXFETCH);
if (globalLimitCtx.isEnable() && globalLimitCtx.getGlobalLimit() > fetchLimit) {
LOG.info("For FetchTask, LIMIT " + globalLimitCtx.getGlobalLimit() + " > " + fetchLimit + ". Doesn't qualify limit optimization.");
globalLimitCtx.disableOpt();
}
if (outerQueryLimit == 0) {
// Believe it or not, some tools do generate queries with limit 0 and than expect
// query to run quickly. Lets meet their requirement.
LOG.info("Limit 0. No query execution needed.");
return;
}
} else if (!isCStats) {
for (LoadTableDesc ltd : loadTableWork) {
Task<MoveWork> tsk = TaskFactory.get(new MoveWork(null, null, ltd, null, false), conf);
mvTask.add(tsk);
// Check to see if we are stale'ing any indexes and auto-update them if we want
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVEINDEXAUTOUPDATE)) {
IndexUpdater indexUpdater = new IndexUpdater(loadTableWork, inputs, conf);
try {
List<Task<? extends Serializable>> indexUpdateTasks = indexUpdater.generateUpdateTasks();
for (Task<? extends Serializable> updateTask : indexUpdateTasks) {
tsk.addDependentTask(updateTask);
}
} catch (HiveException e) {
console.printInfo("WARNING: could not auto-update stale indexes, which are not in sync");
}
}
}
boolean oneLoadFile = true;
for (LoadFileDesc lfd : loadFileWork) {
if (pCtx.getQueryProperties().isCTAS() || pCtx.getQueryProperties().isMaterializedView()) {
// should not have more than 1 load file for
assert (oneLoadFile);
// CTAS
// make the movetask's destination directory the table's destination.
Path location;
String loc = pCtx.getQueryProperties().isCTAS() ? pCtx.getCreateTable().getLocation() : pCtx.getCreateViewDesc().getLocation();
if (loc == null) {
// get the default location
Path targetPath;
try {
String protoName = null;
if (pCtx.getQueryProperties().isCTAS()) {
protoName = pCtx.getCreateTable().getTableName();
} else if (pCtx.getQueryProperties().isMaterializedView()) {
protoName = pCtx.getCreateViewDesc().getViewName();
}
String[] names = Utilities.getDbTableName(protoName);
if (!db.databaseExists(names[0])) {
throw new SemanticException("ERROR: The database " + names[0] + " does not exist.");
}
Warehouse wh = new Warehouse(conf);
targetPath = wh.getTablePath(db.getDatabase(names[0]), names[1]);
} catch (HiveException e) {
throw new SemanticException(e);
} catch (MetaException e) {
throw new SemanticException(e);
}
location = targetPath;
} else {
location = new Path(loc);
}
lfd.setTargetDir(location);
oneLoadFile = false;
}
mvTask.add(TaskFactory.get(new MoveWork(null, null, null, lfd, false), conf));
}
}
generateTaskTree(rootTasks, pCtx, mvTask, inputs, outputs);
// For each task, set the key descriptor for the reducer
for (Task<? extends Serializable> rootTask : rootTasks) {
GenMapRedUtils.setKeyAndValueDescForTaskTree(rootTask);
}
// to be used, please do so
for (Task<? extends Serializable> rootTask : rootTasks) {
setInputFormat(rootTask);
}
optimizeTaskPlan(rootTasks, pCtx, ctx);
/*
* If the query was the result of analyze table column compute statistics rewrite, create
* a column stats task instead of a fetch task to persist stats to the metastore.
*/
if (isCStats || !pCtx.getColumnStatsAutoGatherContexts().isEmpty()) {
Set<Task<? extends Serializable>> leafTasks = new LinkedHashSet<Task<? extends Serializable>>();
getLeafTasks(rootTasks, leafTasks);
if (isCStats) {
genColumnStatsTask(pCtx.getAnalyzeRewrite(), loadFileWork, leafTasks, outerQueryLimit, 0);
} else {
for (ColumnStatsAutoGatherContext columnStatsAutoGatherContext : pCtx.getColumnStatsAutoGatherContexts()) {
if (!columnStatsAutoGatherContext.isInsertInto()) {
genColumnStatsTask(columnStatsAutoGatherContext.getAnalyzeRewrite(), columnStatsAutoGatherContext.getLoadFileWork(), leafTasks, outerQueryLimit, 0);
} else {
int numBitVector;
try {
numBitVector = HiveStatsUtils.getNumBitVectorsForNDVEstimation(conf);
} catch (Exception e) {
throw new SemanticException(e.getMessage());
}
genColumnStatsTask(columnStatsAutoGatherContext.getAnalyzeRewrite(), columnStatsAutoGatherContext.getLoadFileWork(), leafTasks, outerQueryLimit, numBitVector);
}
}
}
}
decideExecMode(rootTasks, ctx, globalLimitCtx);
if (pCtx.getQueryProperties().isCTAS() && !pCtx.getCreateTable().isMaterialization()) {
// generate a DDL task and make it a dependent task of the leaf
CreateTableDesc crtTblDesc = pCtx.getCreateTable();
crtTblDesc.validate(conf);
Task<? extends Serializable> crtTblTask = TaskFactory.get(new DDLWork(inputs, outputs, crtTblDesc), conf);
patchUpAfterCTASorMaterializedView(rootTasks, outputs, crtTblTask);
} else if (pCtx.getQueryProperties().isMaterializedView()) {
// generate a DDL task and make it a dependent task of the leaf
CreateViewDesc viewDesc = pCtx.getCreateViewDesc();
Task<? extends Serializable> crtViewTask = TaskFactory.get(new DDLWork(inputs, outputs, viewDesc), conf);
patchUpAfterCTASorMaterializedView(rootTasks, outputs, crtViewTask);
}
if (globalLimitCtx.isEnable() && pCtx.getFetchTask() != null) {
LOG.info("set least row check for FetchTask: " + globalLimitCtx.getGlobalLimit());
pCtx.getFetchTask().getWork().setLeastNumRows(globalLimitCtx.getGlobalLimit());
}
if (globalLimitCtx.isEnable() && globalLimitCtx.getLastReduceLimitDesc() != null) {
LOG.info("set least row check for LimitDesc: " + globalLimitCtx.getGlobalLimit());
globalLimitCtx.getLastReduceLimitDesc().setLeastRows(globalLimitCtx.getGlobalLimit());
List<ExecDriver> mrTasks = Utilities.getMRTasks(rootTasks);
for (ExecDriver tsk : mrTasks) {
tsk.setRetryCmdWhenFail(true);
}
List<SparkTask> sparkTasks = Utilities.getSparkTasks(rootTasks);
for (SparkTask sparkTask : sparkTasks) {
sparkTask.setRetryCmdWhenFail(true);
}
}
Interner<TableDesc> interner = Interners.newStrongInterner();
for (Task<? extends Serializable> rootTask : rootTasks) {
GenMapRedUtils.internTableDesc(rootTask, interner);
GenMapRedUtils.deriveFinalExplainAttributes(rootTask, pCtx.getConf());
}
}
use of org.apache.hadoop.hive.ql.plan.FetchWork in project hive by apache.
the class GenMRSkewJoinProcessor method processSkewJoin.
/**
* Create tasks for processing skew joins. The idea is (HIVE-964) to use
* separated jobs and map-joins to handle skew joins.
* <p>
* <ul>
* <li>
* Number of mr jobs to handle skew keys is the number of table minus 1 (we
* can stream the last table, so big keys in the last table will not be a
* problem).
* <li>
* At runtime in Join, we output big keys in one table into one corresponding
* directories, and all same keys in other tables into different dirs(one for
* each table). The directories will look like:
* <ul>
* <li>
* dir-T1-bigkeys(containing big keys in T1), dir-T2-keys(containing keys
* which is big in T1),dir-T3-keys(containing keys which is big in T1), ...
* <li>
* dir-T1-keys(containing keys which is big in T2), dir-T2-bigkeys(containing
* big keys in T2),dir-T3-keys(containing keys which is big in T2), ...
* <li>
* dir-T1-keys(containing keys which is big in T3), dir-T2-keys(containing big
* keys in T3),dir-T3-bigkeys(containing keys which is big in T3), ... .....
* </ul>
* </ul>
* For each table, we launch one mapjoin job, taking the directory containing
* big keys in this table and corresponding dirs in other tables as input.
* (Actally one job for one row in the above.)
*
* <p>
* For more discussions, please check
* https://issues.apache.org/jira/browse/HIVE-964.
*
*/
@SuppressWarnings("unchecked")
public static void processSkewJoin(JoinOperator joinOp, Task<? extends Serializable> currTask, ParseContext parseCtx) throws SemanticException {
// now does not work with outer joins
if (!GenMRSkewJoinProcessor.skewJoinEnabled(parseCtx.getConf(), joinOp)) {
return;
}
List<Task<? extends Serializable>> children = currTask.getChildTasks();
Path baseTmpDir = parseCtx.getContext().getMRTmpPath();
JoinDesc joinDescriptor = joinOp.getConf();
Map<Byte, List<ExprNodeDesc>> joinValues = joinDescriptor.getExprs();
int numAliases = joinValues.size();
Map<Byte, Path> bigKeysDirMap = new HashMap<Byte, Path>();
Map<Byte, Map<Byte, Path>> smallKeysDirMap = new HashMap<Byte, Map<Byte, Path>>();
Map<Byte, Path> skewJoinJobResultsDir = new HashMap<Byte, Path>();
Byte[] tags = joinDescriptor.getTagOrder();
for (int i = 0; i < numAliases; i++) {
Byte alias = tags[i];
bigKeysDirMap.put(alias, getBigKeysDir(baseTmpDir, alias));
Map<Byte, Path> smallKeysMap = new HashMap<Byte, Path>();
smallKeysDirMap.put(alias, smallKeysMap);
for (Byte src2 : tags) {
if (!src2.equals(alias)) {
smallKeysMap.put(src2, getSmallKeysDir(baseTmpDir, alias, src2));
}
}
skewJoinJobResultsDir.put(alias, getBigKeysSkewJoinResultDir(baseTmpDir, alias));
}
joinDescriptor.setHandleSkewJoin(true);
joinDescriptor.setBigKeysDirMap(bigKeysDirMap);
joinDescriptor.setSmallKeysDirMap(smallKeysDirMap);
joinDescriptor.setSkewKeyDefinition(HiveConf.getIntVar(parseCtx.getConf(), HiveConf.ConfVars.HIVESKEWJOINKEY));
HashMap<Path, Task<? extends Serializable>> bigKeysDirToTaskMap = new HashMap<Path, Task<? extends Serializable>>();
List<Serializable> listWorks = new ArrayList<Serializable>();
List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>();
MapredWork currPlan = (MapredWork) currTask.getWork();
TableDesc keyTblDesc = (TableDesc) currPlan.getReduceWork().getKeyDesc().clone();
List<String> joinKeys = Utilities.getColumnNames(keyTblDesc.getProperties());
List<String> joinKeyTypes = Utilities.getColumnTypes(keyTblDesc.getProperties());
Map<Byte, TableDesc> tableDescList = new HashMap<Byte, TableDesc>();
Map<Byte, RowSchema> rowSchemaList = new HashMap<Byte, RowSchema>();
Map<Byte, List<ExprNodeDesc>> newJoinValues = new HashMap<Byte, List<ExprNodeDesc>>();
Map<Byte, List<ExprNodeDesc>> newJoinKeys = new HashMap<Byte, List<ExprNodeDesc>>();
// used for create mapJoinDesc, should be in order
List<TableDesc> newJoinValueTblDesc = new ArrayList<TableDesc>();
for (Byte tag : tags) {
newJoinValueTblDesc.add(null);
}
for (int i = 0; i < numAliases; i++) {
Byte alias = tags[i];
List<ExprNodeDesc> valueCols = joinValues.get(alias);
String colNames = "";
String colTypes = "";
int columnSize = valueCols.size();
List<ExprNodeDesc> newValueExpr = new ArrayList<ExprNodeDesc>();
List<ExprNodeDesc> newKeyExpr = new ArrayList<ExprNodeDesc>();
ArrayList<ColumnInfo> columnInfos = new ArrayList<ColumnInfo>();
boolean first = true;
for (int k = 0; k < columnSize; k++) {
TypeInfo type = valueCols.get(k).getTypeInfo();
// any name, it does not matter.
String newColName = i + "_VALUE_" + k;
ColumnInfo columnInfo = new ColumnInfo(newColName, type, alias.toString(), false);
columnInfos.add(columnInfo);
newValueExpr.add(new ExprNodeColumnDesc(columnInfo));
if (!first) {
colNames = colNames + ",";
colTypes = colTypes + ",";
}
first = false;
colNames = colNames + newColName;
colTypes = colTypes + valueCols.get(k).getTypeString();
}
// we are putting join keys at last part of the spilled table
for (int k = 0; k < joinKeys.size(); k++) {
if (!first) {
colNames = colNames + ",";
colTypes = colTypes + ",";
}
first = false;
colNames = colNames + joinKeys.get(k);
colTypes = colTypes + joinKeyTypes.get(k);
ColumnInfo columnInfo = new ColumnInfo(joinKeys.get(k), TypeInfoFactory.getPrimitiveTypeInfo(joinKeyTypes.get(k)), alias.toString(), false);
columnInfos.add(columnInfo);
newKeyExpr.add(new ExprNodeColumnDesc(columnInfo));
}
newJoinValues.put(alias, newValueExpr);
newJoinKeys.put(alias, newKeyExpr);
tableDescList.put(alias, Utilities.getTableDesc(colNames, colTypes));
rowSchemaList.put(alias, new RowSchema(columnInfos));
// construct value table Desc
String valueColNames = "";
String valueColTypes = "";
first = true;
for (int k = 0; k < columnSize; k++) {
// any name, it does not matter.
String newColName = i + "_VALUE_" + k;
if (!first) {
valueColNames = valueColNames + ",";
valueColTypes = valueColTypes + ",";
}
valueColNames = valueColNames + newColName;
valueColTypes = valueColTypes + valueCols.get(k).getTypeString();
first = false;
}
newJoinValueTblDesc.set(Byte.valueOf((byte) i), Utilities.getTableDesc(valueColNames, valueColTypes));
}
joinDescriptor.setSkewKeysValuesTables(tableDescList);
joinDescriptor.setKeyTableDesc(keyTblDesc);
for (int i = 0; i < numAliases - 1; i++) {
Byte src = tags[i];
MapWork newPlan = PlanUtils.getMapRedWork().getMapWork();
// This code has been only added for testing
boolean mapperCannotSpanPartns = parseCtx.getConf().getBoolVar(HiveConf.ConfVars.HIVE_MAPPER_CANNOT_SPAN_MULTIPLE_PARTITIONS);
newPlan.setMapperCannotSpanPartns(mapperCannotSpanPartns);
MapredWork clonePlan = SerializationUtilities.clonePlan(currPlan);
Operator<? extends OperatorDesc>[] parentOps = new TableScanOperator[tags.length];
for (int k = 0; k < tags.length; k++) {
Operator<? extends OperatorDesc> ts = GenMapRedUtils.createTemporaryTableScanOperator(joinOp.getCompilationOpContext(), rowSchemaList.get((byte) k));
((TableScanOperator) ts).setTableDesc(tableDescList.get((byte) k));
parentOps[k] = ts;
}
Operator<? extends OperatorDesc> tblScan_op = parentOps[i];
ArrayList<String> aliases = new ArrayList<String>();
String alias = src.toString().intern();
aliases.add(alias);
Path bigKeyDirPath = bigKeysDirMap.get(src);
newPlan.addPathToAlias(bigKeyDirPath, aliases);
newPlan.getAliasToWork().put(alias, tblScan_op);
PartitionDesc part = new PartitionDesc(tableDescList.get(src), null);
newPlan.addPathToPartitionInfo(bigKeyDirPath, part);
newPlan.getAliasToPartnInfo().put(alias, part);
Operator<? extends OperatorDesc> reducer = clonePlan.getReduceWork().getReducer();
assert reducer instanceof JoinOperator;
JoinOperator cloneJoinOp = (JoinOperator) reducer;
String dumpFilePrefix = "mapfile" + PlanUtils.getCountForMapJoinDumpFilePrefix();
MapJoinDesc mapJoinDescriptor = new MapJoinDesc(newJoinKeys, keyTblDesc, newJoinValues, newJoinValueTblDesc, newJoinValueTblDesc, joinDescriptor.getOutputColumnNames(), i, joinDescriptor.getConds(), joinDescriptor.getFilters(), joinDescriptor.getNoOuterJoin(), dumpFilePrefix);
mapJoinDescriptor.setTagOrder(tags);
mapJoinDescriptor.setHandleSkewJoin(false);
mapJoinDescriptor.setNullSafes(joinDescriptor.getNullSafes());
MapredLocalWork localPlan = new MapredLocalWork(new LinkedHashMap<String, Operator<? extends OperatorDesc>>(), new LinkedHashMap<String, FetchWork>());
Map<Byte, Path> smallTblDirs = smallKeysDirMap.get(src);
for (int j = 0; j < numAliases; j++) {
if (j == i) {
continue;
}
Byte small_alias = tags[j];
Operator<? extends OperatorDesc> tblScan_op2 = parentOps[j];
localPlan.getAliasToWork().put(small_alias.toString(), tblScan_op2);
Path tblDir = smallTblDirs.get(small_alias);
localPlan.getAliasToFetchWork().put(small_alias.toString(), new FetchWork(tblDir, tableDescList.get(small_alias)));
}
newPlan.setMapRedLocalWork(localPlan);
// construct a map join and set it as the child operator of tblScan_op
MapJoinOperator mapJoinOp = (MapJoinOperator) OperatorFactory.getAndMakeChild(joinOp.getCompilationOpContext(), mapJoinDescriptor, (RowSchema) null, parentOps);
// change the children of the original join operator to point to the map
// join operator
List<Operator<? extends OperatorDesc>> childOps = cloneJoinOp.getChildOperators();
for (Operator<? extends OperatorDesc> childOp : childOps) {
childOp.replaceParent(cloneJoinOp, mapJoinOp);
}
mapJoinOp.setChildOperators(childOps);
HiveConf jc = new HiveConf(parseCtx.getConf(), GenMRSkewJoinProcessor.class);
newPlan.setNumMapTasks(HiveConf.getIntVar(jc, HiveConf.ConfVars.HIVESKEWJOINMAPJOINNUMMAPTASK));
newPlan.setMinSplitSize(HiveConf.getLongVar(jc, HiveConf.ConfVars.HIVESKEWJOINMAPJOINMINSPLIT));
newPlan.setInputformat(HiveInputFormat.class.getName());
MapredWork w = new MapredWork();
w.setMapWork(newPlan);
Task<? extends Serializable> skewJoinMapJoinTask = TaskFactory.get(w, jc);
skewJoinMapJoinTask.setFetchSource(currTask.isFetchSource());
bigKeysDirToTaskMap.put(bigKeyDirPath, skewJoinMapJoinTask);
listWorks.add(skewJoinMapJoinTask.getWork());
listTasks.add(skewJoinMapJoinTask);
}
if (children != null) {
for (Task<? extends Serializable> tsk : listTasks) {
for (Task<? extends Serializable> oldChild : children) {
tsk.addDependentTask(oldChild);
}
}
currTask.setChildTasks(new ArrayList<Task<? extends Serializable>>());
for (Task<? extends Serializable> oldChild : children) {
oldChild.getParentTasks().remove(currTask);
}
listTasks.addAll(children);
}
ConditionalResolverSkewJoinCtx context = new ConditionalResolverSkewJoinCtx(bigKeysDirToTaskMap, children);
ConditionalWork cndWork = new ConditionalWork(listWorks);
ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, parseCtx.getConf());
cndTsk.setListTasks(listTasks);
cndTsk.setResolver(new ConditionalResolverSkewJoin());
cndTsk.setResolverCtx(context);
currTask.setChildTasks(new ArrayList<Task<? extends Serializable>>());
currTask.addDependentTask(cndTsk);
return;
}
use of org.apache.hadoop.hive.ql.plan.FetchWork in project hive by apache.
the class SortMergeJoinTaskDispatcher method genSMBJoinWork.
// Convert the work in the SMB plan to a regular join
// Note that the operator tree is not fixed, only the path/alias mappings in the
// plan are fixed. The operator tree will still contain the SMBJoinOperator
private void genSMBJoinWork(MapWork currWork, SMBMapJoinOperator smbJoinOp) {
// Remove the paths which are not part of aliasToPartitionInfo
Map<String, PartitionDesc> aliasToPartitionInfo = currWork.getAliasToPartnInfo();
List<Path> removePaths = new ArrayList<>();
for (Map.Entry<Path, ArrayList<String>> entry : currWork.getPathToAliases().entrySet()) {
boolean keepPath = false;
for (String alias : entry.getValue()) {
if (aliasToPartitionInfo.containsKey(alias)) {
keepPath = true;
break;
}
}
// Remove if the path is not present
if (!keepPath) {
removePaths.add(entry.getKey());
}
}
List<String> removeAliases = new ArrayList<String>();
for (Path removePath : removePaths) {
removeAliases.addAll(currWork.getPathToAliases().get(removePath));
currWork.removePathToAlias(removePath);
currWork.removePathToPartitionInfo(removePath);
}
for (String alias : removeAliases) {
currWork.getAliasToPartnInfo().remove(alias);
currWork.getAliasToWork().remove(alias);
}
// Get the MapredLocalWork
MapredLocalWork localWork = smbJoinOp.getConf().getLocalWork();
for (Map.Entry<String, Operator<? extends OperatorDesc>> entry : localWork.getAliasToWork().entrySet()) {
String alias = entry.getKey();
Operator<? extends OperatorDesc> op = entry.getValue();
FetchWork fetchWork = localWork.getAliasToFetchWork().get(alias);
// Add the entry in mapredwork
currWork.getAliasToWork().put(alias, op);
PartitionDesc partitionInfo = currWork.getAliasToPartnInfo().get(alias);
if (fetchWork.getTblDir() != null) {
currWork.mergeAliasedInput(alias, fetchWork.getTblDir(), partitionInfo);
} else {
for (Path pathDir : fetchWork.getPartDir()) {
currWork.mergeAliasedInput(alias, pathDir, partitionInfo);
}
}
}
// Remove the dummy store operator from the tree
for (Operator<? extends OperatorDesc> parentOp : smbJoinOp.getParentOperators()) {
if (parentOp instanceof DummyStoreOperator) {
Operator<? extends OperatorDesc> grandParentOp = parentOp.getParentOperators().get(0);
smbJoinOp.replaceParent(parentOp, grandParentOp);
grandParentOp.setChildOperators(parentOp.getChildOperators());
parentOp.setParentOperators(null);
parentOp.setParentOperators(null);
}
}
}
use of org.apache.hadoop.hive.ql.plan.FetchWork in project hive by apache.
the class TestHCatMultiOutputFormat method getTableData.
/**
* Method to fetch table data
*
* @param table table name
* @param database database
* @return list of columns in comma seperated way
* @throws Exception if any error occurs
*/
private List<String> getTableData(String table, String database) throws Exception {
QueryState queryState = new QueryState(null);
HiveConf conf = queryState.getConf();
conf.addResource("hive-site.xml");
ArrayList<String> results = new ArrayList<String>();
ArrayList<String> temp = new ArrayList<String>();
Hive hive = Hive.get(conf);
org.apache.hadoop.hive.ql.metadata.Table tbl = hive.getTable(database, table);
FetchWork work;
if (!tbl.getPartCols().isEmpty()) {
List<Partition> partitions = hive.getPartitions(tbl);
List<PartitionDesc> partDesc = new ArrayList<PartitionDesc>();
List<Path> partLocs = new ArrayList<Path>();
TableDesc tableDesc = Utilities.getTableDesc(tbl);
for (Partition part : partitions) {
partLocs.add(part.getDataLocation());
partDesc.add(Utilities.getPartitionDescFromTableDesc(tableDesc, part, true));
}
work = new FetchWork(partLocs, partDesc, tableDesc);
work.setLimit(100);
} else {
work = new FetchWork(tbl.getDataLocation(), Utilities.getTableDesc(tbl));
}
FetchTask task = new FetchTask();
task.setWork(work);
task.initialize(queryState, null, null, new CompilationOpContext());
task.fetch(temp);
for (String str : temp) {
results.add(str.replace("\t", ","));
}
return results;
}
Aggregations