use of org.apache.hadoop.hive.ql.exec.FetchTask in project hive by apache.
the class SemanticAnalyzer method useCachedResult.
/**
* Set the query plan to use cache entry passed in to return the query results.
* @param cacheEntry The results cache entry that will be used to resolve the query.
*/
private void useCachedResult(QueryResultsCache.CacheEntry cacheEntry, boolean needsReset) {
if (needsReset) {
reset(true);
inputs.clear();
}
// Change query FetchTask to use new location specified in results cache.
FetchTask fetchTask = (FetchTask) TaskFactory.get(cacheEntry.getFetchWork());
setFetchTask(fetchTask);
queryState.setCommandType(cacheEntry.getQueryInfo().getHiveOperation());
resultSchema = cacheEntry.getQueryInfo().getResultSchema();
setTableAccessInfo(cacheEntry.getQueryInfo().getTableAccessInfo());
setColumnAccessInfo(cacheEntry.getQueryInfo().getColumnAccessInfo());
inputs.addAll(cacheEntry.getQueryInfo().getInputs());
// Set recursive traversal in case the cached query was UNION generated by Tez.
conf.setBoolean(FileInputFormat.INPUT_DIR_RECURSIVE, true);
// Indicate that the query will use a cached result.
setCacheUsage(new CacheUsage(CacheUsage.CacheStatus.QUERY_USING_CACHE, cacheEntry));
}
use of org.apache.hadoop.hive.ql.exec.FetchTask in project hive by apache.
the class TaskCompiler method compile.
@SuppressWarnings("nls")
public void compile(final ParseContext pCtx, final List<Task<?>> rootTasks, final Set<ReadEntity> inputs, final Set<WriteEntity> outputs) throws SemanticException {
Context ctx = pCtx.getContext();
GlobalLimitCtx globalLimitCtx = pCtx.getGlobalLimitCtx();
List<Task<MoveWork>> mvTask = new ArrayList<>();
List<LoadTableDesc> loadTableWork = pCtx.getLoadTableWork();
List<LoadFileDesc> loadFileWork = pCtx.getLoadFileWork();
boolean isCStats = pCtx.getQueryProperties().isAnalyzeRewrite();
int outerQueryLimit = pCtx.getQueryProperties().getOuterQueryLimit();
boolean directInsertCtas = false;
if (pCtx.getCreateTable() != null && pCtx.getCreateTable().getStorageHandler() != null) {
try {
directInsertCtas = HiveUtils.getStorageHandler(conf, pCtx.getCreateTable().getStorageHandler()).directInsertCTAS();
} catch (HiveException e) {
throw new SemanticException("Failed to load storage handler: " + e.getMessage());
}
}
if (pCtx.getFetchTask() != null) {
if (pCtx.getFetchTask().getTblDesc() == null) {
return;
}
pCtx.getFetchTask().getWork().setHiveServerQuery(SessionState.get().isHiveServerQuery());
TableDesc resultTab = pCtx.getFetchTask().getTblDesc();
// then either the ThriftFormatter or the DefaultFetchFormatter should be used.
if (!resultTab.getSerdeClassName().equalsIgnoreCase(ThriftJDBCBinarySerDe.class.getName())) {
if (SessionState.get().isHiveServerQuery()) {
conf.set(SerDeUtils.LIST_SINK_OUTPUT_FORMATTER, ThriftFormatter.class.getName());
} else {
String formatterName = conf.get(SerDeUtils.LIST_SINK_OUTPUT_FORMATTER);
if (formatterName == null || formatterName.isEmpty()) {
conf.set(SerDeUtils.LIST_SINK_OUTPUT_FORMATTER, DefaultFetchFormatter.class.getName());
}
}
}
return;
}
if (!pCtx.getQueryProperties().isAnalyzeCommand()) {
LOG.debug("Skipping optimize operator plan for analyze command.");
optimizeOperatorPlan(pCtx);
}
/*
* In case of a select, use a fetch task instead of a move task.
* If the select is from analyze table column rewrite, don't create a fetch task. Instead create
* a column stats task later.
*/
if (pCtx.getQueryProperties().isQuery() && !isCStats) {
if ((!loadTableWork.isEmpty()) || (loadFileWork.size() != 1)) {
throw new SemanticException(ErrorMsg.INVALID_LOAD_TABLE_FILE_WORK.getMsg());
}
LoadFileDesc loadFileDesc = loadFileWork.get(0);
String cols = loadFileDesc.getColumns();
String colTypes = loadFileDesc.getColumnTypes();
TableDesc resultTab = pCtx.getFetchTableDesc();
boolean shouldSetOutputFormatter = false;
if (resultTab == null) {
ResultFileFormat resFileFormat = conf.getResultFileFormat();
String fileFormat;
Class<? extends Deserializer> serdeClass;
if (SessionState.get().getIsUsingThriftJDBCBinarySerDe() && resFileFormat == ResultFileFormat.SEQUENCEFILE) {
fileFormat = resFileFormat.toString();
serdeClass = ThriftJDBCBinarySerDe.class;
shouldSetOutputFormatter = true;
} else if (resFileFormat == ResultFileFormat.SEQUENCEFILE) {
// file format is changed so that IF file sink provides list of files to fetch from (instead
// of whole directory) list status is done on files (which is what HiveSequenceFileInputFormat does)
fileFormat = "HiveSequenceFile";
serdeClass = LazySimpleSerDe.class;
} else {
// All other cases we use the defined file format and LazySimpleSerde
fileFormat = resFileFormat.toString();
serdeClass = LazySimpleSerDe.class;
}
resultTab = PlanUtils.getDefaultQueryOutputTableDesc(cols, colTypes, fileFormat, serdeClass);
} else {
shouldSetOutputFormatter = resultTab.getProperties().getProperty(serdeConstants.SERIALIZATION_LIB).equalsIgnoreCase(ThriftJDBCBinarySerDe.class.getName());
}
if (shouldSetOutputFormatter) {
// Set the fetch formatter to be a no-op for the ListSinkOperator, since we will
// read formatted thrift objects from the output SequenceFile written by Tasks.
conf.set(SerDeUtils.LIST_SINK_OUTPUT_FORMATTER, NoOpFetchFormatter.class.getName());
}
FetchWork fetch = new FetchWork(loadFileDesc.getSourcePath(), resultTab, outerQueryLimit);
boolean isHiveServerQuery = SessionState.get().isHiveServerQuery();
fetch.setHiveServerQuery(isHiveServerQuery);
fetch.setSource(pCtx.getFetchSource());
fetch.setSink(pCtx.getFetchSink());
if (isHiveServerQuery && null != resultTab && resultTab.getSerdeClassName().equalsIgnoreCase(ThriftJDBCBinarySerDe.class.getName()) && HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_SERVER2_THRIFT_RESULTSET_SERIALIZE_IN_TASKS)) {
fetch.setIsUsingThriftJDBCBinarySerDe(true);
} else {
fetch.setIsUsingThriftJDBCBinarySerDe(false);
}
// The idea here is to keep an object reference both in FileSink and in FetchTask for list of files
// to be fetched. During Job close file sink will populate the list and fetch task later will use it
// to fetch the results.
Collection<Operator<?>> tableScanOps = Lists.<Operator<?>>newArrayList(pCtx.getTopOps().values());
Set<FileSinkOperator> fsOps = OperatorUtils.findOperators(tableScanOps, FileSinkOperator.class);
if (fsOps != null && fsOps.size() == 1) {
FileSinkOperator op = fsOps.iterator().next();
Set<FileStatus> filesToFetch = new HashSet<>();
op.getConf().setFilesToFetch(filesToFetch);
fetch.setFilesToFetch(filesToFetch);
}
pCtx.setFetchTask((FetchTask) TaskFactory.get(fetch));
// For the FetchTask, the limit optimization requires we fetch all the rows
// in memory and count how many rows we get. It's not practical if the
// limit factor is too big
int fetchLimit = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVELIMITOPTMAXFETCH);
if (globalLimitCtx.isEnable() && globalLimitCtx.getGlobalLimit() > fetchLimit) {
LOG.info("For FetchTask, LIMIT " + globalLimitCtx.getGlobalLimit() + " > " + fetchLimit + ". Doesn't qualify limit optimization.");
globalLimitCtx.disableOpt();
}
if (outerQueryLimit == 0) {
// Believe it or not, some tools do generate queries with limit 0 and than expect
// query to run quickly. Lets meet their requirement.
LOG.info("Limit 0. No query execution needed.");
return;
}
} else if (!isCStats) {
for (LoadTableDesc ltd : loadTableWork) {
Task<MoveWork> tsk = TaskFactory.get(new MoveWork(null, null, ltd, null, false));
mvTask.add(tsk);
}
boolean oneLoadFileForCtas = true;
for (LoadFileDesc lfd : loadFileWork) {
if (pCtx.getQueryProperties().isCTAS() || pCtx.getQueryProperties().isMaterializedView()) {
if (!oneLoadFileForCtas) {
// should not have more than 1 load file for CTAS.
throw new SemanticException("One query is not expected to contain multiple CTAS loads statements");
}
setLoadFileLocation(pCtx, lfd);
oneLoadFileForCtas = false;
}
mvTask.add(TaskFactory.get(new MoveWork(null, null, null, lfd, false)));
}
}
generateTaskTree(rootTasks, pCtx, mvTask, inputs, outputs);
// For each task, set the key descriptor for the reducer
for (Task<?> rootTask : rootTasks) {
GenMapRedUtils.setKeyAndValueDescForTaskTree(rootTask);
}
// to be used, please do so
for (Task<?> rootTask : rootTasks) {
setInputFormat(rootTask);
}
optimizeTaskPlan(rootTasks, pCtx, ctx);
/*
* If the query was the result of analyze table column compute statistics rewrite, create
* a column stats task instead of a fetch task to persist stats to the metastore.
* As per HIVE-15903, we will also collect table stats when user computes column stats.
* That means, if isCStats || !pCtx.getColumnStatsAutoGatherContexts().isEmpty()
* We need to collect table stats
* if isCStats, we need to include a basic stats task
* else it is ColumnStatsAutoGather, which should have a move task with a stats task already.
*/
if (isCStats || !pCtx.getColumnStatsAutoGatherContexts().isEmpty()) {
// map from tablename to task (ColumnStatsTask which includes a BasicStatsTask)
Map<String, StatsTask> map = new LinkedHashMap<>();
if (isCStats) {
if (rootTasks == null || rootTasks.size() != 1 || pCtx.getTopOps() == null || pCtx.getTopOps().size() != 1) {
throw new SemanticException("Can not find correct root task!");
}
try {
Task<?> root = rootTasks.iterator().next();
StatsTask tsk = (StatsTask) genTableStats(pCtx, pCtx.getTopOps().values().iterator().next(), root, outputs);
root.addDependentTask(tsk);
map.put(extractTableFullName(tsk), tsk);
} catch (HiveException e) {
throw new SemanticException(e);
}
genColumnStatsTask(pCtx.getAnalyzeRewrite(), loadFileWork, map, outerQueryLimit, 0);
} else {
Set<Task<?>> leafTasks = new LinkedHashSet<Task<?>>();
getLeafTasks(rootTasks, leafTasks);
List<Task<?>> nonStatsLeafTasks = new ArrayList<>();
for (Task<?> tsk : leafTasks) {
// map table name to the correct ColumnStatsTask
if (tsk instanceof StatsTask) {
map.put(extractTableFullName((StatsTask) tsk), (StatsTask) tsk);
} else {
nonStatsLeafTasks.add(tsk);
}
}
// add cStatsTask as a dependent of all the nonStatsLeafTasks
for (Task<?> tsk : nonStatsLeafTasks) {
for (Task<?> cStatsTask : map.values()) {
tsk.addDependentTask(cStatsTask);
}
}
for (ColumnStatsAutoGatherContext columnStatsAutoGatherContext : pCtx.getColumnStatsAutoGatherContexts()) {
if (!columnStatsAutoGatherContext.isInsertInto()) {
genColumnStatsTask(columnStatsAutoGatherContext.getAnalyzeRewrite(), columnStatsAutoGatherContext.getLoadFileWork(), map, outerQueryLimit, 0);
} else {
int numBitVector;
try {
numBitVector = HiveStatsUtils.getNumBitVectorsForNDVEstimation(conf);
} catch (Exception e) {
throw new SemanticException(e.getMessage());
}
genColumnStatsTask(columnStatsAutoGatherContext.getAnalyzeRewrite(), columnStatsAutoGatherContext.getLoadFileWork(), map, outerQueryLimit, numBitVector);
}
}
}
}
decideExecMode(rootTasks, ctx, globalLimitCtx);
// ahead of time by the non-native table
if (pCtx.getQueryProperties().isCTAS() && !pCtx.getCreateTable().isMaterialization() && !directInsertCtas) {
// generate a DDL task and make it a dependent task of the leaf
CreateTableDesc crtTblDesc = pCtx.getCreateTable();
crtTblDesc.validate(conf);
Task<?> crtTblTask = TaskFactory.get(new DDLWork(inputs, outputs, crtTblDesc));
patchUpAfterCTASorMaterializedView(rootTasks, inputs, outputs, crtTblTask, CollectionUtils.isEmpty(crtTblDesc.getPartColNames()));
} else if (pCtx.getQueryProperties().isMaterializedView()) {
// generate a DDL task and make it a dependent task of the leaf
CreateMaterializedViewDesc viewDesc = pCtx.getCreateViewDesc();
Task<?> crtViewTask = TaskFactory.get(new DDLWork(inputs, outputs, viewDesc));
patchUpAfterCTASorMaterializedView(rootTasks, inputs, outputs, crtViewTask, CollectionUtils.isEmpty(viewDesc.getPartColNames()));
} else if (pCtx.getMaterializedViewUpdateDesc() != null) {
// If there is a materialized view update desc, we create introduce it at the end
// of the tree.
MaterializedViewUpdateDesc materializedViewDesc = pCtx.getMaterializedViewUpdateDesc();
DDLWork ddlWork = new DDLWork(inputs, outputs, materializedViewDesc);
Set<Task<?>> leafTasks = new LinkedHashSet<Task<?>>();
getLeafTasks(rootTasks, leafTasks);
Task<?> materializedViewTask = TaskFactory.get(ddlWork, conf);
for (Task<?> task : leafTasks) {
task.addDependentTask(materializedViewTask);
}
}
if (globalLimitCtx.isEnable() && pCtx.getFetchTask() != null) {
LOG.info("set least row check for FetchTask: " + globalLimitCtx.getGlobalLimit());
pCtx.getFetchTask().getWork().setLeastNumRows(globalLimitCtx.getGlobalLimit());
}
if (globalLimitCtx.isEnable() && globalLimitCtx.getLastReduceLimitDesc() != null) {
LOG.info("set least row check for LimitDesc: " + globalLimitCtx.getGlobalLimit());
globalLimitCtx.getLastReduceLimitDesc().setLeastRows(globalLimitCtx.getGlobalLimit());
}
Interner<TableDesc> interner = Interners.newStrongInterner();
// Perform Final chores on generated Map works
// 1. Intern the table descriptors
// 2. Derive final explain attributes based on previous compilation.
GenMapRedUtils.finalMapWorkChores(rootTasks, pCtx.getConf(), interner);
}
use of org.apache.hadoop.hive.ql.exec.FetchTask in project hive by apache.
the class GenSparkUtils method processFileSink.
public void processFileSink(GenSparkProcContext context, FileSinkOperator fileSink) throws SemanticException {
ParseContext parseContext = context.parseContext;
// is INSERT OVERWRITE TABLE
boolean isInsertTable = GenMapRedUtils.isInsertInto(parseContext, fileSink);
HiveConf hconf = parseContext.getConf();
boolean chDir = GenMapRedUtils.isMergeRequired(context.moveTask, hconf, fileSink, context.currentTask, isInsertTable);
// Set stats config for FileSinkOperators which are cloned from the fileSink
List<FileSinkOperator> fileSinkList = context.fileSinkMap.get(fileSink);
if (fileSinkList != null) {
for (FileSinkOperator fsOp : fileSinkList) {
fsOp.getConf().setGatherStats(fileSink.getConf().isGatherStats());
fsOp.getConf().setStatsReliable(fileSink.getConf().isStatsReliable());
}
}
Path finalName = createMoveTask(context.currentTask, chDir, fileSink, parseContext, context.moveTask, hconf, context.dependencyTask);
if (chDir) {
// Merge the files in the destination table/partitions by creating Map-only merge job
// If underlying data is RCFile a RCFileBlockMerge task would be created.
LOG.info("using CombineHiveInputformat for the merge job");
GenMapRedUtils.createMRWorkForMergingFiles(fileSink, finalName, context.dependencyTask, context.moveTask, hconf, context.currentTask, parseContext.getQueryState().getLineageState());
}
FetchTask fetchTask = parseContext.getFetchTask();
if (fetchTask != null && context.currentTask.getNumChild() == 0) {
if (fetchTask.isFetchFrom(fileSink.getConf())) {
context.currentTask.setFetchSource(true);
}
}
}
use of org.apache.hadoop.hive.ql.exec.FetchTask in project hive by apache.
the class Driver method releasePlan.
private void releasePlan() {
try {
if (driverContext.getPlan() != null) {
FetchTask fetchTask = driverContext.getPlan().getFetchTask();
if (fetchTask != null) {
fetchTask.setTaskQueue(null);
fetchTask.setQueryPlan(null);
}
driverContext.setFetchTask(fetchTask);
}
driverContext.setPlan(null);
} catch (Exception e) {
LOG.debug("Exception while clearing the Fetch task", e);
}
}
use of org.apache.hadoop.hive.ql.exec.FetchTask in project hive by apache.
the class GenMRFileSink1 method process.
/**
* File Sink Operator encountered.
*
* @param nd
* the file sink operator encountered
* @param opProcCtx
* context
*/
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
ParseContext parseCtx = ctx.getParseCtx();
boolean chDir = false;
// we should look take the parent of fsOp's task as the current task.
FileSinkOperator fsOp = (FileSinkOperator) nd;
Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
GenMapRedCtx mapredCtx = mapCurrCtx.get(fsOp.getParentOperators().get(0));
Task<?> currTask = mapredCtx.getCurrTask();
ctx.setCurrTask(currTask);
ctx.addRootIfPossible(currTask);
// is INSERT OVERWRITE TABLE
boolean isInsertTable = GenMapRedUtils.isInsertInto(parseCtx, fsOp);
HiveConf hconf = parseCtx.getConf();
// Mark this task as a final map reduce task (ignoring the optional merge task)
((MapredWork) currTask.getWork()).setFinalMapRed(true);
// If this file sink desc has been processed due to a linked file sink desc,
// use that task
Map<FileSinkDesc, Task<?>> fileSinkDescs = ctx.getLinkedFileDescTasks();
if (fileSinkDescs != null) {
Task<?> childTask = fileSinkDescs.get(fsOp.getConf());
processLinkedFileDesc(ctx, childTask);
return true;
}
// So, no need to attempt to merge the files again.
if ((ctx.getSeenFileSinkOps() == null) || (!ctx.getSeenFileSinkOps().contains(nd))) {
chDir = GenMapRedUtils.isMergeRequired(ctx.getMvTask(), hconf, fsOp, currTask, isInsertTable);
}
Path finalName = processFS(fsOp, stack, opProcCtx, chDir);
if (chDir) {
// Merge the files in the destination table/partitions by creating Map-only merge job
// If underlying data is RCFile or OrcFile, RCFileBlockMerge task or
// OrcFileStripeMerge task would be created.
LOG.info("using CombineHiveInputformat for the merge job");
GenMapRedUtils.createMRWorkForMergingFiles(fsOp, finalName, ctx.getDependencyTaskForMultiInsert(), ctx.getMvTask(), hconf, currTask, parseCtx.getQueryState().getLineageState());
}
FileSinkDesc fileSinkDesc = fsOp.getConf();
// There are linked file sink operators and child tasks are present
if (fileSinkDesc.isLinkedFileSink() && (currTask.getChildTasks() != null) && (currTask.getChildTasks().size() == 1)) {
Map<FileSinkDesc, Task<?>> linkedFileDescTasks = ctx.getLinkedFileDescTasks();
if (linkedFileDescTasks == null) {
linkedFileDescTasks = new HashMap<FileSinkDesc, Task<?>>();
ctx.setLinkedFileDescTasks(linkedFileDescTasks);
}
for (FileSinkDesc fileDesc : fileSinkDesc.getLinkedFileSinkDesc()) {
linkedFileDescTasks.put(fileDesc, currTask.getChildTasks().get(0));
}
}
FetchTask fetchTask = parseCtx.getFetchTask();
if (fetchTask != null && currTask.getNumChild() == 0) {
if (fetchTask.isFetchFrom(fileSinkDesc)) {
currTask.setFetchSource(true);
}
}
return true;
}
Aggregations