use of org.apache.hadoop.hive.ql.plan.FetchWork in project hive by apache.
the class MapredLocalTask method initializeOperators.
private void initializeOperators(Map<FetchOperator, JobConf> fetchOpJobConfMap) throws HiveException {
for (Map.Entry<String, Operator<? extends OperatorDesc>> entry : work.getAliasToWork().entrySet()) {
LOG.debug("initializeOperators: " + entry.getKey() + ", children = " + entry.getValue().getChildOperators());
}
// this mapper operator is used to initialize all the operators
for (Map.Entry<String, FetchWork> entry : work.getAliasToFetchWork().entrySet()) {
if (entry.getValue() == null) {
continue;
}
JobConf jobClone = new JobConf(job);
TableScanOperator ts = (TableScanOperator) work.getAliasToWork().get(entry.getKey());
// push down projections
ColumnProjectionUtils.appendReadColumns(jobClone, ts.getNeededColumnIDs(), ts.getNeededColumns(), ts.getNeededNestedColumnPaths());
// push down filters
HiveInputFormat.pushFilters(jobClone, ts);
AcidUtils.setTransactionalTableScan(jobClone, ts.getConf().isAcidTable());
AcidUtils.setAcidOperationalProperties(jobClone, ts.getConf().getAcidOperationalProperties());
// create a fetch operator
FetchOperator fetchOp = new FetchOperator(entry.getValue(), jobClone);
fetchOpJobConfMap.put(fetchOp, jobClone);
fetchOperators.put(entry.getKey(), fetchOp);
l4j.info("fetchoperator for " + entry.getKey() + " created");
}
// initialize all forward operator
for (Map.Entry<String, FetchOperator> entry : fetchOperators.entrySet()) {
// get the forward op
String alias = entry.getKey();
Operator<? extends OperatorDesc> forwardOp = work.getAliasToWork().get(alias);
// put the exe context into all the operators
forwardOp.passExecContext(execContext);
// All the operators need to be initialized before process
FetchOperator fetchOp = entry.getValue();
JobConf jobConf = fetchOpJobConfMap.get(fetchOp);
if (jobConf == null) {
jobConf = job;
}
// initialize the forward operator
ObjectInspector objectInspector = fetchOp.getOutputObjectInspector();
forwardOp.initialize(jobConf, new ObjectInspector[] { objectInspector });
l4j.info("fetchoperator for " + entry.getKey() + " initialized");
}
}
use of org.apache.hadoop.hive.ql.plan.FetchWork in project hive by apache.
the class ExecDriver method handleSampling.
private void handleSampling(Context context, MapWork mWork, JobConf job) throws Exception {
assert mWork.getAliasToWork().keySet().size() == 1;
String alias = mWork.getAliases().get(0);
Operator<?> topOp = mWork.getAliasToWork().get(alias);
PartitionDesc partDesc = mWork.getAliasToPartnInfo().get(alias);
ArrayList<PartitionDesc> parts = mWork.getPartitionDescs();
List<Path> inputPaths = mWork.getPaths();
Path tmpPath = context.getExternalTmpPath(inputPaths.get(0));
Path partitionFile = new Path(tmpPath, ".partitions");
ShimLoader.getHadoopShims().setTotalOrderPartitionFile(job, partitionFile);
PartitionKeySampler sampler = new PartitionKeySampler();
if (mWork.getSamplingType() == MapWork.SAMPLING_ON_PREV_MR) {
console.printInfo("Use sampling data created in previous MR");
// merges sampling data from previous MR and make partition keys for total sort
for (Path path : inputPaths) {
FileSystem fs = path.getFileSystem(job);
for (FileStatus status : fs.globStatus(new Path(path, ".sampling*"))) {
sampler.addSampleFile(status.getPath(), job);
}
}
} else if (mWork.getSamplingType() == MapWork.SAMPLING_ON_START) {
console.printInfo("Creating sampling data..");
assert topOp instanceof TableScanOperator;
TableScanOperator ts = (TableScanOperator) topOp;
FetchWork fetchWork;
if (!partDesc.isPartitioned()) {
assert inputPaths.size() == 1;
fetchWork = new FetchWork(inputPaths.get(0), partDesc.getTableDesc());
} else {
fetchWork = new FetchWork(inputPaths, parts, partDesc.getTableDesc());
}
fetchWork.setSource(ts);
// random sampling
FetchOperator fetcher = PartitionKeySampler.createSampler(fetchWork, job, ts);
try {
ts.initialize(job, new ObjectInspector[] { fetcher.getOutputObjectInspector() });
OperatorUtils.setChildrenCollector(ts.getChildOperators(), sampler);
while (fetcher.pushRow()) {
}
} finally {
fetcher.clearFetchContext();
}
} else {
throw new IllegalArgumentException("Invalid sampling type " + mWork.getSamplingType());
}
sampler.writePartitionKeys(partitionFile, job);
}
use of org.apache.hadoop.hive.ql.plan.FetchWork in project hive by apache.
the class PostExecOrcFileDump method run.
@Override
public void run(HookContext hookContext) throws Exception {
assert (hookContext.getHookType() == HookContext.HookType.POST_EXEC_HOOK);
HiveConf conf = hookContext.getConf();
LOG.info("Executing post execution hook to print orc file dump..");
QueryPlan plan = hookContext.getQueryPlan();
if (plan == null) {
return;
}
FetchTask fetchTask = plan.getFetchTask();
if (fetchTask != null) {
SessionState ss = SessionState.get();
SessionState.LogHelper console = ss.getConsole();
// file dump should write to session state console's error stream
PrintStream old = System.out;
System.setOut(console.getErrStream());
FetchWork fetchWork = fetchTask.getWork();
boolean partitionedTable = fetchWork.isPartitioned();
List<Path> directories;
if (partitionedTable) {
LOG.info("Printing orc file dump for files from partitioned directory..");
directories = fetchWork.getPartDir();
} else {
LOG.info("Printing orc file dump for files from table directory..");
directories = Lists.newArrayList();
directories.add(fetchWork.getTblDir());
}
for (Path dir : directories) {
FileSystem fs = dir.getFileSystem(conf);
List<FileStatus> fileList = HdfsUtils.listLocatedStatus(fs, dir, hiddenFileFilter);
for (FileStatus fileStatus : fileList) {
LOG.info("Printing orc file dump for " + fileStatus.getPath());
if (fileStatus.getLen() > 0) {
try {
// just creating orc reader is going to do sanity checks to make sure its valid ORC file
OrcFile.createReader(fs, fileStatus.getPath());
console.printError("-- BEGIN ORC FILE DUMP --");
FileDump.main(new String[] { fileStatus.getPath().toString(), "--rowindex=*" });
console.printError("-- END ORC FILE DUMP --");
} catch (FileFormatException e) {
LOG.warn("File " + fileStatus.getPath() + " is not ORC. Skip printing orc file dump");
} catch (IOException e) {
LOG.warn("Skip printing orc file dump. Exception: " + e.getMessage());
}
} else {
LOG.warn("Zero length file encountered. Skip printing orc file dump.");
}
}
}
// restore the old out stream
System.out.flush();
System.setOut(old);
}
}
use of org.apache.hadoop.hive.ql.plan.FetchWork in project hive by apache.
the class GenMapRedUtils method setMapWork.
/**
* initialize MapWork
*
* @param alias_id
* current alias
* @param topOp
* the top operator of the stack
* @param plan
* map work to initialize
* @param local
* whether you need to add to map-reduce or local work
* @param pList
* pruned partition list. If it is null it will be computed on-the-fly.
* @param inputs
* read entities for the map work
* @param conf
* current instance of hive conf
*/
public static void setMapWork(MapWork plan, ParseContext parseCtx, Set<ReadEntity> inputs, PrunedPartitionList partsList, TableScanOperator tsOp, String alias_id, HiveConf conf, boolean local) throws SemanticException {
ArrayList<Path> partDir = new ArrayList<Path>();
ArrayList<PartitionDesc> partDesc = new ArrayList<PartitionDesc>();
boolean isAcidTable = false;
Path tblDir = null;
plan.setNameToSplitSample(parseCtx.getNameToSplitSample());
if (partsList == null) {
try {
partsList = PartitionPruner.prune(tsOp, parseCtx, alias_id);
isAcidTable = tsOp.getConf().isAcidTable();
} catch (SemanticException e) {
throw e;
}
}
// Generate the map work for this alias_id
// pass both confirmed and unknown partitions through the map-reduce
// framework
Set<Partition> parts = partsList.getPartitions();
PartitionDesc aliasPartnDesc = null;
try {
if (!parts.isEmpty()) {
aliasPartnDesc = Utilities.getPartitionDesc(parts.iterator().next());
}
} catch (HiveException e) {
LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
throw new SemanticException(e.getMessage(), e);
}
// The table does not have any partitions
if (aliasPartnDesc == null) {
aliasPartnDesc = new PartitionDesc(Utilities.getTableDesc(tsOp.getConf().getTableMetadata()), null);
}
Map<String, String> props = tsOp.getConf().getOpProps();
if (props != null) {
Properties target = aliasPartnDesc.getProperties();
target.putAll(props);
}
plan.getAliasToPartnInfo().put(alias_id, aliasPartnDesc);
long sizeNeeded = Integer.MAX_VALUE;
int fileLimit = -1;
if (parseCtx.getGlobalLimitCtx().isEnable()) {
if (isAcidTable) {
LOG.info("Skip Global Limit optimization for ACID table");
parseCtx.getGlobalLimitCtx().disableOpt();
} else {
long sizePerRow = HiveConf.getLongVar(parseCtx.getConf(), HiveConf.ConfVars.HIVELIMITMAXROWSIZE);
sizeNeeded = (parseCtx.getGlobalLimitCtx().getGlobalOffset() + parseCtx.getGlobalLimitCtx().getGlobalLimit()) * sizePerRow;
// for the optimization that reduce number of input file, we limit number
// of files allowed. If more than specific number of files have to be
// selected, we skip this optimization. Since having too many files as
// inputs can cause unpredictable latency. It's not necessarily to be
// cheaper.
fileLimit = HiveConf.getIntVar(parseCtx.getConf(), HiveConf.ConfVars.HIVELIMITOPTLIMITFILE);
if (sizePerRow <= 0 || fileLimit <= 0) {
LOG.info("Skip optimization to reduce input size of 'limit'");
parseCtx.getGlobalLimitCtx().disableOpt();
} else if (parts.isEmpty()) {
LOG.info("Empty input: skip limit optimization");
} else {
LOG.info("Try to reduce input size for 'limit' " + "sizeNeeded: " + sizeNeeded + " file limit : " + fileLimit);
}
}
}
boolean isFirstPart = true;
boolean emptyInput = true;
boolean singlePartition = (parts.size() == 1);
// Track the dependencies for the view. Consider a query like: select * from V;
// where V is a view of the form: select * from T
// The dependencies should include V at depth 0, and T at depth 1 (inferred).
Map<String, ReadEntity> viewToInput = parseCtx.getViewAliasToInput();
ReadEntity parentViewInfo = PlanUtils.getParentViewInfo(alias_id, viewToInput);
// The table should also be considered a part of inputs, even if the table is a
// partitioned table and whether any partition is selected or not
//This read entity is a direct read entity and not an indirect read (that is when
// this is being read because it is a dependency of a view).
boolean isDirectRead = (parentViewInfo == null);
TableDesc tblDesc = null;
boolean initTableDesc = false;
PlanUtils.addPartitionInputs(parts, inputs, parentViewInfo, isDirectRead);
for (Partition part : parts) {
// Later the properties have to come from the partition as opposed
// to from the table in order to support versioning.
Path[] paths = null;
SampleDesc sampleDescr = parseCtx.getOpToSamplePruner().get(tsOp);
// Lookup list bucketing pruner
Map<String, ExprNodeDesc> partToPruner = parseCtx.getOpToPartToSkewedPruner().get(tsOp);
ExprNodeDesc listBucketingPruner = (partToPruner != null) ? partToPruner.get(part.getName()) : null;
if (sampleDescr != null) {
assert (listBucketingPruner == null) : "Sampling and list bucketing can't coexit.";
paths = SamplePruner.prune(part, sampleDescr);
parseCtx.getGlobalLimitCtx().disableOpt();
} else if (listBucketingPruner != null) {
assert (sampleDescr == null) : "Sampling and list bucketing can't coexist.";
/* Use list bucketing prunner's path. */
paths = ListBucketingPruner.prune(parseCtx, part, listBucketingPruner);
} else {
// contain enough size, we change to normal mode.
if (parseCtx.getGlobalLimitCtx().isEnable()) {
if (isFirstPart) {
long sizeLeft = sizeNeeded;
ArrayList<Path> retPathList = new ArrayList<Path>();
SamplePruner.LimitPruneRetStatus status = SamplePruner.limitPrune(part, sizeLeft, fileLimit, retPathList);
if (status.equals(SamplePruner.LimitPruneRetStatus.NoFile)) {
continue;
} else if (status.equals(SamplePruner.LimitPruneRetStatus.NotQualify)) {
LOG.info("Use full input -- first " + fileLimit + " files are more than " + sizeNeeded + " bytes");
parseCtx.getGlobalLimitCtx().disableOpt();
} else {
emptyInput = false;
paths = new Path[retPathList.size()];
int index = 0;
for (Path path : retPathList) {
paths[index++] = path;
}
if (status.equals(SamplePruner.LimitPruneRetStatus.NeedAllFiles) && singlePartition) {
// if all files are needed to meet the size limit, we disable
// optimization. It usually happens for empty table/partition or
// table/partition with only one file. By disabling this
// optimization, we can avoid retrying the query if there is
// not sufficient rows.
parseCtx.getGlobalLimitCtx().disableOpt();
}
}
isFirstPart = false;
} else {
paths = new Path[0];
}
}
if (!parseCtx.getGlobalLimitCtx().isEnable()) {
paths = part.getPath();
}
}
// is it a partitioned table ?
if (!part.getTable().isPartitioned()) {
assert (tblDir == null);
tblDir = paths[0];
if (!initTableDesc) {
tblDesc = Utilities.getTableDesc(part.getTable());
initTableDesc = true;
}
} else if (tblDesc == null) {
if (!initTableDesc) {
tblDesc = Utilities.getTableDesc(part.getTable());
initTableDesc = true;
}
}
if (props != null) {
Properties target = tblDesc.getProperties();
target.putAll(props);
}
for (Path p : paths) {
if (p == null) {
continue;
}
String path = p.toString();
if (LOG.isDebugEnabled()) {
LOG.debug("Adding " + path + " of table" + alias_id);
}
partDir.add(p);
try {
if (part.getTable().isPartitioned()) {
partDesc.add(Utilities.getPartitionDesc(part));
} else {
partDesc.add(Utilities.getPartitionDescFromTableDesc(tblDesc, part, false));
}
} catch (HiveException e) {
LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
throw new SemanticException(e.getMessage(), e);
}
}
}
if (emptyInput) {
parseCtx.getGlobalLimitCtx().disableOpt();
}
Utilities.addSchemaEvolutionToTableScanOperator(partsList.getSourceTable(), tsOp);
Iterator<Path> iterPath = partDir.iterator();
Iterator<PartitionDesc> iterPartnDesc = partDesc.iterator();
if (!local) {
while (iterPath.hasNext()) {
assert iterPartnDesc.hasNext();
Path path = iterPath.next();
PartitionDesc prtDesc = iterPartnDesc.next();
// Add the path to alias mapping
plan.addPathToAlias(path, alias_id);
plan.addPathToPartitionInfo(path, prtDesc);
if (LOG.isDebugEnabled()) {
LOG.debug("Information added for path " + path);
}
}
assert plan.getAliasToWork().get(alias_id) == null;
plan.getAliasToWork().put(alias_id, tsOp);
} else {
// populate local work if needed
MapredLocalWork localPlan = plan.getMapRedLocalWork();
if (localPlan == null) {
localPlan = new MapredLocalWork(new LinkedHashMap<String, Operator<? extends OperatorDesc>>(), new LinkedHashMap<String, FetchWork>());
}
assert localPlan.getAliasToWork().get(alias_id) == null;
assert localPlan.getAliasToFetchWork().get(alias_id) == null;
localPlan.getAliasToWork().put(alias_id, tsOp);
if (tblDir == null) {
tblDesc = Utilities.getTableDesc(partsList.getSourceTable());
localPlan.getAliasToFetchWork().put(alias_id, new FetchWork(partDir, partDesc, tblDesc));
} else {
localPlan.getAliasToFetchWork().put(alias_id, new FetchWork(tblDir, tblDesc));
}
plan.setMapRedLocalWork(localPlan);
}
}
use of org.apache.hadoop.hive.ql.plan.FetchWork in project hive by apache.
the class SimpleFetchOptimizer method optimize.
// returns non-null FetchTask instance when succeeded
@SuppressWarnings("unchecked")
private FetchTask optimize(ParseContext pctx, String alias, TableScanOperator source) throws Exception {
String mode = HiveConf.getVar(pctx.getConf(), HiveConf.ConfVars.HIVEFETCHTASKCONVERSION);
boolean aggressive = "more".equals(mode);
final int limit = pctx.getQueryProperties().getOuterQueryLimit();
// limit = 0 means that we do not need any task.
if (limit == 0) {
return null;
}
FetchData fetch = checkTree(aggressive, pctx, alias, source);
if (fetch != null && checkThreshold(fetch, limit, pctx)) {
FetchWork fetchWork = fetch.convertToWork();
FetchTask fetchTask = (FetchTask) TaskFactory.get(fetchWork, pctx.getConf());
fetchWork.setSink(fetch.completed(pctx, fetchWork));
fetchWork.setSource(source);
fetchWork.setLimit(limit);
return fetchTask;
}
return null;
}
Aggregations