use of org.apache.hadoop.hive.ql.plan.FilterDesc.SampleDesc in project hive by apache.
the class GenMapRedUtils method setMapWork.
/**
* initialize MapWork
*
* @param alias_id
* current alias
* @param topOp
* the top operator of the stack
* @param plan
* map work to initialize
* @param local
* whether you need to add to map-reduce or local work
* @param pList
* pruned partition list. If it is null it will be computed on-the-fly.
* @param inputs
* read entities for the map work
* @param conf
* current instance of hive conf
*/
public static void setMapWork(MapWork plan, ParseContext parseCtx, Set<ReadEntity> inputs, PrunedPartitionList partsList, TableScanOperator tsOp, String alias_id, HiveConf conf, boolean local) throws SemanticException {
ArrayList<Path> partDir = new ArrayList<Path>();
ArrayList<PartitionDesc> partDesc = new ArrayList<PartitionDesc>();
boolean isFullAcidTable = false;
Path tblDir = null;
plan.setNameToSplitSample(parseCtx.getNameToSplitSample());
// we also collect table stats while collecting column stats.
if (parseCtx.getAnalyzeRewrite() != null) {
plan.setGatheringStats(true);
}
if (partsList == null) {
try {
partsList = PartitionPruner.prune(tsOp, parseCtx, alias_id);
isFullAcidTable = tsOp.getConf().isFullAcidTable();
} catch (SemanticException e) {
throw e;
}
}
// Generate the map work for this alias_id
// pass both confirmed and unknown partitions through the map-reduce
// framework
Set<Partition> parts = partsList.getPartitions();
TableDesc tableSpec = Utilities.getTableDesc(tsOp.getConf().getTableMetadata());
PartitionDesc aliasPartnDesc = null;
try {
if (!parts.isEmpty()) {
aliasPartnDesc = Utilities.getPartitionDesc(parts.iterator().next(), tableSpec);
}
} catch (HiveException e) {
LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
throw new SemanticException(e.getMessage(), e);
}
// The table does not have any partitions
if (aliasPartnDesc == null) {
aliasPartnDesc = new PartitionDesc(tableSpec, null);
}
Map<String, String> props = tsOp.getConf().getOpProps();
if (props != null) {
Properties target = aliasPartnDesc.getProperties();
target.putAll(props);
}
plan.getAliasToPartnInfo().put(alias_id, aliasPartnDesc);
long sizeNeeded = Integer.MAX_VALUE;
int fileLimit = -1;
if (parseCtx.getGlobalLimitCtx().isEnable()) {
if (isFullAcidTable) {
LOG.info("Skipping Global Limit optimization for an ACID table");
parseCtx.getGlobalLimitCtx().disableOpt();
} else {
long sizePerRow = HiveConf.getLongVar(parseCtx.getConf(), HiveConf.ConfVars.HIVELIMITMAXROWSIZE);
sizeNeeded = (parseCtx.getGlobalLimitCtx().getGlobalOffset() + parseCtx.getGlobalLimitCtx().getGlobalLimit()) * sizePerRow;
// for the optimization that reduce number of input file, we limit number
// of files allowed. If more than specific number of files have to be
// selected, we skip this optimization. Since having too many files as
// inputs can cause unpredictable latency. It's not necessarily to be
// cheaper.
fileLimit = HiveConf.getIntVar(parseCtx.getConf(), HiveConf.ConfVars.HIVELIMITOPTLIMITFILE);
if (sizePerRow <= 0 || fileLimit <= 0) {
LOG.info("Skip optimization to reduce input size of 'limit'");
parseCtx.getGlobalLimitCtx().disableOpt();
} else if (parts.isEmpty()) {
LOG.info("Empty input: skip limit optimization");
} else {
LOG.info("Try to reduce input size for 'limit' " + "sizeNeeded: " + sizeNeeded + " file limit : " + fileLimit);
}
}
}
boolean isFirstPart = true;
boolean emptyInput = true;
boolean singlePartition = (parts.size() == 1);
// Track the dependencies for the view. Consider a query like: select * from V;
// where V is a view of the form: select * from T
// The dependencies should include V at depth 0, and T at depth 1 (inferred).
Map<String, ReadEntity> viewToInput = parseCtx.getViewAliasToInput();
ReadEntity parentViewInfo = PlanUtils.getParentViewInfo(alias_id, viewToInput);
// The table should also be considered a part of inputs, even if the table is a
// partitioned table and whether any partition is selected or not
// This read entity is a direct read entity and not an indirect read (that is when
// this is being read because it is a dependency of a view).
boolean isDirectRead = (parentViewInfo == null);
TableDesc tblDesc = null;
boolean initTableDesc = false;
PlanUtils.addPartitionInputs(parts, inputs, parentViewInfo, isDirectRead);
for (Partition part : parts) {
// Later the properties have to come from the partition as opposed
// to from the table in order to support versioning.
Path[] paths = null;
SampleDesc sampleDescr = parseCtx.getOpToSamplePruner().get(tsOp);
// Lookup list bucketing pruner
Map<String, ExprNodeDesc> partToPruner = parseCtx.getOpToPartToSkewedPruner().get(tsOp);
ExprNodeDesc listBucketingPruner = (partToPruner != null) ? partToPruner.get(part.getName()) : null;
if (sampleDescr != null) {
assert (listBucketingPruner == null) : "Sampling and list bucketing can't coexit.";
paths = SamplePruner.prune(part, sampleDescr);
parseCtx.getGlobalLimitCtx().disableOpt();
} else if (listBucketingPruner != null) {
assert (sampleDescr == null) : "Sampling and list bucketing can't coexist.";
/* Use list bucketing prunner's path. */
paths = ListBucketingPruner.prune(parseCtx, part, listBucketingPruner);
} else {
// contain enough size, we change to normal mode.
if (parseCtx.getGlobalLimitCtx().isEnable()) {
if (isFirstPart) {
long sizeLeft = sizeNeeded;
ArrayList<Path> retPathList = new ArrayList<Path>();
SamplePruner.LimitPruneRetStatus status = SamplePruner.limitPrune(part, sizeLeft, fileLimit, retPathList);
if (status.equals(SamplePruner.LimitPruneRetStatus.NoFile)) {
continue;
} else if (status.equals(SamplePruner.LimitPruneRetStatus.NotQualify)) {
LOG.info("Use full input -- first " + fileLimit + " files are more than " + sizeNeeded + " bytes");
parseCtx.getGlobalLimitCtx().disableOpt();
} else {
emptyInput = false;
paths = new Path[retPathList.size()];
int index = 0;
for (Path path : retPathList) {
paths[index++] = path;
}
if (status.equals(SamplePruner.LimitPruneRetStatus.NeedAllFiles) && singlePartition) {
// if all files are needed to meet the size limit, we disable
// optimization. It usually happens for empty table/partition or
// table/partition with only one file. By disabling this
// optimization, we can avoid retrying the query if there is
// not sufficient rows.
parseCtx.getGlobalLimitCtx().disableOpt();
}
}
isFirstPart = false;
} else {
paths = new Path[0];
}
}
if (!parseCtx.getGlobalLimitCtx().isEnable()) {
paths = part.getPath();
}
}
// is it a partitioned table ?
if (!part.getTable().isPartitioned()) {
assert (tblDir == null);
tblDir = paths[0];
if (!initTableDesc) {
tblDesc = Utilities.getTableDesc(part.getTable());
initTableDesc = true;
}
} else if (tblDesc == null) {
if (!initTableDesc) {
tblDesc = Utilities.getTableDesc(part.getTable());
initTableDesc = true;
}
}
if (props != null) {
Properties target = tblDesc.getProperties();
target.putAll(props);
}
for (Path p : paths) {
if (p == null) {
continue;
}
String path = p.toString();
if (LOG.isDebugEnabled()) {
LOG.debug("Adding " + path + " of table " + alias_id);
}
partDir.add(p);
try {
if (part.getTable().isPartitioned()) {
partDesc.add(Utilities.getPartitionDesc(part, tblDesc));
} else {
partDesc.add(Utilities.getPartitionDescFromTableDesc(tblDesc, part, false));
}
} catch (HiveException e) {
LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
throw new SemanticException(e.getMessage(), e);
}
}
}
if (emptyInput) {
parseCtx.getGlobalLimitCtx().disableOpt();
}
Utilities.addSchemaEvolutionToTableScanOperator(partsList.getSourceTable(), tsOp);
Iterator<Path> iterPath = partDir.iterator();
Iterator<PartitionDesc> iterPartnDesc = partDesc.iterator();
if (!local) {
while (iterPath.hasNext()) {
assert iterPartnDesc.hasNext();
Path path = iterPath.next();
PartitionDesc prtDesc = iterPartnDesc.next();
// Add the path to alias mapping
plan.addPathToAlias(path, alias_id);
plan.addPathToPartitionInfo(path, prtDesc);
if (LOG.isDebugEnabled()) {
LOG.debug("Information added for path " + path);
}
}
assert plan.getAliasToWork().get(alias_id) == null;
plan.getAliasToWork().put(alias_id, tsOp);
} else {
// populate local work if needed
MapredLocalWork localPlan = plan.getMapRedLocalWork();
if (localPlan == null) {
localPlan = new MapredLocalWork(new LinkedHashMap<String, Operator<? extends OperatorDesc>>(), new LinkedHashMap<String, FetchWork>());
}
assert localPlan.getAliasToWork().get(alias_id) == null;
assert localPlan.getAliasToFetchWork().get(alias_id) == null;
localPlan.getAliasToWork().put(alias_id, tsOp);
if (tblDir == null) {
tblDesc = Utilities.getTableDesc(partsList.getSourceTable());
localPlan.getAliasToFetchWork().put(alias_id, new FetchWork(partDir, partDesc, tblDesc));
} else {
localPlan.getAliasToFetchWork().put(alias_id, new FetchWork(tblDir, tblDesc));
}
plan.setMapRedLocalWork(localPlan);
}
}
use of org.apache.hadoop.hive.ql.plan.FilterDesc.SampleDesc in project hive by apache.
the class SemanticAnalyzer method genTablePlan.
@SuppressWarnings("nls")
private Operator genTablePlan(String alias, QB qb) throws SemanticException {
String alias_id = getAliasId(alias, qb);
Table tab = qb.getMetaData().getSrcForAlias(alias);
RowResolver rwsch;
// is the table already present
TableScanOperator top = topOps.get(alias_id);
// Obtain table props in query
Map<String, String> properties = qb.getTabPropsForAlias(alias);
if (top == null) {
// Determine row schema for TSOP.
// Include column names from SerDe, the partition and virtual columns.
rwsch = new RowResolver();
try {
// Including parameters passed in the query
if (properties != null) {
for (Entry<String, String> prop : properties.entrySet()) {
if (tab.getSerdeParam(prop.getKey()) != null) {
LOG.warn("SerDe property in input query overrides stored SerDe property");
}
tab.setSerdeParam(prop.getKey(), prop.getValue());
}
}
// Obtain inspector for schema
StructObjectInspector rowObjectInspector = (StructObjectInspector) tab.getDeserializer().getObjectInspector();
List<? extends StructField> fields = rowObjectInspector.getAllStructFieldRefs();
for (int i = 0; i < fields.size(); i++) {
/**
* if the column is a skewed column, use ColumnInfo accordingly
*/
ColumnInfo colInfo = new ColumnInfo(fields.get(i).getFieldName(), TypeInfoUtils.getTypeInfoFromObjectInspector(fields.get(i).getFieldObjectInspector()), alias, false);
colInfo.setSkewedCol((isSkewedCol(alias, qb, fields.get(i).getFieldName())) ? true : false);
rwsch.put(alias, fields.get(i).getFieldName(), colInfo);
}
} catch (SerDeException e) {
throw new RuntimeException(e);
}
// Finally add the partitioning columns
for (FieldSchema part_col : tab.getPartCols()) {
LOG.trace("Adding partition col: " + part_col);
rwsch.put(alias, part_col.getName(), new ColumnInfo(part_col.getName(), TypeInfoFactory.getPrimitiveTypeInfo(part_col.getType()), alias, true));
}
// put all virtual columns in RowResolver.
Iterator<VirtualColumn> vcs = VirtualColumn.getRegistry(conf).iterator();
// use a list for easy cumtomize
List<VirtualColumn> vcList = new ArrayList<VirtualColumn>();
while (vcs.hasNext()) {
VirtualColumn vc = vcs.next();
rwsch.put(alias, vc.getName().toLowerCase(), new ColumnInfo(vc.getName(), vc.getTypeInfo(), alias, true, vc.getIsHidden()));
vcList.add(vc);
}
// Create the root of the operator tree
TableScanDesc tsDesc = new TableScanDesc(alias, vcList, tab);
setupStats(tsDesc, qb.getParseInfo(), tab, alias, rwsch);
SplitSample sample = nameToSplitSample.get(alias_id);
if (sample != null && sample.getRowCount() != null) {
tsDesc.setRowLimit(sample.getRowCount());
nameToSplitSample.remove(alias_id);
}
top = (TableScanOperator) putOpInsertMap(OperatorFactory.get(getOpContext(), tsDesc, new RowSchema(rwsch.getColumnInfos())), rwsch);
// Set insiderView so that we can skip the column authorization for this.
top.setInsideView(qb.isInsideView() || qb.getAliasInsideView().contains(alias.toLowerCase()));
// Add this to the list of top operators - we always start from a table
// scan
topOps.put(alias_id, top);
// Add a mapping from the table scan operator to Table
topToTable.put(top, tab);
if (properties != null) {
topToTableProps.put(top, properties);
tsDesc.setOpProps(properties);
}
} else {
rwsch = opParseCtx.get(top).getRowResolver();
top.setChildOperators(null);
}
// check if this table is sampled and needs more than input pruning
Operator<? extends OperatorDesc> op = top;
TableSample ts = qb.getParseInfo().getTabSample(alias);
if (ts != null) {
TableScanOperator tableScanOp = top;
tableScanOp.getConf().setTableSample(ts);
int num = ts.getNumerator();
int den = ts.getDenominator();
ArrayList<ASTNode> sampleExprs = ts.getExprs();
// TODO: Do the type checking of the expressions
List<String> tabBucketCols = tab.getBucketCols();
int numBuckets = tab.getNumBuckets();
// If there are no sample cols and no bucket cols then throw an error
if (tabBucketCols.size() == 0 && sampleExprs.size() == 0) {
throw new SemanticException(ErrorMsg.NON_BUCKETED_TABLE.getMsg() + " " + tab.getTableName());
}
if (num > den) {
throw new SemanticException(ErrorMsg.BUCKETED_NUMERATOR_BIGGER_DENOMINATOR.getMsg() + " " + tab.getTableName());
}
// check if a predicate is needed
// predicate is needed if either input pruning is not enough
// or if input pruning is not possible
// check if the sample columns are the same as the table bucket columns
boolean colsEqual = true;
if ((sampleExprs.size() != tabBucketCols.size()) && (sampleExprs.size() != 0)) {
colsEqual = false;
}
for (int i = 0; i < sampleExprs.size() && colsEqual; i++) {
boolean colFound = false;
for (int j = 0; j < tabBucketCols.size() && !colFound; j++) {
if (sampleExprs.get(i).getToken().getType() != HiveParser.TOK_TABLE_OR_COL) {
break;
}
if (((ASTNode) sampleExprs.get(i).getChild(0)).getText().equalsIgnoreCase(tabBucketCols.get(j))) {
colFound = true;
}
}
colsEqual = (colsEqual && colFound);
}
// Check if input can be pruned
ts.setInputPruning((sampleExprs == null || sampleExprs.size() == 0 || colsEqual));
// check if input pruning is enough
if ((sampleExprs == null || sampleExprs.size() == 0 || colsEqual) && (num == den || (den % numBuckets == 0 || numBuckets % den == 0))) {
// input pruning is enough; add the filter for the optimizer to use it
// later
LOG.info("No need for sample filter");
ExprNodeDesc samplePredicate = genSamplePredicate(ts, tabBucketCols, colsEqual, alias, rwsch, qb.getMetaData(), null);
FilterDesc filterDesc = new FilterDesc(samplePredicate, true, new SampleDesc(ts.getNumerator(), ts.getDenominator(), tabBucketCols, true));
filterDesc.setGenerated(true);
op = OperatorFactory.getAndMakeChild(filterDesc, new RowSchema(rwsch.getColumnInfos()), top);
} else {
// need to add filter
// create tableOp to be filterDesc and set as child to 'top'
LOG.info("Need sample filter");
ExprNodeDesc samplePredicate = genSamplePredicate(ts, tabBucketCols, colsEqual, alias, rwsch, qb.getMetaData(), null);
FilterDesc filterDesc = new FilterDesc(samplePredicate, true);
filterDesc.setGenerated(true);
op = OperatorFactory.getAndMakeChild(filterDesc, new RowSchema(rwsch.getColumnInfos()), top);
}
} else {
boolean testMode = conf.getBoolVar(HiveConf.ConfVars.HIVETESTMODE);
if (testMode) {
String tabName = tab.getTableName();
// has the user explicitly asked not to sample this table
String unSampleTblList = conf.getVar(HiveConf.ConfVars.HIVETESTMODENOSAMPLE);
String[] unSampleTbls = unSampleTblList.split(",");
boolean unsample = false;
for (String unSampleTbl : unSampleTbls) {
if (tabName.equalsIgnoreCase(unSampleTbl)) {
unsample = true;
}
}
if (!unsample) {
int numBuckets = tab.getNumBuckets();
// If the input table is bucketed, choose the first bucket
if (numBuckets > 0) {
TableSample tsSample = new TableSample(1, numBuckets);
tsSample.setInputPruning(true);
qb.getParseInfo().setTabSample(alias, tsSample);
ExprNodeDesc samplePred = genSamplePredicate(tsSample, tab.getBucketCols(), true, alias, rwsch, qb.getMetaData(), null);
FilterDesc filterDesc = new FilterDesc(samplePred, true, new SampleDesc(tsSample.getNumerator(), tsSample.getDenominator(), tab.getBucketCols(), true));
filterDesc.setGenerated(true);
op = OperatorFactory.getAndMakeChild(filterDesc, new RowSchema(rwsch.getColumnInfos()), top);
LOG.info("No need for sample filter");
} else {
// The table is not bucketed, add a dummy filter :: rand()
int freq = conf.getIntVar(HiveConf.ConfVars.HIVETESTMODESAMPLEFREQ);
TableSample tsSample = new TableSample(1, freq);
tsSample.setInputPruning(false);
qb.getParseInfo().setTabSample(alias, tsSample);
LOG.info("Need sample filter");
ExprNodeDesc randFunc = TypeCheckProcFactory.DefaultExprProcessor.getFuncExprNodeDesc("rand", new ExprNodeConstantDesc(Integer.valueOf(460476415)));
ExprNodeDesc samplePred = genSamplePredicate(tsSample, null, false, alias, rwsch, qb.getMetaData(), randFunc);
FilterDesc filterDesc = new FilterDesc(samplePred, true);
filterDesc.setGenerated(true);
op = OperatorFactory.getAndMakeChild(filterDesc, new RowSchema(rwsch.getColumnInfos()), top);
}
}
}
}
Operator output = putOpInsertMap(op, rwsch);
if (LOG.isDebugEnabled()) {
LOG.debug("Created Table Plan for " + alias + " " + op.toString());
}
return output;
}
Aggregations