use of org.apache.hadoop.hive.ql.parse.SemanticException in project hive by apache.
the class GenMapRedUtils method setMapWork.
/**
* initialize MapWork
*
* @param alias_id
* current alias
* @param topOp
* the top operator of the stack
* @param plan
* map work to initialize
* @param local
* whether you need to add to map-reduce or local work
* @param pList
* pruned partition list. If it is null it will be computed on-the-fly.
* @param inputs
* read entities for the map work
* @param conf
* current instance of hive conf
*/
public static void setMapWork(MapWork plan, ParseContext parseCtx, Set<ReadEntity> inputs, PrunedPartitionList partsList, TableScanOperator tsOp, String alias_id, HiveConf conf, boolean local) throws SemanticException {
ArrayList<Path> partDir = new ArrayList<Path>();
ArrayList<PartitionDesc> partDesc = new ArrayList<PartitionDesc>();
boolean isFullAcidTable = false;
Path tblDir = null;
plan.setNameToSplitSample(parseCtx.getNameToSplitSample());
// we also collect table stats while collecting column stats.
if (parseCtx.getAnalyzeRewrite() != null) {
plan.setGatheringStats(true);
}
if (partsList == null) {
try {
partsList = PartitionPruner.prune(tsOp, parseCtx, alias_id);
isFullAcidTable = tsOp.getConf().isFullAcidTable();
} catch (SemanticException e) {
throw e;
}
}
// Generate the map work for this alias_id
// pass both confirmed and unknown partitions through the map-reduce
// framework
Set<Partition> parts = partsList.getPartitions();
TableDesc tableSpec = Utilities.getTableDesc(tsOp.getConf().getTableMetadata());
PartitionDesc aliasPartnDesc = null;
try {
if (!parts.isEmpty()) {
aliasPartnDesc = Utilities.getPartitionDesc(parts.iterator().next(), tableSpec);
}
} catch (HiveException e) {
LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
throw new SemanticException(e.getMessage(), e);
}
// The table does not have any partitions
if (aliasPartnDesc == null) {
aliasPartnDesc = new PartitionDesc(tableSpec, null);
}
Map<String, String> props = tsOp.getConf().getOpProps();
if (props != null) {
Properties target = aliasPartnDesc.getProperties();
target.putAll(props);
}
plan.getAliasToPartnInfo().put(alias_id, aliasPartnDesc);
long sizeNeeded = Integer.MAX_VALUE;
int fileLimit = -1;
if (parseCtx.getGlobalLimitCtx().isEnable()) {
if (isFullAcidTable) {
LOG.info("Skipping Global Limit optimization for an ACID table");
parseCtx.getGlobalLimitCtx().disableOpt();
} else {
long sizePerRow = HiveConf.getLongVar(parseCtx.getConf(), HiveConf.ConfVars.HIVELIMITMAXROWSIZE);
sizeNeeded = (parseCtx.getGlobalLimitCtx().getGlobalOffset() + parseCtx.getGlobalLimitCtx().getGlobalLimit()) * sizePerRow;
// for the optimization that reduce number of input file, we limit number
// of files allowed. If more than specific number of files have to be
// selected, we skip this optimization. Since having too many files as
// inputs can cause unpredictable latency. It's not necessarily to be
// cheaper.
fileLimit = HiveConf.getIntVar(parseCtx.getConf(), HiveConf.ConfVars.HIVELIMITOPTLIMITFILE);
if (sizePerRow <= 0 || fileLimit <= 0) {
LOG.info("Skip optimization to reduce input size of 'limit'");
parseCtx.getGlobalLimitCtx().disableOpt();
} else if (parts.isEmpty()) {
LOG.info("Empty input: skip limit optimization");
} else {
LOG.info("Try to reduce input size for 'limit' " + "sizeNeeded: " + sizeNeeded + " file limit : " + fileLimit);
}
}
}
boolean isFirstPart = true;
boolean emptyInput = true;
boolean singlePartition = (parts.size() == 1);
// Track the dependencies for the view. Consider a query like: select * from V;
// where V is a view of the form: select * from T
// The dependencies should include V at depth 0, and T at depth 1 (inferred).
Map<String, ReadEntity> viewToInput = parseCtx.getViewAliasToInput();
ReadEntity parentViewInfo = PlanUtils.getParentViewInfo(alias_id, viewToInput);
// The table should also be considered a part of inputs, even if the table is a
// partitioned table and whether any partition is selected or not
// This read entity is a direct read entity and not an indirect read (that is when
// this is being read because it is a dependency of a view).
boolean isDirectRead = (parentViewInfo == null);
TableDesc tblDesc = null;
boolean initTableDesc = false;
PlanUtils.addPartitionInputs(parts, inputs, parentViewInfo, isDirectRead);
for (Partition part : parts) {
// Later the properties have to come from the partition as opposed
// to from the table in order to support versioning.
Path[] paths = null;
SampleDesc sampleDescr = parseCtx.getOpToSamplePruner().get(tsOp);
// Lookup list bucketing pruner
Map<String, ExprNodeDesc> partToPruner = parseCtx.getOpToPartToSkewedPruner().get(tsOp);
ExprNodeDesc listBucketingPruner = (partToPruner != null) ? partToPruner.get(part.getName()) : null;
if (sampleDescr != null) {
assert (listBucketingPruner == null) : "Sampling and list bucketing can't coexit.";
paths = SamplePruner.prune(part, sampleDescr);
parseCtx.getGlobalLimitCtx().disableOpt();
} else if (listBucketingPruner != null) {
assert (sampleDescr == null) : "Sampling and list bucketing can't coexist.";
/* Use list bucketing prunner's path. */
paths = ListBucketingPruner.prune(parseCtx, part, listBucketingPruner);
} else {
// contain enough size, we change to normal mode.
if (parseCtx.getGlobalLimitCtx().isEnable()) {
if (isFirstPart) {
long sizeLeft = sizeNeeded;
ArrayList<Path> retPathList = new ArrayList<Path>();
SamplePruner.LimitPruneRetStatus status = SamplePruner.limitPrune(part, sizeLeft, fileLimit, retPathList);
if (status.equals(SamplePruner.LimitPruneRetStatus.NoFile)) {
continue;
} else if (status.equals(SamplePruner.LimitPruneRetStatus.NotQualify)) {
LOG.info("Use full input -- first " + fileLimit + " files are more than " + sizeNeeded + " bytes");
parseCtx.getGlobalLimitCtx().disableOpt();
} else {
emptyInput = false;
paths = new Path[retPathList.size()];
int index = 0;
for (Path path : retPathList) {
paths[index++] = path;
}
if (status.equals(SamplePruner.LimitPruneRetStatus.NeedAllFiles) && singlePartition) {
// if all files are needed to meet the size limit, we disable
// optimization. It usually happens for empty table/partition or
// table/partition with only one file. By disabling this
// optimization, we can avoid retrying the query if there is
// not sufficient rows.
parseCtx.getGlobalLimitCtx().disableOpt();
}
}
isFirstPart = false;
} else {
paths = new Path[0];
}
}
if (!parseCtx.getGlobalLimitCtx().isEnable()) {
paths = part.getPath();
}
}
// is it a partitioned table ?
if (!part.getTable().isPartitioned()) {
assert (tblDir == null);
tblDir = paths[0];
if (!initTableDesc) {
tblDesc = Utilities.getTableDesc(part.getTable());
initTableDesc = true;
}
} else if (tblDesc == null) {
if (!initTableDesc) {
tblDesc = Utilities.getTableDesc(part.getTable());
initTableDesc = true;
}
}
if (props != null) {
Properties target = tblDesc.getProperties();
target.putAll(props);
}
for (Path p : paths) {
if (p == null) {
continue;
}
String path = p.toString();
if (LOG.isDebugEnabled()) {
LOG.debug("Adding " + path + " of table " + alias_id);
}
partDir.add(p);
try {
if (part.getTable().isPartitioned()) {
partDesc.add(Utilities.getPartitionDesc(part, tblDesc));
} else {
partDesc.add(Utilities.getPartitionDescFromTableDesc(tblDesc, part, false));
}
} catch (HiveException e) {
LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
throw new SemanticException(e.getMessage(), e);
}
}
}
if (emptyInput) {
parseCtx.getGlobalLimitCtx().disableOpt();
}
Utilities.addSchemaEvolutionToTableScanOperator(partsList.getSourceTable(), tsOp);
Iterator<Path> iterPath = partDir.iterator();
Iterator<PartitionDesc> iterPartnDesc = partDesc.iterator();
if (!local) {
while (iterPath.hasNext()) {
assert iterPartnDesc.hasNext();
Path path = iterPath.next();
PartitionDesc prtDesc = iterPartnDesc.next();
// Add the path to alias mapping
plan.addPathToAlias(path, alias_id);
plan.addPathToPartitionInfo(path, prtDesc);
if (LOG.isDebugEnabled()) {
LOG.debug("Information added for path " + path);
}
}
assert plan.getAliasToWork().get(alias_id) == null;
plan.getAliasToWork().put(alias_id, tsOp);
} else {
// populate local work if needed
MapredLocalWork localPlan = plan.getMapRedLocalWork();
if (localPlan == null) {
localPlan = new MapredLocalWork(new LinkedHashMap<String, Operator<? extends OperatorDesc>>(), new LinkedHashMap<String, FetchWork>());
}
assert localPlan.getAliasToWork().get(alias_id) == null;
assert localPlan.getAliasToFetchWork().get(alias_id) == null;
localPlan.getAliasToWork().put(alias_id, tsOp);
if (tblDir == null) {
tblDesc = Utilities.getTableDesc(partsList.getSourceTable());
localPlan.getAliasToFetchWork().put(alias_id, new FetchWork(partDir, partDesc, tblDesc));
} else {
localPlan.getAliasToFetchWork().put(alias_id, new FetchWork(tblDir, tblDesc));
}
plan.setMapRedLocalWork(localPlan);
}
}
use of org.apache.hadoop.hive.ql.parse.SemanticException in project hive by apache.
the class SamplePruner method limitPrune.
/**
* Try to generate a list of subset of files in the partition to reach a size
* limit with number of files less than fileLimit
* @param part
* @param sizeLimit
* @param fileLimit
* @param retPathList list of Paths returned
* @return the result of the attempt
* @throws SemanticException
*/
public static LimitPruneRetStatus limitPrune(Partition part, long sizeLimit, int fileLimit, Collection<Path> retPathList) throws SemanticException {
try {
FileSystem fs = part.getDataLocation().getFileSystem(Hive.get().getConf());
String pathPattern = part.getDataLocation().toString() + "/*";
AddPathReturnStatus ret = addPath(fs, pathPattern, sizeLimit, fileLimit, retPathList);
if (ret == null) {
return LimitPruneRetStatus.NotQualify;
} else if (!ret.hasFile) {
return LimitPruneRetStatus.NoFile;
} else if (ret.sizeLeft > 0) {
return LimitPruneRetStatus.NotQualify;
} else if (ret.allFile) {
return LimitPruneRetStatus.NeedAllFiles;
} else {
return LimitPruneRetStatus.NeedSomeFiles;
}
} catch (Exception e) {
throw new RuntimeException("Cannot get path", e);
}
}
use of org.apache.hadoop.hive.ql.parse.SemanticException in project hive by apache.
the class SparkMapJoinProcessor method convertMapJoin.
/**
* convert a regular join to a a map-side join.
*
* @param conf
* @param opParseCtxMap
* @param op join operator
* @param joinTree qb join tree
* @param bigTablePos position of the source to be read as part of
* map-reduce framework. All other sources are cached in memory
* @param noCheckOuterJoin
* @param validateMapJoinTree
*/
@Override
public MapJoinOperator convertMapJoin(HiveConf conf, JoinOperator op, boolean leftSrc, String[] baseSrc, List<String> mapAliases, int bigTablePos, boolean noCheckOuterJoin, boolean validateMapJoinTree) throws SemanticException {
// outer join cannot be performed on a table which is being cached
JoinCondDesc[] condns = op.getConf().getConds();
if (!noCheckOuterJoin) {
if (checkMapJoin(bigTablePos, condns) < 0) {
throw new SemanticException(ErrorMsg.NO_OUTER_MAPJOIN.getMsg());
}
}
// create the map-join operator
MapJoinOperator mapJoinOp = convertJoinOpMapJoinOp(conf, op, op.getConf().isLeftInputJoin(), op.getConf().getBaseSrc(), op.getConf().getMapAliases(), bigTablePos, noCheckOuterJoin);
// 1. remove RS as parent for the big table branch
// 2. remove old join op from child set of all the RSs
List<Operator<? extends OperatorDesc>> parentOps = mapJoinOp.getParentOperators();
for (int i = 0; i < parentOps.size(); i++) {
Operator<? extends OperatorDesc> parentOp = parentOps.get(i);
parentOp.getChildOperators().remove(op);
if (i == bigTablePos) {
List<Operator<? extends OperatorDesc>> grandParentOps = parentOp.getParentOperators();
Preconditions.checkArgument(grandParentOps.size() == 1, "AssertionError: expect number of parents to be 1, but was " + grandParentOps.size());
Operator<? extends OperatorDesc> grandParentOp = grandParentOps.get(0);
grandParentOp.replaceChild(parentOp, mapJoinOp);
mapJoinOp.replaceParent(parentOp, grandParentOp);
}
}
return mapJoinOp;
}
use of org.apache.hadoop.hive.ql.parse.SemanticException in project hive by apache.
the class HiveIntersectRewriteRule method onMatch.
// ~ Methods ----------------------------------------------------------------
public void onMatch(RelOptRuleCall call) {
final HiveIntersect hiveIntersect = call.rel(0);
final RelOptCluster cluster = hiveIntersect.getCluster();
final RexBuilder rexBuilder = cluster.getRexBuilder();
int numOfBranch = hiveIntersect.getInputs().size();
Builder<RelNode> bldr = new ImmutableList.Builder<RelNode>();
// 1st level GB: create a GB (col0, col1, count(1) as c) for each branch
for (int index = 0; index < numOfBranch; index++) {
RelNode input = hiveIntersect.getInputs().get(index);
final List<RexNode> gbChildProjLst = Lists.newArrayList();
final List<Integer> groupSetPositions = Lists.newArrayList();
for (int cInd = 0; cInd < input.getRowType().getFieldList().size(); cInd++) {
gbChildProjLst.add(rexBuilder.makeInputRef(input, cInd));
groupSetPositions.add(cInd);
}
gbChildProjLst.add(rexBuilder.makeBigintLiteral(new BigDecimal(1)));
// create the project before GB because we need a new project with extra column '1'.
RelNode gbInputRel = null;
try {
gbInputRel = HiveProject.create(input, gbChildProjLst, null);
} catch (CalciteSemanticException e) {
LOG.debug(e.toString());
throw new RuntimeException(e);
}
// groupSetPosition includes all the positions
final ImmutableBitSet groupSet = ImmutableBitSet.of(groupSetPositions);
List<AggregateCall> aggregateCalls = Lists.newArrayList();
RelDataType aggFnRetType = TypeConverter.convert(TypeInfoFactory.longTypeInfo, cluster.getTypeFactory());
// count(1), 1's position is input.getRowType().getFieldList().size()
AggregateCall aggregateCall = HiveCalciteUtil.createSingleArgAggCall("count", cluster, TypeInfoFactory.longTypeInfo, input.getRowType().getFieldList().size(), aggFnRetType);
aggregateCalls.add(aggregateCall);
HiveRelNode aggregateRel = new HiveAggregate(cluster, cluster.traitSetOf(HiveRelNode.CONVENTION), gbInputRel, groupSet, null, aggregateCalls);
bldr.add(aggregateRel);
}
// create a union above all the branches
HiveRelNode union = new HiveUnion(cluster, TraitsUtil.getDefaultTraitSet(cluster), bldr.build());
// 2nd level GB: create a GB (col0, col1, count(c)) for each branch
final List<Integer> groupSetPositions = Lists.newArrayList();
// the index of c
int cInd = union.getRowType().getFieldList().size() - 1;
for (int index = 0; index < union.getRowType().getFieldList().size(); index++) {
if (index != cInd) {
groupSetPositions.add(index);
}
}
List<AggregateCall> aggregateCalls = Lists.newArrayList();
RelDataType aggFnRetType = TypeConverter.convert(TypeInfoFactory.longTypeInfo, cluster.getTypeFactory());
AggregateCall aggregateCall = HiveCalciteUtil.createSingleArgAggCall("count", cluster, TypeInfoFactory.longTypeInfo, cInd, aggFnRetType);
aggregateCalls.add(aggregateCall);
if (hiveIntersect.all) {
aggregateCall = HiveCalciteUtil.createSingleArgAggCall("min", cluster, TypeInfoFactory.longTypeInfo, cInd, aggFnRetType);
aggregateCalls.add(aggregateCall);
}
final ImmutableBitSet groupSet = ImmutableBitSet.of(groupSetPositions);
HiveRelNode aggregateRel = new HiveAggregate(cluster, cluster.traitSetOf(HiveRelNode.CONVENTION), union, groupSet, null, aggregateCalls);
// add a filter count(c) = #branches
int countInd = cInd;
List<RexNode> childRexNodeLst = new ArrayList<RexNode>();
RexInputRef ref = rexBuilder.makeInputRef(aggregateRel, countInd);
RexLiteral literal = rexBuilder.makeBigintLiteral(new BigDecimal(numOfBranch));
childRexNodeLst.add(ref);
childRexNodeLst.add(literal);
ImmutableList.Builder<RelDataType> calciteArgTypesBldr = new ImmutableList.Builder<RelDataType>();
calciteArgTypesBldr.add(TypeConverter.convert(TypeInfoFactory.longTypeInfo, cluster.getTypeFactory()));
calciteArgTypesBldr.add(TypeConverter.convert(TypeInfoFactory.longTypeInfo, cluster.getTypeFactory()));
RexNode factoredFilterExpr = null;
try {
factoredFilterExpr = rexBuilder.makeCall(SqlFunctionConverter.getCalciteFn("=", calciteArgTypesBldr.build(), TypeConverter.convert(TypeInfoFactory.longTypeInfo, cluster.getTypeFactory()), true, false), childRexNodeLst);
} catch (CalciteSemanticException e) {
LOG.debug(e.toString());
throw new RuntimeException(e);
}
RelNode filterRel = new HiveFilter(cluster, cluster.traitSetOf(HiveRelNode.CONVENTION), aggregateRel, factoredFilterExpr);
if (!hiveIntersect.all) {
// the schema for intersect distinct is like this
// R3 on all attributes + count(c) as cnt
// finally add a project to project out the last column
Set<Integer> projectOutColumnPositions = new HashSet<>();
projectOutColumnPositions.add(filterRel.getRowType().getFieldList().size() - 1);
try {
call.transformTo(HiveCalciteUtil.createProjectWithoutColumn(filterRel, projectOutColumnPositions));
} catch (CalciteSemanticException e) {
LOG.debug(e.toString());
throw new RuntimeException(e);
}
} else {
// the schema for intersect all is like this
// R3 + count(c) as cnt + min(c) as m
// we create a input project for udtf whose schema is like this
// min(c) as m + R3
List<RexNode> originalInputRefs = Lists.transform(filterRel.getRowType().getFieldList(), new Function<RelDataTypeField, RexNode>() {
@Override
public RexNode apply(RelDataTypeField input) {
return new RexInputRef(input.getIndex(), input.getType());
}
});
List<RexNode> copyInputRefs = new ArrayList<>();
copyInputRefs.add(originalInputRefs.get(originalInputRefs.size() - 1));
for (int i = 0; i < originalInputRefs.size() - 2; i++) {
copyInputRefs.add(originalInputRefs.get(i));
}
RelNode srcRel = null;
try {
srcRel = HiveProject.create(filterRel, copyInputRefs, null);
HiveTableFunctionScan udtf = HiveCalciteUtil.createUDTFForSetOp(cluster, srcRel);
// finally add a project to project out the 1st column
Set<Integer> projectOutColumnPositions = new HashSet<>();
projectOutColumnPositions.add(0);
call.transformTo(HiveCalciteUtil.createProjectWithoutColumn(udtf, projectOutColumnPositions));
} catch (SemanticException e) {
LOG.debug(e.toString());
throw new RuntimeException(e);
}
}
}
use of org.apache.hadoop.hive.ql.parse.SemanticException in project hive by apache.
the class ConvertJoinMapJoin method checkAndConvertSMBJoin.
@SuppressWarnings("unchecked")
private Object checkAndConvertSMBJoin(OptimizeTezProcContext context, JoinOperator joinOp, TezBucketJoinProcCtx tezBucketJoinProcCtx, final long maxSize) throws SemanticException {
// map join either based on the size. Check if we can convert to SMB join.
if (!(HiveConf.getBoolVar(context.conf, ConfVars.HIVE_AUTO_SORTMERGE_JOIN)) || ((!HiveConf.getBoolVar(context.conf, ConfVars.HIVE_AUTO_SORTMERGE_JOIN_REDUCE)) && joinOp.getOpTraits().getNumReduceSinks() >= 2)) {
fallbackToReduceSideJoin(joinOp, context, maxSize);
return null;
}
Class<? extends BigTableSelectorForAutoSMJ> bigTableMatcherClass = null;
try {
String selector = HiveConf.getVar(context.parseContext.getConf(), HiveConf.ConfVars.HIVE_AUTO_SORTMERGE_JOIN_BIGTABLE_SELECTOR);
bigTableMatcherClass = JavaUtils.loadClass(selector);
} catch (ClassNotFoundException e) {
throw new SemanticException(e.getMessage());
}
BigTableSelectorForAutoSMJ bigTableMatcher = ReflectionUtils.newInstance(bigTableMatcherClass, null);
JoinDesc joinDesc = joinOp.getConf();
JoinCondDesc[] joinCondns = joinDesc.getConds();
Set<Integer> joinCandidates = MapJoinProcessor.getBigTableCandidates(joinCondns);
if (joinCandidates.isEmpty()) {
// of any type. So return false.
return false;
}
int mapJoinConversionPos = bigTableMatcher.getBigTablePosition(context.parseContext, joinOp, joinCandidates);
if (mapJoinConversionPos < 0) {
// contains aliases from sub-query
// we are just converting to a common merge join operator. The shuffle
// join in map-reduce case.
fallbackToReduceSideJoin(joinOp, context, maxSize);
return null;
}
if (checkConvertJoinSMBJoin(joinOp, context, mapJoinConversionPos, tezBucketJoinProcCtx)) {
convertJoinSMBJoin(joinOp, context, mapJoinConversionPos, tezBucketJoinProcCtx.getNumBuckets(), true);
} else {
// we are just converting to a common merge join operator. The shuffle
// join in map-reduce case.
fallbackToReduceSideJoin(joinOp, context, maxSize);
}
return null;
}
Aggregations