use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class GenMapRedUtils method createMRWorkForMergingFiles.
/**
* Create a MapredWork based on input path, the top operator and the input
* table descriptor.
*
* @param conf
* @param topOp
* the table scan operator that is the root of the MapReduce task.
* @param fsDesc
* the file sink descriptor that serves as the input to this merge task.
* @param parentMR
* the parent MapReduce work
* @param parentFS
* the last FileSinkOperator in the parent MapReduce work
* @return the MapredWork
*/
private static MapWork createMRWorkForMergingFiles(HiveConf conf, TableScanOperator topOp, FileSinkDesc fsDesc) {
ArrayList<String> aliases = new ArrayList<String>();
Path inputDir = StringInternUtils.internUriStringsInPath(fsDesc.getMergeInputDirName());
String inputDirStr = inputDir.toString().intern();
TableDesc tblDesc = fsDesc.getTableInfo();
// dummy alias: just use the input path
aliases.add(inputDirStr);
// constructing the default MapredWork
MapredWork cMrPlan = GenMapRedUtils.getMapRedWorkFromConf(conf);
MapWork cplan = cMrPlan.getMapWork();
cplan.addPathToAlias(inputDir, aliases);
cplan.addPathToPartitionInfo(inputDir, new PartitionDesc(tblDesc, null));
cplan.getAliasToWork().put(inputDirStr, topOp);
cplan.setMapperCannotSpanPartns(true);
return cplan;
}
use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class GenMapRedUtils method setTaskPlan.
/**
* set the current task in the mapredWork.
*
* @param alias
* current alias
* @param topOp
* the top operator of the stack
* @param plan
* current plan
* @param local
* whether you need to add to map-reduce or local work
* @param ttDesc
* table descriptor
* @throws SemanticException
*/
public static void setTaskPlan(Path path, String alias, Operator<? extends OperatorDesc> topOp, MapWork plan, boolean local, TableDesc ttDesc) throws SemanticException {
if (path == null || alias == null) {
return;
}
if (topOp instanceof TableScanOperator) {
try {
Utilities.addSchemaEvolutionToTableScanOperator((StructObjectInspector) ttDesc.getSerDe().getObjectInspector(), (TableScanOperator) topOp);
} catch (Exception e) {
throw new SemanticException(e);
}
}
if (!local) {
plan.addPathToAlias(path, alias);
plan.addPathToPartitionInfo(path, new PartitionDesc(ttDesc, null));
plan.getAliasToWork().put(alias, topOp);
} else {
// populate local work if needed
MapredLocalWork localPlan = plan.getMapRedLocalWork();
if (localPlan == null) {
localPlan = new MapredLocalWork(new LinkedHashMap<String, Operator<? extends OperatorDesc>>(), new LinkedHashMap<String, FetchWork>());
}
assert localPlan.getAliasToWork().get(alias) == null;
assert localPlan.getAliasToFetchWork().get(alias) == null;
localPlan.getAliasToWork().put(alias, topOp);
localPlan.getAliasToFetchWork().put(alias, new FetchWork(new Path(alias), ttDesc));
plan.setMapRedLocalWork(localPlan);
}
}
use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class SplitGrouper method generateGroupedSplits.
/**
* Generate groups of splits, separated by schema evolution boundaries
* OR
* When used from compactor, group splits based on the bucket number of the input files
* (in this case, splits for same logical bucket but different schema, end up in same group)
*/
public Multimap<Integer, InputSplit> generateGroupedSplits(JobConf jobConf, Configuration conf, InputSplit[] splits, float waves, int availableSlots, String inputName, boolean groupAcrossFiles, SplitLocationProvider locationProvider) throws Exception {
boolean isMinorCompaction = true;
MapWork mapWork = populateMapWork(jobConf, inputName);
// ArrayListMultimap is important here to retain the ordering for the splits.
Multimap<Integer, InputSplit> schemaGroupedSplitMultiMap = ArrayListMultimap.<Integer, InputSplit>create();
if (HiveConf.getVar(jobConf, HiveConf.ConfVars.SPLIT_GROUPING_MODE).equalsIgnoreCase("compactor")) {
List<Path> paths = Utilities.getInputPathsTez(jobConf, mapWork);
for (Path path : paths) {
List<String> aliases = mapWork.getPathToAliases().get(path);
if ((aliases != null) && (aliases.size() == 1)) {
Operator<? extends OperatorDesc> op = mapWork.getAliasToWork().get(aliases.get(0));
if ((op != null) && (op instanceof TableScanOperator)) {
TableScanOperator tableScan = (TableScanOperator) op;
PartitionDesc partitionDesc = mapWork.getAliasToPartnInfo().get(aliases.get(0));
isMinorCompaction &= AcidUtils.isCompactionTable(partitionDesc.getTableDesc().getProperties());
if (!tableScan.getConf().isTranscationalTable() && !isMinorCompaction) {
String splitPath = getFirstSplitPath(splits);
String errorMessage = "Compactor split grouping is enabled only for transactional tables. Please check the path: " + splitPath;
LOG.error(errorMessage);
throw new RuntimeException(errorMessage);
}
}
}
}
/**
* The expectation is that each InputSplit is a {@link org.apache.hadoop.hive.ql.io.HiveInputFormat.HiveInputSplit}
* wrapping an OrcSplit. So group these splits by bucketId and within each bucketId, sort by writeId, stmtId,
* rowIdOffset or splitStart. For 'original' splits (w/o acid meta cols in the file) SyntheticBucketProperties
* should always be there and so rowIdOffset is there. For 'native' acid files, OrcSplit doesn't have
* the 1st rowid in the split, so splitStart is used to sort. This should achieve the required sorting invariance
* (sort by: writeId, stmtId, rowIdOffset within each bucket) needed for Acid tables.
* See: {@link org.apache.hadoop.hive.ql.io.AcidInputFormat}
* Create a TezGroupedSplit for each bucketId and return.
* TODO: Are there any other config values (split size etc) that can override this per writer split grouping?
*/
return getCompactorSplitGroups(splits, conf, isMinorCompaction);
}
int i = 0;
InputSplit prevSplit = null;
for (InputSplit s : splits) {
// this is the bit where we make sure we don't group across partition schema boundaries
if (schemaEvolved(s, prevSplit, groupAcrossFiles, mapWork)) {
++i;
prevSplit = s;
}
schemaGroupedSplitMultiMap.put(i, s);
}
LOG.info("# Src groups for split generation: " + (i + 1));
// group them into the chunks we want
Multimap<Integer, InputSplit> groupedSplits = this.group(jobConf, schemaGroupedSplitMultiMap, availableSlots, waves, locationProvider);
return groupedSplits;
}
use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class MapJoinProcessor method genMapJoinLocalWork.
/**
* Generate the MapRed Local Work for the given map-join operator
*
* @param newWork
* @param mapJoinOp
* map-join operator for which local work needs to be generated.
* @param bigTablePos
* @throws SemanticException
*/
private static void genMapJoinLocalWork(MapredWork newWork, MapJoinOperator mapJoinOp, int bigTablePos) throws SemanticException {
// keep the small table alias to avoid concurrent modification exception
ArrayList<String> smallTableAliasList = new ArrayList<String>();
// create a new MapredLocalWork
MapredLocalWork newLocalWork = new MapredLocalWork(new LinkedHashMap<String, Operator<? extends OperatorDesc>>(), new LinkedHashMap<String, FetchWork>());
for (Map.Entry<String, Operator<? extends OperatorDesc>> entry : newWork.getMapWork().getAliasToWork().entrySet()) {
String alias = entry.getKey();
Operator<? extends OperatorDesc> op = entry.getValue();
// if the table scan is for big table; then skip it
// tracing down the operator tree from the table scan operator
Operator<? extends OperatorDesc> parentOp = op;
Operator<? extends OperatorDesc> childOp = op.getChildOperators().get(0);
while ((childOp != null) && (!childOp.equals(mapJoinOp))) {
parentOp = childOp;
assert parentOp.getChildOperators().size() == 1;
childOp = parentOp.getChildOperators().get(0);
}
if (childOp == null) {
throw new SemanticException("Cannot find join op by tracing down the table scan operator tree");
}
// skip the big table pos
int i = childOp.getParentOperators().indexOf(parentOp);
if (i == bigTablePos) {
continue;
}
// set alias to work and put into smallTableAliasList
newLocalWork.getAliasToWork().put(alias, op);
smallTableAliasList.add(alias);
// get input path and remove this alias from pathToAlias
// because this file will be fetched by fetch operator
Map<Path, List<String>> pathToAliases = newWork.getMapWork().getPathToAliases();
// keep record all the input path for this alias
HashSet<Path> pathSet = new HashSet<>();
HashSet<Path> emptyPath = new HashSet<>();
for (Map.Entry<Path, List<String>> entry2 : pathToAliases.entrySet()) {
Path path = entry2.getKey();
List<String> list = entry2.getValue();
if (list.contains(alias)) {
// add to path set
pathSet.add(path);
// remove this alias from the alias list
list.remove(alias);
if (list.size() == 0) {
emptyPath.add(path);
}
}
}
// remove the path, with which no alias associates
for (Path path : emptyPath) {
newWork.getMapWork().removePathToAlias(path);
}
// create fetch work
FetchWork fetchWork = null;
List<Path> partDir = new ArrayList<Path>();
List<PartitionDesc> partDesc = new ArrayList<PartitionDesc>();
for (Path tablePath : pathSet) {
PartitionDesc partitionDesc = newWork.getMapWork().getPathToPartitionInfo().get(tablePath);
// create fetchwork for non partitioned table
if (partitionDesc.getPartSpec() == null || partitionDesc.getPartSpec().size() == 0) {
fetchWork = new FetchWork(tablePath, partitionDesc.getTableDesc());
break;
}
// if table is partitioned,add partDir and partitionDesc
partDir.add(tablePath);
partDesc.add(partitionDesc);
}
// create fetchwork for partitioned table
if (fetchWork == null) {
TableDesc table = newWork.getMapWork().getAliasToPartnInfo().get(alias).getTableDesc();
fetchWork = new FetchWork(partDir, partDesc, table);
}
// set alias to fetch work
newLocalWork.getAliasToFetchWork().put(alias, fetchWork);
}
// remove small table alias from aliasToWork;Avoid concurrent modification
for (String alias : smallTableAliasList) {
newWork.getMapWork().getAliasToWork().remove(alias);
}
// set up local work
newWork.getMapWork().setMapRedLocalWork(newLocalWork);
// remove reducer
newWork.setReduceWork(null);
}
use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class SparkDynamicPartitionPruner method applyFilterToPartitions.
private void applyFilterToPartitions(MapWork work, ObjectInspectorConverters.Converter converter, ExprNodeEvaluator eval, String columnName, Set<Object> values) throws HiveException {
Object[] row = new Object[1];
Iterator<Path> it = work.getPathToPartitionInfo().keySet().iterator();
while (it.hasNext()) {
Path p = it.next();
PartitionDesc desc = work.getPathToPartitionInfo().get(p);
Map<String, String> spec = desc.getPartSpec();
Preconditions.checkNotNull(spec, "No partition spec found in dynamic pruning");
String partValueString = spec.get(columnName);
Preconditions.checkNotNull(partValueString, "Could not find partition value for column: " + columnName);
Object partValue = converter.convert(partValueString);
if (LOG.isDebugEnabled()) {
LOG.debug("Converted partition value: " + partValue + " original (" + partValueString + ")");
}
row[0] = partValue;
partValue = eval.evaluate(row);
if (LOG.isDebugEnabled()) {
LOG.debug("part key expr applied: " + partValue);
}
if (!values.contains(partValue)) {
LOG.info("Pruning path: " + p);
it.remove();
work.removePathToAlias(p);
work.removePathToPartitionInfo(p);
}
}
}
Aggregations