use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.
the class ProcessAnalyzeTable method process.
@SuppressWarnings("unchecked")
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
GenTezProcContext context = (GenTezProcContext) procContext;
TableScanOperator tableScan = (TableScanOperator) nd;
ParseContext parseContext = context.parseContext;
Table table = tableScan.getConf().getTableMetadata();
Class<? extends InputFormat> inputFormat = table.getInputFormatClass();
if (parseContext.getQueryProperties().isAnalyzeCommand()) {
assert tableScan.getChildOperators() == null || tableScan.getChildOperators().size() == 0;
String alias = null;
for (String a : parseContext.getTopOps().keySet()) {
if (tableScan == parseContext.getTopOps().get(a)) {
alias = a;
}
}
assert alias != null;
TezWork tezWork = context.currentTask.getWork();
if (OrcInputFormat.class.isAssignableFrom(inputFormat) || MapredParquetInputFormat.class.isAssignableFrom(inputFormat)) {
// For ORC & Parquet, all the following statements are the same
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;
// There will not be any Tez job above this task
StatsWork statWork = new StatsWork(table, parseContext.getConf());
statWork.setFooterScan();
// If partition is specified, get pruned partition list
Set<Partition> confirmedParts = GenMapRedUtils.getConfirmedPartitionsForScan(tableScan);
if (confirmedParts.size() > 0) {
List<String> partCols = GenMapRedUtils.getPartitionColumns(tableScan);
PrunedPartitionList partList = new PrunedPartitionList(table, confirmedParts, partCols, false);
statWork.addInputPartitions(partList.getPartitions());
}
Task<StatsWork> snjTask = TaskFactory.get(statWork);
snjTask.setParentTasks(null);
context.rootTasks.remove(context.currentTask);
context.rootTasks.add(snjTask);
return true;
} else {
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS;
// The plan consists of a simple TezTask followed by a StatsTask.
// The Tez task is just a simple TableScanOperator
BasicStatsWork basicStatsWork = new BasicStatsWork(table.getTableSpec());
basicStatsWork.setNoScanAnalyzeCommand(parseContext.getQueryProperties().isNoScanAnalyzeCommand());
StatsWork columnStatsWork = new StatsWork(table, basicStatsWork, parseContext.getConf());
columnStatsWork.collectStatsFromAggregator(tableScan.getConf());
columnStatsWork.setSourceTask(context.currentTask);
Task<StatsWork> statsTask = TaskFactory.get(columnStatsWork);
context.currentTask.addDependentTask(statsTask);
// The plan consists of a StatsTask only.
if (parseContext.getQueryProperties().isNoScanAnalyzeCommand()) {
statsTask.setParentTasks(null);
context.rootTasks.remove(context.currentTask);
context.rootTasks.add(statsTask);
}
// NOTE: here we should use the new partition predicate pushdown API to
// get a list of pruned list,
// and pass it to setTaskPlan as the last parameter
Set<Partition> confirmedPartns = GenMapRedUtils.getConfirmedPartitionsForScan(tableScan);
PrunedPartitionList partitions = null;
if (confirmedPartns.size() > 0) {
List<String> partCols = GenMapRedUtils.getPartitionColumns(tableScan);
partitions = new PrunedPartitionList(table, confirmedPartns, partCols, false);
}
MapWork w = utils.createMapWork(context, tableScan, tezWork, partitions);
w.setGatheringStats(true);
return true;
}
}
return null;
}
use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.
the class GenSparkUtils method processPartitionPruningSink.
/**
* Populate partition pruning information from the pruning sink operator to the
* target MapWork (the MapWork for the big table side). The information include the source table
* name, column name, and partition key expression. It also set up the temporary path used to
* communicate between the target MapWork and source BaseWork.
*
* Here "source" refers to the small table side, while "target" refers to the big
* table side.
*
* @param context the spark context.
* @param pruningSink the pruner sink operator being processed.
*/
public void processPartitionPruningSink(GenSparkProcContext context, SparkPartitionPruningSinkOperator pruningSink) {
SparkPartitionPruningSinkDesc desc = pruningSink.getConf();
final Path outputBase = getDPPOutputPath(context.parseContext.getContext());
final String sourceId = pruningSink.getUniqueId();
desc.setPath(new Path(outputBase, sourceId));
for (SparkPartitionPruningSinkDesc.DPPTargetInfo targetInfo : desc.getTargetInfos()) {
TableScanOperator ts = targetInfo.tableScan;
MapWork targetWork = (MapWork) context.rootToWorkMap.get(ts);
Preconditions.checkNotNull(targetWork, "No targetWork found for tablescan " + ts);
// set up temporary path to communicate between the small/big table
if (targetWork.getTmpPathForPartitionPruning() == null) {
targetWork.setTmpPathForPartitionPruning(outputBase);
LOG.info("Setting tmp path between source work and target work:\n" + outputBase);
}
targetInfo.work = targetWork;
targetInfo.columnName = SparkUtilities.getWorkId(targetWork) + ":" + targetInfo.columnName;
pruningSink.addAsSourceEvent(targetWork, targetInfo.partKey, targetInfo.columnName, targetInfo.columnType);
}
}
use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.
the class GenSparkUtils method createMapWork.
public MapWork createMapWork(GenSparkProcContext context, Operator<?> root, SparkWork sparkWork, PrunedPartitionList partitions, boolean deferSetup) throws SemanticException {
Preconditions.checkArgument(root.getParentOperators().isEmpty(), "AssertionError: expected root.getParentOperators() to be empty");
MapWork mapWork = new MapWork("Map " + (++sequenceNumber));
LOG.debug("Adding map work (" + mapWork.getName() + ") for " + root);
// map work starts with table scan operators
Preconditions.checkArgument(root instanceof TableScanOperator, "AssertionError: expected root to be an instance of TableScanOperator, but was " + root.getClass().getName());
String alias_id = null;
if (context.parseContext != null && context.parseContext.getTopOps() != null) {
for (String currentAliasID : context.parseContext.getTopOps().keySet()) {
Operator<? extends OperatorDesc> currOp = context.parseContext.getTopOps().get(currentAliasID);
if (currOp == root) {
alias_id = currentAliasID;
break;
}
}
}
if (alias_id == null)
alias_id = ((TableScanOperator) root).getConf().getAlias();
if (!deferSetup) {
setupMapWork(mapWork, context, partitions, (TableScanOperator) root, alias_id);
}
// add new item to the Spark work
sparkWork.add(mapWork);
return mapWork;
}
use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.
the class TestInputOutputFormat method createMockExecutionEnvironment.
/**
* Create a mock execution environment that has enough detail that
* ORC, vectorization, HiveInputFormat, and CombineHiveInputFormat don't
* explode.
* @param workDir a local filesystem work directory
* @param warehouseDir a mock filesystem warehouse directory
* @param tableName the table name
* @param objectInspector object inspector for the row
* @param isVectorized should run vectorized
* @return a JobConf that contains the necessary information
* @throws IOException
* @throws HiveException
*/
JobConf createMockExecutionEnvironment(Path workDir, Path warehouseDir, String tableName, ObjectInspector objectInspector, boolean isVectorized, int partitions) throws IOException, HiveException {
JobConf conf = new JobConf();
Utilities.clearWorkMap(conf);
conf.set("hive.exec.plan", workDir.toString());
conf.set("mapred.job.tracker", "local");
String isVectorizedString = Boolean.toString(isVectorized);
conf.set("hive.vectorized.execution.enabled", isVectorizedString);
conf.set(Utilities.VECTOR_MODE, isVectorizedString);
conf.set(Utilities.USE_VECTORIZED_INPUT_FILE_FORMAT, isVectorizedString);
conf.set("fs.mock.impl", MockFileSystem.class.getName());
conf.set("mapred.mapper.class", ExecMapper.class.getName());
Path root = new Path(warehouseDir, tableName);
// clean out previous contents
((MockFileSystem) root.getFileSystem(conf)).clear();
// build partition strings
String[] partPath = new String[partitions];
StringBuilder buffer = new StringBuilder();
for (int p = 0; p < partitions; ++p) {
partPath[p] = new Path(root, "p=" + p).toString();
if (p != 0) {
buffer.append(',');
}
buffer.append(partPath[p]);
}
conf.set("mapred.input.dir", buffer.toString());
StringBuilder columnIds = new StringBuilder();
StringBuilder columnNames = new StringBuilder();
StringBuilder columnTypes = new StringBuilder();
StructObjectInspector structOI = (StructObjectInspector) objectInspector;
List<? extends StructField> fields = structOI.getAllStructFieldRefs();
int numCols = fields.size();
for (int i = 0; i < numCols; ++i) {
if (i != 0) {
columnIds.append(',');
columnNames.append(',');
columnTypes.append(',');
}
columnIds.append(i);
columnNames.append(fields.get(i).getFieldName());
columnTypes.append(fields.get(i).getFieldObjectInspector().getTypeName());
}
conf.set("hive.io.file.readcolumn.ids", columnIds.toString());
conf.set("partition_columns", "p");
conf.set(serdeConstants.LIST_COLUMNS, columnNames.toString());
conf.set(serdeConstants.LIST_COLUMN_TYPES, columnTypes.toString());
MockFileSystem fs = (MockFileSystem) warehouseDir.getFileSystem(conf);
fs.clear();
Properties tblProps = new Properties();
tblProps.put("name", tableName);
tblProps.put("serialization.lib", OrcSerde.class.getName());
tblProps.put("columns", columnNames.toString());
tblProps.put("columns.types", columnTypes.toString());
TableDesc tbl = new TableDesc(OrcInputFormat.class, OrcOutputFormat.class, tblProps);
MapWork mapWork = new MapWork();
mapWork.setVectorMode(isVectorized);
if (isVectorized) {
VectorizedRowBatchCtx vectorizedRowBatchCtx = new VectorizedRowBatchCtx();
vectorizedRowBatchCtx.init(structOI, new String[0]);
mapWork.setVectorizedRowBatchCtx(vectorizedRowBatchCtx);
}
mapWork.setUseBucketizedHiveInputFormat(false);
LinkedHashMap<Path, ArrayList<String>> aliasMap = new LinkedHashMap<>();
ArrayList<String> aliases = new ArrayList<String>();
aliases.add(tableName);
LinkedHashMap<Path, PartitionDesc> partMap = new LinkedHashMap<>();
for (int p = 0; p < partitions; ++p) {
Path path = new Path(partPath[p]);
aliasMap.put(path, aliases);
LinkedHashMap<String, String> partSpec = new LinkedHashMap<String, String>();
PartitionDesc part = new PartitionDesc(tbl, partSpec);
if (isVectorized) {
part.setVectorPartitionDesc(VectorPartitionDesc.createVectorizedInputFileFormat("MockInputFileFormatClassName", false));
}
partMap.put(path, part);
}
mapWork.setPathToAliases(aliasMap);
mapWork.setPathToPartitionInfo(partMap);
// write the plan out
FileSystem localFs = FileSystem.getLocal(conf).getRaw();
Path mapXml = new Path(workDir, "map.xml");
localFs.delete(mapXml, true);
FSDataOutputStream planStream = localFs.create(mapXml);
SerializationUtilities.serializePlan(mapWork, planStream);
conf.setBoolean(Utilities.HAS_MAP_WORK, true);
planStream.close();
return conf;
}
Aggregations