use of org.apache.hadoop.hive.ql.metadata.Partition in project hive by apache.
the class AbstractSMBJoinProc method isEligibleForBucketSortMergeJoin.
/**
* Whether this table is eligible for a sort-merge join.
*
* @param pctx parse context
* @param op map join operator being considered
* @param joinTree join tree being considered
* @param alias table alias in the join tree being checked
* @param pos position of the table
* @param sortColumnsFirstTable The names and order of the sorted columns for the first table.
* It is not initialized when pos = 0.
* @return
* @throws SemanticException
*/
private boolean isEligibleForBucketSortMergeJoin(SortBucketJoinProcCtx smbJoinContext, List<ExprNodeDesc> keys, Map<String, Operator<? extends OperatorDesc>> aliasToOpInfo, String[] aliases, int pos, List<Order> sortColumnsFirstTable) throws SemanticException {
String alias = aliases[pos];
/*
* Consider a query like:
*
* select -- mapjoin(subq1) -- * from
* (select a.key, a.value from tbl1 a) subq1
* join
* (select a.key, a.value from tbl2 a) subq2
* on subq1.key = subq2.key;
*
* aliasToOpInfo contains the SelectOperator for subq1 and subq2.
* We need to traverse the tree (using TableAccessAnalyzer) to get to the base
* table. If the object being map-joined is a base table, then aliasToOpInfo
* contains the TableScanOperator, and TableAccessAnalyzer is a no-op.
*/
Operator<? extends OperatorDesc> topOp = aliasToOpInfo.get(alias);
if (topOp == null) {
return false;
}
// get all join columns from join keys
List<String> joinCols = toColumns(keys);
if (joinCols == null || joinCols.isEmpty()) {
return false;
}
TableScanOperator tso = TableAccessAnalyzer.genRootTableScan(topOp, joinCols);
if (tso == null) {
return false;
}
/*
* Consider a query like:
*
* select count(*) from
* (
* select key, count(*) from
* (
* select --mapjoin(a)-- a.key as key, a.value as val1, b.value as val2
* from tbl1 a join tbl2 b on a.key = b.key
* ) subq1
* group by key
* ) subq2;
*
* The table alias should be subq2:subq1:a which needs to be fetched from topOps.
*/
if (pGraphContext.getTopOps().containsValue(tso)) {
for (Map.Entry<String, TableScanOperator> topOpEntry : this.pGraphContext.getTopOps().entrySet()) {
if (topOpEntry.getValue() == tso) {
alias = topOpEntry.getKey();
aliases[pos] = alias;
break;
}
}
} else {
// Ideally, this should never happen, and this should be an assert.
return false;
}
Table tbl = tso.getConf().getTableMetadata();
if (tbl.isPartitioned()) {
PrunedPartitionList prunedParts = pGraphContext.getPrunedPartitions(alias, tso);
List<Partition> partitions = prunedParts.getNotDeniedPartns();
// first table
if ((pos == 0) && (partitions != null) && (!partitions.isEmpty())) {
Partition firstPartition = partitions.get(0);
sortColumnsFirstTable.addAll(firstPartition.getSortCols());
}
for (Partition partition : prunedParts.getNotDeniedPartns()) {
if (!checkSortColsAndJoinCols(partition.getSortCols(), joinCols, sortColumnsFirstTable)) {
return false;
}
}
return true;
}
// Populate the names and order of columns for the first table
if (pos == 0) {
sortColumnsFirstTable.addAll(tbl.getSortCols());
}
return checkSortColsAndJoinCols(tbl.getSortCols(), joinCols, sortColumnsFirstTable);
}
use of org.apache.hadoop.hive.ql.metadata.Partition in project hive by apache.
the class ExportSemanticAnalyzer method prepareExport.
// FIXME : Move to EximUtil - it's okay for this to stay here for a little while more till we finalize the statics
public static void prepareExport(ASTNode ast, URI toURI, TableSpec ts, ReplicationSpec replicationSpec, Hive db, HiveConf conf, Context ctx, List<Task<? extends Serializable>> rootTasks, HashSet<ReadEntity> inputs, HashSet<WriteEntity> outputs, Logger LOG) throws SemanticException {
if (ts != null) {
try {
EximUtil.validateTable(ts.tableHandle);
if (replicationSpec.isInReplicationScope() && ts.tableHandle.isTemporary()) {
// No replication for temporary tables either
ts = null;
} else if (ts.tableHandle.isView()) {
replicationSpec.setIsMetadataOnly(true);
}
} catch (SemanticException e) {
// ignore for replication, error if not.
if (replicationSpec.isInReplicationScope()) {
// null out ts so we can't use it.
ts = null;
} else {
throw e;
}
}
}
try {
FileSystem fs = FileSystem.get(toURI, conf);
Path toPath = new Path(toURI.getScheme(), toURI.getAuthority(), toURI.getPath());
try {
FileStatus tgt = fs.getFileStatus(toPath);
// target exists
if (!tgt.isDir()) {
throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(ast, "Target is not a directory : " + toURI));
} else {
FileStatus[] files = fs.listStatus(toPath, FileUtils.HIDDEN_FILES_PATH_FILTER);
if (files != null && files.length != 0) {
throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(ast, "Target is not an empty directory : " + toURI));
}
}
} catch (FileNotFoundException e) {
}
} catch (IOException e) {
throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(ast), e);
}
PartitionIterable partitions = null;
try {
replicationSpec.setCurrentReplicationState(String.valueOf(db.getMSC().getCurrentNotificationEventId().getEventId()));
if ((ts != null) && (ts.tableHandle.isPartitioned())) {
if (ts.specType == TableSpec.SpecType.TABLE_ONLY) {
// TABLE-ONLY, fetch partitions if regular export, don't if metadata-only
if (replicationSpec.isMetadataOnly()) {
partitions = null;
} else {
partitions = new PartitionIterable(db, ts.tableHandle, null, conf.getIntVar(HiveConf.ConfVars.METASTORE_BATCH_RETRIEVE_MAX));
}
} else {
// PARTITIONS specified - partitions inside tableSpec
partitions = new PartitionIterable(ts.partitions);
}
} else {
// Either tableHandle isn't partitioned => null, or repl-export after ts becomes null => null.
// or this is a noop-replication export, so we can skip looking at ptns.
partitions = null;
}
Path path = new Path(ctx.getLocalTmpPath(), EximUtil.METADATA_NAME);
EximUtil.createExportDump(FileSystem.getLocal(conf), path, (ts != null ? ts.tableHandle : null), partitions, replicationSpec);
Task<? extends Serializable> rTask = ReplCopyTask.getDumpCopyTask(replicationSpec, path, new Path(toURI), conf);
rootTasks.add(rTask);
LOG.debug("_metadata file written into " + path.toString() + " and then copied to " + toURI.toString());
} catch (Exception e) {
throw new SemanticException(ErrorMsg.IO_ERROR.getMsg("Exception while writing out the local file"), e);
}
if (!(replicationSpec.isMetadataOnly() || (ts == null))) {
Path parentPath = new Path(toURI);
if (ts.tableHandle.isPartitioned()) {
for (Partition partition : partitions) {
Path fromPath = partition.getDataLocation();
Path toPartPath = new Path(parentPath, partition.getName());
Task<? extends Serializable> rTask = ReplCopyTask.getDumpCopyTask(replicationSpec, fromPath, toPartPath, conf);
rootTasks.add(rTask);
inputs.add(new ReadEntity(partition));
}
} else {
Path fromPath = ts.tableHandle.getDataLocation();
Path toDataPath = new Path(parentPath, EximUtil.DATA_PATH_NAME);
Task<? extends Serializable> rTask = ReplCopyTask.getDumpCopyTask(replicationSpec, fromPath, toDataPath, conf);
rootTasks.add(rTask);
inputs.add(new ReadEntity(ts.tableHandle));
}
outputs.add(toWriteEntity(parentPath, conf));
}
}
use of org.apache.hadoop.hive.ql.metadata.Partition in project hive by apache.
the class LoadSemanticAnalyzer method analyzeInternal.
@Override
public void analyzeInternal(ASTNode ast) throws SemanticException {
boolean isLocal = false;
boolean isOverWrite = false;
Tree fromTree = ast.getChild(0);
Tree tableTree = ast.getChild(1);
if (ast.getChildCount() == 4) {
isLocal = true;
isOverWrite = true;
}
if (ast.getChildCount() == 3) {
if (ast.getChild(2).getText().toLowerCase().equals("local")) {
isLocal = true;
} else {
isOverWrite = true;
}
}
// initialize load path
URI fromURI;
try {
String fromPath = stripQuotes(fromTree.getText());
fromURI = initializeFromURI(fromPath, isLocal);
} catch (IOException e) {
throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(fromTree, e.getMessage()), e);
} catch (URISyntaxException e) {
throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(fromTree, e.getMessage()), e);
}
// initialize destination table/partition
TableSpec ts = new TableSpec(db, conf, (ASTNode) tableTree);
if (ts.tableHandle.isView() || ts.tableHandle.isMaterializedView()) {
throw new SemanticException(ErrorMsg.DML_AGAINST_VIEW.getMsg());
}
if (ts.tableHandle.isNonNative()) {
throw new SemanticException(ErrorMsg.LOAD_INTO_NON_NATIVE.getMsg());
}
if (ts.tableHandle.isStoredAsSubDirectories()) {
throw new SemanticException(ErrorMsg.LOAD_INTO_STORED_AS_DIR.getMsg());
}
List<FieldSchema> parts = ts.tableHandle.getPartitionKeys();
if ((parts != null && parts.size() > 0) && (ts.partSpec == null || ts.partSpec.size() == 0)) {
throw new SemanticException(ErrorMsg.NEED_PARTITION_ERROR.getMsg());
}
List<String> bucketCols = ts.tableHandle.getBucketCols();
if (bucketCols != null && !bucketCols.isEmpty()) {
String error = StrictChecks.checkBucketing(conf);
if (error != null)
throw new SemanticException("Please load into an intermediate table" + " and use 'insert... select' to allow Hive to enforce bucketing. " + error);
}
// make sure the arguments make sense
List<FileStatus> files = applyConstraintsAndGetFiles(fromURI, fromTree, isLocal);
// for managed tables, make sure the file formats match
if (TableType.MANAGED_TABLE.equals(ts.tableHandle.getTableType()) && conf.getBoolVar(HiveConf.ConfVars.HIVECHECKFILEFORMAT)) {
ensureFileFormatsMatch(ts, files, fromURI);
}
inputs.add(toReadEntity(new Path(fromURI)));
Task<? extends Serializable> rTask = null;
// create final load/move work
boolean preservePartitionSpecs = false;
Map<String, String> partSpec = ts.getPartSpec();
if (partSpec == null) {
partSpec = new LinkedHashMap<String, String>();
outputs.add(new WriteEntity(ts.tableHandle, (isOverWrite ? WriteEntity.WriteType.INSERT_OVERWRITE : WriteEntity.WriteType.INSERT)));
} else {
try {
Partition part = Hive.get().getPartition(ts.tableHandle, partSpec, false);
if (part != null) {
if (isOverWrite) {
outputs.add(new WriteEntity(part, WriteEntity.WriteType.INSERT_OVERWRITE));
} else {
outputs.add(new WriteEntity(part, WriteEntity.WriteType.INSERT));
// If partition already exists and we aren't overwriting it, then respect
// its current location info rather than picking it from the parent TableDesc
preservePartitionSpecs = true;
}
} else {
outputs.add(new WriteEntity(ts.tableHandle, (isOverWrite ? WriteEntity.WriteType.INSERT_OVERWRITE : WriteEntity.WriteType.INSERT)));
}
} catch (HiveException e) {
throw new SemanticException(e);
}
}
LoadTableDesc loadTableWork;
loadTableWork = new LoadTableDesc(new Path(fromURI), Utilities.getTableDesc(ts.tableHandle), partSpec, isOverWrite);
if (preservePartitionSpecs) {
// Note : preservePartitionSpecs=true implies inheritTableSpecs=false but
// but preservePartitionSpecs=false(default) here is not sufficient enough
// info to set inheritTableSpecs=true
loadTableWork.setInheritTableSpecs(false);
}
Task<? extends Serializable> childTask = TaskFactory.get(new MoveWork(getInputs(), getOutputs(), loadTableWork, null, true, isLocal), conf);
if (rTask != null) {
rTask.addDependentTask(childTask);
} else {
rTask = childTask;
}
rootTasks.add(rTask);
// The user asked for stats to be collected.
// Some stats like number of rows require a scan of the data
// However, some other stats, like number of files, do not require a complete scan
// Update the stats which do not require a complete scan.
Task<? extends Serializable> statTask = null;
if (conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
StatsWork statDesc = new StatsWork(loadTableWork);
statDesc.setNoStatsAggregator(true);
statDesc.setClearAggregatorStats(true);
statDesc.setStatsReliable(conf.getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
statTask = TaskFactory.get(statDesc, conf);
}
// HIVE-3334 has been filed for load file with index auto update
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVEINDEXAUTOUPDATE)) {
IndexUpdater indexUpdater = new IndexUpdater(loadTableWork, getInputs(), conf);
try {
List<Task<? extends Serializable>> indexUpdateTasks = indexUpdater.generateUpdateTasks();
for (Task<? extends Serializable> updateTask : indexUpdateTasks) {
//LOAD DATA will either have a copy & move or just a move,
// we always want the update to be dependent on the move
childTask.addDependentTask(updateTask);
if (statTask != null) {
updateTask.addDependentTask(statTask);
}
}
} catch (HiveException e) {
console.printInfo("WARNING: could not auto-update stale indexes, indexes are not out of sync");
}
} else if (statTask != null) {
childTask.addDependentTask(statTask);
}
}
use of org.apache.hadoop.hive.ql.metadata.Partition in project hive by apache.
the class ProcessAnalyzeTable method process.
@SuppressWarnings("unchecked")
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
GenTezProcContext context = (GenTezProcContext) procContext;
TableScanOperator tableScan = (TableScanOperator) nd;
ParseContext parseContext = context.parseContext;
Class<? extends InputFormat> inputFormat = tableScan.getConf().getTableMetadata().getInputFormatClass();
if (parseContext.getQueryProperties().isAnalyzeCommand()) {
assert tableScan.getChildOperators() == null || tableScan.getChildOperators().size() == 0;
String alias = null;
for (String a : parseContext.getTopOps().keySet()) {
if (tableScan == parseContext.getTopOps().get(a)) {
alias = a;
}
}
assert alias != null;
TezWork tezWork = context.currentTask.getWork();
if (inputFormat.equals(OrcInputFormat.class)) {
// For ORC, all the following statements are the same
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan;
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;
// There will not be any Tez job above this task
StatsNoJobWork snjWork = new StatsNoJobWork(tableScan.getConf().getTableMetadata().getTableSpec());
snjWork.setStatsReliable(parseContext.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
// If partition is specified, get pruned partition list
Set<Partition> confirmedParts = GenMapRedUtils.getConfirmedPartitionsForScan(tableScan);
if (confirmedParts.size() > 0) {
Table source = tableScan.getConf().getTableMetadata();
List<String> partCols = GenMapRedUtils.getPartitionColumns(tableScan);
PrunedPartitionList partList = new PrunedPartitionList(source, confirmedParts, partCols, false);
snjWork.setPrunedPartitionList(partList);
}
Task<StatsNoJobWork> snjTask = TaskFactory.get(snjWork, parseContext.getConf());
snjTask.setParentTasks(null);
context.rootTasks.remove(context.currentTask);
context.rootTasks.add(snjTask);
return true;
} else {
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS;
// The plan consists of a simple TezTask followed by a StatsTask.
// The Tez task is just a simple TableScanOperator
StatsWork statsWork = new StatsWork(tableScan.getConf().getTableMetadata().getTableSpec());
statsWork.setAggKey(tableScan.getConf().getStatsAggPrefix());
statsWork.setStatsTmpDir(tableScan.getConf().getTmpStatsDir());
statsWork.setSourceTask(context.currentTask);
statsWork.setStatsReliable(parseContext.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
Task<StatsWork> statsTask = TaskFactory.get(statsWork, parseContext.getConf());
context.currentTask.addDependentTask(statsTask);
// The plan consists of a StatsTask only.
if (parseContext.getQueryProperties().isNoScanAnalyzeCommand()) {
statsTask.setParentTasks(null);
statsWork.setNoScanAnalyzeCommand(true);
context.rootTasks.remove(context.currentTask);
context.rootTasks.add(statsTask);
}
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan;
if (parseContext.getQueryProperties().isPartialScanAnalyzeCommand()) {
handlePartialScanCommand(tableScan, parseContext, statsWork, context, statsTask);
}
// NOTE: here we should use the new partition predicate pushdown API to
// get a list of pruned list,
// and pass it to setTaskPlan as the last parameter
Set<Partition> confirmedPartns = GenMapRedUtils.getConfirmedPartitionsForScan(tableScan);
PrunedPartitionList partitions = null;
if (confirmedPartns.size() > 0) {
Table source = tableScan.getConf().getTableMetadata();
List<String> partCols = GenMapRedUtils.getPartitionColumns(tableScan);
partitions = new PrunedPartitionList(source, confirmedPartns, partCols, false);
}
MapWork w = utils.createMapWork(context, tableScan, tezWork, partitions);
w.setGatheringStats(true);
return true;
}
} else if (parseContext.getAnalyzeRewrite() != null) {
// we need to collect table stats while collecting column stats.
try {
context.currentTask.addDependentTask(genTableStats(context, tableScan));
} catch (HiveException e) {
throw new SemanticException(e);
}
}
return null;
}
use of org.apache.hadoop.hive.ql.metadata.Partition in project hive by apache.
the class ProcessAnalyzeTable method genTableStats.
private Task<?> genTableStats(GenTezProcContext context, TableScanOperator tableScan) throws HiveException {
Class<? extends InputFormat> inputFormat = tableScan.getConf().getTableMetadata().getInputFormatClass();
ParseContext parseContext = context.parseContext;
Table table = tableScan.getConf().getTableMetadata();
List<Partition> partitions = new ArrayList<>();
if (table.isPartitioned()) {
partitions.addAll(parseContext.getPrunedPartitions(tableScan).getPartitions());
for (Partition partn : partitions) {
LOG.debug("XXX: adding part: " + partn);
context.outputs.add(new WriteEntity(partn, WriteEntity.WriteType.DDL_NO_LOCK));
}
}
TableSpec tableSpec = new TableSpec(table, partitions);
tableScan.getConf().getTableMetadata().setTableSpec(tableSpec);
if (inputFormat.equals(OrcInputFormat.class)) {
// For ORC, there is no Tez Job for table stats.
StatsNoJobWork snjWork = new StatsNoJobWork(tableScan.getConf().getTableMetadata().getTableSpec());
snjWork.setStatsReliable(parseContext.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
// If partition is specified, get pruned partition list
if (partitions.size() > 0) {
snjWork.setPrunedPartitionList(parseContext.getPrunedPartitions(tableScan));
}
return TaskFactory.get(snjWork, parseContext.getConf());
} else {
StatsWork statsWork = new StatsWork(tableScan.getConf().getTableMetadata().getTableSpec());
statsWork.setAggKey(tableScan.getConf().getStatsAggPrefix());
statsWork.setStatsTmpDir(tableScan.getConf().getTmpStatsDir());
statsWork.setSourceTask(context.currentTask);
statsWork.setStatsReliable(parseContext.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
return TaskFactory.get(statsWork, parseContext.getConf());
}
}
Aggregations