use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.
the class GenMapRedUtils method createMRWorkForMergingFiles.
/**
* @param fsInput The FileSink operator.
* @param ctx The MR processing context.
* @param finalName the final destination path the merge job should output.
* @param dependencyTask
* @param mvTasks
* @param conf
* @param currTask
* @throws SemanticException
* create a Map-only merge job using CombineHiveInputFormat for all partitions with
* following operators:
* MR job J0:
* ...
* |
* v
* FileSinkOperator_1 (fsInput)
* |
* v
* Merge job J1:
* |
* v
* TableScan (using CombineHiveInputFormat) (tsMerge)
* |
* v
* FileSinkOperator (fsMerge)
*
* Here the pathToPartitionInfo & pathToAlias will remain the same, which means the paths
* do
* not contain the dynamic partitions (their parent). So after the dynamic partitions are
* created (after the first job finished before the moveTask or ConditionalTask start),
* we need to change the pathToPartitionInfo & pathToAlias to include the dynamic
* partition
* directories.
*
*/
public static void createMRWorkForMergingFiles(FileSinkOperator fsInput, Path finalName, DependencyCollectionTask dependencyTask, List<Task<MoveWork>> mvTasks, HiveConf conf, Task<? extends Serializable> currTask) throws SemanticException {
//
// 1. create the operator tree
//
FileSinkDesc fsInputDesc = fsInput.getConf();
// Create a TableScan operator
RowSchema inputRS = fsInput.getSchema();
TableScanOperator tsMerge = GenMapRedUtils.createTemporaryTableScanOperator(fsInput.getCompilationOpContext(), inputRS);
// Create a FileSink operator
TableDesc ts = (TableDesc) fsInputDesc.getTableInfo().clone();
FileSinkDesc fsOutputDesc = new FileSinkDesc(finalName, ts, conf.getBoolVar(ConfVars.COMPRESSRESULT));
FileSinkOperator fsOutput = (FileSinkOperator) OperatorFactory.getAndMakeChild(fsOutputDesc, inputRS, tsMerge);
// If the input FileSinkOperator is a dynamic partition enabled, the tsMerge input schema
// needs to include the partition column, and the fsOutput should have
// a DynamicPartitionCtx to indicate that it needs to dynamically partitioned.
DynamicPartitionCtx dpCtx = fsInputDesc.getDynPartCtx();
if (dpCtx != null && dpCtx.getNumDPCols() > 0) {
// adding DP ColumnInfo to the RowSchema signature
ArrayList<ColumnInfo> signature = inputRS.getSignature();
String tblAlias = fsInputDesc.getTableInfo().getTableName();
for (String dpCol : dpCtx.getDPColNames()) {
ColumnInfo colInfo = new ColumnInfo(dpCol, // all partition column type should be string
TypeInfoFactory.stringTypeInfo, tblAlias, // partition column is virtual column
true);
signature.add(colInfo);
}
inputRS.setSignature(signature);
// create another DynamicPartitionCtx, which has a different input-to-DP column mapping
DynamicPartitionCtx dpCtx2 = new DynamicPartitionCtx(dpCtx);
fsOutputDesc.setDynPartCtx(dpCtx2);
// update the FileSinkOperator to include partition columns
usePartitionColumns(fsInputDesc.getTableInfo().getProperties(), dpCtx.getDPColNames());
} else {
// non-partitioned table
fsInputDesc.getTableInfo().getProperties().remove(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS);
}
//
// 2. Constructing a conditional task consisting of a move task and a map reduce task
//
MoveWork dummyMv = new MoveWork(null, null, null, new LoadFileDesc(fsInputDesc.getFinalDirName(), finalName, true, null, null), false);
MapWork cplan;
Serializable work;
if ((conf.getBoolVar(ConfVars.HIVEMERGERCFILEBLOCKLEVEL) && fsInputDesc.getTableInfo().getInputFileFormatClass().equals(RCFileInputFormat.class)) || (conf.getBoolVar(ConfVars.HIVEMERGEORCFILESTRIPELEVEL) && fsInputDesc.getTableInfo().getInputFileFormatClass().equals(OrcInputFormat.class))) {
cplan = GenMapRedUtils.createMergeTask(fsInputDesc, finalName, dpCtx != null && dpCtx.getNumDPCols() > 0, fsInput.getCompilationOpContext());
if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
work = new TezWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID), conf);
cplan.setName("File Merge");
((TezWork) work).add(cplan);
} else if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("spark")) {
work = new SparkWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID));
cplan.setName("Spark Merge File Work");
((SparkWork) work).add(cplan);
} else {
work = cplan;
}
} else {
cplan = createMRWorkForMergingFiles(conf, tsMerge, fsInputDesc);
if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
work = new TezWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID), conf);
cplan.setName("File Merge");
((TezWork) work).add(cplan);
} else if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("spark")) {
work = new SparkWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID));
cplan.setName("Spark Merge File Work");
((SparkWork) work).add(cplan);
} else {
work = new MapredWork();
((MapredWork) work).setMapWork(cplan);
}
}
// use CombineHiveInputFormat for map-only merging
cplan.setInputformat("org.apache.hadoop.hive.ql.io.CombineHiveInputFormat");
// NOTE: we should gather stats in MR1 rather than MR2 at merge job since we don't
// know if merge MR2 will be triggered at execution time
Task<MoveWork> mvTask = GenMapRedUtils.findMoveTask(mvTasks, fsOutput);
ConditionalTask cndTsk = GenMapRedUtils.createCondTask(conf, currTask, dummyMv, work, fsInputDesc.getFinalDirName(), finalName, mvTask, dependencyTask);
// keep the dynamic partition context in conditional task resolver context
ConditionalResolverMergeFilesCtx mrCtx = (ConditionalResolverMergeFilesCtx) cndTsk.getResolverCtx();
mrCtx.setDPCtx(fsInputDesc.getDynPartCtx());
mrCtx.setLbCtx(fsInputDesc.getLbCtx());
}
use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.
the class ImportSemanticAnalyzer method addSinglePartition.
private static Task<?> addSinglePartition(URI fromURI, FileSystem fs, ImportTableDesc tblDesc, Table table, Warehouse wh, AddPartitionDesc addPartitionDesc, ReplicationSpec replicationSpec, EximUtil.SemanticAnalyzerWrapperContext x) throws MetaException, IOException, HiveException {
AddPartitionDesc.OnePartitionDesc partSpec = addPartitionDesc.getPartition(0);
if (tblDesc.isExternal() && tblDesc.getLocation() == null) {
x.getLOG().debug("Importing in-place: adding AddPart for partition " + partSpecToString(partSpec.getPartSpec()));
// addPartitionDesc already has the right partition location
Task<?> addPartTask = TaskFactory.get(new DDLWork(x.getInputs(), x.getOutputs(), addPartitionDesc), x.getConf());
return addPartTask;
} else {
String srcLocation = partSpec.getLocation();
fixLocationInPartSpec(fs, tblDesc, table, wh, replicationSpec, partSpec, x);
x.getLOG().debug("adding dependent CopyWork/AddPart/MoveWork for partition " + partSpecToString(partSpec.getPartSpec()) + " with source location: " + srcLocation);
Path tgtLocation = new Path(partSpec.getLocation());
Path tmpPath = x.getCtx().getExternalTmpPath(tgtLocation);
Task<?> copyTask = ReplCopyTask.getLoadCopyTask(replicationSpec, new Path(srcLocation), tmpPath, x.getConf());
Task<?> addPartTask = TaskFactory.get(new DDLWork(x.getInputs(), x.getOutputs(), addPartitionDesc), x.getConf());
LoadTableDesc loadTableWork = new LoadTableDesc(tmpPath, Utilities.getTableDesc(table), partSpec.getPartSpec(), true);
loadTableWork.setInheritTableSpecs(false);
Task<?> loadPartTask = TaskFactory.get(new MoveWork(x.getInputs(), x.getOutputs(), loadTableWork, null, false), x.getConf());
copyTask.addDependentTask(loadPartTask);
addPartTask.addDependentTask(loadPartTask);
x.getTasks().add(copyTask);
return addPartTask;
}
}
use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.
the class ImportSemanticAnalyzer method loadTable.
private static Task<?> loadTable(URI fromURI, Table table, boolean replace, Path tgtPath, ReplicationSpec replicationSpec, EximUtil.SemanticAnalyzerWrapperContext x) {
Path dataPath = new Path(fromURI.toString(), EximUtil.DATA_PATH_NAME);
Path tmpPath = x.getCtx().getExternalTmpPath(tgtPath);
Task<?> copyTask = ReplCopyTask.getLoadCopyTask(replicationSpec, dataPath, tmpPath, x.getConf());
LoadTableDesc loadTableWork = new LoadTableDesc(tmpPath, Utilities.getTableDesc(table), new TreeMap<String, String>(), replace);
Task<?> loadTableTask = TaskFactory.get(new MoveWork(x.getInputs(), x.getOutputs(), loadTableWork, null, false), x.getConf());
copyTask.addDependentTask(loadTableTask);
x.getTasks().add(copyTask);
return loadTableTask;
}
use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.
the class DDLTask method generateAddMmTasks.
private List<Task<?>> generateAddMmTasks(Table tbl) throws HiveException {
// We will move all the files in the table/partition directories into the first MM
// directory, then commit the first write ID.
List<Path> srcs = new ArrayList<>(), tgts = new ArrayList<>();
long mmWriteId = 0;
try {
HiveTxnManager txnManager = SessionState.get().getTxnMgr();
if (txnManager.isTxnOpen()) {
mmWriteId = txnManager.getTableWriteId(tbl.getDbName(), tbl.getTableName());
} else {
txnManager.openTxn(new Context(conf), conf.getUser());
mmWriteId = txnManager.getTableWriteId(tbl.getDbName(), tbl.getTableName());
txnManager.commitTxn();
}
} catch (Exception e) {
String errorMessage = "FAILED: Error in acquiring locks: " + e.getMessage();
console.printError(errorMessage, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
}
int stmtId = 0;
String mmDir = AcidUtils.deltaSubdir(mmWriteId, mmWriteId, stmtId);
Hive db = getHive();
if (tbl.getPartitionKeys().size() > 0) {
PartitionIterable parts = new PartitionIterable(db, tbl, null, HiveConf.getIntVar(conf, ConfVars.METASTORE_BATCH_RETRIEVE_MAX));
Iterator<Partition> partIter = parts.iterator();
while (partIter.hasNext()) {
Partition part = partIter.next();
checkMmLb(part);
Path src = part.getDataLocation(), tgt = new Path(src, mmDir);
srcs.add(src);
tgts.add(tgt);
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("Will move " + src + " to " + tgt);
}
}
} else {
checkMmLb(tbl);
Path src = tbl.getDataLocation(), tgt = new Path(src, mmDir);
srcs.add(src);
tgts.add(tgt);
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("Will move " + src + " to " + tgt);
}
}
// Don't set inputs and outputs - the locks have already been taken so it's pointless.
MoveWork mw = new MoveWork(null, null, null, null, false);
mw.setMultiFilesDesc(new LoadMultiFilesDesc(srcs, tgts, true, null, null));
ImportCommitWork icw = new ImportCommitWork(tbl.getDbName(), tbl.getTableName(), mmWriteId, stmtId);
Task<?> mv = TaskFactory.get(mw), ic = TaskFactory.get(icw);
mv.addDependentTask(ic);
return Lists.<Task<?>>newArrayList(mv);
}
use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.
the class DDLSemanticAnalyzer method analyzeAlterTablePartMergeFiles.
private void analyzeAlterTablePartMergeFiles(ASTNode ast, String tableName, HashMap<String, String> partSpec) throws SemanticException {
AlterTablePartMergeFilesDesc mergeDesc = new AlterTablePartMergeFilesDesc(tableName, partSpec);
List<Path> inputDir = new ArrayList<Path>();
Path oldTblPartLoc = null;
Path newTblPartLoc = null;
Table tblObj = null;
ListBucketingCtx lbCtx = null;
try {
tblObj = getTable(tableName);
// TODO: we should probably block all ACID tables here.
if (AcidUtils.isInsertOnlyTable(tblObj.getParameters())) {
throw new SemanticException("Merge is not supported for MM tables");
}
mergeDesc.setTableDesc(Utilities.getTableDesc(tblObj));
List<String> bucketCols = null;
Class<? extends InputFormat> inputFormatClass = null;
boolean isArchived = false;
if (tblObj.isPartitioned()) {
if (partSpec == null) {
throw new SemanticException("source table " + tableName + " is partitioned but no partition desc found.");
} else {
Partition part = getPartition(tblObj, partSpec, false);
if (part == null) {
throw new SemanticException("source table " + tableName + " is partitioned but partition not found.");
}
bucketCols = part.getBucketCols();
inputFormatClass = part.getInputFormatClass();
isArchived = ArchiveUtils.isArchived(part);
Path tabPath = tblObj.getPath();
Path partPath = part.getDataLocation();
// if the table is in a different dfs than the partition,
// replace the partition's dfs with the table's dfs.
newTblPartLoc = new Path(tabPath.toUri().getScheme(), tabPath.toUri().getAuthority(), partPath.toUri().getPath());
oldTblPartLoc = partPath;
lbCtx = constructListBucketingCtx(part.getSkewedColNames(), part.getSkewedColValues(), part.getSkewedColValueLocationMaps(), part.isStoredAsSubDirectories(), conf);
}
} else {
inputFormatClass = tblObj.getInputFormatClass();
bucketCols = tblObj.getBucketCols();
// input and output are the same
oldTblPartLoc = tblObj.getPath();
newTblPartLoc = tblObj.getPath();
lbCtx = constructListBucketingCtx(tblObj.getSkewedColNames(), tblObj.getSkewedColValues(), tblObj.getSkewedColValueLocationMaps(), tblObj.isStoredAsSubDirectories(), conf);
}
// throw a HiveException for other than rcfile and orcfile.
if (!((inputFormatClass.equals(RCFileInputFormat.class) || (inputFormatClass.equals(OrcInputFormat.class))))) {
throw new SemanticException(ErrorMsg.CONCATENATE_UNSUPPORTED_FILE_FORMAT.getMsg());
}
mergeDesc.setInputFormatClass(inputFormatClass);
// throw a HiveException if the table/partition is bucketized
if (bucketCols != null && bucketCols.size() > 0) {
throw new SemanticException(ErrorMsg.CONCATENATE_UNSUPPORTED_TABLE_BUCKETED.getMsg());
}
// throw a HiveException if the table/partition is archived
if (isArchived) {
throw new SemanticException(ErrorMsg.CONCATENATE_UNSUPPORTED_PARTITION_ARCHIVED.getMsg());
}
// violating which can cause data loss
if (tblObj.isNonNative()) {
throw new SemanticException(ErrorMsg.CONCATENATE_UNSUPPORTED_TABLE_NON_NATIVE.getMsg());
}
if (tblObj.getTableType() != TableType.MANAGED_TABLE) {
throw new SemanticException(ErrorMsg.CONCATENATE_UNSUPPORTED_TABLE_NOT_MANAGED.getMsg());
}
// transactional tables are compacted and no longer needs to be bucketed, so not safe for merge/concatenation
boolean isAcid = AcidUtils.isTransactionalTable(tblObj);
if (isAcid) {
throw new SemanticException(ErrorMsg.CONCATENATE_UNSUPPORTED_TABLE_TRANSACTIONAL.getMsg());
}
inputDir.add(oldTblPartLoc);
mergeDesc.setInputDir(inputDir);
mergeDesc.setLbCtx(lbCtx);
addInputsOutputsAlterTable(tableName, partSpec, AlterTableTypes.MERGEFILES);
DDLWork ddlWork = new DDLWork(getInputs(), getOutputs(), mergeDesc);
ddlWork.setNeedLock(true);
Task<? extends Serializable> mergeTask = TaskFactory.get(ddlWork);
TableDesc tblDesc = Utilities.getTableDesc(tblObj);
Path queryTmpdir = ctx.getExternalTmpPath(newTblPartLoc);
mergeDesc.setOutputDir(queryTmpdir);
// No need to handle MM tables - unsupported path.
LoadTableDesc ltd = new LoadTableDesc(queryTmpdir, tblDesc, partSpec == null ? new HashMap<>() : partSpec);
ltd.setLbCtx(lbCtx);
Task<MoveWork> moveTsk = TaskFactory.get(new MoveWork(null, null, ltd, null, false));
mergeTask.addDependentTask(moveTsk);
if (conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
BasicStatsWork basicStatsWork;
if (oldTblPartLoc.equals(newTblPartLoc)) {
// If we're merging to the same location, we can avoid some metastore calls
TableSpec tableSpec = new TableSpec(db, tableName, partSpec);
basicStatsWork = new BasicStatsWork(tableSpec);
} else {
basicStatsWork = new BasicStatsWork(ltd);
}
basicStatsWork.setNoStatsAggregator(true);
basicStatsWork.setClearAggregatorStats(true);
StatsWork columnStatsWork = new StatsWork(tblObj, basicStatsWork, conf);
Task<? extends Serializable> statTask = TaskFactory.get(columnStatsWork);
moveTsk.addDependentTask(statTask);
}
rootTasks.add(mergeTask);
} catch (Exception e) {
throw new SemanticException(e);
}
}
Aggregations