use of org.apache.hadoop.hive.ql.plan.LoadTableDesc in project hive by apache.
the class ImportSemanticAnalyzer method loadTable.
private static Task<?> loadTable(URI fromURI, Table table, boolean replace, Path tgtPath, ReplicationSpec replicationSpec, EximUtil.SemanticAnalyzerWrapperContext x, Long writeId, int stmtId, String dumpRoot, ReplicationMetricCollector metricCollector) throws HiveException {
assert table != null;
assert table.getParameters() != null;
Path dataPath = new Path(fromURI.toString(), EximUtil.DATA_PATH_NAME);
Path destPath = null, loadPath = null;
LoadFileType lft;
boolean isSkipTrash = false;
boolean needRecycle = false;
if (replicationSpec.isInReplicationScope()) {
isSkipTrash = MetaStoreUtils.isSkipTrash(table.getParameters());
if (table.isTemporary()) {
needRecycle = false;
} else {
org.apache.hadoop.hive.metastore.api.Database db = x.getHive().getDatabase(table.getDbName());
needRecycle = db != null && ReplChangeManager.shouldEnableCm(db, table.getTTable());
}
}
if (AcidUtils.isTransactionalTable(table)) {
String mmSubdir = replace ? AcidUtils.baseDir(writeId) : AcidUtils.deltaSubdir(writeId, writeId, stmtId);
destPath = new Path(tgtPath, mmSubdir);
/**
* CopyTask below will copy files from the 'archive' to a delta_x_x in the table/partition
* directory, i.e. the final destination for these files. This has to be a copy to preserve
* the archive. MoveTask is optimized to do a 'rename' if files are on the same FileSystem.
* So setting 'loadPath' this way will make
* {@link Hive#loadTable(Path, String, LoadFileType, boolean, boolean, boolean,
* boolean, Long, int)}
* skip the unnecessary file (rename) operation but it will perform other things.
*/
loadPath = tgtPath;
lft = LoadFileType.KEEP_EXISTING;
} else {
destPath = loadPath = x.getCtx().getExternalTmpPath(tgtPath);
lft = replace ? LoadFileType.REPLACE_ALL : LoadFileType.OVERWRITE_EXISTING;
}
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("adding import work for table with source location: " + dataPath + "; table: " + tgtPath + "; copy destination " + destPath + "; mm " + writeId + " for " + table.getTableName() + ": " + (AcidUtils.isFullAcidTable(table) ? "acid" : (AcidUtils.isInsertOnlyTable(table) ? "mm" : "flat")));
}
Task<?> copyTask = null;
if (replicationSpec.isInReplicationScope()) {
boolean copyAtLoad = x.getConf().getBoolVar(HiveConf.ConfVars.REPL_RUN_DATA_COPY_TASKS_ON_TARGET);
copyTask = ReplCopyTask.getLoadCopyTask(replicationSpec, dataPath, destPath, x.getConf(), isSkipTrash, needRecycle, copyAtLoad, dumpRoot, metricCollector);
} else {
copyTask = TaskFactory.get(new CopyWork(dataPath, destPath, false, dumpRoot, metricCollector, true));
}
MoveWork moveWork = new MoveWork(x.getInputs(), x.getOutputs(), null, null, false, dumpRoot, metricCollector, true);
if (replicationSpec.isInReplicationScope() && AcidUtils.isTransactionalTable(table)) {
LoadMultiFilesDesc loadFilesWork = new LoadMultiFilesDesc(Collections.singletonList(destPath), Collections.singletonList(tgtPath), true, null, null);
moveWork.setMultiFilesDesc(loadFilesWork);
moveWork.setNeedCleanTarget(replace);
} else {
LoadTableDesc loadTableWork = new LoadTableDesc(loadPath, Utilities.getTableDesc(table), new TreeMap<>(), lft, writeId);
loadTableWork.setStmtId(stmtId);
moveWork.setLoadTableWork(loadTableWork);
}
// if Importing into existing table, FileFormat is checked by
// ImportSemanticAnalyzer.checked checkTable()
Task<?> loadTableTask = TaskFactory.get(moveWork, x.getConf());
copyTask.addDependentTask(loadTableTask);
x.getTasks().add(copyTask);
return loadTableTask;
}
use of org.apache.hadoop.hive.ql.plan.LoadTableDesc in project hive by apache.
the class GenMapRedUtils method mergeMovePaths.
/**
* Merges the given Conditional input path and the linked MoveWork into one only MoveWork.
* This is an optimization for BlobStore systems to avoid doing two renames or copies that are not necessary.
*
* @param condInputPath A path that the ConditionalTask uses as input for its sub-tasks.
* @param linkedMoveWork A MoveWork that the ConditionalTask uses to link to its sub-tasks.
* @return A new MoveWork that has the Conditional input path as source and the linkedMoveWork as target.
*/
@VisibleForTesting
protected static MoveWork mergeMovePaths(Path condInputPath, MoveWork linkedMoveWork) {
MoveWork newWork = new MoveWork(linkedMoveWork);
LoadFileDesc fileDesc = null;
LoadTableDesc tableDesc = null;
if (linkedMoveWork.getLoadFileWork() != null) {
fileDesc = new LoadFileDesc(linkedMoveWork.getLoadFileWork());
fileDesc.setSourcePath(condInputPath);
} else if (linkedMoveWork.getLoadTableWork() != null) {
tableDesc = new LoadTableDesc(linkedMoveWork.getLoadTableWork());
tableDesc.setSourcePath(condInputPath);
} else {
throw new IllegalArgumentException("Merging a path with a MoveWork with multi-files work is not allowed.");
}
newWork.setLoadFileWork(fileDesc);
newWork.setLoadTableWork(tableDesc);
return newWork;
}
use of org.apache.hadoop.hive.ql.plan.LoadTableDesc in project hive by apache.
the class LoadPartitions method movePartitionTask.
/**
* This will create the move of partition data from temp path to actual path
*/
private Task<?> movePartitionTask(Table table, AddPartitionDesc.OnePartitionDesc partSpec, Path tmpPath) {
// Note: this sets LoadFileType incorrectly for ACID; is that relevant for load?
// See setLoadFileType and setIsAcidIow calls elsewhere for an example.
LoadTableDesc loadTableWork = new LoadTableDesc(tmpPath, Utilities.getTableDesc(table), partSpec.getPartSpec(), event.replicationSpec().isReplace() ? LoadFileType.REPLACE_ALL : LoadFileType.OVERWRITE_EXISTING, SessionState.get().getTxnMgr().getCurrentTxnId());
loadTableWork.setInheritTableSpecs(false);
MoveWork work = new MoveWork(new HashSet<>(), new HashSet<>(), loadTableWork, null, false);
return TaskFactory.get(work, context.hiveConf);
}
use of org.apache.hadoop.hive.ql.plan.LoadTableDesc in project hive by apache.
the class MoveTask method execute.
@Override
public int execute(DriverContext driverContext) {
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("Executing MoveWork " + System.identityHashCode(work) + " with " + work.getLoadFileWork() + "; " + work.getLoadTableWork() + "; " + work.getLoadMultiFilesWork());
}
try {
if (driverContext.getCtx().getExplainAnalyze() == AnalyzeState.RUNNING) {
return 0;
}
Hive db = getHive();
// Do any hive related operations like moving tables and files
// to appropriate locations
LoadFileDesc lfd = work.getLoadFileWork();
if (lfd != null) {
Path targetPath = lfd.getTargetDir();
Path sourcePath = lfd.getSourcePath();
if (targetPath.equals(sourcePath)) {
Utilities.FILE_OP_LOGGER.debug("MoveTask not moving " + sourcePath);
} else {
Utilities.FILE_OP_LOGGER.debug("MoveTask moving " + sourcePath + " to " + targetPath);
if (lfd.getWriteType() == AcidUtils.Operation.INSERT) {
// 'sourcePath' result of 'select ...' part of CTAS statement
assert lfd.getIsDfsDir();
FileSystem srcFs = sourcePath.getFileSystem(conf);
FileStatus[] srcs = srcFs.globStatus(sourcePath);
if (srcs != null) {
List<Path> newFiles = new ArrayList<>();
Hive.moveAcidFiles(srcFs, srcs, targetPath, newFiles);
} else {
LOG.debug("No files found to move from " + sourcePath + " to " + targetPath);
}
} else {
moveFile(sourcePath, targetPath, lfd.getIsDfsDir());
}
}
}
// Multi-file load is for dynamic partitions when some partitions do not
// need to merge and they can simply be moved to the target directory.
// This is also used for MM table conversion.
LoadMultiFilesDesc lmfd = work.getLoadMultiFilesWork();
if (lmfd != null) {
boolean isDfsDir = lmfd.getIsDfsDir();
List<String> targetPrefixes = lmfd.getTargetPrefixes();
for (int i = 0; i < lmfd.getSourceDirs().size(); ++i) {
Path srcPath = lmfd.getSourceDirs().get(i);
Path destPath = lmfd.getTargetDirs().get(i);
String filePrefix = targetPrefixes == null ? null : targetPrefixes.get(i);
FileSystem destFs = destPath.getFileSystem(conf);
if (filePrefix == null) {
if (!destFs.exists(destPath.getParent())) {
destFs.mkdirs(destPath.getParent());
}
Utilities.FILE_OP_LOGGER.debug("MoveTask moving (multi-file) " + srcPath + " to " + destPath);
moveFile(srcPath, destPath, isDfsDir);
} else {
if (!destFs.exists(destPath)) {
destFs.mkdirs(destPath);
}
FileSystem srcFs = srcPath.getFileSystem(conf);
FileStatus[] children = srcFs.listStatus(srcPath);
if (children != null) {
for (FileStatus child : children) {
Path childSrc = child.getPath();
Path childDest = new Path(destPath, filePrefix + childSrc.getName());
Utilities.FILE_OP_LOGGER.debug("MoveTask moving (multi-file) " + childSrc + " to " + childDest);
moveFile(childSrc, childDest, isDfsDir);
}
} else {
Utilities.FILE_OP_LOGGER.debug("MoveTask skipping empty directory (multi-file) " + srcPath);
}
if (!srcFs.delete(srcPath, false)) {
throw new IOException("Couldn't delete " + srcPath + " after moving all the files");
}
}
}
}
// Next we do this for tables and partitions
LoadTableDesc tbd = work.getLoadTableWork();
if (tbd != null) {
logMessage(tbd);
Table table = db.getTable(tbd.getTable().getTableName());
checkFileFormats(db, tbd, table);
boolean isFullAcidOp = work.getLoadTableWork().getWriteType() != AcidUtils.Operation.NOT_ACID && // it seems that LoadTableDesc has Operation.INSERT only for CTAS...
!tbd.isMmTable();
// Create a data container
DataContainer dc = null;
if (tbd.getPartitionSpec().size() == 0) {
dc = new DataContainer(table.getTTable());
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("loadTable called from " + tbd.getSourcePath() + " into " + tbd.getTable().getTableName());
}
db.loadTable(tbd.getSourcePath(), tbd.getTable().getTableName(), tbd.getLoadFileType(), work.isSrcLocal(), isSkewedStoredAsDirs(tbd), isFullAcidOp, hasFollowingStatsTask(), tbd.getWriteId(), tbd.getStmtId());
if (work.getOutputs() != null) {
DDLTask.addIfAbsentByName(new WriteEntity(table, getWriteType(tbd, work.getLoadTableWork().getWriteType())), work.getOutputs());
}
} else {
LOG.info("Partition is: {}", tbd.getPartitionSpec());
// Check if the bucketing and/or sorting columns were inferred
TaskInformation ti = new TaskInformation(this, tbd.getSourcePath().toUri().toString());
inferTaskInformation(ti);
// deal with dynamic partitions
DynamicPartitionCtx dpCtx = tbd.getDPCtx();
if (dpCtx != null && dpCtx.getNumDPCols() > 0) {
// dynamic partitions
dc = handleDynParts(db, table, tbd, ti, dpCtx);
} else {
// static partitions
dc = handleStaticParts(db, table, tbd, ti);
}
}
if (dc != null) {
// If we are doing an update or a delete the number of columns in the table will not
// match the number of columns in the file sink. For update there will be one too many
// (because of the ROW__ID), and in the case of the delete there will be just the
// ROW__ID, which we don't need to worry about from a lineage perspective.
List<FieldSchema> tableCols = null;
switch(work.getLoadTableWork().getWriteType()) {
case DELETE:
case UPDATE:
// Pass an empty list as no columns will be written to the file.
// TODO I should be able to make this work for update
tableCols = new ArrayList<>();
break;
default:
tableCols = table.getCols();
break;
}
queryState.getLineageState().setLineage(tbd.getSourcePath(), dc, tableCols);
}
releaseLocks(tbd);
}
return 0;
} catch (HiveException he) {
int errorCode = 1;
if (he.getCanonicalErrorMsg() != ErrorMsg.GENERIC_ERROR) {
errorCode = he.getCanonicalErrorMsg().getErrorCode();
if (he.getCanonicalErrorMsg() == ErrorMsg.UNRESOLVED_RT_EXCEPTION) {
console.printError("Failed with exception " + he.getMessage(), "\n" + StringUtils.stringifyException(he));
} else {
console.printError("Failed with exception " + he.getMessage() + "\nRemote Exception: " + he.getRemoteErrorMsg());
console.printInfo("\n", StringUtils.stringifyException(he), false);
}
}
setException(he);
return errorCode;
} catch (Exception e) {
console.printError("Failed with exception " + e.getMessage(), "\n" + StringUtils.stringifyException(e));
setException(e);
return (1);
}
}
use of org.apache.hadoop.hive.ql.plan.LoadTableDesc in project hive by apache.
the class DDLSemanticAnalyzer method analyzeTruncateTable.
private void analyzeTruncateTable(ASTNode ast) throws SemanticException {
// TOK_TABLE_PARTITION
ASTNode root = (ASTNode) ast.getChild(0);
String tableName = getUnescapedName((ASTNode) root.getChild(0));
Table table = getTable(tableName, true);
if (table.getTableType() != TableType.MANAGED_TABLE) {
throw new SemanticException(ErrorMsg.TRUNCATE_FOR_NON_MANAGED_TABLE.format(tableName));
}
if (table.isNonNative()) {
// TODO
throw new SemanticException(ErrorMsg.TRUNCATE_FOR_NON_NATIVE_TABLE.format(tableName));
}
if (!table.isPartitioned() && root.getChildCount() > 1) {
throw new SemanticException(ErrorMsg.PARTSPEC_FOR_NON_PARTITIONED_TABLE.format(tableName));
}
Map<String, String> partSpec = getPartSpec((ASTNode) root.getChild(1));
if (partSpec == null) {
if (!table.isPartitioned()) {
outputs.add(new WriteEntity(table, WriteEntity.WriteType.DDL_EXCLUSIVE));
} else {
for (Partition partition : getPartitions(table, null, false)) {
outputs.add(new WriteEntity(partition, WriteEntity.WriteType.DDL_EXCLUSIVE));
}
}
} else {
if (isFullSpec(table, partSpec)) {
validatePartSpec(table, partSpec, (ASTNode) root.getChild(1), conf, true);
Partition partition = getPartition(table, partSpec, true);
outputs.add(new WriteEntity(partition, WriteEntity.WriteType.DDL_EXCLUSIVE));
} else {
validatePartSpec(table, partSpec, (ASTNode) root.getChild(1), conf, false);
for (Partition partition : getPartitions(table, partSpec, false)) {
outputs.add(new WriteEntity(partition, WriteEntity.WriteType.DDL_EXCLUSIVE));
}
}
}
TruncateTableDesc truncateTblDesc = new TruncateTableDesc(tableName, partSpec, null);
DDLWork ddlWork = new DDLWork(getInputs(), getOutputs(), truncateTblDesc);
Task<? extends Serializable> truncateTask = TaskFactory.get(ddlWork);
// Is this a truncate column command
List<String> columnNames = null;
if (ast.getChildCount() == 2) {
try {
columnNames = getColumnNames((ASTNode) ast.getChild(1));
// It would be possible to support this, but this is such a pointless command.
if (AcidUtils.isInsertOnlyTable(table.getParameters())) {
throw new SemanticException("Truncating MM table columns not presently supported");
}
List<String> bucketCols = null;
Class<? extends InputFormat> inputFormatClass = null;
boolean isArchived = false;
Path newTblPartLoc = null;
Path oldTblPartLoc = null;
List<FieldSchema> cols = null;
ListBucketingCtx lbCtx = null;
boolean isListBucketed = false;
List<String> listBucketColNames = null;
if (table.isPartitioned()) {
Partition part = db.getPartition(table, partSpec, false);
Path tabPath = table.getPath();
Path partPath = part.getDataLocation();
// if the table is in a different dfs than the partition,
// replace the partition's dfs with the table's dfs.
newTblPartLoc = new Path(tabPath.toUri().getScheme(), tabPath.toUri().getAuthority(), partPath.toUri().getPath());
oldTblPartLoc = partPath;
cols = part.getCols();
bucketCols = part.getBucketCols();
inputFormatClass = part.getInputFormatClass();
isArchived = ArchiveUtils.isArchived(part);
lbCtx = constructListBucketingCtx(part.getSkewedColNames(), part.getSkewedColValues(), part.getSkewedColValueLocationMaps(), part.isStoredAsSubDirectories(), conf);
isListBucketed = part.isStoredAsSubDirectories();
listBucketColNames = part.getSkewedColNames();
} else {
// input and output are the same
oldTblPartLoc = table.getPath();
newTblPartLoc = table.getPath();
cols = table.getCols();
bucketCols = table.getBucketCols();
inputFormatClass = table.getInputFormatClass();
lbCtx = constructListBucketingCtx(table.getSkewedColNames(), table.getSkewedColValues(), table.getSkewedColValueLocationMaps(), table.isStoredAsSubDirectories(), conf);
isListBucketed = table.isStoredAsSubDirectories();
listBucketColNames = table.getSkewedColNames();
}
// throw a HiveException for non-rcfile.
if (!inputFormatClass.equals(RCFileInputFormat.class)) {
throw new SemanticException(ErrorMsg.TRUNCATE_COLUMN_NOT_RC.getMsg());
}
// throw a HiveException if the table/partition is archived
if (isArchived) {
throw new SemanticException(ErrorMsg.TRUNCATE_COLUMN_ARCHIVED.getMsg());
}
Set<Integer> columnIndexes = new HashSet<Integer>();
for (String columnName : columnNames) {
boolean found = false;
for (int columnIndex = 0; columnIndex < cols.size(); columnIndex++) {
if (columnName.equalsIgnoreCase(cols.get(columnIndex).getName())) {
columnIndexes.add(columnIndex);
found = true;
break;
}
}
// Throw an exception if the user is trying to truncate a column which doesn't exist
if (!found) {
throw new SemanticException(ErrorMsg.INVALID_COLUMN.getMsg(columnName));
}
// Throw an exception if the table/partition is bucketed on one of the columns
for (String bucketCol : bucketCols) {
if (bucketCol.equalsIgnoreCase(columnName)) {
throw new SemanticException(ErrorMsg.TRUNCATE_BUCKETED_COLUMN.getMsg(columnName));
}
}
if (isListBucketed) {
for (String listBucketCol : listBucketColNames) {
if (listBucketCol.equalsIgnoreCase(columnName)) {
throw new SemanticException(ErrorMsg.TRUNCATE_LIST_BUCKETED_COLUMN.getMsg(columnName));
}
}
}
}
truncateTblDesc.setColumnIndexes(new ArrayList<Integer>(columnIndexes));
truncateTblDesc.setInputDir(oldTblPartLoc);
truncateTblDesc.setLbCtx(lbCtx);
addInputsOutputsAlterTable(tableName, partSpec, AlterTableTypes.TRUNCATE);
ddlWork.setNeedLock(true);
TableDesc tblDesc = Utilities.getTableDesc(table);
// Write the output to temporary directory and move it to the final location at the end
// so the operation is atomic.
Path queryTmpdir = ctx.getExternalTmpPath(newTblPartLoc);
truncateTblDesc.setOutputDir(queryTmpdir);
LoadTableDesc ltd = new LoadTableDesc(queryTmpdir, tblDesc, partSpec == null ? new HashMap<>() : partSpec);
ltd.setLbCtx(lbCtx);
@SuppressWarnings("unchecked") Task<MoveWork> moveTsk = TaskFactory.get(new MoveWork(null, null, ltd, null, false));
truncateTask.addDependentTask(moveTsk);
// Recalculate the HDFS stats if auto gather stats is set
if (conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
BasicStatsWork basicStatsWork;
if (oldTblPartLoc.equals(newTblPartLoc)) {
// If we're merging to the same location, we can avoid some metastore calls
TableSpec tablepart = new TableSpec(this.db, conf, root);
basicStatsWork = new BasicStatsWork(tablepart);
} else {
basicStatsWork = new BasicStatsWork(ltd);
}
basicStatsWork.setNoStatsAggregator(true);
basicStatsWork.setClearAggregatorStats(true);
StatsWork columnStatsWork = new StatsWork(table, basicStatsWork, conf);
Task<? extends Serializable> statTask = TaskFactory.get(columnStatsWork);
moveTsk.addDependentTask(statTask);
}
} catch (HiveException e) {
throw new SemanticException(e);
}
}
rootTasks.add(truncateTask);
}
Aggregations