use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.
the class DDLSemanticAnalyzer method analyzeTruncateTable.
private void analyzeTruncateTable(ASTNode ast) throws SemanticException {
// TOK_TABLE_PARTITION
ASTNode root = (ASTNode) ast.getChild(0);
String tableName = getUnescapedName((ASTNode) root.getChild(0));
Table table = getTable(tableName, true);
if (table.getTableType() != TableType.MANAGED_TABLE) {
throw new SemanticException(ErrorMsg.TRUNCATE_FOR_NON_MANAGED_TABLE.format(tableName));
}
if (table.isNonNative()) {
// TODO
throw new SemanticException(ErrorMsg.TRUNCATE_FOR_NON_NATIVE_TABLE.format(tableName));
}
if (!table.isPartitioned() && root.getChildCount() > 1) {
throw new SemanticException(ErrorMsg.PARTSPEC_FOR_NON_PARTITIONED_TABLE.format(tableName));
}
Map<String, String> partSpec = getPartSpec((ASTNode) root.getChild(1));
if (partSpec == null) {
if (!table.isPartitioned()) {
outputs.add(new WriteEntity(table, WriteEntity.WriteType.DDL_EXCLUSIVE));
} else {
for (Partition partition : getPartitions(table, null, false)) {
outputs.add(new WriteEntity(partition, WriteEntity.WriteType.DDL_EXCLUSIVE));
}
}
} else {
if (isFullSpec(table, partSpec)) {
validatePartSpec(table, partSpec, (ASTNode) root.getChild(1), conf, true);
Partition partition = getPartition(table, partSpec, true);
outputs.add(new WriteEntity(partition, WriteEntity.WriteType.DDL_EXCLUSIVE));
} else {
validatePartSpec(table, partSpec, (ASTNode) root.getChild(1), conf, false);
for (Partition partition : getPartitions(table, partSpec, false)) {
outputs.add(new WriteEntity(partition, WriteEntity.WriteType.DDL_EXCLUSIVE));
}
}
}
TruncateTableDesc truncateTblDesc = new TruncateTableDesc(tableName, partSpec, null);
DDLWork ddlWork = new DDLWork(getInputs(), getOutputs(), truncateTblDesc);
Task<? extends Serializable> truncateTask = TaskFactory.get(ddlWork);
// Is this a truncate column command
List<String> columnNames = null;
if (ast.getChildCount() == 2) {
try {
columnNames = getColumnNames((ASTNode) ast.getChild(1));
// It would be possible to support this, but this is such a pointless command.
if (AcidUtils.isInsertOnlyTable(table.getParameters())) {
throw new SemanticException("Truncating MM table columns not presently supported");
}
List<String> bucketCols = null;
Class<? extends InputFormat> inputFormatClass = null;
boolean isArchived = false;
Path newTblPartLoc = null;
Path oldTblPartLoc = null;
List<FieldSchema> cols = null;
ListBucketingCtx lbCtx = null;
boolean isListBucketed = false;
List<String> listBucketColNames = null;
if (table.isPartitioned()) {
Partition part = db.getPartition(table, partSpec, false);
Path tabPath = table.getPath();
Path partPath = part.getDataLocation();
// if the table is in a different dfs than the partition,
// replace the partition's dfs with the table's dfs.
newTblPartLoc = new Path(tabPath.toUri().getScheme(), tabPath.toUri().getAuthority(), partPath.toUri().getPath());
oldTblPartLoc = partPath;
cols = part.getCols();
bucketCols = part.getBucketCols();
inputFormatClass = part.getInputFormatClass();
isArchived = ArchiveUtils.isArchived(part);
lbCtx = constructListBucketingCtx(part.getSkewedColNames(), part.getSkewedColValues(), part.getSkewedColValueLocationMaps(), part.isStoredAsSubDirectories(), conf);
isListBucketed = part.isStoredAsSubDirectories();
listBucketColNames = part.getSkewedColNames();
} else {
// input and output are the same
oldTblPartLoc = table.getPath();
newTblPartLoc = table.getPath();
cols = table.getCols();
bucketCols = table.getBucketCols();
inputFormatClass = table.getInputFormatClass();
lbCtx = constructListBucketingCtx(table.getSkewedColNames(), table.getSkewedColValues(), table.getSkewedColValueLocationMaps(), table.isStoredAsSubDirectories(), conf);
isListBucketed = table.isStoredAsSubDirectories();
listBucketColNames = table.getSkewedColNames();
}
// throw a HiveException for non-rcfile.
if (!inputFormatClass.equals(RCFileInputFormat.class)) {
throw new SemanticException(ErrorMsg.TRUNCATE_COLUMN_NOT_RC.getMsg());
}
// throw a HiveException if the table/partition is archived
if (isArchived) {
throw new SemanticException(ErrorMsg.TRUNCATE_COLUMN_ARCHIVED.getMsg());
}
Set<Integer> columnIndexes = new HashSet<Integer>();
for (String columnName : columnNames) {
boolean found = false;
for (int columnIndex = 0; columnIndex < cols.size(); columnIndex++) {
if (columnName.equalsIgnoreCase(cols.get(columnIndex).getName())) {
columnIndexes.add(columnIndex);
found = true;
break;
}
}
// Throw an exception if the user is trying to truncate a column which doesn't exist
if (!found) {
throw new SemanticException(ErrorMsg.INVALID_COLUMN.getMsg(columnName));
}
// Throw an exception if the table/partition is bucketed on one of the columns
for (String bucketCol : bucketCols) {
if (bucketCol.equalsIgnoreCase(columnName)) {
throw new SemanticException(ErrorMsg.TRUNCATE_BUCKETED_COLUMN.getMsg(columnName));
}
}
if (isListBucketed) {
for (String listBucketCol : listBucketColNames) {
if (listBucketCol.equalsIgnoreCase(columnName)) {
throw new SemanticException(ErrorMsg.TRUNCATE_LIST_BUCKETED_COLUMN.getMsg(columnName));
}
}
}
}
truncateTblDesc.setColumnIndexes(new ArrayList<Integer>(columnIndexes));
truncateTblDesc.setInputDir(oldTblPartLoc);
truncateTblDesc.setLbCtx(lbCtx);
addInputsOutputsAlterTable(tableName, partSpec, AlterTableTypes.TRUNCATE);
ddlWork.setNeedLock(true);
TableDesc tblDesc = Utilities.getTableDesc(table);
// Write the output to temporary directory and move it to the final location at the end
// so the operation is atomic.
Path queryTmpdir = ctx.getExternalTmpPath(newTblPartLoc);
truncateTblDesc.setOutputDir(queryTmpdir);
LoadTableDesc ltd = new LoadTableDesc(queryTmpdir, tblDesc, partSpec == null ? new HashMap<>() : partSpec);
ltd.setLbCtx(lbCtx);
@SuppressWarnings("unchecked") Task<MoveWork> moveTsk = TaskFactory.get(new MoveWork(null, null, ltd, null, false));
truncateTask.addDependentTask(moveTsk);
// Recalculate the HDFS stats if auto gather stats is set
if (conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
BasicStatsWork basicStatsWork;
if (oldTblPartLoc.equals(newTblPartLoc)) {
// If we're merging to the same location, we can avoid some metastore calls
TableSpec tablepart = new TableSpec(this.db, conf, root);
basicStatsWork = new BasicStatsWork(tablepart);
} else {
basicStatsWork = new BasicStatsWork(ltd);
}
basicStatsWork.setNoStatsAggregator(true);
basicStatsWork.setClearAggregatorStats(true);
StatsWork columnStatsWork = new StatsWork(table, basicStatsWork, conf);
Task<? extends Serializable> statTask = TaskFactory.get(columnStatsWork);
moveTsk.addDependentTask(statTask);
}
} catch (HiveException e) {
throw new SemanticException(e);
}
}
rootTasks.add(truncateTask);
}
use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.
the class ImportSemanticAnalyzer method addSinglePartition.
private static Task<?> addSinglePartition(URI fromURI, FileSystem fs, ImportTableDesc tblDesc, Table table, Warehouse wh, AddPartitionDesc addPartitionDesc, ReplicationSpec replicationSpec, EximUtil.SemanticAnalyzerWrapperContext x, Long writeId, int stmtId, boolean isSourceMm, Task<?> commitTask) throws MetaException, IOException, HiveException {
AddPartitionDesc.OnePartitionDesc partSpec = addPartitionDesc.getPartition(0);
if (tblDesc.isExternal() && tblDesc.getLocation() == null) {
x.getLOG().debug("Importing in-place: adding AddPart for partition " + partSpecToString(partSpec.getPartSpec()));
// addPartitionDesc already has the right partition location
@SuppressWarnings("unchecked") Task<?> addPartTask = TaskFactory.get(new DDLWork(x.getInputs(), x.getOutputs(), addPartitionDesc));
return addPartTask;
} else {
String srcLocation = partSpec.getLocation();
fixLocationInPartSpec(fs, tblDesc, table, wh, replicationSpec, partSpec, x);
x.getLOG().debug("adding dependent CopyWork/AddPart/MoveWork for partition " + partSpecToString(partSpec.getPartSpec()) + " with source location: " + srcLocation);
Path tgtLocation = new Path(partSpec.getLocation());
Path destPath = !AcidUtils.isInsertOnlyTable(table.getParameters()) ? x.getCtx().getExternalTmpPath(tgtLocation) : new Path(tgtLocation, AcidUtils.deltaSubdir(writeId, writeId, stmtId));
Path moveTaskSrc = !AcidUtils.isInsertOnlyTable(table.getParameters()) ? destPath : tgtLocation;
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("adding import work for partition with source location: " + srcLocation + "; target: " + tgtLocation + "; copy dest " + destPath + "; mm " + writeId + " (src " + isSourceMm + ") for " + partSpecToString(partSpec.getPartSpec()));
}
Task<?> copyTask = null;
if (replicationSpec.isInReplicationScope()) {
if (isSourceMm || isAcid(writeId)) {
// Note: this is replication gap, not MM gap... Repl V2 is not ready yet.
throw new RuntimeException("Replicating MM and ACID tables is not supported");
}
copyTask = ReplCopyTask.getLoadCopyTask(replicationSpec, new Path(srcLocation), destPath, x.getConf());
} else {
CopyWork cw = new CopyWork(new Path(srcLocation), destPath, false);
cw.setSkipSourceMmDirs(isSourceMm);
copyTask = TaskFactory.get(cw);
}
Task<?> addPartTask = TaskFactory.get(new DDLWork(x.getInputs(), x.getOutputs(), addPartitionDesc));
// Note: this sets LoadFileType incorrectly for ACID; is that relevant for import?
// See setLoadFileType and setIsAcidIow calls elsewhere for an example.
LoadTableDesc loadTableWork = new LoadTableDesc(moveTaskSrc, Utilities.getTableDesc(table), partSpec.getPartSpec(), replicationSpec.isReplace() ? LoadFileType.REPLACE_ALL : LoadFileType.OVERWRITE_EXISTING, writeId);
loadTableWork.setStmtId(stmtId);
loadTableWork.setInheritTableSpecs(false);
Task<?> loadPartTask = TaskFactory.get(new MoveWork(x.getInputs(), x.getOutputs(), loadTableWork, null, false));
copyTask.addDependentTask(loadPartTask);
addPartTask.addDependentTask(loadPartTask);
x.getTasks().add(copyTask);
if (commitTask != null) {
loadPartTask.addDependentTask(commitTask);
}
return addPartTask;
}
}
use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.
the class LoadSemanticAnalyzer method analyzeInternal.
@Override
public void analyzeInternal(ASTNode ast) throws SemanticException {
boolean isLocal = false;
boolean isOverWrite = false;
Tree fromTree = ast.getChild(0);
Tree tableTree = ast.getChild(1);
if (ast.getChildCount() == 4) {
isLocal = true;
isOverWrite = true;
}
if (ast.getChildCount() == 3) {
if (ast.getChild(2).getText().toLowerCase().equals("local")) {
isLocal = true;
} else {
isOverWrite = true;
}
}
// initialize load path
URI fromURI;
try {
String fromPath = stripQuotes(fromTree.getText());
fromURI = initializeFromURI(fromPath, isLocal);
} catch (IOException e) {
throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(fromTree, e.getMessage()), e);
} catch (URISyntaxException e) {
throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(fromTree, e.getMessage()), e);
}
// initialize destination table/partition
TableSpec ts = new TableSpec(db, conf, (ASTNode) tableTree);
if (ts.tableHandle.isView() || ts.tableHandle.isMaterializedView()) {
throw new SemanticException(ErrorMsg.DML_AGAINST_VIEW.getMsg());
}
if (ts.tableHandle.isNonNative()) {
throw new SemanticException(ErrorMsg.LOAD_INTO_NON_NATIVE.getMsg());
}
if (ts.tableHandle.isStoredAsSubDirectories()) {
throw new SemanticException(ErrorMsg.LOAD_INTO_STORED_AS_DIR.getMsg());
}
List<FieldSchema> parts = ts.tableHandle.getPartitionKeys();
if ((parts != null && parts.size() > 0) && (ts.partSpec == null || ts.partSpec.size() == 0)) {
throw new SemanticException(ErrorMsg.NEED_PARTITION_ERROR.getMsg());
}
List<String> bucketCols = ts.tableHandle.getBucketCols();
if (bucketCols != null && !bucketCols.isEmpty()) {
String error = StrictChecks.checkBucketing(conf);
if (error != null) {
throw new SemanticException("Please load into an intermediate table" + " and use 'insert... select' to allow Hive to enforce bucketing. " + error);
}
}
// make sure the arguments make sense
List<FileStatus> files = applyConstraintsAndGetFiles(fromURI, fromTree, isLocal, ts.tableHandle);
// for managed tables, make sure the file formats match
if (TableType.MANAGED_TABLE.equals(ts.tableHandle.getTableType()) && conf.getBoolVar(HiveConf.ConfVars.HIVECHECKFILEFORMAT)) {
ensureFileFormatsMatch(ts, files, fromURI);
}
inputs.add(toReadEntity(new Path(fromURI)));
Task<? extends Serializable> rTask = null;
// create final load/move work
boolean preservePartitionSpecs = false;
Map<String, String> partSpec = ts.getPartSpec();
if (partSpec == null) {
partSpec = new LinkedHashMap<String, String>();
outputs.add(new WriteEntity(ts.tableHandle, (isOverWrite ? WriteEntity.WriteType.INSERT_OVERWRITE : WriteEntity.WriteType.INSERT)));
} else {
try {
Partition part = Hive.get().getPartition(ts.tableHandle, partSpec, false);
if (part != null) {
if (isOverWrite) {
outputs.add(new WriteEntity(part, WriteEntity.WriteType.INSERT_OVERWRITE));
} else {
outputs.add(new WriteEntity(part, WriteEntity.WriteType.INSERT));
// If partition already exists and we aren't overwriting it, then respect
// its current location info rather than picking it from the parent TableDesc
preservePartitionSpecs = true;
}
} else {
outputs.add(new WriteEntity(ts.tableHandle, (isOverWrite ? WriteEntity.WriteType.INSERT_OVERWRITE : WriteEntity.WriteType.INSERT)));
}
} catch (HiveException e) {
throw new SemanticException(e);
}
}
Long writeId = null;
int stmtId = -1;
if (AcidUtils.isTransactionalTable(ts.tableHandle)) {
try {
writeId = SessionState.get().getTxnMgr().getTableWriteId(ts.tableHandle.getDbName(), ts.tableHandle.getTableName());
} catch (LockException ex) {
throw new SemanticException("Failed to allocate the write id", ex);
}
stmtId = SessionState.get().getTxnMgr().getStmtIdAndIncrement();
}
// Note: this sets LoadFileType incorrectly for ACID; is that relevant for load?
// See setLoadFileType and setIsAcidIow calls elsewhere for an example.
LoadTableDesc loadTableWork = new LoadTableDesc(new Path(fromURI), Utilities.getTableDesc(ts.tableHandle), partSpec, isOverWrite ? LoadFileType.REPLACE_ALL : LoadFileType.KEEP_EXISTING, writeId);
loadTableWork.setStmtId(stmtId);
if (preservePartitionSpecs) {
// Note : preservePartitionSpecs=true implies inheritTableSpecs=false but
// but preservePartitionSpecs=false(default) here is not sufficient enough
// info to set inheritTableSpecs=true
loadTableWork.setInheritTableSpecs(false);
}
Task<? extends Serializable> childTask = TaskFactory.get(new MoveWork(getInputs(), getOutputs(), loadTableWork, null, true, isLocal));
if (rTask != null) {
rTask.addDependentTask(childTask);
} else {
rTask = childTask;
}
rootTasks.add(rTask);
// The user asked for stats to be collected.
// Some stats like number of rows require a scan of the data
// However, some other stats, like number of files, do not require a complete scan
// Update the stats which do not require a complete scan.
Task<? extends Serializable> statTask = null;
if (conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
BasicStatsWork basicStatsWork = new BasicStatsWork(loadTableWork);
basicStatsWork.setNoStatsAggregator(true);
basicStatsWork.setClearAggregatorStats(true);
StatsWork columnStatsWork = new StatsWork(ts.tableHandle, basicStatsWork, conf);
statTask = TaskFactory.get(columnStatsWork);
}
if (statTask != null) {
childTask.addDependentTask(statTask);
}
}
use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.
the class GenSparkUtils method createMoveTask.
/**
* Create and add any dependent move tasks.
*
* This is forked from {@link GenMapRedUtils}. The difference is that it doesn't check
* 'isLinkedFileSink' and does not set parent dir for the linked file sinks.
*/
public static Path createMoveTask(Task<? extends Serializable> currTask, boolean chDir, FileSinkOperator fsOp, ParseContext parseCtx, List<Task<MoveWork>> mvTasks, HiveConf hconf, DependencyCollectionTask dependencyTask) {
Path dest = null;
FileSinkDesc fileSinkDesc = fsOp.getConf();
if (chDir) {
dest = fsOp.getConf().getFinalDirName();
// generate the temporary file
// it must be on the same file system as the current destination
Context baseCtx = parseCtx.getContext();
Path tmpDir = baseCtx.getExternalTmpPath(dest);
// Change all the linked file sink descriptors
if (fileSinkDesc.getLinkedFileSinkDesc() != null) {
for (FileSinkDesc fsConf : fileSinkDesc.getLinkedFileSinkDesc()) {
fsConf.setDirName(tmpDir);
}
} else {
fileSinkDesc.setDirName(tmpDir);
}
}
Task<MoveWork> mvTask = null;
if (!chDir) {
mvTask = GenMapRedUtils.findMoveTaskForFsopOutput(mvTasks, fileSinkDesc.getFinalDirName(), false);
}
// Set the move task to be dependent on the current task
if (mvTask != null) {
GenMapRedUtils.addDependentMoveTasks(mvTask, hconf, currTask, dependencyTask);
}
return dest;
}
use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.
the class TestGenMapRedUtilsCreateConditionalTask method testMovePathsThatCanBeMerged.
@Test
public void testMovePathsThatCanBeMerged() {
final Path condInputPath = new Path("s3a://bucket/scratch/-ext-10000");
final Path condOutputPath = new Path("s3a://bucket/scratch/-ext-10002");
final Path targetMoveWorkPath = new Path("s3a://bucket/scratch/-ext-10003");
final MoveWork mockWork = mock(MoveWork.class);
when(mockWork.getLoadFileWork()).thenReturn(new LoadFileDesc(condOutputPath, targetMoveWorkPath, false, "", "", false));
assertTrue("Merging BlobStore paths should be allowed.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, mockWork));
}
Aggregations