use of org.apache.hadoop.hive.ql.plan.LoadTableDesc in project hive by apache.
the class ImportSemanticAnalyzer method addSinglePartition.
private static Task<?> addSinglePartition(URI fromURI, FileSystem fs, ImportTableDesc tblDesc, Table table, Warehouse wh, AddPartitionDesc addPartitionDesc, ReplicationSpec replicationSpec, EximUtil.SemanticAnalyzerWrapperContext x, Long writeId, int stmtId, boolean isSourceMm, Task<?> commitTask) throws MetaException, IOException, HiveException {
AddPartitionDesc.OnePartitionDesc partSpec = addPartitionDesc.getPartition(0);
if (tblDesc.isExternal() && tblDesc.getLocation() == null) {
x.getLOG().debug("Importing in-place: adding AddPart for partition " + partSpecToString(partSpec.getPartSpec()));
// addPartitionDesc already has the right partition location
@SuppressWarnings("unchecked") Task<?> addPartTask = TaskFactory.get(new DDLWork(x.getInputs(), x.getOutputs(), addPartitionDesc));
return addPartTask;
} else {
String srcLocation = partSpec.getLocation();
fixLocationInPartSpec(fs, tblDesc, table, wh, replicationSpec, partSpec, x);
x.getLOG().debug("adding dependent CopyWork/AddPart/MoveWork for partition " + partSpecToString(partSpec.getPartSpec()) + " with source location: " + srcLocation);
Path tgtLocation = new Path(partSpec.getLocation());
Path destPath = !AcidUtils.isInsertOnlyTable(table.getParameters()) ? x.getCtx().getExternalTmpPath(tgtLocation) : new Path(tgtLocation, AcidUtils.deltaSubdir(writeId, writeId, stmtId));
Path moveTaskSrc = !AcidUtils.isInsertOnlyTable(table.getParameters()) ? destPath : tgtLocation;
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("adding import work for partition with source location: " + srcLocation + "; target: " + tgtLocation + "; copy dest " + destPath + "; mm " + writeId + " (src " + isSourceMm + ") for " + partSpecToString(partSpec.getPartSpec()));
}
Task<?> copyTask = null;
if (replicationSpec.isInReplicationScope()) {
if (isSourceMm || isAcid(writeId)) {
// Note: this is replication gap, not MM gap... Repl V2 is not ready yet.
throw new RuntimeException("Replicating MM and ACID tables is not supported");
}
copyTask = ReplCopyTask.getLoadCopyTask(replicationSpec, new Path(srcLocation), destPath, x.getConf());
} else {
CopyWork cw = new CopyWork(new Path(srcLocation), destPath, false);
cw.setSkipSourceMmDirs(isSourceMm);
copyTask = TaskFactory.get(cw);
}
Task<?> addPartTask = TaskFactory.get(new DDLWork(x.getInputs(), x.getOutputs(), addPartitionDesc));
// Note: this sets LoadFileType incorrectly for ACID; is that relevant for import?
// See setLoadFileType and setIsAcidIow calls elsewhere for an example.
LoadTableDesc loadTableWork = new LoadTableDesc(moveTaskSrc, Utilities.getTableDesc(table), partSpec.getPartSpec(), replicationSpec.isReplace() ? LoadFileType.REPLACE_ALL : LoadFileType.OVERWRITE_EXISTING, writeId);
loadTableWork.setStmtId(stmtId);
loadTableWork.setInheritTableSpecs(false);
Task<?> loadPartTask = TaskFactory.get(new MoveWork(x.getInputs(), x.getOutputs(), loadTableWork, null, false));
copyTask.addDependentTask(loadPartTask);
addPartTask.addDependentTask(loadPartTask);
x.getTasks().add(copyTask);
if (commitTask != null) {
loadPartTask.addDependentTask(commitTask);
}
return addPartTask;
}
}
use of org.apache.hadoop.hive.ql.plan.LoadTableDesc in project hive by apache.
the class LoadSemanticAnalyzer method analyzeInternal.
@Override
public void analyzeInternal(ASTNode ast) throws SemanticException {
boolean isLocal = false;
boolean isOverWrite = false;
Tree fromTree = ast.getChild(0);
Tree tableTree = ast.getChild(1);
if (ast.getChildCount() == 4) {
isLocal = true;
isOverWrite = true;
}
if (ast.getChildCount() == 3) {
if (ast.getChild(2).getText().toLowerCase().equals("local")) {
isLocal = true;
} else {
isOverWrite = true;
}
}
// initialize load path
URI fromURI;
try {
String fromPath = stripQuotes(fromTree.getText());
fromURI = initializeFromURI(fromPath, isLocal);
} catch (IOException e) {
throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(fromTree, e.getMessage()), e);
} catch (URISyntaxException e) {
throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(fromTree, e.getMessage()), e);
}
// initialize destination table/partition
TableSpec ts = new TableSpec(db, conf, (ASTNode) tableTree);
if (ts.tableHandle.isView() || ts.tableHandle.isMaterializedView()) {
throw new SemanticException(ErrorMsg.DML_AGAINST_VIEW.getMsg());
}
if (ts.tableHandle.isNonNative()) {
throw new SemanticException(ErrorMsg.LOAD_INTO_NON_NATIVE.getMsg());
}
if (ts.tableHandle.isStoredAsSubDirectories()) {
throw new SemanticException(ErrorMsg.LOAD_INTO_STORED_AS_DIR.getMsg());
}
List<FieldSchema> parts = ts.tableHandle.getPartitionKeys();
if ((parts != null && parts.size() > 0) && (ts.partSpec == null || ts.partSpec.size() == 0)) {
throw new SemanticException(ErrorMsg.NEED_PARTITION_ERROR.getMsg());
}
List<String> bucketCols = ts.tableHandle.getBucketCols();
if (bucketCols != null && !bucketCols.isEmpty()) {
String error = StrictChecks.checkBucketing(conf);
if (error != null) {
throw new SemanticException("Please load into an intermediate table" + " and use 'insert... select' to allow Hive to enforce bucketing. " + error);
}
}
// make sure the arguments make sense
List<FileStatus> files = applyConstraintsAndGetFiles(fromURI, fromTree, isLocal, ts.tableHandle);
// for managed tables, make sure the file formats match
if (TableType.MANAGED_TABLE.equals(ts.tableHandle.getTableType()) && conf.getBoolVar(HiveConf.ConfVars.HIVECHECKFILEFORMAT)) {
ensureFileFormatsMatch(ts, files, fromURI);
}
inputs.add(toReadEntity(new Path(fromURI)));
Task<? extends Serializable> rTask = null;
// create final load/move work
boolean preservePartitionSpecs = false;
Map<String, String> partSpec = ts.getPartSpec();
if (partSpec == null) {
partSpec = new LinkedHashMap<String, String>();
outputs.add(new WriteEntity(ts.tableHandle, (isOverWrite ? WriteEntity.WriteType.INSERT_OVERWRITE : WriteEntity.WriteType.INSERT)));
} else {
try {
Partition part = Hive.get().getPartition(ts.tableHandle, partSpec, false);
if (part != null) {
if (isOverWrite) {
outputs.add(new WriteEntity(part, WriteEntity.WriteType.INSERT_OVERWRITE));
} else {
outputs.add(new WriteEntity(part, WriteEntity.WriteType.INSERT));
// If partition already exists and we aren't overwriting it, then respect
// its current location info rather than picking it from the parent TableDesc
preservePartitionSpecs = true;
}
} else {
outputs.add(new WriteEntity(ts.tableHandle, (isOverWrite ? WriteEntity.WriteType.INSERT_OVERWRITE : WriteEntity.WriteType.INSERT)));
}
} catch (HiveException e) {
throw new SemanticException(e);
}
}
Long writeId = null;
int stmtId = -1;
if (AcidUtils.isTransactionalTable(ts.tableHandle)) {
try {
writeId = SessionState.get().getTxnMgr().getTableWriteId(ts.tableHandle.getDbName(), ts.tableHandle.getTableName());
} catch (LockException ex) {
throw new SemanticException("Failed to allocate the write id", ex);
}
stmtId = SessionState.get().getTxnMgr().getStmtIdAndIncrement();
}
// Note: this sets LoadFileType incorrectly for ACID; is that relevant for load?
// See setLoadFileType and setIsAcidIow calls elsewhere for an example.
LoadTableDesc loadTableWork = new LoadTableDesc(new Path(fromURI), Utilities.getTableDesc(ts.tableHandle), partSpec, isOverWrite ? LoadFileType.REPLACE_ALL : LoadFileType.KEEP_EXISTING, writeId);
loadTableWork.setStmtId(stmtId);
if (preservePartitionSpecs) {
// Note : preservePartitionSpecs=true implies inheritTableSpecs=false but
// but preservePartitionSpecs=false(default) here is not sufficient enough
// info to set inheritTableSpecs=true
loadTableWork.setInheritTableSpecs(false);
}
Task<? extends Serializable> childTask = TaskFactory.get(new MoveWork(getInputs(), getOutputs(), loadTableWork, null, true, isLocal));
if (rTask != null) {
rTask.addDependentTask(childTask);
} else {
rTask = childTask;
}
rootTasks.add(rTask);
// The user asked for stats to be collected.
// Some stats like number of rows require a scan of the data
// However, some other stats, like number of files, do not require a complete scan
// Update the stats which do not require a complete scan.
Task<? extends Serializable> statTask = null;
if (conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
BasicStatsWork basicStatsWork = new BasicStatsWork(loadTableWork);
basicStatsWork.setNoStatsAggregator(true);
basicStatsWork.setClearAggregatorStats(true);
StatsWork columnStatsWork = new StatsWork(ts.tableHandle, basicStatsWork, conf);
statTask = TaskFactory.get(columnStatsWork);
}
if (statTask != null) {
childTask.addDependentTask(statTask);
}
}
use of org.apache.hadoop.hive.ql.plan.LoadTableDesc in project hive by apache.
the class MoveTask method acquireLockForFileMove.
private LocalTableLock acquireLockForFileMove(LoadTableDesc loadTableWork) throws HiveException {
LockFileMoveMode mode = LockFileMoveMode.fromConf(conf);
if (mode == LockFileMoveMode.NONE) {
return new LocalTableLock();
}
if (mode == LockFileMoveMode.DP && loadTableWork.getDPCtx() == null) {
return new LocalTableLock();
}
WriteEntity output = context.getLoadTableOutputMap().get(loadTableWork);
List<HiveLockObj> lockObjects = context.getOutputLockObjects().get(output);
if (lockObjects == null) {
return new LocalTableLock();
}
TableDesc table = loadTableWork.getTable();
if (table == null) {
return new LocalTableLock();
}
Hive db = getHive();
Table baseTable = db.getTable(loadTableWork.getTable().getTableName());
HiveLockObject.HiveLockObjectData lockData = new HiveLockObject.HiveLockObjectData(queryPlan.getQueryId(), String.valueOf(System.currentTimeMillis()), "IMPLICIT", queryPlan.getQueryStr(), conf);
HiveLockObject lock = new HiveLockObject(baseTable, lockData);
for (HiveLockObj hiveLockObj : lockObjects) {
if (Arrays.equals(hiveLockObj.getObj().getPaths(), lock.getPaths())) {
HiveLockMode l = hiveLockObj.getMode();
if (l == HiveLockMode.EXCLUSIVE || l == HiveLockMode.SEMI_SHARED) {
// no need to lock ; already owns a more powerful one
return new LocalTableLock();
}
}
}
return new LocalTableLock(lock);
}
use of org.apache.hadoop.hive.ql.plan.LoadTableDesc in project hive by apache.
the class BasicStatsTask method getPartitionsList.
/**
* Get the list of partitions that need to update statistics.
* TODO: we should reuse the Partitions generated at compile time
* since getting the list of partitions is quite expensive.
*
* @return a list of partitions that need to update statistics.
* @throws HiveException
*/
private List<Partition> getPartitionsList(Hive db) throws HiveException {
if (work.getLoadFileDesc() != null) {
// we are in CTAS, so we know there are no partitions
return null;
}
if (work.getTableSpecs() != null) {
// ANALYZE command
TableSpec tblSpec = work.getTableSpecs();
table = tblSpec.tableHandle;
if (!table.isPartitioned()) {
return null;
}
// get all partitions that matches with the partition spec
return tblSpec.partitions != null ? unmodifiableList(tblSpec.partitions) : emptyList();
} else if (work.getLoadTableDesc() != null) {
// INSERT OVERWRITE command
LoadTableDesc tbd = work.getLoadTableDesc();
table = db.getTable(tbd.getTable().getTableName());
if (!table.isPartitioned()) {
return null;
}
DynamicPartitionCtx dpCtx = tbd.getDPCtx();
if (dpCtx != null && dpCtx.getNumDPCols() > 0) {
// If no dynamic partitions are generated, dpPartSpecs may not be initialized
if (dpPartSpecs != null) {
// Reload partition metadata because another BasicStatsTask instance may have updated the stats.
List<String> partNames = dpPartSpecs.stream().map(Partition::getName).collect(Collectors.toList());
return db.getPartitionsByNames(table, partNames);
}
} else {
// static partition
return singletonList(db.getPartition(table, tbd.getPartitionSpec(), false));
}
}
return emptyList();
}
use of org.apache.hadoop.hive.ql.plan.LoadTableDesc in project hive by apache.
the class Hive method loadDynamicPartitions.
/**
* Given a source directory name of the load path, load all dynamically generated partitions
* into the specified table and return a list of strings that represent the dynamic partition
* paths.
* @param tbd table descriptor
* @param numLB number of buckets
* @param isAcid true if this is an ACID operation
* @param writeId writeId, can be 0 unless isAcid == true
* @param stmtId statementId
* @param resetStatistics if true, reset statistics. Do not reset statistics otherwise.
* @param operation ACID operation type
* @param partitionDetailsMap full dynamic partition specification
* @return partition map details (PartitionSpec and Partition)
* @throws HiveException
*/
public Map<Map<String, String>, Partition> loadDynamicPartitions(final LoadTableDesc tbd, final int numLB, final boolean isAcid, final long writeId, final int stmtId, final boolean resetStatistics, final AcidUtils.Operation operation, Map<Path, PartitionDetails> partitionDetailsMap) throws HiveException {
PerfLogger perfLogger = SessionState.getPerfLogger();
perfLogger.perfLogBegin("MoveTask", PerfLogger.LOAD_DYNAMIC_PARTITIONS);
final Path loadPath = tbd.getSourcePath();
final Table tbl = getTable(tbd.getTable().getTableName());
final Map<String, String> partSpec = tbd.getPartitionSpec();
final AtomicInteger partitionsLoaded = new AtomicInteger(0);
final boolean inPlaceEligible = conf.getLong("fs.trash.interval", 0) <= 0 && InPlaceUpdate.canRenderInPlace(conf) && !SessionState.getConsole().getIsSilent();
final PrintStream ps = (inPlaceEligible) ? SessionState.getConsole().getInfoStream() : null;
final SessionState parentSession = SessionState.get();
List<Callable<Partition>> tasks = Lists.newLinkedList();
boolean fetchPartitionInfo = true;
final boolean scanPartitionsByName = HiveConf.getBoolVar(conf, HIVE_LOAD_DYNAMIC_PARTITIONS_SCAN_SPECIFIC_PARTITIONS);
// for every dynamic partition
if (scanPartitionsByName && !tbd.isDirectInsert() && !AcidUtils.isTransactionalTable(tbl)) {
// Fetch only relevant partitions from HMS for checking old partitions
List<String> partitionNames = new LinkedList<>();
for (PartitionDetails details : partitionDetailsMap.values()) {
if (details.fullSpec != null && !details.fullSpec.isEmpty()) {
partitionNames.add(Warehouse.makeDynamicPartNameNoTrailingSeperator(details.fullSpec));
}
}
List<Partition> partitions = Hive.get().getPartitionsByNames(tbl, partitionNames);
for (Partition partition : partitions) {
LOG.debug("HMS partition spec: {}", partition.getSpec());
partitionDetailsMap.entrySet().parallelStream().filter(entry -> entry.getValue().fullSpec.equals(partition.getSpec())).findAny().ifPresent(entry -> {
entry.getValue().partition = partition;
entry.getValue().hasOldPartition = true;
});
}
// no need to fetch partition again in tasks since we have already fetched partitions
// info in getPartitionsByNames()
fetchPartitionInfo = false;
}
boolean isTxnTable = AcidUtils.isTransactionalTable(tbl);
AcidUtils.TableSnapshot tableSnapshot = isTxnTable ? getTableSnapshot(tbl, writeId) : null;
for (Entry<Path, PartitionDetails> entry : partitionDetailsMap.entrySet()) {
boolean getPartitionFromHms = fetchPartitionInfo;
tasks.add(() -> {
PartitionDetails partitionDetails = entry.getValue();
Map<String, String> fullPartSpec = partitionDetails.fullSpec;
try {
SessionState.setCurrentSessionState(parentSession);
if (getPartitionFromHms) {
// didn't fetch partition info from HMS. Getting from HMS now.
Partition existing = getPartition(tbl, fullPartSpec, false);
if (existing != null) {
partitionDetails.partition = existing;
partitionDetails.hasOldPartition = true;
}
}
LOG.info("New loading path = " + entry.getKey() + " withPartSpec " + fullPartSpec);
Partition oldPartition = partitionDetails.partition;
List<FileStatus> newFiles = null;
if (partitionDetails.newFiles != null) {
// If we already know the files from the direct insert manifest, use them
newFiles = partitionDetails.newFiles;
} else if (conf.getBoolVar(ConfVars.FIRE_EVENTS_FOR_DML) && !tbl.isTemporary() && oldPartition == null) {
// Otherwise only collect them, if we are going to fire write notifications
newFiles = Collections.synchronizedList(new ArrayList<>());
}
// load the partition
Partition partition = loadPartitionInternal(entry.getKey(), tbl, fullPartSpec, oldPartition, tbd.getLoadFileType(), true, false, numLB > 0, false, isAcid, resetStatistics, writeId, stmtId, tbd.isInsertOverwrite(), isTxnTable, newFiles, tbd.isDirectInsert());
// metastore
if (tableSnapshot != null) {
partition.getTPartition().setWriteId(tableSnapshot.getWriteId());
}
partitionDetails.tableSnapshot = tableSnapshot;
if (oldPartition == null) {
partitionDetails.newFiles = newFiles;
partitionDetails.partition = partition;
}
if (inPlaceEligible) {
synchronized (ps) {
InPlaceUpdate.rePositionCursor(ps);
partitionsLoaded.incrementAndGet();
InPlaceUpdate.reprintLine(ps, "Loaded : " + partitionsLoaded.get() + "/" + partitionDetailsMap.size() + " partitions.");
}
}
return partition;
} catch (Exception e) {
LOG.error("Exception when loading partition with parameters " + " partPath=" + entry.getKey() + ", " + " table=" + tbl.getTableName() + ", " + " partSpec=" + fullPartSpec + ", " + " loadFileType=" + tbd.getLoadFileType().toString() + ", " + " listBucketingLevel=" + numLB + ", " + " isAcid=" + isAcid + ", " + " resetStatistics=" + resetStatistics, e);
throw e;
} finally {
// get(conf).getMSC can be called in this task, Close the HMS connection right after use, do not wait for finalizer to close it.
closeCurrent();
}
});
}
int poolSize = conf.getInt(ConfVars.HIVE_LOAD_DYNAMIC_PARTITIONS_THREAD_COUNT.varname, 1);
ExecutorService executor = Executors.newFixedThreadPool(poolSize, new ThreadFactoryBuilder().setDaemon(true).setNameFormat("load-dynamic-partitionsToAdd-%d").build());
List<Future<Partition>> futures = Lists.newLinkedList();
Map<Map<String, String>, Partition> result = Maps.newLinkedHashMap();
try {
futures = executor.invokeAll(tasks);
LOG.info("Number of partitionsToAdd to be added is " + futures.size());
for (Future<Partition> future : futures) {
Partition partition = future.get();
result.put(partition.getSpec(), partition);
}
// add new partitions in batch
addPartitionsToMetastore(partitionDetailsMap.entrySet().stream().filter(entry -> !entry.getValue().hasOldPartition).map(entry -> entry.getValue().partition).collect(Collectors.toList()), resetStatistics, tbl, partitionDetailsMap.entrySet().stream().filter(entry -> !entry.getValue().hasOldPartition).map(entry -> entry.getValue().tableSnapshot).collect(Collectors.toList()));
// For acid table, add the acid_write event with file list at the time of load itself. But
// it should be done after partition is created.
List<WriteNotificationLogRequest> requestList = new ArrayList<>();
int maxBatchSize = conf.getIntVar(HIVE_WRITE_NOTIFICATION_MAX_BATCH_SIZE);
for (Entry<Path, PartitionDetails> entry : partitionDetailsMap.entrySet()) {
PartitionDetails partitionDetails = entry.getValue();
if (isTxnTable && partitionDetails.newFiles != null) {
addWriteNotificationLog(tbl, partitionDetails.fullSpec, partitionDetails.newFiles, writeId, requestList);
if (requestList != null && requestList.size() >= maxBatchSize) {
// If the first call returns that the HMS does not supports batching, avoid batching
// for later requests.
boolean batchSupported = addWriteNotificationLogInBatch(tbl, requestList);
if (batchSupported) {
requestList.clear();
} else {
requestList = null;
}
}
}
}
if (requestList != null && requestList.size() > 0) {
addWriteNotificationLogInBatch(tbl, requestList);
}
setStatsPropAndAlterPartitions(resetStatistics, tbl, partitionDetailsMap.entrySet().stream().filter(entry -> entry.getValue().hasOldPartition).map(entry -> entry.getValue().partition).collect(Collectors.toList()), tableSnapshot);
} catch (InterruptedException | ExecutionException e) {
throw new HiveException("Exception when loading " + partitionDetailsMap.size() + " partitions" + " in table " + tbl.getTableName() + " with loadPath=" + loadPath, e);
} catch (TException e) {
LOG.error("Failed loadDynamicPartitions", e);
throw new HiveException(e);
} catch (Exception e) {
StringBuffer logMsg = new StringBuffer();
logMsg.append("Exception when loading partitionsToAdd with parameters ");
logMsg.append("partPaths=");
partitionDetailsMap.keySet().forEach(path -> logMsg.append(path + ", "));
logMsg.append("table=" + tbl.getTableName() + ", ").append("partSpec=" + partSpec + ", ").append("loadFileType=" + tbd.getLoadFileType().toString() + ", ").append("listBucketingLevel=" + numLB + ", ").append("isAcid=" + isAcid + ", ").append("resetStatistics=" + resetStatistics);
LOG.error(logMsg.toString(), e);
throw e;
} finally {
LOG.debug("Cancelling " + futures.size() + " dynamic loading tasks");
executor.shutdownNow();
}
if (HiveConf.getBoolVar(conf, ConfVars.HIVE_IN_TEST) && HiveConf.getBoolVar(conf, ConfVars.HIVETESTMODEFAILLOADDYNAMICPARTITION)) {
throw new HiveException(HiveConf.ConfVars.HIVETESTMODEFAILLOADDYNAMICPARTITION.name() + "=true");
}
try {
if (isTxnTable) {
List<String> partNames = result.values().stream().map(Partition::getName).collect(Collectors.toList());
getMSC().addDynamicPartitions(parentSession.getTxnMgr().getCurrentTxnId(), writeId, tbl.getDbName(), tbl.getTableName(), partNames, AcidUtils.toDataOperationType(operation));
}
LOG.info("Loaded " + result.size() + "partitionsToAdd");
perfLogger.perfLogEnd("MoveTask", PerfLogger.LOAD_DYNAMIC_PARTITIONS);
return result;
} catch (TException te) {
LOG.error("Failed loadDynamicPartitions", te);
throw new HiveException("Exception updating metastore for acid table " + tbd.getTable().getTableName() + " with partitions " + result.values(), te);
}
}
Aggregations