use of org.apache.hadoop.hive.ql.log.PerfLogger in project hive by apache.
the class PartitionPruner method getAllPartitions.
private static Set<Partition> getAllPartitions(Table tab) throws HiveException {
PerfLogger perfLogger = SessionState.getPerfLogger();
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.PARTITION_RETRIEVING);
Set<Partition> result = Hive.get().getAllPartitionsOf(tab);
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.PARTITION_RETRIEVING);
return result;
}
use of org.apache.hadoop.hive.ql.log.PerfLogger in project hive by apache.
the class Driver method acquireLocksAndOpenTxn.
/**
* Acquire read and write locks needed by the statement. The list of objects to be locked are
* obtained from the inputs and outputs populated by the compiler. The lock acquisition scheme is
* pretty simple. If all the locks cannot be obtained, error out. Deadlock is avoided by making
* sure that the locks are lexicographically sorted.
*
* This method also records the list of valid transactions. This must be done after any
* transactions have been opened and locks acquired.
* @param startTxnImplicitly in AC=false, the 1st DML starts a txn
**/
private int acquireLocksAndOpenTxn(boolean startTxnImplicitly) {
PerfLogger perfLogger = SessionState.getPerfLogger();
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.ACQUIRE_READ_WRITE_LOCKS);
SessionState ss = SessionState.get();
HiveTxnManager txnMgr = ss.getTxnMgr();
if (startTxnImplicitly) {
assert !txnMgr.getAutoCommit();
}
try {
// Don't use the userName member, as it may or may not have been set. Get the value from
// conf, which calls into getUGI to figure out who the process is running as.
String userFromUGI;
try {
userFromUGI = conf.getUser();
} catch (IOException e) {
errorMessage = "FAILED: Error in determining user while acquiring locks: " + e.getMessage();
SQLState = ErrorMsg.findSQLState(e.getMessage());
downstreamError = e;
console.printError(errorMessage, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
return 10;
}
boolean initiatingTransaction = false;
boolean readOnlyQueryInAutoCommit = false;
if ((txnMgr.getAutoCommit() && haveAcidWrite()) || plan.getOperation() == HiveOperation.START_TRANSACTION || (!txnMgr.getAutoCommit() && startTxnImplicitly)) {
if (txnMgr.isTxnOpen()) {
throw new RuntimeException("Already have an open transaction txnid:" + txnMgr.getCurrentTxnId());
}
// We are writing to tables in an ACID compliant way, so we need to open a transaction
txnMgr.openTxn(ctx, userFromUGI);
initiatingTransaction = true;
} else {
readOnlyQueryInAutoCommit = txnMgr.getAutoCommit() && plan.getOperation() == HiveOperation.QUERY && !haveAcidWrite();
}
// Set the transaction id in all of the acid file sinks
if (haveAcidWrite()) {
for (FileSinkDesc desc : acidSinks) {
desc.setTransactionId(txnMgr.getCurrentTxnId());
//it's possible to have > 1 FileSink writing to the same table/partition
//e.g. Merge stmt, multi-insert stmt when mixing DP and SP writes
desc.setStatementId(txnMgr.getWriteIdAndIncrement());
}
}
/*Note, we have to record snapshot after lock acquisition to prevent lost update problem
consider 2 concurrent "update table T set x = x + 1". 1st will get the locks and the
2nd will block until 1st one commits and only then lock in the snapshot, i.e. it will
see the changes made by 1st one. This takes care of autoCommit=true case.
For multi-stmt txns this is not sufficient and will be managed via WriteSet tracking
in the lock manager.*/
txnMgr.acquireLocks(plan, ctx, userFromUGI, lDrvState);
if (initiatingTransaction || (readOnlyQueryInAutoCommit && acidInQuery)) {
//For multi-stmt txns we should record the snapshot when txn starts but
// don't update it after that until txn completes. Thus the check for {@code initiatingTransaction}
//For autoCommit=true, Read-only statements, txn is implicit, i.e. lock in the snapshot
//for each statement.
recordValidTxns();
}
return 0;
} catch (Exception e) {
errorMessage = "FAILED: Error in acquiring locks: " + e.getMessage();
SQLState = ErrorMsg.findSQLState(e.getMessage());
downstreamError = e;
console.printError(errorMessage, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
return 10;
} finally {
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.ACQUIRE_READ_WRITE_LOCKS);
}
}
use of org.apache.hadoop.hive.ql.log.PerfLogger in project hive by apache.
the class Hive method loadPartition.
/**
* Load a directory into a Hive Table Partition - Alters existing content of
* the partition with the contents of loadPath. - If the partition does not
* exist - one is created - files in loadPath are moved into Hive. But the
* directory itself is not removed.
*
* @param loadPath
* Directory containing files to load into Table
* @param tbl
* name of table to be loaded.
* @param partSpec
* defines which partition needs to be loaded
* @param loadFileType
* if REPLACE_ALL - replace files in the table,
* otherwise add files to table (KEEP_EXISTING, OVERWRITE_EXISTING)
* @param inheritTableSpecs if true, on [re]creating the partition, take the
* location/inputformat/outputformat/serde details from table spec
* @param isSrcLocal
* If the source directory is LOCAL
* @param isAcidIUDoperation
* true if this is an ACID operation Insert/Update/Delete operation
* @param hasFollowingStatsTask
* true if there is a following task which updates the stats, so, this method need not update.
* @param writeId write ID allocated for the current load operation
* @param stmtId statement ID of the current load statement
* @return Partition object being loaded with data
*/
public Partition loadPartition(Path loadPath, Table tbl, Map<String, String> partSpec, LoadFileType loadFileType, boolean inheritTableSpecs, boolean isSkewedStoreAsSubdir, boolean isSrcLocal, boolean isAcidIUDoperation, boolean hasFollowingStatsTask, Long writeId, int stmtId) throws HiveException {
Path tblDataLocationPath = tbl.getDataLocation();
boolean isMmTableWrite = AcidUtils.isInsertOnlyTable(tbl.getParameters());
assert tbl.getPath() != null : "null==getPath() for " + tbl.getTableName();
boolean isFullAcidTable = AcidUtils.isFullAcidTable(tbl);
try {
// Get the partition object if it already exists
Partition oldPart = getPartition(tbl, partSpec, false);
/**
* Move files before creating the partition since down stream processes
* check for existence of partition in metadata before accessing the data.
* If partition is created before data is moved, downstream waiting
* processes might move forward with partial data
*/
Path oldPartPath = (oldPart != null) ? oldPart.getDataLocation() : null;
Path newPartPath = null;
if (inheritTableSpecs) {
Path partPath = new Path(tbl.getDataLocation(), Warehouse.makePartPath(partSpec));
newPartPath = new Path(tblDataLocationPath.toUri().getScheme(), tblDataLocationPath.toUri().getAuthority(), partPath.toUri().getPath());
if (oldPart != null) {
/*
* If we are moving the partition across filesystem boundaries
* inherit from the table properties. Otherwise (same filesystem) use the
* original partition location.
*
* See: HIVE-1707 and HIVE-2117 for background
*/
FileSystem oldPartPathFS = oldPartPath.getFileSystem(getConf());
FileSystem loadPathFS = loadPath.getFileSystem(getConf());
if (FileUtils.equalsFileSystem(oldPartPathFS, loadPathFS)) {
newPartPath = oldPartPath;
}
}
} else {
newPartPath = oldPartPath;
}
List<Path> newFiles = null;
PerfLogger perfLogger = SessionState.getPerfLogger();
perfLogger.PerfLogBegin("MoveTask", "FileMoves");
// or dynamic partition inserts), the add partition event will capture the list of files added.
if (conf.getBoolVar(ConfVars.FIRE_EVENTS_FOR_DML) && !tbl.isTemporary() && (null != oldPart)) {
newFiles = Collections.synchronizedList(new ArrayList<Path>());
}
// Note: this assumes both paths are qualified; which they are, currently.
if (isMmTableWrite && loadPath.equals(newPartPath)) {
// MM insert query, move itself is a no-op.
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("not moving " + loadPath + " to " + newPartPath + " (MM)");
}
assert !isAcidIUDoperation;
if (areEventsForDmlNeeded(tbl, oldPart)) {
newFiles = listFilesCreatedByQuery(loadPath, writeId, stmtId);
}
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("maybe deleting stuff from " + oldPartPath + " (new " + newPartPath + ") for replace");
}
} else {
// Either a non-MM query, or a load into MM table from an external source.
PathFilter filter = FileUtils.HIDDEN_FILES_PATH_FILTER;
Path destPath = newPartPath;
if (isMmTableWrite) {
// We will load into MM directory, and delete from the parent if needed.
// TODO: this looks invalid after ACID integration. What about base dirs?
destPath = new Path(destPath, AcidUtils.deltaSubdir(writeId, writeId, stmtId));
// TODO: loadFileType for MM table will no longer be REPLACE_ALL
filter = (loadFileType == LoadFileType.REPLACE_ALL) ? new JavaUtils.IdPathFilter(writeId, stmtId, false, true) : filter;
} else if (!isAcidIUDoperation && isFullAcidTable) {
destPath = fixFullAcidPathForLoadData(loadFileType, destPath, writeId, stmtId, tbl);
}
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("moving " + loadPath + " to " + destPath);
}
// todo: why is "&& !isAcidIUDoperation" needed here?
if (!isFullAcidTable && ((loadFileType == LoadFileType.REPLACE_ALL) || (oldPart == null && !isAcidIUDoperation))) {
// for fullAcid tables we don't delete files for commands with OVERWRITE - we create a new
// base_x. (there is Insert Overwrite and Load Data Overwrite)
boolean isAutoPurge = "true".equalsIgnoreCase(tbl.getProperty("auto.purge"));
// TODO: this should never run for MM tables anymore. Remove the flag, and maybe the filter?
replaceFiles(tbl.getPath(), loadPath, destPath, oldPartPath, getConf(), isSrcLocal, isAutoPurge, newFiles, filter, isMmTableWrite, !tbl.isTemporary());
} else {
FileSystem fs = tbl.getDataLocation().getFileSystem(conf);
copyFiles(conf, loadPath, destPath, fs, isSrcLocal, isAcidIUDoperation, (loadFileType == LoadFileType.OVERWRITE_EXISTING), newFiles, tbl.getNumBuckets() > 0, isFullAcidTable);
}
}
perfLogger.PerfLogEnd("MoveTask", "FileMoves");
Partition newTPart = oldPart != null ? oldPart : new Partition(tbl, partSpec, newPartPath);
alterPartitionSpecInMemory(tbl, partSpec, newTPart.getTPartition(), inheritTableSpecs, newPartPath.toString());
validatePartition(newTPart);
// When inserting into a new partition, the add partition event takes care of insert event
if ((null != oldPart) && (null != newFiles)) {
fireInsertEvent(tbl, partSpec, (loadFileType == LoadFileType.REPLACE_ALL), newFiles);
} else {
LOG.debug("No new files were created, and is not a replace, or we're inserting into a " + "partition that does not exist yet. Skipping generating INSERT event.");
}
// column stats will be inaccurate
if (!hasFollowingStatsTask) {
StatsSetupConst.clearColumnStatsState(newTPart.getParameters());
}
// recreate the partition if it existed before
if (isSkewedStoreAsSubdir) {
org.apache.hadoop.hive.metastore.api.Partition newCreatedTpart = newTPart.getTPartition();
SkewedInfo skewedInfo = newCreatedTpart.getSd().getSkewedInfo();
/* Construct list bucketing location mappings from sub-directory name. */
Map<List<String>, String> skewedColValueLocationMaps = constructListBucketingLocationMap(newPartPath, skewedInfo);
/* Add list bucketing location mappings. */
skewedInfo.setSkewedColValueLocationMaps(skewedColValueLocationMaps);
newCreatedTpart.getSd().setSkewedInfo(skewedInfo);
}
if (!this.getConf().getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
StatsSetupConst.setBasicStatsState(newTPart.getParameters(), StatsSetupConst.FALSE);
}
if (oldPart == null) {
newTPart.getTPartition().setParameters(new HashMap<String, String>());
if (this.getConf().getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
StatsSetupConst.setStatsStateForCreateTable(newTPart.getParameters(), MetaStoreUtils.getColumnNames(tbl.getCols()), StatsSetupConst.TRUE);
}
// Note: we are creating a brand new the partition, so this is going to be valid for ACID.
List<FileStatus> filesForStats = null;
if (isFullAcidTable || isMmTableWrite) {
filesForStats = AcidUtils.getAcidFilesForStats(newTPart.getTable(), newPartPath, conf, null);
} else {
filesForStats = HiveStatsUtils.getFileStatusRecurse(newPartPath, -1, newPartPath.getFileSystem(conf));
}
if (filesForStats != null) {
MetaStoreUtils.populateQuickStats(filesForStats, newTPart.getParameters());
} else {
// The ACID state is probably absent. Warning is logged in the get method.
MetaStoreUtils.clearQuickStats(newTPart.getParameters());
}
try {
LOG.debug("Adding new partition " + newTPart.getSpec());
getSynchronizedMSC().add_partition(newTPart.getTPartition());
} catch (AlreadyExistsException aee) {
// With multiple users concurrently issuing insert statements on the same partition has
// a side effect that some queries may not see a partition at the time when they're issued,
// but will realize the partition is actually there when it is trying to add such partition
// to the metastore and thus get AlreadyExistsException, because some earlier query just created it (race condition).
// For example, imagine such a table is created:
// create table T (name char(50)) partitioned by (ds string);
// and the following two queries are launched at the same time, from different sessions:
// insert into table T partition (ds) values ('Bob', 'today'); -- creates the partition 'today'
// insert into table T partition (ds) values ('Joe', 'today'); -- will fail with AlreadyExistsException
// In that case, we want to retry with alterPartition.
LOG.debug("Caught AlreadyExistsException, trying to alter partition instead");
setStatsPropAndAlterPartition(hasFollowingStatsTask, tbl, newTPart);
} catch (Exception e) {
try {
final FileSystem newPathFileSystem = newPartPath.getFileSystem(this.getConf());
boolean isAutoPurge = "true".equalsIgnoreCase(tbl.getProperty("auto.purge"));
final FileStatus status = newPathFileSystem.getFileStatus(newPartPath);
Hive.trashFiles(newPathFileSystem, new FileStatus[] { status }, this.getConf(), isAutoPurge);
} catch (IOException io) {
LOG.error("Could not delete partition directory contents after failed partition creation: ", io);
}
throw e;
}
} else {
setStatsPropAndAlterPartition(hasFollowingStatsTask, tbl, newTPart);
}
return newTPart;
} catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
throw new HiveException(e);
} catch (MetaException e) {
LOG.error(StringUtils.stringifyException(e));
throw new HiveException(e);
} catch (InvalidOperationException e) {
LOG.error(StringUtils.stringifyException(e));
throw new HiveException(e);
} catch (TException e) {
LOG.error(StringUtils.stringifyException(e));
throw new HiveException(e);
}
}
use of org.apache.hadoop.hive.ql.log.PerfLogger in project hive by apache.
the class CombineHiveInputFormat method getSplits.
/**
* Create Hive splits based on CombineFileSplit.
*/
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
PerfLogger perfLogger = SessionState.getPerfLogger();
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.GET_SPLITS);
init(job);
ArrayList<InputSplit> result = new ArrayList<InputSplit>();
Path[] paths = getInputPaths(job);
List<Path> nonCombinablePaths = new ArrayList<Path>(paths.length / 2);
List<Path> combinablePaths = new ArrayList<Path>(paths.length / 2);
int numThreads = Math.min(MAX_CHECK_NONCOMBINABLE_THREAD_NUM, (int) Math.ceil((double) paths.length / DEFAULT_NUM_PATH_PER_THREAD));
// In that case, Executors.newFixedThreadPool will fail.
if (numThreads > 0) {
try {
Set<Integer> nonCombinablePathIndices = getNonCombinablePathIndices(job, paths, numThreads);
for (int i = 0; i < paths.length; i++) {
if (nonCombinablePathIndices.contains(i)) {
nonCombinablePaths.add(paths[i]);
} else {
combinablePaths.add(paths[i]);
}
}
} catch (Exception e) {
LOG.error("Error checking non-combinable path", e);
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
throw new IOException(e);
}
}
// Store the previous value for the path specification
String oldPaths = job.get(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR);
if (LOG.isDebugEnabled()) {
LOG.debug("The received input paths are: [" + oldPaths + "] against the property " + org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR);
}
// Process the normal splits
if (nonCombinablePaths.size() > 0) {
FileInputFormat.setInputPaths(job, nonCombinablePaths.toArray(new Path[nonCombinablePaths.size()]));
InputSplit[] splits = super.getSplits(job, numSplits);
for (InputSplit split : splits) {
result.add(split);
}
}
// Process the combine splits
if (combinablePaths.size() > 0) {
FileInputFormat.setInputPaths(job, combinablePaths.toArray(new Path[combinablePaths.size()]));
Map<Path, PartitionDesc> pathToPartitionInfo = this.pathToPartitionInfo != null ? this.pathToPartitionInfo : Utilities.getMapWork(job).getPathToPartitionInfo();
InputSplit[] splits = getCombineSplits(job, numSplits, pathToPartitionInfo);
for (InputSplit split : splits) {
result.add(split);
}
}
// if some application depends on the original value being set.
if (oldPaths != null) {
job.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR, oldPaths);
}
// clear work from ThreadLocal after splits generated in case of thread is reused in pool.
Utilities.clearWorkMapForConf(job);
if (result.isEmpty() && paths.length > 0 && job.getBoolean(Utilities.ENSURE_OPERATORS_EXECUTED, false)) {
// If there are no inputs; the Execution engine skips the operator tree.
// To prevent it from happening; an opaque ZeroRows input is added here - when needed.
result.add(new HiveInputSplit(new NullRowsInputFormat.DummyInputSplit(paths[0]), ZeroRowsInputFormat.class.getName()));
}
LOG.info("Number of all splits " + result.size());
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
return result.toArray(new InputSplit[result.size()]);
}
use of org.apache.hadoop.hive.ql.log.PerfLogger in project hive by apache.
the class Driver method acquireLocks.
/**
* Acquire read and write locks needed by the statement. The list of objects to be locked are
* obtained from the inputs and outputs populated by the compiler. Locking strategy depends on
* HiveTxnManager and HiveLockManager configured
*
* This method also records the list of valid transactions. This must be done after any
* transactions have been opened.
* @throws CommandProcessorResponse
*/
private void acquireLocks() throws CommandProcessorResponse {
PerfLogger perfLogger = SessionState.getPerfLogger();
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.ACQUIRE_READ_WRITE_LOCKS);
if (!queryTxnMgr.isTxnOpen() && queryTxnMgr.supportsAcid()) {
/*non acid txn managers don't support txns but fwd lock requests to lock managers
acid txn manager requires all locks to be associated with a txn so if we
end up here w/o an open txn it's because we are processing something like "use <database>
which by definition needs no locks*/
return;
}
try {
String userFromUGI = getUserFromUGI();
if (userFromUGI == null) {
throw createProcessorResponse(10);
}
// Set the table write id in all of the acid file sinks
if (haveAcidWrite()) {
List<FileSinkDesc> acidSinks = new ArrayList<>(plan.getAcidSinks());
// sorting makes tests easier to write since file names and ROW__IDs depend on statementId
// so this makes (file name -> data) mapping stable
acidSinks.sort((FileSinkDesc fsd1, FileSinkDesc fsd2) -> fsd1.getDirName().compareTo(fsd2.getDirName()));
for (FileSinkDesc desc : acidSinks) {
TableDesc tableInfo = desc.getTableInfo();
long writeId = queryTxnMgr.getTableWriteId(Utilities.getDatabaseName(tableInfo.getTableName()), Utilities.getTableName(tableInfo.getTableName()));
desc.setTableWriteId(writeId);
// it's possible to have > 1 FileSink writing to the same table/partition
// e.g. Merge stmt, multi-insert stmt when mixing DP and SP writes
desc.setStatementId(queryTxnMgr.getStmtIdAndIncrement());
}
}
/*It's imperative that {@code acquireLocks()} is called for all commands so that
HiveTxnManager can transition its state machine correctly*/
queryTxnMgr.acquireLocks(plan, ctx, userFromUGI, lDrvState);
if (queryTxnMgr.recordSnapshot(plan)) {
recordValidTxns(queryTxnMgr);
}
if (plan.hasAcidResourcesInQuery()) {
recordValidWriteIds(queryTxnMgr);
}
} catch (Exception e) {
errorMessage = "FAILED: Error in acquiring locks: " + e.getMessage();
SQLState = ErrorMsg.findSQLState(e.getMessage());
downstreamError = e;
console.printError(errorMessage, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
throw createProcessorResponse(10);
} finally {
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.ACQUIRE_READ_WRITE_LOCKS);
}
}
Aggregations