use of org.apache.hadoop.hive.ql.Driver.LockedDriverState in project hive by apache.
the class CombineHiveInputFormat method getCombineSplits.
/**
* Create Hive splits based on CombineFileSplit.
*/
private InputSplit[] getCombineSplits(JobConf job, int numSplits, Map<Path, PartitionDesc> pathToPartitionInfo) throws IOException {
init(job);
Map<Path, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
Map<String, Operator<? extends OperatorDesc>> aliasToWork = mrwork.getAliasToWork();
CombineFileInputFormatShim combine = ShimLoader.getHadoopShims().getCombineFileInputFormat();
InputSplit[] splits = null;
if (combine == null) {
splits = super.getSplits(job, numSplits);
return splits;
}
if (combine.getInputPathsShim(job).length == 0) {
throw new IOException("No input paths specified in job");
}
ArrayList<InputSplit> result = new ArrayList<InputSplit>();
// combine splits only from same tables and same partitions. Do not combine splits from multiple
// tables or multiple partitions.
Path[] paths = StringInternUtils.internUriStringsInPathArray(combine.getInputPathsShim(job));
List<Path> inpDirs = new ArrayList<Path>();
List<Path> inpFiles = new ArrayList<Path>();
Map<CombinePathInputFormat, CombineFilter> poolMap = new HashMap<CombinePathInputFormat, CombineFilter>();
Set<Path> poolSet = new HashSet<Path>();
LockedDriverState lDrvStat = LockedDriverState.getLockedDriverState();
for (Path path : paths) {
if (lDrvStat != null && lDrvStat.isAborted()) {
throw new IOException("Operation is Canceled. ");
}
PartitionDesc part = HiveFileFormatUtils.getFromPathRecursively(pathToPartitionInfo, path, IOPrepareCache.get().allocatePartitionDescMap());
TableDesc tableDesc = part.getTableDesc();
if ((tableDesc != null) && tableDesc.isNonNative()) {
return super.getSplits(job, numSplits);
}
// Use HiveInputFormat if any of the paths is not splittable
Class inputFormatClass = part.getInputFileFormatClass();
String inputFormatClassName = inputFormatClass.getName();
InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
String deserializerClassName = null;
try {
deserializerClassName = part.getDeserializer(job).getClass().getName();
} catch (Exception e) {
// ignore
}
FileSystem inpFs = path.getFileSystem(job);
// don't combine if inputformat is a SymlinkTextInputFormat
if (inputFormat instanceof SymlinkTextInputFormat) {
splits = super.getSplits(job, numSplits);
return splits;
}
Path filterPath = path;
// Does a pool exist for this path already
CombineFilter f = null;
List<Operator<? extends OperatorDesc>> opList = null;
if (!mrwork.isMapperCannotSpanPartns()) {
// if mapper can span partitions, make sure a splits does not contain multiple
// opList + inputFormatClassName + deserializerClassName combination
// This is done using the Map of CombinePathInputFormat to PathFilter
opList = HiveFileFormatUtils.doGetWorksFromPath(pathToAliases, aliasToWork, filterPath);
CombinePathInputFormat combinePathInputFormat = new CombinePathInputFormat(opList, inputFormatClassName, deserializerClassName);
f = poolMap.get(combinePathInputFormat);
if (f == null) {
f = new CombineFilter(filterPath);
LOG.info("CombineHiveInputSplit creating pool for " + path + "; using filter path " + filterPath);
combine.createPool(job, f);
poolMap.put(combinePathInputFormat, f);
} else {
LOG.debug("CombineHiveInputSplit: pool is already created for " + path + "; using filter path " + filterPath);
f.addPath(filterPath);
}
} else {
// but won't cross multiple partitions if the user has asked so.
if (!path.getFileSystem(job).getFileStatus(path).isDir()) {
// path is not directory
filterPath = path.getParent();
inpFiles.add(path);
poolSet.add(filterPath);
} else {
inpDirs.add(path);
}
}
}
// Processing directories
List<CombineFileSplit> iss = new ArrayList<CombineFileSplit>();
if (!mrwork.isMapperCannotSpanPartns()) {
// mapper can span partitions
// combine into as few as one split, subject to the PathFilters set
// using combine.createPool.
iss = Arrays.asList(combine.getSplits(job, 1));
} else {
for (Path path : inpDirs) {
processPaths(job, combine, iss, path);
}
if (inpFiles.size() > 0) {
// Processing files
for (Path filterPath : poolSet) {
combine.createPool(job, new CombineFilter(filterPath));
}
processPaths(job, combine, iss, inpFiles.toArray(new Path[0]));
}
}
if (mrwork.getNameToSplitSample() != null && !mrwork.getNameToSplitSample().isEmpty()) {
iss = sampleSplits(iss);
}
for (CombineFileSplit is : iss) {
CombineHiveInputSplit csplit = new CombineHiveInputSplit(job, is, pathToPartitionInfo);
result.add(csplit);
}
LOG.info("number of splits " + result.size());
return result.toArray(new InputSplit[result.size()]);
}
use of org.apache.hadoop.hive.ql.Driver.LockedDriverState in project hive by apache.
the class Utilities method getInputPathsWithPool.
@VisibleForTesting
static List<Path> getInputPathsWithPool(JobConf job, MapWork work, Path hiveScratchDir, Context ctx, boolean skipDummy, List<Path> pathsToAdd, ExecutorService pool) throws IOException, ExecutionException, InterruptedException {
LockedDriverState lDrvStat = LockedDriverState.getLockedDriverState();
List<Path> finalPathsToAdd = new ArrayList<>();
try {
Map<GetInputPathsCallable, Future<Path>> getPathsCallableToFuture = new LinkedHashMap<>();
for (final Path path : pathsToAdd) {
if (lDrvStat != null && lDrvStat.isAborted()) {
throw new IOException("Operation is Canceled.");
}
GetInputPathsCallable callable = new GetInputPathsCallable(path, job, work, hiveScratchDir, ctx, skipDummy);
getPathsCallableToFuture.put(callable, pool.submit(callable));
}
pool.shutdown();
for (Map.Entry<GetInputPathsCallable, Future<Path>> future : getPathsCallableToFuture.entrySet()) {
if (lDrvStat != null && lDrvStat.isAborted()) {
throw new IOException("Operation is Canceled.");
}
Path newPath = future.getValue().get();
updatePathForMapWork(newPath, work, future.getKey().path);
finalPathsToAdd.add(newPath);
}
} finally {
pool.shutdownNow();
}
return finalPathsToAdd;
}
use of org.apache.hadoop.hive.ql.Driver.LockedDriverState in project hive by apache.
the class Utilities method getInputPaths.
/**
* Computes a list of all input paths needed to compute the given MapWork. All aliases
* are considered and a merged list of input paths is returned. If any input path points
* to an empty table or partition a dummy file in the scratch dir is instead created and
* added to the list. This is needed to avoid special casing the operator pipeline for
* these cases.
*
* @param job JobConf used to run the job
* @param work MapWork encapsulating the info about the task
* @param hiveScratchDir The tmp dir used to create dummy files if needed
* @param ctx Context object
* @return List of paths to process for the given MapWork
* @throws Exception
*/
public static List<Path> getInputPaths(JobConf job, MapWork work, Path hiveScratchDir, Context ctx, boolean skipDummy) throws Exception {
Set<Path> pathsProcessed = new HashSet<Path>();
List<Path> pathsToAdd = new LinkedList<Path>();
LockedDriverState lDrvStat = LockedDriverState.getLockedDriverState();
// AliasToWork contains all the aliases
Collection<String> aliasToWork = work.getAliasToWork().keySet();
if (!skipDummy) {
// ConcurrentModification otherwise if adding dummy.
aliasToWork = new ArrayList<>(aliasToWork);
}
for (String alias : aliasToWork) {
LOG.info("Processing alias {}", alias);
// The alias may not have any path
Collection<Map.Entry<Path, ArrayList<String>>> pathToAliases = work.getPathToAliases().entrySet();
if (!skipDummy) {
// ConcurrentModification otherwise if adding dummy.
pathToAliases = new ArrayList<>(pathToAliases);
}
boolean isEmptyTable = true;
boolean hasLogged = false;
for (Map.Entry<Path, ArrayList<String>> e : pathToAliases) {
if (lDrvStat != null && lDrvStat.isAborted()) {
throw new IOException("Operation is Canceled.");
}
Path file = e.getKey();
List<String> aliases = e.getValue();
if (aliases.contains(alias)) {
if (file != null) {
isEmptyTable = false;
} else {
LOG.warn("Found a null path for alias {}", alias);
continue;
}
// processed only once
if (pathsProcessed.contains(file)) {
continue;
}
StringInternUtils.internUriStringsInPath(file);
pathsProcessed.add(file);
LOG.debug("Adding input file {}", file);
if (!hasLogged) {
hasLogged = true;
LOG.info("Adding {} inputs; the first input is {}", work.getPathToAliases().size(), file);
}
pathsToAdd.add(file);
}
}
// rows)
if (isEmptyTable && !skipDummy) {
pathsToAdd.add(createDummyFileForEmptyTable(job, work, hiveScratchDir, alias));
}
}
List<Path> finalPathsToAdd = new LinkedList<>();
int numExecutors = getMaxExecutorsForInputListing(job, pathsToAdd.size());
if (numExecutors > 1) {
ExecutorService pool = Executors.newFixedThreadPool(numExecutors, new ThreadFactoryBuilder().setDaemon(true).setNameFormat("Get-Input-Paths-%d").build());
finalPathsToAdd.addAll(getInputPathsWithPool(job, work, hiveScratchDir, ctx, skipDummy, pathsToAdd, pool));
} else {
for (final Path path : pathsToAdd) {
if (lDrvStat != null && lDrvStat.isAborted()) {
throw new IOException("Operation is Canceled.");
}
Path newPath = new GetInputPathsCallable(path, job, work, hiveScratchDir, ctx, skipDummy).call();
updatePathForMapWork(newPath, work, path);
finalPathsToAdd.add(newPath);
}
}
return finalPathsToAdd;
}
use of org.apache.hadoop.hive.ql.Driver.LockedDriverState in project hive by apache.
the class TestDummyTxnManager method testSingleReadTable.
/**
* Verifies the current database object is not locked if the table read is against different database
* @throws Exception
*/
@Test
public void testSingleReadTable() throws Exception {
// Setup
SessionState.get().setCurrentDatabase("db1");
List<HiveLock> expectedLocks = new ArrayList<HiveLock>();
expectedLocks.add(new ZooKeeperHiveLock("default", new HiveLockObject(), HiveLockMode.SHARED));
expectedLocks.add(new ZooKeeperHiveLock("default.table1", new HiveLockObject(), HiveLockMode.SHARED));
LockedDriverState lDrvState = new LockedDriverState();
LockedDriverState lDrvInp = new LockedDriverState();
lDrvInp.abort();
LockException lEx = new LockException(ErrorMsg.LOCK_ACQUIRE_CANCELLED.getMsg());
when(mockLockManager.lock(anyListOf(HiveLockObj.class), eq(false), eq(lDrvState))).thenReturn(expectedLocks);
when(mockLockManager.lock(anyListOf(HiveLockObj.class), eq(false), eq(lDrvInp))).thenThrow(lEx);
doNothing().when(mockLockManager).setContext(any(HiveLockManagerCtx.class));
doNothing().when(mockLockManager).close();
ArgumentCaptor<List> lockObjsCaptor = ArgumentCaptor.forClass(List.class);
when(mockQueryPlan.getInputs()).thenReturn(createReadEntities());
when(mockQueryPlan.getOutputs()).thenReturn(new HashSet<WriteEntity>());
// Execute
txnMgr.acquireLocks(mockQueryPlan, ctx, "fred", lDrvState);
// Verify
Assert.assertEquals("db1", SessionState.get().getCurrentDatabase());
List<HiveLock> resultLocks = ctx.getHiveLocks();
Assert.assertEquals(expectedLocks.size(), resultLocks.size());
Assert.assertEquals(expectedLocks.get(0).getHiveLockMode(), resultLocks.get(0).getHiveLockMode());
Assert.assertEquals(expectedLocks.get(0).getHiveLockObject().getName(), resultLocks.get(0).getHiveLockObject().getName());
Assert.assertEquals(expectedLocks.get(1).getHiveLockMode(), resultLocks.get(1).getHiveLockMode());
Assert.assertEquals(expectedLocks.get(0).getHiveLockObject().getName(), resultLocks.get(0).getHiveLockObject().getName());
verify(mockLockManager).lock(lockObjsCaptor.capture(), eq(false), eq(lDrvState));
List<HiveLockObj> lockObjs = lockObjsCaptor.getValue();
Assert.assertEquals(2, lockObjs.size());
Assert.assertEquals("default", lockObjs.get(0).getName());
Assert.assertEquals(HiveLockMode.SHARED, lockObjs.get(0).mode);
Assert.assertEquals("default/table1", lockObjs.get(1).getName());
Assert.assertEquals(HiveLockMode.SHARED, lockObjs.get(1).mode);
// Execute
try {
txnMgr.acquireLocks(mockQueryPlan, ctx, "fred", lDrvInp);
Assert.fail();
} catch (LockException le) {
Assert.assertEquals(le.getMessage(), ErrorMsg.LOCK_ACQUIRE_CANCELLED.getMsg());
}
}
Aggregations