Search in sources :

Example 1 with LockedDriverState

use of org.apache.hadoop.hive.ql.Driver.LockedDriverState in project hive by apache.

the class CombineHiveInputFormat method getCombineSplits.

/**
 * Create Hive splits based on CombineFileSplit.
 */
private InputSplit[] getCombineSplits(JobConf job, int numSplits, Map<Path, PartitionDesc> pathToPartitionInfo) throws IOException {
    init(job);
    Map<Path, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
    Map<String, Operator<? extends OperatorDesc>> aliasToWork = mrwork.getAliasToWork();
    CombineFileInputFormatShim combine = ShimLoader.getHadoopShims().getCombineFileInputFormat();
    InputSplit[] splits = null;
    if (combine == null) {
        splits = super.getSplits(job, numSplits);
        return splits;
    }
    if (combine.getInputPathsShim(job).length == 0) {
        throw new IOException("No input paths specified in job");
    }
    ArrayList<InputSplit> result = new ArrayList<InputSplit>();
    // combine splits only from same tables and same partitions. Do not combine splits from multiple
    // tables or multiple partitions.
    Path[] paths = StringInternUtils.internUriStringsInPathArray(combine.getInputPathsShim(job));
    List<Path> inpDirs = new ArrayList<Path>();
    List<Path> inpFiles = new ArrayList<Path>();
    Map<CombinePathInputFormat, CombineFilter> poolMap = new HashMap<CombinePathInputFormat, CombineFilter>();
    Set<Path> poolSet = new HashSet<Path>();
    LockedDriverState lDrvStat = LockedDriverState.getLockedDriverState();
    for (Path path : paths) {
        if (lDrvStat != null && lDrvStat.isAborted()) {
            throw new IOException("Operation is Canceled. ");
        }
        PartitionDesc part = HiveFileFormatUtils.getFromPathRecursively(pathToPartitionInfo, path, IOPrepareCache.get().allocatePartitionDescMap());
        TableDesc tableDesc = part.getTableDesc();
        if ((tableDesc != null) && tableDesc.isNonNative()) {
            return super.getSplits(job, numSplits);
        }
        // Use HiveInputFormat if any of the paths is not splittable
        Class inputFormatClass = part.getInputFileFormatClass();
        String inputFormatClassName = inputFormatClass.getName();
        InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
        String deserializerClassName = null;
        try {
            deserializerClassName = part.getDeserializer(job).getClass().getName();
        } catch (Exception e) {
        // ignore
        }
        FileSystem inpFs = path.getFileSystem(job);
        // don't combine if inputformat is a SymlinkTextInputFormat
        if (inputFormat instanceof SymlinkTextInputFormat) {
            splits = super.getSplits(job, numSplits);
            return splits;
        }
        Path filterPath = path;
        // Does a pool exist for this path already
        CombineFilter f = null;
        List<Operator<? extends OperatorDesc>> opList = null;
        if (!mrwork.isMapperCannotSpanPartns()) {
            // if mapper can span partitions, make sure a splits does not contain multiple
            // opList + inputFormatClassName + deserializerClassName combination
            // This is done using the Map of CombinePathInputFormat to PathFilter
            opList = HiveFileFormatUtils.doGetWorksFromPath(pathToAliases, aliasToWork, filterPath);
            CombinePathInputFormat combinePathInputFormat = new CombinePathInputFormat(opList, inputFormatClassName, deserializerClassName);
            f = poolMap.get(combinePathInputFormat);
            if (f == null) {
                f = new CombineFilter(filterPath);
                LOG.info("CombineHiveInputSplit creating pool for " + path + "; using filter path " + filterPath);
                combine.createPool(job, f);
                poolMap.put(combinePathInputFormat, f);
            } else {
                LOG.debug("CombineHiveInputSplit: pool is already created for " + path + "; using filter path " + filterPath);
                f.addPath(filterPath);
            }
        } else {
            // but won't cross multiple partitions if the user has asked so.
            if (!path.getFileSystem(job).getFileStatus(path).isDir()) {
                // path is not directory
                filterPath = path.getParent();
                inpFiles.add(path);
                poolSet.add(filterPath);
            } else {
                inpDirs.add(path);
            }
        }
    }
    // Processing directories
    List<CombineFileSplit> iss = new ArrayList<CombineFileSplit>();
    if (!mrwork.isMapperCannotSpanPartns()) {
        // mapper can span partitions
        // combine into as few as one split, subject to the PathFilters set
        // using combine.createPool.
        iss = Arrays.asList(combine.getSplits(job, 1));
    } else {
        for (Path path : inpDirs) {
            processPaths(job, combine, iss, path);
        }
        if (inpFiles.size() > 0) {
            // Processing files
            for (Path filterPath : poolSet) {
                combine.createPool(job, new CombineFilter(filterPath));
            }
            processPaths(job, combine, iss, inpFiles.toArray(new Path[0]));
        }
    }
    if (mrwork.getNameToSplitSample() != null && !mrwork.getNameToSplitSample().isEmpty()) {
        iss = sampleSplits(iss);
    }
    for (CombineFileSplit is : iss) {
        CombineHiveInputSplit csplit = new CombineHiveInputSplit(job, is, pathToPartitionInfo);
        result.add(csplit);
    }
    LOG.info("number of splits " + result.size());
    return result.toArray(new InputSplit[result.size()]);
}
Also used : Operator(org.apache.hadoop.hive.ql.exec.Operator) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) CombineFileSplit(org.apache.hadoop.mapred.lib.CombineFileSplit) CombineFileInputFormatShim(org.apache.hadoop.hive.shims.HadoopShims.CombineFileInputFormatShim) FileSystem(org.apache.hadoop.fs.FileSystem) InputSplit(org.apache.hadoop.mapred.InputSplit) HashSet(java.util.HashSet) Path(org.apache.hadoop.fs.Path) IOException(java.io.IOException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) LockedDriverState(org.apache.hadoop.hive.ql.Driver.LockedDriverState) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 2 with LockedDriverState

use of org.apache.hadoop.hive.ql.Driver.LockedDriverState in project hive by apache.

the class Utilities method getInputPathsWithPool.

@VisibleForTesting
static List<Path> getInputPathsWithPool(JobConf job, MapWork work, Path hiveScratchDir, Context ctx, boolean skipDummy, List<Path> pathsToAdd, ExecutorService pool) throws IOException, ExecutionException, InterruptedException {
    LockedDriverState lDrvStat = LockedDriverState.getLockedDriverState();
    List<Path> finalPathsToAdd = new ArrayList<>();
    try {
        Map<GetInputPathsCallable, Future<Path>> getPathsCallableToFuture = new LinkedHashMap<>();
        for (final Path path : pathsToAdd) {
            if (lDrvStat != null && lDrvStat.isAborted()) {
                throw new IOException("Operation is Canceled.");
            }
            GetInputPathsCallable callable = new GetInputPathsCallable(path, job, work, hiveScratchDir, ctx, skipDummy);
            getPathsCallableToFuture.put(callable, pool.submit(callable));
        }
        pool.shutdown();
        for (Map.Entry<GetInputPathsCallable, Future<Path>> future : getPathsCallableToFuture.entrySet()) {
            if (lDrvStat != null && lDrvStat.isAborted()) {
                throw new IOException("Operation is Canceled.");
            }
            Path newPath = future.getValue().get();
            updatePathForMapWork(newPath, work, future.getKey().path);
            finalPathsToAdd.add(newPath);
        }
    } finally {
        pool.shutdownNow();
    }
    return finalPathsToAdd;
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) LockedDriverState(org.apache.hadoop.hive.ql.Driver.LockedDriverState) Future(java.util.concurrent.Future) IOException(java.io.IOException) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 3 with LockedDriverState

use of org.apache.hadoop.hive.ql.Driver.LockedDriverState in project hive by apache.

the class Utilities method getInputPaths.

/**
 * Computes a list of all input paths needed to compute the given MapWork. All aliases
 * are considered and a merged list of input paths is returned. If any input path points
 * to an empty table or partition a dummy file in the scratch dir is instead created and
 * added to the list. This is needed to avoid special casing the operator pipeline for
 * these cases.
 *
 * @param job JobConf used to run the job
 * @param work MapWork encapsulating the info about the task
 * @param hiveScratchDir The tmp dir used to create dummy files if needed
 * @param ctx Context object
 * @return List of paths to process for the given MapWork
 * @throws Exception
 */
public static List<Path> getInputPaths(JobConf job, MapWork work, Path hiveScratchDir, Context ctx, boolean skipDummy) throws Exception {
    Set<Path> pathsProcessed = new HashSet<Path>();
    List<Path> pathsToAdd = new LinkedList<Path>();
    LockedDriverState lDrvStat = LockedDriverState.getLockedDriverState();
    // AliasToWork contains all the aliases
    Collection<String> aliasToWork = work.getAliasToWork().keySet();
    if (!skipDummy) {
        // ConcurrentModification otherwise if adding dummy.
        aliasToWork = new ArrayList<>(aliasToWork);
    }
    for (String alias : aliasToWork) {
        LOG.info("Processing alias {}", alias);
        // The alias may not have any path
        Collection<Map.Entry<Path, ArrayList<String>>> pathToAliases = work.getPathToAliases().entrySet();
        if (!skipDummy) {
            // ConcurrentModification otherwise if adding dummy.
            pathToAliases = new ArrayList<>(pathToAliases);
        }
        boolean isEmptyTable = true;
        boolean hasLogged = false;
        for (Map.Entry<Path, ArrayList<String>> e : pathToAliases) {
            if (lDrvStat != null && lDrvStat.isAborted()) {
                throw new IOException("Operation is Canceled.");
            }
            Path file = e.getKey();
            List<String> aliases = e.getValue();
            if (aliases.contains(alias)) {
                if (file != null) {
                    isEmptyTable = false;
                } else {
                    LOG.warn("Found a null path for alias {}", alias);
                    continue;
                }
                // processed only once
                if (pathsProcessed.contains(file)) {
                    continue;
                }
                StringInternUtils.internUriStringsInPath(file);
                pathsProcessed.add(file);
                LOG.debug("Adding input file {}", file);
                if (!hasLogged) {
                    hasLogged = true;
                    LOG.info("Adding {} inputs; the first input is {}", work.getPathToAliases().size(), file);
                }
                pathsToAdd.add(file);
            }
        }
        // rows)
        if (isEmptyTable && !skipDummy) {
            pathsToAdd.add(createDummyFileForEmptyTable(job, work, hiveScratchDir, alias));
        }
    }
    List<Path> finalPathsToAdd = new LinkedList<>();
    int numExecutors = getMaxExecutorsForInputListing(job, pathsToAdd.size());
    if (numExecutors > 1) {
        ExecutorService pool = Executors.newFixedThreadPool(numExecutors, new ThreadFactoryBuilder().setDaemon(true).setNameFormat("Get-Input-Paths-%d").build());
        finalPathsToAdd.addAll(getInputPathsWithPool(job, work, hiveScratchDir, ctx, skipDummy, pathsToAdd, pool));
    } else {
        for (final Path path : pathsToAdd) {
            if (lDrvStat != null && lDrvStat.isAborted()) {
                throw new IOException("Operation is Canceled.");
            }
            Path newPath = new GetInputPathsCallable(path, job, work, hiveScratchDir, ctx, skipDummy).call();
            updatePathForMapWork(newPath, work, path);
            finalPathsToAdd.add(newPath);
        }
    }
    return finalPathsToAdd;
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) IOException(java.io.IOException) LinkedList(java.util.LinkedList) ExecutorService(java.util.concurrent.ExecutorService) LockedDriverState(org.apache.hadoop.hive.ql.Driver.LockedDriverState) ThreadFactoryBuilder(com.google.common.util.concurrent.ThreadFactoryBuilder) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) HashSet(java.util.HashSet)

Example 4 with LockedDriverState

use of org.apache.hadoop.hive.ql.Driver.LockedDriverState in project hive by apache.

the class TestDummyTxnManager method testSingleReadTable.

/**
 * Verifies the current database object is not locked if the table read is against different database
 * @throws Exception
 */
@Test
public void testSingleReadTable() throws Exception {
    // Setup
    SessionState.get().setCurrentDatabase("db1");
    List<HiveLock> expectedLocks = new ArrayList<HiveLock>();
    expectedLocks.add(new ZooKeeperHiveLock("default", new HiveLockObject(), HiveLockMode.SHARED));
    expectedLocks.add(new ZooKeeperHiveLock("default.table1", new HiveLockObject(), HiveLockMode.SHARED));
    LockedDriverState lDrvState = new LockedDriverState();
    LockedDriverState lDrvInp = new LockedDriverState();
    lDrvInp.abort();
    LockException lEx = new LockException(ErrorMsg.LOCK_ACQUIRE_CANCELLED.getMsg());
    when(mockLockManager.lock(anyListOf(HiveLockObj.class), eq(false), eq(lDrvState))).thenReturn(expectedLocks);
    when(mockLockManager.lock(anyListOf(HiveLockObj.class), eq(false), eq(lDrvInp))).thenThrow(lEx);
    doNothing().when(mockLockManager).setContext(any(HiveLockManagerCtx.class));
    doNothing().when(mockLockManager).close();
    ArgumentCaptor<List> lockObjsCaptor = ArgumentCaptor.forClass(List.class);
    when(mockQueryPlan.getInputs()).thenReturn(createReadEntities());
    when(mockQueryPlan.getOutputs()).thenReturn(new HashSet<WriteEntity>());
    // Execute
    txnMgr.acquireLocks(mockQueryPlan, ctx, "fred", lDrvState);
    // Verify
    Assert.assertEquals("db1", SessionState.get().getCurrentDatabase());
    List<HiveLock> resultLocks = ctx.getHiveLocks();
    Assert.assertEquals(expectedLocks.size(), resultLocks.size());
    Assert.assertEquals(expectedLocks.get(0).getHiveLockMode(), resultLocks.get(0).getHiveLockMode());
    Assert.assertEquals(expectedLocks.get(0).getHiveLockObject().getName(), resultLocks.get(0).getHiveLockObject().getName());
    Assert.assertEquals(expectedLocks.get(1).getHiveLockMode(), resultLocks.get(1).getHiveLockMode());
    Assert.assertEquals(expectedLocks.get(0).getHiveLockObject().getName(), resultLocks.get(0).getHiveLockObject().getName());
    verify(mockLockManager).lock(lockObjsCaptor.capture(), eq(false), eq(lDrvState));
    List<HiveLockObj> lockObjs = lockObjsCaptor.getValue();
    Assert.assertEquals(2, lockObjs.size());
    Assert.assertEquals("default", lockObjs.get(0).getName());
    Assert.assertEquals(HiveLockMode.SHARED, lockObjs.get(0).mode);
    Assert.assertEquals("default/table1", lockObjs.get(1).getName());
    Assert.assertEquals(HiveLockMode.SHARED, lockObjs.get(1).mode);
    // Execute
    try {
        txnMgr.acquireLocks(mockQueryPlan, ctx, "fred", lDrvInp);
        Assert.fail();
    } catch (LockException le) {
        Assert.assertEquals(le.getMessage(), ErrorMsg.LOCK_ACQUIRE_CANCELLED.getMsg());
    }
}
Also used : ArrayList(java.util.ArrayList) ZooKeeperHiveLock(org.apache.hadoop.hive.ql.lockmgr.zookeeper.ZooKeeperHiveLock) ZooKeeperHiveLock(org.apache.hadoop.hive.ql.lockmgr.zookeeper.ZooKeeperHiveLock) LockedDriverState(org.apache.hadoop.hive.ql.Driver.LockedDriverState) ArrayList(java.util.ArrayList) List(java.util.List) WriteEntity(org.apache.hadoop.hive.ql.hooks.WriteEntity) Test(org.junit.Test)

Aggregations

ArrayList (java.util.ArrayList)4 LockedDriverState (org.apache.hadoop.hive.ql.Driver.LockedDriverState)4 IOException (java.io.IOException)3 HashMap (java.util.HashMap)3 Path (org.apache.hadoop.fs.Path)3 HashSet (java.util.HashSet)2 LinkedHashMap (java.util.LinkedHashMap)2 Map (java.util.Map)2 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)2 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 ThreadFactoryBuilder (com.google.common.util.concurrent.ThreadFactoryBuilder)1 LinkedList (java.util.LinkedList)1 List (java.util.List)1 ExecutionException (java.util.concurrent.ExecutionException)1 ExecutorService (java.util.concurrent.ExecutorService)1 Future (java.util.concurrent.Future)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Operator (org.apache.hadoop.hive.ql.exec.Operator)1 WriteEntity (org.apache.hadoop.hive.ql.hooks.WriteEntity)1 ZooKeeperHiveLock (org.apache.hadoop.hive.ql.lockmgr.zookeeper.ZooKeeperHiveLock)1