Search in sources :

Example 1 with EventDumpDirComparator

use of org.apache.hadoop.hive.ql.parse.repl.load.EventDumpDirComparator in project hive by apache.

the class ReplicationSemanticAnalyzer method analyzeReplLoad.

/*
   * Example dump dirs we need to be able to handle :
   *
   * for: hive.repl.rootdir = staging/
   * Then, repl dumps will be created in staging/<dumpdir>
   *
   * single-db-dump: staging/blah12345 will contain a db dir for the db specified
   *  blah12345/
   *   default/
   *    _metadata
   *    tbl1/
   *      _metadata
   *      dt=20160907/
   *        _files
   *    tbl2/
   *    tbl3/
   *    unptn_tbl/
   *      _metadata
   *      _files
   *
   * multi-db-dump: staging/bar12347 will contain dirs for each db covered
   * staging/
   *  bar12347/
   *   default/
   *     ...
   *   sales/
   *     ...
   *
   * single table-dump: staging/baz123 will contain a table object dump inside
   * staging/
   *  baz123/
   *    _metadata
   *    dt=20150931/
   *      _files
   *
   * incremental dump : staging/blue123 will contain dirs for each event inside.
   * staging/
   *  blue123/
   *    34/
   *    35/
   *    36/
   */
private void analyzeReplLoad(ASTNode ast) throws SemanticException {
    LOG.debug("ReplSemanticAnalyzer.analyzeReplLoad: " + String.valueOf(dbNameOrPattern) + "." + String.valueOf(tblNameOrPattern) + " from " + String.valueOf(path));
    try {
        Path loadPath = new Path(path);
        final FileSystem fs = loadPath.getFileSystem(conf);
        if (!fs.exists(loadPath)) {
            // supposed dump path does not exist.
            throw new FileNotFoundException(loadPath.toUri().toString());
        }
        // Now, the dumped path can be one of three things:
        // a) It can be a db dump, in which case we expect a set of dirs, each with a
        // db name, and with a _metadata file in each, and table dirs inside that.
        // b) It can be a table dump dir, in which case we expect a _metadata dump of
        // a table in question in the dir, and individual ptn dir hierarchy.
        // c) A dump can be an incremental dump, which means we have several subdirs
        // each of which have the evid as the dir name, and each of which correspond
        // to a event-level dump. Currently, only CREATE_TABLE and ADD_PARTITION are
        // handled, so all of these dumps will be at a table/ptn level.
        // For incremental repl, we will have individual events which can
        // be other things like roles and fns as well.
        // At this point, all dump dirs should contain a _dumpmetadata file that
        // tells us what is inside that dumpdir.
        DumpMetaData dmd = new DumpMetaData(loadPath, conf);
        boolean evDump = false;
        if (dmd.isIncrementalDump()) {
            LOG.debug("{} contains an incremental dump", loadPath);
            evDump = true;
        } else {
            LOG.debug("{} contains an bootstrap dump", loadPath);
        }
        if ((!evDump) && (tblNameOrPattern != null) && !(tblNameOrPattern.isEmpty())) {
            ReplLoadWork replLoadWork = new ReplLoadWork(conf, loadPath.toString(), dbNameOrPattern, tblNameOrPattern, queryState.getLineageState(), SessionState.get().getTxnMgr().getCurrentTxnId());
            rootTasks.add(TaskFactory.get(replLoadWork, conf));
            return;
        }
        FileStatus[] srcs = LoadSemanticAnalyzer.matchFilesOrDir(fs, loadPath);
        if (srcs == null || (srcs.length == 0)) {
            LOG.warn("Nothing to load at {}", loadPath.toUri().toString());
            return;
        }
        FileStatus[] dirsInLoadPath = fs.listStatus(loadPath, EximUtil.getDirectoryFilter(fs));
        if ((dirsInLoadPath == null) || (dirsInLoadPath.length == 0)) {
            throw new IllegalArgumentException("No data to load in path " + loadPath.toUri().toString());
        }
        if (!evDump) {
            // not an event dump, not a table dump - thus, a db dump
            if ((dbNameOrPattern != null) && (dirsInLoadPath.length > 1)) {
                LOG.debug("Found multiple dirs when we expected 1:");
                for (FileStatus d : dirsInLoadPath) {
                    LOG.debug("> " + d.getPath().toUri().toString());
                }
                throw new IllegalArgumentException("Multiple dirs in " + loadPath.toUri().toString() + " does not correspond to REPL LOAD expecting to load to a singular destination point.");
            }
            ReplLoadWork replLoadWork = new ReplLoadWork(conf, loadPath.toString(), dbNameOrPattern, queryState.getLineageState(), SessionState.get().getTxnMgr().getCurrentTxnId());
            rootTasks.add(TaskFactory.get(replLoadWork, conf));
        // 
        // for (FileStatus dir : dirsInLoadPath) {
        // analyzeDatabaseLoad(dbNameOrPattern, fs, dir);
        // }
        } else {
            // Event dump, each sub-dir is an individual event dump.
            // We need to guarantee that the directory listing we got is in order of evid.
            Arrays.sort(dirsInLoadPath, new EventDumpDirComparator());
            Task<? extends Serializable> evTaskRoot = TaskFactory.get(new DependencyCollectionWork());
            Task<? extends Serializable> taskChainTail = evTaskRoot;
            ReplLogger replLogger = new IncrementalLoadLogger(dbNameOrPattern, loadPath.toString(), dirsInLoadPath.length);
            for (FileStatus dir : dirsInLoadPath) {
                LOG.debug("Loading event from {} to {}.{}", dir.getPath().toUri(), dbNameOrPattern, tblNameOrPattern);
                // event loads will behave similar to table loads, with one crucial difference
                // precursor order is strict, and each event must be processed after the previous one.
                // The way we handle this strict order is as follows:
                // First, we start with a taskChainTail which is a dummy noop task (a DependecyCollectionTask)
                // at the head of our event chain. For each event we process, we tell analyzeTableLoad to
                // create tasks that use the taskChainTail as a dependency. Then, we collect all those tasks
                // and introduce a new barrier task(also a DependencyCollectionTask) which depends on all
                // these tasks. Then, this barrier task becomes our new taskChainTail. Thus, we get a set of
                // tasks as follows:
                // 
                // --->ev1.task1--                          --->ev2.task1--
                // /               \                        /               \
                // evTaskRoot-->*---->ev1.task2---*--> ev1.barrierTask-->*---->ev2.task2---*->evTaskChainTail
                // \               /
                // --->ev1.task3--
                // 
                // Once this entire chain is generated, we add evTaskRoot to rootTasks, so as to execute the
                // entire chain
                String locn = dir.getPath().toUri().toString();
                DumpMetaData eventDmd = new DumpMetaData(new Path(locn), conf);
                MessageHandler.Context context = new MessageHandler.Context(dbNameOrPattern, tblNameOrPattern, locn, taskChainTail, eventDmd, conf, db, ctx, LOG);
                List<Task<? extends Serializable>> evTasks = analyzeEventLoad(context);
                if ((evTasks != null) && (!evTasks.isEmpty())) {
                    ReplStateLogWork replStateLogWork = new ReplStateLogWork(replLogger, dir.getPath().getName(), eventDmd.getDumpType().toString());
                    Task<? extends Serializable> barrierTask = TaskFactory.get(replStateLogWork);
                    for (Task<? extends Serializable> t : evTasks) {
                        t.addDependentTask(barrierTask);
                        LOG.debug("Added {}:{} as a precursor of barrier task {}:{}", t.getClass(), t.getId(), barrierTask.getClass(), barrierTask.getId());
                    }
                    LOG.debug("Updated taskChainTail from {}:{} to {}:{}", taskChainTail.getClass(), taskChainTail.getId(), barrierTask.getClass(), barrierTask.getId());
                    taskChainTail = barrierTask;
                }
            }
            // If any event is there and db name is known, then dump the start and end logs
            if (!evTaskRoot.equals(taskChainTail)) {
                Map<String, String> dbProps = new HashMap<>();
                dbProps.put(ReplicationSpec.KEY.CURR_STATE_ID.toString(), String.valueOf(dmd.getEventTo()));
                ReplStateLogWork replStateLogWork = new ReplStateLogWork(replLogger, dbProps);
                Task<? extends Serializable> barrierTask = TaskFactory.get(replStateLogWork);
                taskChainTail.addDependentTask(barrierTask);
                LOG.debug("Added {}:{} as a precursor of barrier task {}:{}", taskChainTail.getClass(), taskChainTail.getId(), barrierTask.getClass(), barrierTask.getId());
                replLogger.startLog();
            }
            rootTasks.add(evTaskRoot);
        }
    } catch (Exception e) {
        // TODO : simple wrap & rethrow for now, clean up with error codes
        throw new SemanticException(e);
    }
}
Also used : Task(org.apache.hadoop.hive.ql.exec.Task) Serializable(java.io.Serializable) FileStatus(org.apache.hadoop.fs.FileStatus) MessageHandler(org.apache.hadoop.hive.ql.parse.repl.load.message.MessageHandler) ReplStateLogWork(org.apache.hadoop.hive.ql.exec.repl.ReplStateLogWork) HashMap(java.util.HashMap) FileNotFoundException(java.io.FileNotFoundException) IncrementalLoadLogger(org.apache.hadoop.hive.ql.parse.repl.load.log.IncrementalLoadLogger) FileSystem(org.apache.hadoop.fs.FileSystem) ReplLogger(org.apache.hadoop.hive.ql.parse.repl.ReplLogger) Path(org.apache.hadoop.fs.Path) DumpMetaData(org.apache.hadoop.hive.ql.parse.repl.load.DumpMetaData) FileNotFoundException(java.io.FileNotFoundException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) DependencyCollectionWork(org.apache.hadoop.hive.ql.plan.DependencyCollectionWork) ReplLoadWork(org.apache.hadoop.hive.ql.exec.repl.bootstrap.ReplLoadWork) EventDumpDirComparator(org.apache.hadoop.hive.ql.parse.repl.load.EventDumpDirComparator)

Aggregations

FileNotFoundException (java.io.FileNotFoundException)1 Serializable (java.io.Serializable)1 HashMap (java.util.HashMap)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 Task (org.apache.hadoop.hive.ql.exec.Task)1 ReplStateLogWork (org.apache.hadoop.hive.ql.exec.repl.ReplStateLogWork)1 ReplLoadWork (org.apache.hadoop.hive.ql.exec.repl.bootstrap.ReplLoadWork)1 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)1 ReplLogger (org.apache.hadoop.hive.ql.parse.repl.ReplLogger)1 DumpMetaData (org.apache.hadoop.hive.ql.parse.repl.load.DumpMetaData)1 EventDumpDirComparator (org.apache.hadoop.hive.ql.parse.repl.load.EventDumpDirComparator)1 IncrementalLoadLogger (org.apache.hadoop.hive.ql.parse.repl.load.log.IncrementalLoadLogger)1 MessageHandler (org.apache.hadoop.hive.ql.parse.repl.load.message.MessageHandler)1 DependencyCollectionWork (org.apache.hadoop.hive.ql.plan.DependencyCollectionWork)1