Search in sources :

Example 11 with MoveWork

use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.

the class DDLTask method generateAddMmTasks.

private List<Task<?>> generateAddMmTasks(Table tbl) throws HiveException {
    // We will move all the files in the table/partition directories into the first MM
    // directory, then commit the first write ID.
    List<Path> srcs = new ArrayList<>(), tgts = new ArrayList<>();
    long mmWriteId = 0;
    try {
        HiveTxnManager txnManager = SessionState.get().getTxnMgr();
        if (txnManager.isTxnOpen()) {
            mmWriteId = txnManager.getTableWriteId(tbl.getDbName(), tbl.getTableName());
        } else {
            txnManager.openTxn(new Context(conf), conf.getUser());
            mmWriteId = txnManager.getTableWriteId(tbl.getDbName(), tbl.getTableName());
            txnManager.commitTxn();
        }
    } catch (Exception e) {
        String errorMessage = "FAILED: Error in acquiring locks: " + e.getMessage();
        console.printError(errorMessage, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
    }
    int stmtId = 0;
    String mmDir = AcidUtils.deltaSubdir(mmWriteId, mmWriteId, stmtId);
    Hive db = getHive();
    if (tbl.getPartitionKeys().size() > 0) {
        PartitionIterable parts = new PartitionIterable(db, tbl, null, HiveConf.getIntVar(conf, ConfVars.METASTORE_BATCH_RETRIEVE_MAX));
        Iterator<Partition> partIter = parts.iterator();
        while (partIter.hasNext()) {
            Partition part = partIter.next();
            checkMmLb(part);
            Path src = part.getDataLocation(), tgt = new Path(src, mmDir);
            srcs.add(src);
            tgts.add(tgt);
            if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
                Utilities.FILE_OP_LOGGER.trace("Will move " + src + " to " + tgt);
            }
        }
    } else {
        checkMmLb(tbl);
        Path src = tbl.getDataLocation(), tgt = new Path(src, mmDir);
        srcs.add(src);
        tgts.add(tgt);
        if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
            Utilities.FILE_OP_LOGGER.trace("Will move " + src + " to " + tgt);
        }
    }
    // Don't set inputs and outputs - the locks have already been taken so it's pointless.
    MoveWork mw = new MoveWork(null, null, null, null, false);
    mw.setMultiFilesDesc(new LoadMultiFilesDesc(srcs, tgts, true, null, null));
    ImportCommitWork icw = new ImportCommitWork(tbl.getDbName(), tbl.getTableName(), mmWriteId, stmtId);
    Task<?> mv = TaskFactory.get(mw), ic = TaskFactory.get(icw);
    mv.addDependentTask(ic);
    return Lists.<Task<?>>newArrayList(mv);
}
Also used : Path(org.apache.hadoop.fs.Path) EnvironmentContext(org.apache.hadoop.hive.metastore.api.EnvironmentContext) Context(org.apache.hadoop.hive.ql.Context) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) DriverContext(org.apache.hadoop.hive.ql.DriverContext) MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) Partition(org.apache.hadoop.hive.ql.metadata.Partition) AlterTableExchangePartition(org.apache.hadoop.hive.ql.plan.AlterTableExchangePartition) ColumnTruncateTask(org.apache.hadoop.hive.ql.io.rcfile.truncate.ColumnTruncateTask) TezTask(org.apache.hadoop.hive.ql.exec.tez.TezTask) MergeFileTask(org.apache.hadoop.hive.ql.io.merge.MergeFileTask) ArrayList(java.util.ArrayList) AlreadyExistsException(org.apache.hadoop.hive.metastore.api.AlreadyExistsException) InvalidOperationException(org.apache.hadoop.hive.metastore.api.InvalidOperationException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) NoSuchObjectException(org.apache.hadoop.hive.metastore.api.NoSuchObjectException) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) URISyntaxException(java.net.URISyntaxException) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) InvalidObjectException(org.apache.hadoop.hive.metastore.api.InvalidObjectException) SQLException(java.sql.SQLException) FileNotFoundException(java.io.FileNotFoundException) HiveAuthzPluginException(org.apache.hadoop.hive.ql.security.authorization.plugin.HiveAuthzPluginException) InvalidTableException(org.apache.hadoop.hive.ql.metadata.InvalidTableException) SQLUniqueConstraint(org.apache.hadoop.hive.metastore.api.SQLUniqueConstraint) CheckConstraint(org.apache.hadoop.hive.ql.metadata.CheckConstraint) NotNullConstraint(org.apache.hadoop.hive.ql.metadata.NotNullConstraint) SQLCheckConstraint(org.apache.hadoop.hive.metastore.api.SQLCheckConstraint) SQLDefaultConstraint(org.apache.hadoop.hive.metastore.api.SQLDefaultConstraint) DefaultConstraint(org.apache.hadoop.hive.ql.metadata.DefaultConstraint) UniqueConstraint(org.apache.hadoop.hive.ql.metadata.UniqueConstraint) SQLNotNullConstraint(org.apache.hadoop.hive.metastore.api.SQLNotNullConstraint) LoadMultiFilesDesc(org.apache.hadoop.hive.ql.plan.LoadMultiFilesDesc) Hive(org.apache.hadoop.hive.ql.metadata.Hive) PartitionIterable(org.apache.hadoop.hive.ql.metadata.PartitionIterable) HiveTxnManager(org.apache.hadoop.hive.ql.lockmgr.HiveTxnManager)

Example 12 with MoveWork

use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.

the class LoadTable method loadTableTask.

private Task<?> loadTableTask(Table table, ReplicationSpec replicationSpec, Path tgtPath, Path fromURI) {
    Path dataPath = new Path(fromURI, EximUtil.DATA_PATH_NAME);
    Path tmpPath = PathUtils.getExternalTmpPath(tgtPath, context.pathInfo);
    Task<?> copyTask = ReplCopyTask.getLoadCopyTask(replicationSpec, dataPath, tmpPath, context.hiveConf);
    LoadTableDesc loadTableWork = new LoadTableDesc(tmpPath, Utilities.getTableDesc(table), new TreeMap<>(), replicationSpec.isReplace() ? LoadFileType.REPLACE_ALL : LoadFileType.OVERWRITE_EXISTING, // todo: what is the point of this?  If this is for replication, who would have opened a txn?
    SessionState.get().getTxnMgr().getCurrentTxnId());
    MoveWork moveWork = new MoveWork(new HashSet<>(), new HashSet<>(), loadTableWork, null, false);
    Task<?> loadTableTask = TaskFactory.get(moveWork, context.hiveConf);
    copyTask.addDependentTask(loadTableTask);
    return copyTask;
}
Also used : Path(org.apache.hadoop.fs.Path) MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc)

Example 13 with MoveWork

use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.

the class TestGenMapRedUtilsCreateConditionalTask method testMovePathsThatCannotBeMerged.

@Test
public void testMovePathsThatCannotBeMerged() {
    final Path condInputPath = new Path("s3a://bucket/scratch/-ext-10000");
    final Path condOutputPath = new Path("s3a://bucket/scratch/-ext-10002");
    final MoveWork mockWork = mock(MoveWork.class);
    assertFalse("A MoveWork null object cannot be merged.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, null));
    hiveConf.set(HiveConf.ConfVars.HIVE_BLOBSTORE_OPTIMIZATIONS_ENABLED.varname, "false");
    assertFalse("Merging paths is not allowed when BlobStorage optimizations are disabled.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, mockWork));
    // Enable BlobStore optimizations for the rest of tests
    hiveConf.set(HiveConf.ConfVars.HIVE_BLOBSTORE_OPTIMIZATIONS_ENABLED.varname, "true");
    reset(mockWork);
    when(mockWork.getLoadMultiFilesWork()).thenReturn(new LoadMultiFilesDesc());
    assertFalse("Merging paths is not allowed when MultiFileWork is found in the MoveWork object.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, mockWork));
    reset(mockWork);
    when(mockWork.getLoadFileWork()).thenReturn(mock(LoadFileDesc.class));
    when(mockWork.getLoadTableWork()).thenReturn(mock(LoadTableDesc.class));
    assertFalse("Merging paths is not allowed when both LoadFileWork & LoadTableWork are found in the MoveWork object.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, mockWork));
    reset(mockWork);
    when(mockWork.getLoadFileWork()).thenReturn(new LoadFileDesc(condInputPath, condOutputPath, false, "", "", false));
    assertFalse("Merging paths is not allowed when both conditional output path is not equals to MoveWork input path.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, mockWork));
    reset(mockWork);
    when(mockWork.getLoadFileWork()).thenReturn(new LoadFileDesc(condOutputPath, new Path("unused"), false, "", "", false));
    assertFalse("Merging paths is not allowed when conditional input path is not a BlobStore path.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, new Path("hdfs://hdfs-path"), condOutputPath, mockWork));
    reset(mockWork);
    when(mockWork.getLoadFileWork()).thenReturn(new LoadFileDesc(condOutputPath, new Path("hdfs://hdfs-path"), false, "", "", false));
    assertFalse("Merging paths is not allowed when MoveWork output path is not a BlobStore path.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, mockWork));
}
Also used : Path(org.apache.hadoop.fs.Path) MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) LoadFileDesc(org.apache.hadoop.hive.ql.plan.LoadFileDesc) LoadMultiFilesDesc(org.apache.hadoop.hive.ql.plan.LoadMultiFilesDesc) Test(org.junit.Test)

Example 14 with MoveWork

use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.

the class TestGenMapRedUtilsCreateConditionalTask method testConditionalMoveTaskIsNotOptimized.

@Test
public void testConditionalMoveTaskIsNotOptimized() throws SemanticException {
    hiveConf.set(HiveConf.ConfVars.HIVE_BLOBSTORE_OPTIMIZATIONS_ENABLED.varname, "false");
    Path sinkDirName = new Path("s3a://bucket/scratch/-ext-10002");
    FileSinkOperator fileSinkOperator = createFileSinkOperator(sinkDirName);
    Path finalDirName = new Path("s3a://bucket/scratch/-ext-10000");
    Path tableLocation = new Path("s3a://bucket/warehouse/table");
    Task<MoveWork> moveTask = createMoveTask(finalDirName, tableLocation);
    List<Task<MoveWork>> moveTaskList = Collections.singletonList(moveTask);
    GenMapRedUtils.createMRWorkForMergingFiles(fileSinkOperator, finalDirName, null, moveTaskList, hiveConf, dummyMRTask, new LineageState());
    ConditionalTask conditionalTask = (ConditionalTask) dummyMRTask.getChildTasks().get(0);
    Task<? extends Serializable> moveOnlyTask = conditionalTask.getListTasks().get(0);
    Task<? extends Serializable> mergeOnlyTask = conditionalTask.getListTasks().get(1);
    Task<? extends Serializable> mergeAndMoveTask = conditionalTask.getListTasks().get(2);
    // Verify moveOnlyTask is NOT optimized
    assertEquals(1, moveOnlyTask.getChildTasks().size());
    verifyMoveTask(moveOnlyTask, sinkDirName, finalDirName);
    verifyMoveTask(moveOnlyTask.getChildTasks().get(0), finalDirName, tableLocation);
    // Verify mergeOnlyTask is NOT optimized
    assertEquals(1, mergeOnlyTask.getChildTasks().size());
    verifyMoveTask(mergeOnlyTask.getChildTasks().get(0), finalDirName, tableLocation);
    // Verify mergeAndMoveTask is NOT optimized
    assertEquals(1, mergeAndMoveTask.getChildTasks().size());
    assertEquals(1, mergeAndMoveTask.getChildTasks().get(0).getChildTasks().size());
    verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0), sinkDirName, finalDirName);
    verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0).getChildTasks().get(0), finalDirName, tableLocation);
}
Also used : Path(org.apache.hadoop.fs.Path) MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) MoveTask(org.apache.hadoop.hive.ql.exec.MoveTask) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) LineageState(org.apache.hadoop.hive.ql.session.LineageState) Test(org.junit.Test)

Example 15 with MoveWork

use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.

the class TestGenMapRedUtilsCreateConditionalTask method testConditionalMoveTaskIsOptimized.

@Test
public void testConditionalMoveTaskIsOptimized() throws SemanticException {
    hiveConf.set(HiveConf.ConfVars.HIVE_BLOBSTORE_OPTIMIZATIONS_ENABLED.varname, "true");
    Path sinkDirName = new Path("s3a://bucket/scratch/-ext-10002");
    FileSinkOperator fileSinkOperator = createFileSinkOperator(sinkDirName);
    Path finalDirName = new Path("s3a://bucket/scratch/-ext-10000");
    Path tableLocation = new Path("s3a://bucket/warehouse/table");
    Task<MoveWork> moveTask = createMoveTask(finalDirName, tableLocation);
    List<Task<MoveWork>> moveTaskList = Collections.singletonList(moveTask);
    GenMapRedUtils.createMRWorkForMergingFiles(fileSinkOperator, finalDirName, null, moveTaskList, hiveConf, dummyMRTask, new LineageState());
    ConditionalTask conditionalTask = (ConditionalTask) dummyMRTask.getChildTasks().get(0);
    Task<? extends Serializable> moveOnlyTask = conditionalTask.getListTasks().get(0);
    Task<? extends Serializable> mergeOnlyTask = conditionalTask.getListTasks().get(1);
    Task<? extends Serializable> mergeAndMoveTask = conditionalTask.getListTasks().get(2);
    /*
     * OPTIMIZATION
     * The ConditionalTask avoids linking 2 MoveTask that are expensive on blobstorage systems. Instead of
     * linking, it creates one MoveTask where the source is the first MoveTask source, and target is the
     * second MoveTask target.
     */
    // Verify moveOnlyTask is optimized
    assertNull(moveOnlyTask.getChildTasks());
    verifyMoveTask(moveOnlyTask, sinkDirName, tableLocation);
    // Verify mergeOnlyTask is NOT optimized (a merge task writes directly to finalDirName, then a MoveTask is executed)
    assertEquals(1, mergeOnlyTask.getChildTasks().size());
    verifyMoveTask(mergeOnlyTask.getChildTasks().get(0), finalDirName, tableLocation);
    // Verify mergeAndMoveTask is NOT optimized
    assertEquals(1, mergeAndMoveTask.getChildTasks().size());
    assertEquals(1, mergeAndMoveTask.getChildTasks().get(0).getChildTasks().size());
    verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0), sinkDirName, finalDirName);
    verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0).getChildTasks().get(0), finalDirName, tableLocation);
}
Also used : Path(org.apache.hadoop.fs.Path) MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) MoveTask(org.apache.hadoop.hive.ql.exec.MoveTask) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) LineageState(org.apache.hadoop.hive.ql.session.LineageState) Test(org.junit.Test)

Aggregations

MoveWork (org.apache.hadoop.hive.ql.plan.MoveWork)29 Path (org.apache.hadoop.fs.Path)21 LoadTableDesc (org.apache.hadoop.hive.ql.plan.LoadTableDesc)16 LoadFileDesc (org.apache.hadoop.hive.ql.plan.LoadFileDesc)9 Test (org.junit.Test)7 ConditionalTask (org.apache.hadoop.hive.ql.exec.ConditionalTask)6 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)6 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)6 Task (org.apache.hadoop.hive.ql.exec.Task)5 DDLWork (org.apache.hadoop.hive.ql.plan.DDLWork)5 Serializable (java.io.Serializable)4 ArrayList (java.util.ArrayList)4 Context (org.apache.hadoop.hive.ql.Context)4 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)4 MoveTask (org.apache.hadoop.hive.ql.exec.MoveTask)4 MapRedTask (org.apache.hadoop.hive.ql.exec.mr.MapRedTask)4 Partition (org.apache.hadoop.hive.ql.metadata.Partition)4 BasicStatsWork (org.apache.hadoop.hive.ql.plan.BasicStatsWork)4 StatsWork (org.apache.hadoop.hive.ql.plan.StatsWork)4 URISyntaxException (java.net.URISyntaxException)3