Search in sources :

Example 6 with LoadFileDesc

use of org.apache.hadoop.hive.ql.plan.LoadFileDesc in project hive by apache.

the class GenMapRedUtils method mergeMovePaths.

/**
 * Merges the given Conditional input path and the linked MoveWork into one only MoveWork.
 * This is an optimization for BlobStore systems to avoid doing two renames or copies that are not necessary.
 *
 * @param condInputPath A path that the ConditionalTask uses as input for its sub-tasks.
 * @param linkedMoveWork A MoveWork that the ConditionalTask uses to link to its sub-tasks.
 * @param lineageState A LineageState used to track what changes.
 * @return A new MoveWork that has the Conditional input path as source and the linkedMoveWork as target.
 */
@VisibleForTesting
protected static MoveWork mergeMovePaths(Path condInputPath, MoveWork linkedMoveWork, LineageState lineageState) {
    MoveWork newWork = new MoveWork(linkedMoveWork);
    LoadFileDesc fileDesc = null;
    LoadTableDesc tableDesc = null;
    if (linkedMoveWork.getLoadFileWork() != null) {
        fileDesc = new LoadFileDesc(linkedMoveWork.getLoadFileWork());
        fileDesc.setSourcePath(condInputPath);
        lineageState.updateDirToOpMap(condInputPath, linkedMoveWork.getLoadFileWork().getSourcePath());
    } else if (linkedMoveWork.getLoadTableWork() != null) {
        tableDesc = new LoadTableDesc(linkedMoveWork.getLoadTableWork());
        tableDesc.setSourcePath(condInputPath);
        lineageState.updateDirToOpMap(condInputPath, linkedMoveWork.getLoadTableWork().getSourcePath());
    } else {
        throw new IllegalArgumentException("Merging a path with a MoveWork with multi-files work is not allowed.");
    }
    newWork.setLoadFileWork(fileDesc);
    newWork.setLoadTableWork(tableDesc);
    return newWork;
}
Also used : MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) LoadFileDesc(org.apache.hadoop.hive.ql.plan.LoadFileDesc) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 7 with LoadFileDesc

use of org.apache.hadoop.hive.ql.plan.LoadFileDesc in project hive by apache.

the class TestGenMapRedUtilsCreateConditionalTask method testMovePathsThatCannotBeMerged.

@Test
public void testMovePathsThatCannotBeMerged() {
    final Path condInputPath = new Path("s3a://bucket/scratch/-ext-10000");
    final Path condOutputPath = new Path("s3a://bucket/scratch/-ext-10002");
    final MoveWork mockWork = mock(MoveWork.class);
    assertFalse("A MoveWork null object cannot be merged.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, null));
    hiveConf.set(HiveConf.ConfVars.HIVE_BLOBSTORE_OPTIMIZATIONS_ENABLED.varname, "false");
    assertFalse("Merging paths is not allowed when BlobStorage optimizations are disabled.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, mockWork));
    // Enable BlobStore optimizations for the rest of tests
    hiveConf.set(HiveConf.ConfVars.HIVE_BLOBSTORE_OPTIMIZATIONS_ENABLED.varname, "true");
    reset(mockWork);
    when(mockWork.getLoadMultiFilesWork()).thenReturn(new LoadMultiFilesDesc());
    assertFalse("Merging paths is not allowed when MultiFileWork is found in the MoveWork object.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, mockWork));
    reset(mockWork);
    when(mockWork.getLoadFileWork()).thenReturn(mock(LoadFileDesc.class));
    when(mockWork.getLoadTableWork()).thenReturn(mock(LoadTableDesc.class));
    assertFalse("Merging paths is not allowed when both LoadFileWork & LoadTableWork are found in the MoveWork object.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, mockWork));
    reset(mockWork);
    when(mockWork.getLoadFileWork()).thenReturn(new LoadFileDesc(condInputPath, condOutputPath, false, "", "", false));
    assertFalse("Merging paths is not allowed when both conditional output path is not equals to MoveWork input path.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, mockWork));
    reset(mockWork);
    when(mockWork.getLoadFileWork()).thenReturn(new LoadFileDesc(condOutputPath, new Path("unused"), false, "", "", false));
    assertFalse("Merging paths is not allowed when conditional input path is not a BlobStore path.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, new Path("hdfs://hdfs-path"), condOutputPath, mockWork));
    reset(mockWork);
    when(mockWork.getLoadFileWork()).thenReturn(new LoadFileDesc(condOutputPath, new Path("hdfs://hdfs-path"), false, "", "", false));
    assertFalse("Merging paths is not allowed when MoveWork output path is not a BlobStore path.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, mockWork));
}
Also used : Path(org.apache.hadoop.fs.Path) MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) LoadFileDesc(org.apache.hadoop.hive.ql.plan.LoadFileDesc) LoadMultiFilesDesc(org.apache.hadoop.hive.ql.plan.LoadMultiFilesDesc) Test(org.junit.Test)

Example 8 with LoadFileDesc

use of org.apache.hadoop.hive.ql.plan.LoadFileDesc in project hive by apache.

the class GenMapRedUtils method mergeMovePaths.

/**
   * Merges the given Conditional input path and the linked MoveWork into one only MoveWork.
   * This is an optimization for BlobStore systems to avoid doing two renames or copies that are not necessary.
   *
   * @param condInputPath A path that the ConditionalTask uses as input for its sub-tasks.
   * @param linkedMoveWork A MoveWork that the ConditionalTask uses to link to its sub-tasks.
   * @return A new MoveWork that has the Conditional input path as source and the linkedMoveWork as target.
   */
@VisibleForTesting
protected static MoveWork mergeMovePaths(Path condInputPath, MoveWork linkedMoveWork) {
    MoveWork newWork = new MoveWork(linkedMoveWork);
    LoadFileDesc fileDesc = null;
    LoadTableDesc tableDesc = null;
    if (linkedMoveWork.getLoadFileWork() != null) {
        fileDesc = new LoadFileDesc(linkedMoveWork.getLoadFileWork());
        fileDesc.setSourcePath(condInputPath);
    } else if (linkedMoveWork.getLoadTableWork() != null) {
        tableDesc = new LoadTableDesc(linkedMoveWork.getLoadTableWork());
        tableDesc.setSourcePath(condInputPath);
    } else {
        throw new IllegalArgumentException("Merging a path with a MoveWork with multi-files work is not allowed.");
    }
    newWork.setLoadFileWork(fileDesc);
    newWork.setLoadTableWork(tableDesc);
    return newWork;
}
Also used : MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) LoadFileDesc(org.apache.hadoop.hive.ql.plan.LoadFileDesc) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 9 with LoadFileDesc

use of org.apache.hadoop.hive.ql.plan.LoadFileDesc in project hive by apache.

the class MoveTask method execute.

@Override
public int execute(DriverContext driverContext) {
    if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
        Utilities.FILE_OP_LOGGER.trace("Executing MoveWork " + System.identityHashCode(work) + " with " + work.getLoadFileWork() + "; " + work.getLoadTableWork() + "; " + work.getLoadMultiFilesWork());
    }
    try {
        if (driverContext.getCtx().getExplainAnalyze() == AnalyzeState.RUNNING) {
            return 0;
        }
        Hive db = getHive();
        // Do any hive related operations like moving tables and files
        // to appropriate locations
        LoadFileDesc lfd = work.getLoadFileWork();
        if (lfd != null) {
            Path targetPath = lfd.getTargetDir();
            Path sourcePath = lfd.getSourcePath();
            if (targetPath.equals(sourcePath)) {
                Utilities.FILE_OP_LOGGER.debug("MoveTask not moving " + sourcePath);
            } else {
                Utilities.FILE_OP_LOGGER.debug("MoveTask moving " + sourcePath + " to " + targetPath);
                if (lfd.getWriteType() == AcidUtils.Operation.INSERT) {
                    // 'sourcePath' result of 'select ...' part of CTAS statement
                    assert lfd.getIsDfsDir();
                    FileSystem srcFs = sourcePath.getFileSystem(conf);
                    FileStatus[] srcs = srcFs.globStatus(sourcePath);
                    if (srcs != null) {
                        List<Path> newFiles = new ArrayList<>();
                        Hive.moveAcidFiles(srcFs, srcs, targetPath, newFiles);
                    } else {
                        LOG.debug("No files found to move from " + sourcePath + " to " + targetPath);
                    }
                } else {
                    moveFile(sourcePath, targetPath, lfd.getIsDfsDir());
                }
            }
        }
        // Multi-file load is for dynamic partitions when some partitions do not
        // need to merge and they can simply be moved to the target directory.
        // This is also used for MM table conversion.
        LoadMultiFilesDesc lmfd = work.getLoadMultiFilesWork();
        if (lmfd != null) {
            boolean isDfsDir = lmfd.getIsDfsDir();
            List<String> targetPrefixes = lmfd.getTargetPrefixes();
            for (int i = 0; i < lmfd.getSourceDirs().size(); ++i) {
                Path srcPath = lmfd.getSourceDirs().get(i);
                Path destPath = lmfd.getTargetDirs().get(i);
                String filePrefix = targetPrefixes == null ? null : targetPrefixes.get(i);
                FileSystem destFs = destPath.getFileSystem(conf);
                if (filePrefix == null) {
                    if (!destFs.exists(destPath.getParent())) {
                        destFs.mkdirs(destPath.getParent());
                    }
                    Utilities.FILE_OP_LOGGER.debug("MoveTask moving (multi-file) " + srcPath + " to " + destPath);
                    moveFile(srcPath, destPath, isDfsDir);
                } else {
                    if (!destFs.exists(destPath)) {
                        destFs.mkdirs(destPath);
                    }
                    FileSystem srcFs = srcPath.getFileSystem(conf);
                    FileStatus[] children = srcFs.listStatus(srcPath);
                    if (children != null) {
                        for (FileStatus child : children) {
                            Path childSrc = child.getPath();
                            Path childDest = new Path(destPath, filePrefix + childSrc.getName());
                            Utilities.FILE_OP_LOGGER.debug("MoveTask moving (multi-file) " + childSrc + " to " + childDest);
                            moveFile(childSrc, childDest, isDfsDir);
                        }
                    } else {
                        Utilities.FILE_OP_LOGGER.debug("MoveTask skipping empty directory (multi-file) " + srcPath);
                    }
                    if (!srcFs.delete(srcPath, false)) {
                        throw new IOException("Couldn't delete " + srcPath + " after moving all the files");
                    }
                }
            }
        }
        // Next we do this for tables and partitions
        LoadTableDesc tbd = work.getLoadTableWork();
        if (tbd != null) {
            logMessage(tbd);
            Table table = db.getTable(tbd.getTable().getTableName());
            checkFileFormats(db, tbd, table);
            boolean isFullAcidOp = work.getLoadTableWork().getWriteType() != AcidUtils.Operation.NOT_ACID && // it seems that LoadTableDesc has Operation.INSERT only for CTAS...
            !tbd.isMmTable();
            // Create a data container
            DataContainer dc = null;
            if (tbd.getPartitionSpec().size() == 0) {
                dc = new DataContainer(table.getTTable());
                if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
                    Utilities.FILE_OP_LOGGER.trace("loadTable called from " + tbd.getSourcePath() + " into " + tbd.getTable().getTableName());
                }
                db.loadTable(tbd.getSourcePath(), tbd.getTable().getTableName(), tbd.getLoadFileType(), work.isSrcLocal(), isSkewedStoredAsDirs(tbd), isFullAcidOp, hasFollowingStatsTask(), tbd.getWriteId(), tbd.getStmtId());
                if (work.getOutputs() != null) {
                    DDLTask.addIfAbsentByName(new WriteEntity(table, getWriteType(tbd, work.getLoadTableWork().getWriteType())), work.getOutputs());
                }
            } else {
                LOG.info("Partition is: {}", tbd.getPartitionSpec());
                // Check if the bucketing and/or sorting columns were inferred
                TaskInformation ti = new TaskInformation(this, tbd.getSourcePath().toUri().toString());
                inferTaskInformation(ti);
                // deal with dynamic partitions
                DynamicPartitionCtx dpCtx = tbd.getDPCtx();
                if (dpCtx != null && dpCtx.getNumDPCols() > 0) {
                    // dynamic partitions
                    dc = handleDynParts(db, table, tbd, ti, dpCtx);
                } else {
                    // static partitions
                    dc = handleStaticParts(db, table, tbd, ti);
                }
            }
            if (dc != null) {
                // If we are doing an update or a delete the number of columns in the table will not
                // match the number of columns in the file sink.  For update there will be one too many
                // (because of the ROW__ID), and in the case of the delete there will be just the
                // ROW__ID, which we don't need to worry about from a lineage perspective.
                List<FieldSchema> tableCols = null;
                switch(work.getLoadTableWork().getWriteType()) {
                    case DELETE:
                    case UPDATE:
                        // Pass an empty list as no columns will be written to the file.
                        // TODO I should be able to make this work for update
                        tableCols = new ArrayList<>();
                        break;
                    default:
                        tableCols = table.getCols();
                        break;
                }
                queryState.getLineageState().setLineage(tbd.getSourcePath(), dc, tableCols);
            }
            releaseLocks(tbd);
        }
        return 0;
    } catch (HiveException he) {
        int errorCode = 1;
        if (he.getCanonicalErrorMsg() != ErrorMsg.GENERIC_ERROR) {
            errorCode = he.getCanonicalErrorMsg().getErrorCode();
            if (he.getCanonicalErrorMsg() == ErrorMsg.UNRESOLVED_RT_EXCEPTION) {
                console.printError("Failed with exception " + he.getMessage(), "\n" + StringUtils.stringifyException(he));
            } else {
                console.printError("Failed with exception " + he.getMessage() + "\nRemote Exception: " + he.getRemoteErrorMsg());
                console.printInfo("\n", StringUtils.stringifyException(he), false);
            }
        }
        setException(he);
        return errorCode;
    } catch (Exception e) {
        console.printError("Failed with exception " + e.getMessage(), "\n" + StringUtils.stringifyException(e));
        setException(e);
        return (1);
    }
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) ArrayList(java.util.ArrayList) DynamicPartitionCtx(org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx) DataContainer(org.apache.hadoop.hive.ql.hooks.LineageInfo.DataContainer) FileSystem(org.apache.hadoop.fs.FileSystem) WriteEntity(org.apache.hadoop.hive.ql.hooks.WriteEntity) Path(org.apache.hadoop.fs.Path) LoadFileDesc(org.apache.hadoop.hive.ql.plan.LoadFileDesc) Table(org.apache.hadoop.hive.ql.metadata.Table) IOException(java.io.IOException) LoadMultiFilesDesc(org.apache.hadoop.hive.ql.plan.LoadMultiFilesDesc) LockException(org.apache.hadoop.hive.ql.lockmgr.LockException) InvalidOperationException(org.apache.hadoop.hive.metastore.api.InvalidOperationException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) IOException(java.io.IOException) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) Hive(org.apache.hadoop.hive.ql.metadata.Hive)

Example 10 with LoadFileDesc

use of org.apache.hadoop.hive.ql.plan.LoadFileDesc in project hive by apache.

the class TestGenMapRedUtilsCreateConditionalTask method testMovePathsThatCanBeMerged.

@Test
public void testMovePathsThatCanBeMerged() {
    final Path condInputPath = new Path("s3a://bucket/scratch/-ext-10000");
    final Path condOutputPath = new Path("s3a://bucket/scratch/-ext-10002");
    final Path targetMoveWorkPath = new Path("s3a://bucket/scratch/-ext-10003");
    final MoveWork mockWork = mock(MoveWork.class);
    when(mockWork.getLoadFileWork()).thenReturn(new LoadFileDesc(condOutputPath, targetMoveWorkPath, false, "", "", false));
    assertTrue("Merging BlobStore paths should be allowed.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, mockWork));
}
Also used : Path(org.apache.hadoop.fs.Path) MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) LoadFileDesc(org.apache.hadoop.hive.ql.plan.LoadFileDesc) Test(org.junit.Test)

Aggregations

LoadFileDesc (org.apache.hadoop.hive.ql.plan.LoadFileDesc)12 LoadTableDesc (org.apache.hadoop.hive.ql.plan.LoadTableDesc)10 MoveWork (org.apache.hadoop.hive.ql.plan.MoveWork)9 Path (org.apache.hadoop.fs.Path)6 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)5 DynamicPartitionCtx (org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx)4 Serializable (java.io.Serializable)3 ArrayList (java.util.ArrayList)3 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)3 RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)3 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)3 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)3 FileSinkDesc (org.apache.hadoop.hive.ql.plan.FileSinkDesc)3 VisibleForTesting (com.google.common.annotations.VisibleForTesting)2 IOException (java.io.IOException)2 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)2 MetaException (org.apache.hadoop.hive.metastore.api.MetaException)2 ConditionalTask (org.apache.hadoop.hive.ql.exec.ConditionalTask)2 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)2 MaterializedViewDesc (org.apache.hadoop.hive.ql.exec.MaterializedViewDesc)2