Search in sources :

Example 1 with LoadMultiFilesDesc

use of org.apache.hadoop.hive.ql.plan.LoadMultiFilesDesc in project hive by apache.

the class MoveTask method execute.

@Override
public int execute(DriverContext driverContext) {
    try {
        if (driverContext.getCtx().getExplainAnalyze() == AnalyzeState.RUNNING) {
            return 0;
        }
        Hive db = getHive();
        // Do any hive related operations like moving tables and files
        // to appropriate locations
        LoadFileDesc lfd = work.getLoadFileWork();
        if (lfd != null) {
            Path targetPath = lfd.getTargetDir();
            Path sourcePath = lfd.getSourcePath();
            moveFile(sourcePath, targetPath, lfd.getIsDfsDir());
        }
        // Multi-file load is for dynamic partitions when some partitions do not
        // need to merge and they can simply be moved to the target directory.
        LoadMultiFilesDesc lmfd = work.getLoadMultiFilesWork();
        if (lmfd != null) {
            boolean isDfsDir = lmfd.getIsDfsDir();
            int i = 0;
            while (i < lmfd.getSourceDirs().size()) {
                Path srcPath = lmfd.getSourceDirs().get(i);
                Path destPath = lmfd.getTargetDirs().get(i);
                FileSystem fs = destPath.getFileSystem(conf);
                if (!fs.exists(destPath.getParent())) {
                    fs.mkdirs(destPath.getParent());
                }
                moveFile(srcPath, destPath, isDfsDir);
                i++;
            }
        }
        // Next we do this for tables and partitions
        LoadTableDesc tbd = work.getLoadTableWork();
        if (tbd != null) {
            StringBuilder mesg = new StringBuilder("Loading data to table ").append(tbd.getTable().getTableName());
            if (tbd.getPartitionSpec().size() > 0) {
                mesg.append(" partition (");
                Map<String, String> partSpec = tbd.getPartitionSpec();
                for (String key : partSpec.keySet()) {
                    mesg.append(key).append('=').append(partSpec.get(key)).append(", ");
                }
                mesg.setLength(mesg.length() - 2);
                mesg.append(')');
            }
            String mesg_detail = " from " + tbd.getSourcePath();
            console.printInfo(mesg.toString(), mesg_detail);
            Table table = db.getTable(tbd.getTable().getTableName());
            if (work.getCheckFileFormat()) {
                // Get all files from the src directory
                FileStatus[] dirs;
                ArrayList<FileStatus> files;
                // source filesystem
                FileSystem srcFs;
                try {
                    srcFs = tbd.getSourcePath().getFileSystem(conf);
                    dirs = srcFs.globStatus(tbd.getSourcePath());
                    files = new ArrayList<FileStatus>();
                    for (int i = 0; (dirs != null && i < dirs.length); i++) {
                        files.addAll(Arrays.asList(srcFs.listStatus(dirs[i].getPath(), FileUtils.HIDDEN_FILES_PATH_FILTER)));
                        // one.
                        if (files.size() > 0) {
                            break;
                        }
                    }
                } catch (IOException e) {
                    throw new HiveException("addFiles: filesystem error in check phase", e);
                }
                // handle file format check for table level
                if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVECHECKFILEFORMAT)) {
                    boolean flag = true;
                    // dynamic partition context is null
                    if (tbd.getDPCtx() == null) {
                        if (tbd.getPartitionSpec() == null || tbd.getPartitionSpec().isEmpty()) {
                            // Check if the file format of the file matches that of the table.
                            flag = HiveFileFormatUtils.checkInputFormat(srcFs, conf, tbd.getTable().getInputFileFormatClass(), files);
                        } else {
                            // Check if the file format of the file matches that of the partition
                            Partition oldPart = db.getPartition(table, tbd.getPartitionSpec(), false);
                            if (oldPart == null) {
                                // this means we have just created a table and are specifying partition in the
                                // load statement (without pre-creating the partition), in which case lets use
                                // table input format class. inheritTableSpecs defaults to true so when a new
                                // partition is created later it will automatically inherit input format
                                // from table object
                                flag = HiveFileFormatUtils.checkInputFormat(srcFs, conf, tbd.getTable().getInputFileFormatClass(), files);
                            } else {
                                flag = HiveFileFormatUtils.checkInputFormat(srcFs, conf, oldPart.getInputFormatClass(), files);
                            }
                        }
                        if (!flag) {
                            throw new HiveException("Wrong file format. Please check the file's format.");
                        }
                    } else {
                        LOG.warn("Skipping file format check as dpCtx is not null");
                    }
                }
            }
            // Create a data container
            DataContainer dc = null;
            if (tbd.getPartitionSpec().size() == 0) {
                dc = new DataContainer(table.getTTable());
                db.loadTable(tbd.getSourcePath(), tbd.getTable().getTableName(), tbd.getReplace(), work.isSrcLocal(), isSkewedStoredAsDirs(tbd), work.getLoadTableWork().getWriteType() != AcidUtils.Operation.NOT_ACID, hasFollowingStatsTask());
                if (work.getOutputs() != null) {
                    DDLTask.addIfAbsentByName(new WriteEntity(table, getWriteType(tbd, work.getLoadTableWork().getWriteType())), work.getOutputs());
                }
            } else {
                LOG.info("Partition is: " + tbd.getPartitionSpec().toString());
                // Check if the bucketing and/or sorting columns were inferred
                List<BucketCol> bucketCols = null;
                List<SortCol> sortCols = null;
                int numBuckets = -1;
                Task task = this;
                String path = tbd.getSourcePath().toUri().toString();
                // (Either standard, local, or a merge)
                while (task.getParentTasks() != null && task.getParentTasks().size() == 1) {
                    task = (Task) task.getParentTasks().get(0);
                    // If it was a merge task or a local map reduce task, nothing can be inferred
                    if (task instanceof MergeFileTask || task instanceof MapredLocalTask) {
                        break;
                    }
                    // the directory this move task is moving
                    if (task instanceof MapRedTask) {
                        MapredWork work = (MapredWork) task.getWork();
                        MapWork mapWork = work.getMapWork();
                        bucketCols = mapWork.getBucketedColsByDirectory().get(path);
                        sortCols = mapWork.getSortedColsByDirectory().get(path);
                        if (work.getReduceWork() != null) {
                            numBuckets = work.getReduceWork().getNumReduceTasks();
                        }
                        if (bucketCols != null || sortCols != null) {
                            // operator that writes the final output)
                            assert work.isFinalMapRed();
                        }
                        break;
                    }
                    // condition for merging is not met, see GenMRFileSink1.
                    if (task instanceof MoveTask) {
                        if (((MoveTask) task).getWork().getLoadFileWork() != null) {
                            path = ((MoveTask) task).getWork().getLoadFileWork().getSourcePath().toUri().toString();
                        }
                    }
                }
                // deal with dynamic partitions
                DynamicPartitionCtx dpCtx = tbd.getDPCtx();
                if (dpCtx != null && dpCtx.getNumDPCols() > 0) {
                    // dynamic partitions
                    List<LinkedHashMap<String, String>> dps = Utilities.getFullDPSpecs(conf, dpCtx);
                    console.printInfo(System.getProperty("line.separator"));
                    long startTime = System.currentTimeMillis();
                    // load the list of DP partitions and return the list of partition specs
                    // TODO: In a follow-up to HIVE-1361, we should refactor loadDynamicPartitions
                    // to use Utilities.getFullDPSpecs() to get the list of full partSpecs.
                    // After that check the number of DPs created to not exceed the limit and
                    // iterate over it and call loadPartition() here.
                    // The reason we don't do inside HIVE-1361 is the latter is large and we
                    // want to isolate any potential issue it may introduce.
                    Map<Map<String, String>, Partition> dp = db.loadDynamicPartitions(tbd.getSourcePath(), tbd.getTable().getTableName(), tbd.getPartitionSpec(), tbd.getReplace(), dpCtx.getNumDPCols(), isSkewedStoredAsDirs(tbd), work.getLoadTableWork().getWriteType() != AcidUtils.Operation.NOT_ACID, SessionState.get().getTxnMgr().getCurrentTxnId(), hasFollowingStatsTask(), work.getLoadTableWork().getWriteType());
                    // publish DP columns to its subscribers
                    if (dps != null && dps.size() > 0) {
                        pushFeed(FeedType.DYNAMIC_PARTITIONS, dp.values());
                    }
                    String loadTime = "\t Time taken to load dynamic partitions: " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds";
                    console.printInfo(loadTime);
                    LOG.info(loadTime);
                    if (dp.size() == 0 && conf.getBoolVar(HiveConf.ConfVars.HIVE_ERROR_ON_EMPTY_PARTITION)) {
                        throw new HiveException("This query creates no partitions." + " To turn off this error, set hive.error.on.empty.partition=false.");
                    }
                    startTime = System.currentTimeMillis();
                    // and put it to WriteEntity for post-exec hook
                    for (Map.Entry<Map<String, String>, Partition> entry : dp.entrySet()) {
                        Partition partn = entry.getValue();
                        if (bucketCols != null || sortCols != null) {
                            updatePartitionBucketSortColumns(db, table, partn, bucketCols, numBuckets, sortCols);
                        }
                        WriteEntity enty = new WriteEntity(partn, getWriteType(tbd, work.getLoadTableWork().getWriteType()));
                        if (work.getOutputs() != null) {
                            DDLTask.addIfAbsentByName(enty, work.getOutputs());
                        }
                        // queryPlan here.
                        if (queryPlan.getOutputs() == null) {
                            queryPlan.setOutputs(new LinkedHashSet<WriteEntity>());
                        }
                        queryPlan.getOutputs().add(enty);
                        // update columnar lineage for each partition
                        dc = new DataContainer(table.getTTable(), partn.getTPartition());
                        // Don't set lineage on delete as we don't have all the columns
                        if (SessionState.get() != null && work.getLoadTableWork().getWriteType() != AcidUtils.Operation.DELETE && work.getLoadTableWork().getWriteType() != AcidUtils.Operation.UPDATE) {
                            SessionState.get().getLineageState().setLineage(tbd.getSourcePath(), dc, table.getCols());
                        }
                        LOG.info("\tLoading partition " + entry.getKey());
                    }
                    console.printInfo("\t Time taken for adding to write entity : " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
                    // reset data container to prevent it being added again.
                    dc = null;
                } else {
                    // static partitions
                    List<String> partVals = MetaStoreUtils.getPvals(table.getPartCols(), tbd.getPartitionSpec());
                    db.validatePartitionNameCharacters(partVals);
                    db.loadPartition(tbd.getSourcePath(), tbd.getTable().getTableName(), tbd.getPartitionSpec(), tbd.getReplace(), tbd.getInheritTableSpecs(), isSkewedStoredAsDirs(tbd), work.isSrcLocal(), work.getLoadTableWork().getWriteType() != AcidUtils.Operation.NOT_ACID, hasFollowingStatsTask());
                    Partition partn = db.getPartition(table, tbd.getPartitionSpec(), false);
                    if (bucketCols != null || sortCols != null) {
                        updatePartitionBucketSortColumns(db, table, partn, bucketCols, numBuckets, sortCols);
                    }
                    dc = new DataContainer(table.getTTable(), partn.getTPartition());
                    // add this partition to post-execution hook
                    if (work.getOutputs() != null) {
                        DDLTask.addIfAbsentByName(new WriteEntity(partn, getWriteType(tbd, work.getLoadTableWork().getWriteType())), work.getOutputs());
                    }
                }
            }
            if (SessionState.get() != null && dc != null) {
                // If we are doing an update or a delete the number of columns in the table will not
                // match the number of columns in the file sink.  For update there will be one too many
                // (because of the ROW__ID), and in the case of the delete there will be just the
                // ROW__ID, which we don't need to worry about from a lineage perspective.
                List<FieldSchema> tableCols = null;
                switch(work.getLoadTableWork().getWriteType()) {
                    case DELETE:
                    case UPDATE:
                        // Pass an empty list as no columns will be written to the file.
                        // TODO I should be able to make this work for update
                        tableCols = new ArrayList<FieldSchema>();
                        break;
                    default:
                        tableCols = table.getCols();
                        break;
                }
                SessionState.get().getLineageState().setLineage(tbd.getSourcePath(), dc, tableCols);
            }
            releaseLocks(tbd);
        }
        return 0;
    } catch (Exception e) {
        console.printError("Failed with exception " + e.getMessage(), "\n" + StringUtils.stringifyException(e));
        setException(e);
        return (1);
    }
}
Also used : MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) MapredLocalTask(org.apache.hadoop.hive.ql.exec.mr.MapredLocalTask) MergeFileTask(org.apache.hadoop.hive.ql.io.merge.MergeFileTask) FileStatus(org.apache.hadoop.fs.FileStatus) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) MapredLocalTask(org.apache.hadoop.hive.ql.exec.mr.MapredLocalTask) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) DynamicPartitionCtx(org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx) LinkedHashMap(java.util.LinkedHashMap) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) DataContainer(org.apache.hadoop.hive.ql.hooks.LineageInfo.DataContainer) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) FileSystem(org.apache.hadoop.fs.FileSystem) WriteEntity(org.apache.hadoop.hive.ql.hooks.WriteEntity) Path(org.apache.hadoop.fs.Path) Partition(org.apache.hadoop.hive.ql.metadata.Partition) LoadFileDesc(org.apache.hadoop.hive.ql.plan.LoadFileDesc) Table(org.apache.hadoop.hive.ql.metadata.Table) BucketCol(org.apache.hadoop.hive.ql.optimizer.physical.BucketingSortingCtx.BucketCol) SortCol(org.apache.hadoop.hive.ql.optimizer.physical.BucketingSortingCtx.SortCol) IOException(java.io.IOException) LoadMultiFilesDesc(org.apache.hadoop.hive.ql.plan.LoadMultiFilesDesc) LockException(org.apache.hadoop.hive.ql.lockmgr.LockException) InvalidOperationException(org.apache.hadoop.hive.metastore.api.InvalidOperationException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) IOException(java.io.IOException) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) Hive(org.apache.hadoop.hive.ql.metadata.Hive) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) MergeFileTask(org.apache.hadoop.hive.ql.io.merge.MergeFileTask)

Aggregations

IOException (java.io.IOException)1 LinkedHashMap (java.util.LinkedHashMap)1 Map (java.util.Map)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)1 InvalidOperationException (org.apache.hadoop.hive.metastore.api.InvalidOperationException)1 MapRedTask (org.apache.hadoop.hive.ql.exec.mr.MapRedTask)1 MapredLocalTask (org.apache.hadoop.hive.ql.exec.mr.MapredLocalTask)1 DataContainer (org.apache.hadoop.hive.ql.hooks.LineageInfo.DataContainer)1 WriteEntity (org.apache.hadoop.hive.ql.hooks.WriteEntity)1 MergeFileTask (org.apache.hadoop.hive.ql.io.merge.MergeFileTask)1 LockException (org.apache.hadoop.hive.ql.lockmgr.LockException)1 Hive (org.apache.hadoop.hive.ql.metadata.Hive)1 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)1 Partition (org.apache.hadoop.hive.ql.metadata.Partition)1 Table (org.apache.hadoop.hive.ql.metadata.Table)1 BucketCol (org.apache.hadoop.hive.ql.optimizer.physical.BucketingSortingCtx.BucketCol)1 SortCol (org.apache.hadoop.hive.ql.optimizer.physical.BucketingSortingCtx.SortCol)1