use of org.apache.hadoop.hive.ql.plan.RCFileMergeDesc in project hive by apache.
the class GenMapRedUtils method createMergeTask.
/**
* Create a block level merge task for RCFiles or stripe level merge task for
* ORCFiles
*
* @param fsInputDesc
* @param finalName
* @param ctx
* @param inputFormatClass
* @return MergeWork if table is stored as RCFile or ORCFile,
* null otherwise
*/
public static MapWork createMergeTask(FileSinkDesc fsInputDesc, Path finalName, boolean hasDynamicPartitions, CompilationOpContext ctx) throws SemanticException {
Path inputDir = fsInputDesc.getFinalDirName();
TableDesc tblDesc = fsInputDesc.getTableInfo();
List<Path> inputDirs = new ArrayList<Path>(1);
ArrayList<String> inputDirstr = new ArrayList<String>(1);
// in case of dynamic partitioning and list bucketing
if (!hasDynamicPartitions && !GenMapRedUtils.isSkewedStoredAsDirs(fsInputDesc)) {
inputDirs.add(inputDir);
}
inputDirstr.add(inputDir.toString());
// internal input format class for CombineHiveInputFormat
final Class<? extends InputFormat> internalIFClass;
if (tblDesc.getInputFileFormatClass().equals(RCFileInputFormat.class)) {
internalIFClass = RCFileBlockMergeInputFormat.class;
} else if (tblDesc.getInputFileFormatClass().equals(OrcInputFormat.class)) {
internalIFClass = OrcFileStripeMergeInputFormat.class;
} else {
throw new SemanticException("createMergeTask called on a table with file" + " format other than RCFile or ORCFile");
}
// create the merge file work
MergeFileWork work = new MergeFileWork(inputDirs, finalName, hasDynamicPartitions, tblDesc.getInputFileFormatClass().getName());
LinkedHashMap<Path, ArrayList<String>> pathToAliases = new LinkedHashMap<>();
pathToAliases.put(inputDir, inputDirstr);
work.setMapperCannotSpanPartns(true);
work.setPathToAliases(pathToAliases);
PartitionDesc pDesc = new PartitionDesc(tblDesc, null);
pDesc.setInputFileFormatClass(internalIFClass);
work.addPathToPartitionInfo(inputDir, pDesc);
work.setListBucketingCtx(fsInputDesc.getLbCtx());
// create alias to work which contains the merge operator
LinkedHashMap<String, Operator<? extends OperatorDesc>> aliasToWork = new LinkedHashMap<String, Operator<? extends OperatorDesc>>();
Operator<? extends OperatorDesc> mergeOp = null;
final FileMergeDesc fmd;
if (tblDesc.getInputFileFormatClass().equals(RCFileInputFormat.class)) {
fmd = new RCFileMergeDesc();
} else {
fmd = new OrcFileMergeDesc();
}
fmd.setDpCtx(fsInputDesc.getDynPartCtx());
fmd.setOutputPath(finalName);
fmd.setHasDynamicPartitions(work.hasDynamicPartitions());
fmd.setListBucketingAlterTableConcatenate(work.isListBucketingAlterTableConcatenate());
int lbLevel = work.getListBucketingCtx() == null ? 0 : work.getListBucketingCtx().calculateListBucketingLevel();
fmd.setListBucketingDepth(lbLevel);
mergeOp = OperatorFactory.get(ctx, fmd);
aliasToWork.put(inputDir.toString(), mergeOp);
work.setAliasToWork(aliasToWork);
return work;
}
use of org.apache.hadoop.hive.ql.plan.RCFileMergeDesc in project hive by apache.
the class DDLTask method mergeFiles.
/**
* First, make sure the source table/partition is not
* archived/indexes/non-rcfile. If either of these is true, throw an
* exception.
*
* The way how it does the merge is to create a BlockMergeTask from the
* mergeFilesDesc.
*
* @param db
* @param mergeFilesDesc
* @return
* @throws HiveException
*/
private int mergeFiles(Hive db, AlterTablePartMergeFilesDesc mergeFilesDesc, DriverContext driverContext) throws HiveException {
ListBucketingCtx lbCtx = mergeFilesDesc.getLbCtx();
boolean lbatc = lbCtx == null ? false : lbCtx.isSkewedStoredAsDir();
int lbd = lbCtx == null ? 0 : lbCtx.calculateListBucketingLevel();
// merge work only needs input and output.
MergeFileWork mergeWork = new MergeFileWork(mergeFilesDesc.getInputDir(), mergeFilesDesc.getOutputDir(), mergeFilesDesc.getInputFormatClass().getName());
LinkedHashMap<Path, ArrayList<String>> pathToAliases = new LinkedHashMap<>();
ArrayList<String> inputDirstr = new ArrayList<String>(1);
inputDirstr.add(mergeFilesDesc.getInputDir().toString());
pathToAliases.put(mergeFilesDesc.getInputDir().get(0), inputDirstr);
mergeWork.setPathToAliases(pathToAliases);
mergeWork.setListBucketingCtx(mergeFilesDesc.getLbCtx());
mergeWork.resolveConcatenateMerge(db.getConf());
mergeWork.setMapperCannotSpanPartns(true);
mergeWork.setSourceTableInputFormat(mergeFilesDesc.getInputFormatClass().getName());
final FileMergeDesc fmd;
if (mergeFilesDesc.getInputFormatClass().equals(RCFileInputFormat.class)) {
fmd = new RCFileMergeDesc();
} else {
// safe to assume else is ORC as semantic analyzer will check for RC/ORC
fmd = new OrcFileMergeDesc();
}
fmd.setDpCtx(null);
fmd.setHasDynamicPartitions(false);
fmd.setListBucketingAlterTableConcatenate(lbatc);
fmd.setListBucketingDepth(lbd);
fmd.setOutputPath(mergeFilesDesc.getOutputDir());
CompilationOpContext opContext = driverContext.getCtx().getOpContext();
Operator<? extends OperatorDesc> mergeOp = OperatorFactory.get(opContext, fmd);
LinkedHashMap<String, Operator<? extends OperatorDesc>> aliasToWork = new LinkedHashMap<String, Operator<? extends OperatorDesc>>();
aliasToWork.put(mergeFilesDesc.getInputDir().toString(), mergeOp);
mergeWork.setAliasToWork(aliasToWork);
DriverContext driverCxt = new DriverContext();
Task task;
if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
TezWork tezWork = new TezWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID), conf);
mergeWork.setName("File Merge");
tezWork.add(mergeWork);
task = new TezTask();
task.setWork(tezWork);
} else {
task = new MergeFileTask();
task.setWork(mergeWork);
}
// initialize the task and execute
task.initialize(queryState, getQueryPlan(), driverCxt, opContext);
subtask = task;
int ret = task.execute(driverCxt);
if (subtask.getException() != null) {
setException(subtask.getException());
}
return ret;
}
Aggregations