use of org.apache.hadoop.hive.ql.io.merge.MergeFileWork in project hive by apache.
the class SparkPlanGenerator method generate.
private SparkTran generate(BaseWork work, SparkWork sparkWork) throws Exception {
initStatsPublisher(work);
JobConf newJobConf = cloneJobConf(work);
checkSpecs(work, newJobConf);
byte[] confBytes = KryoSerializer.serializeJobConf(newJobConf);
boolean caching = isCachingWork(work, sparkWork);
if (work instanceof MapWork) {
// Create tmp dir for MergeFileWork
if (work instanceof MergeFileWork) {
Path outputPath = ((MergeFileWork) work).getOutputDir();
Path tempOutPath = Utilities.toTempPath(outputPath);
FileSystem fs = outputPath.getFileSystem(jobConf);
try {
if (!fs.exists(tempOutPath)) {
fs.mkdirs(tempOutPath);
}
} catch (IOException e) {
throw new RuntimeException("Can't make path " + outputPath + " : " + e.getMessage());
}
}
MapTran mapTran = new MapTran(caching);
HiveMapFunction mapFunc = new HiveMapFunction(confBytes, sparkReporter);
mapTran.setMapFunction(mapFunc);
return mapTran;
} else if (work instanceof ReduceWork) {
ReduceTran reduceTran = new ReduceTran(caching);
HiveReduceFunction reduceFunc = new HiveReduceFunction(confBytes, sparkReporter);
reduceTran.setReduceFunction(reduceFunc);
return reduceTran;
} else {
throw new IllegalStateException("AssertionError: expected either MapWork or ReduceWork, " + "but found " + work.getClass().getName());
}
}
use of org.apache.hadoop.hive.ql.io.merge.MergeFileWork in project hive by apache.
the class SparkPlanGenerator method cloneJobConf.
@SuppressWarnings({ "unchecked" })
private JobConf cloneJobConf(BaseWork work) throws Exception {
if (workToJobConf.containsKey(work)) {
return workToJobConf.get(work);
}
JobConf cloned = new JobConf(jobConf);
// Make sure we'll use a different plan path from the original one
HiveConf.setVar(cloned, HiveConf.ConfVars.PLAN, "");
try {
cloned.setPartitionerClass(JavaUtils.loadClass(HiveConf.getVar(cloned, HiveConf.ConfVars.HIVEPARTITIONER)));
} catch (ClassNotFoundException e) {
String msg = "Could not find partitioner class: " + e.getMessage() + " which is specified by: " + HiveConf.ConfVars.HIVEPARTITIONER.varname;
throw new IllegalArgumentException(msg, e);
}
if (work instanceof MapWork) {
cloned.setBoolean("mapred.task.is.map", true);
List<Path> inputPaths = Utilities.getInputPaths(cloned, (MapWork) work, scratchDir, context, false);
Utilities.setInputPaths(cloned, inputPaths);
Utilities.setMapWork(cloned, (MapWork) work, scratchDir, false);
Utilities.createTmpDirs(cloned, (MapWork) work);
if (work instanceof MergeFileWork) {
MergeFileWork mergeFileWork = (MergeFileWork) work;
cloned.set(Utilities.MAPRED_MAPPER_CLASS, MergeFileMapper.class.getName());
cloned.set("mapred.input.format.class", mergeFileWork.getInputformat());
cloned.setClass("mapred.output.format.class", MergeFileOutputFormat.class, FileOutputFormat.class);
} else {
cloned.set(Utilities.MAPRED_MAPPER_CLASS, ExecMapper.class.getName());
}
if (((MapWork) work).getMinSplitSize() != null) {
HiveConf.setLongVar(cloned, HiveConf.ConfVars.MAPREDMINSPLITSIZE, ((MapWork) work).getMinSplitSize());
}
// remember the JobConf cloned for each MapWork, so we won't clone for it again
workToJobConf.put(work, cloned);
} else if (work instanceof ReduceWork) {
cloned.setBoolean("mapred.task.is.map", false);
Utilities.setReduceWork(cloned, (ReduceWork) work, scratchDir, false);
Utilities.createTmpDirs(cloned, (ReduceWork) work);
cloned.set(Utilities.MAPRED_REDUCER_CLASS, ExecReducer.class.getName());
}
return cloned;
}
use of org.apache.hadoop.hive.ql.io.merge.MergeFileWork in project hive by apache.
the class GenMapRedUtils method createMergeTask.
/**
* Create a block level merge task for RCFiles or stripe level merge task for
* ORCFiles
*
* @param fsInputDesc
* @param finalName
* @param ctx
* @param inputFormatClass
* @return MergeWork if table is stored as RCFile or ORCFile,
* null otherwise
*/
public static MapWork createMergeTask(FileSinkDesc fsInputDesc, Path finalName, boolean hasDynamicPartitions, CompilationOpContext ctx) throws SemanticException {
Path inputDir = fsInputDesc.getFinalDirName();
TableDesc tblDesc = fsInputDesc.getTableInfo();
List<Path> inputDirs = new ArrayList<Path>(1);
ArrayList<String> inputDirstr = new ArrayList<String>(1);
// in case of dynamic partitioning and list bucketing
if (!hasDynamicPartitions && !GenMapRedUtils.isSkewedStoredAsDirs(fsInputDesc)) {
inputDirs.add(inputDir);
}
inputDirstr.add(inputDir.toString());
// internal input format class for CombineHiveInputFormat
final Class<? extends InputFormat> internalIFClass;
if (tblDesc.getInputFileFormatClass().equals(RCFileInputFormat.class)) {
internalIFClass = RCFileBlockMergeInputFormat.class;
} else if (tblDesc.getInputFileFormatClass().equals(OrcInputFormat.class)) {
internalIFClass = OrcFileStripeMergeInputFormat.class;
} else {
throw new SemanticException("createMergeTask called on a table with file" + " format other than RCFile or ORCFile");
}
// create the merge file work
MergeFileWork work = new MergeFileWork(inputDirs, finalName, hasDynamicPartitions, tblDesc.getInputFileFormatClass().getName());
LinkedHashMap<Path, ArrayList<String>> pathToAliases = new LinkedHashMap<>();
pathToAliases.put(inputDir, inputDirstr);
work.setMapperCannotSpanPartns(true);
work.setPathToAliases(pathToAliases);
PartitionDesc pDesc = new PartitionDesc(tblDesc, null);
pDesc.setInputFileFormatClass(internalIFClass);
work.addPathToPartitionInfo(inputDir, pDesc);
work.setListBucketingCtx(fsInputDesc.getLbCtx());
// create alias to work which contains the merge operator
LinkedHashMap<String, Operator<? extends OperatorDesc>> aliasToWork = new LinkedHashMap<String, Operator<? extends OperatorDesc>>();
Operator<? extends OperatorDesc> mergeOp = null;
final FileMergeDesc fmd;
if (tblDesc.getInputFileFormatClass().equals(RCFileInputFormat.class)) {
fmd = new RCFileMergeDesc();
} else {
fmd = new OrcFileMergeDesc();
}
fmd.setDpCtx(fsInputDesc.getDynPartCtx());
fmd.setOutputPath(finalName);
fmd.setHasDynamicPartitions(work.hasDynamicPartitions());
fmd.setListBucketingAlterTableConcatenate(work.isListBucketingAlterTableConcatenate());
int lbLevel = work.getListBucketingCtx() == null ? 0 : work.getListBucketingCtx().calculateListBucketingLevel();
fmd.setListBucketingDepth(lbLevel);
mergeOp = OperatorFactory.get(ctx, fmd);
aliasToWork.put(inputDir.toString(), mergeOp);
work.setAliasToWork(aliasToWork);
return work;
}
use of org.apache.hadoop.hive.ql.io.merge.MergeFileWork in project hive by apache.
the class DDLTask method mergeFiles.
/**
* First, make sure the source table/partition is not
* archived/indexes/non-rcfile. If either of these is true, throw an
* exception.
*
* The way how it does the merge is to create a BlockMergeTask from the
* mergeFilesDesc.
*
* @param db
* @param mergeFilesDesc
* @return
* @throws HiveException
*/
private int mergeFiles(Hive db, AlterTablePartMergeFilesDesc mergeFilesDesc, DriverContext driverContext) throws HiveException {
ListBucketingCtx lbCtx = mergeFilesDesc.getLbCtx();
boolean lbatc = lbCtx == null ? false : lbCtx.isSkewedStoredAsDir();
int lbd = lbCtx == null ? 0 : lbCtx.calculateListBucketingLevel();
// merge work only needs input and output.
MergeFileWork mergeWork = new MergeFileWork(mergeFilesDesc.getInputDir(), mergeFilesDesc.getOutputDir(), mergeFilesDesc.getInputFormatClass().getName());
LinkedHashMap<Path, ArrayList<String>> pathToAliases = new LinkedHashMap<>();
ArrayList<String> inputDirstr = new ArrayList<String>(1);
inputDirstr.add(mergeFilesDesc.getInputDir().toString());
pathToAliases.put(mergeFilesDesc.getInputDir().get(0), inputDirstr);
mergeWork.setPathToAliases(pathToAliases);
mergeWork.setListBucketingCtx(mergeFilesDesc.getLbCtx());
mergeWork.resolveConcatenateMerge(db.getConf());
mergeWork.setMapperCannotSpanPartns(true);
mergeWork.setSourceTableInputFormat(mergeFilesDesc.getInputFormatClass().getName());
final FileMergeDesc fmd;
if (mergeFilesDesc.getInputFormatClass().equals(RCFileInputFormat.class)) {
fmd = new RCFileMergeDesc();
} else {
// safe to assume else is ORC as semantic analyzer will check for RC/ORC
fmd = new OrcFileMergeDesc();
}
fmd.setDpCtx(null);
fmd.setHasDynamicPartitions(false);
fmd.setListBucketingAlterTableConcatenate(lbatc);
fmd.setListBucketingDepth(lbd);
fmd.setOutputPath(mergeFilesDesc.getOutputDir());
CompilationOpContext opContext = driverContext.getCtx().getOpContext();
Operator<? extends OperatorDesc> mergeOp = OperatorFactory.get(opContext, fmd);
LinkedHashMap<String, Operator<? extends OperatorDesc>> aliasToWork = new LinkedHashMap<String, Operator<? extends OperatorDesc>>();
aliasToWork.put(mergeFilesDesc.getInputDir().toString(), mergeOp);
mergeWork.setAliasToWork(aliasToWork);
DriverContext driverCxt = new DriverContext();
Task task;
if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
TezWork tezWork = new TezWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID), conf);
mergeWork.setName("File Merge");
tezWork.add(mergeWork);
task = new TezTask();
task.setWork(tezWork);
} else {
task = new MergeFileTask();
task.setWork(mergeWork);
}
// initialize the task and execute
task.initialize(queryState, getQueryPlan(), driverCxt, opContext);
subtask = task;
int ret = task.execute(driverCxt);
if (subtask.getException() != null) {
setException(subtask.getException());
}
return ret;
}
use of org.apache.hadoop.hive.ql.io.merge.MergeFileWork in project hive by apache.
the class MergeFileRecordProcessor method init.
@Override
void init(MRTaskReporter mrReporter, Map<String, LogicalInput> inputs, Map<String, LogicalOutput> outputs) throws Exception {
// TODO HIVE-14042. Abort handling.
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.TEZ_INIT_OPERATORS);
super.init(mrReporter, inputs, outputs);
execContext = new ExecMapperContext(jconf);
//Update JobConf using MRInput, info like filename comes via this
mrInput = getMRInput(inputs);
Configuration updatedConf = mrInput.getConfigUpdates();
if (updatedConf != null) {
for (Map.Entry<String, String> entry : updatedConf) {
jconf.set(entry.getKey(), entry.getValue());
}
}
createOutputMap();
// Start all the Outputs.
for (Map.Entry<String, LogicalOutput> outputEntry : outputs.entrySet()) {
outputEntry.getValue().start();
((TezProcessor.TezKVOutputCollector) outMap.get(outputEntry.getKey())).initialize();
}
String queryId = HiveConf.getVar(jconf, HiveConf.ConfVars.HIVEQUERYID);
cache = ObjectCacheFactory.getCache(jconf, queryId, true);
try {
execContext.setJc(jconf);
cacheKey = MAP_PLAN_KEY;
MapWork mapWork = (MapWork) cache.retrieve(cacheKey, new Callable<Object>() {
@Override
public Object call() {
return Utilities.getMapWork(jconf);
}
});
Utilities.setMapWork(jconf, mapWork);
if (mapWork instanceof MergeFileWork) {
mfWork = (MergeFileWork) mapWork;
} else {
throw new RuntimeException("MapWork should be an instance of MergeFileWork.");
}
String alias = mfWork.getAliasToWork().keySet().iterator().next();
mergeOp = mfWork.getAliasToWork().get(alias);
LOG.info(mergeOp.dump(0));
MapredContext.init(true, new JobConf(jconf));
((TezContext) MapredContext.get()).setInputs(inputs);
mergeOp.passExecContext(execContext);
mergeOp.initializeLocalWork(jconf);
mergeOp.initialize(jconf, null);
OperatorUtils.setChildrenCollector(mergeOp.getChildOperators(), outMap);
mergeOp.setReporter(reporter);
MapredContext.get().setReporter(reporter);
} catch (Throwable e) {
if (e instanceof OutOfMemoryError) {
// Don't create a new object if we are already out of memory
throw (OutOfMemoryError) e;
} else if (e instanceof InterruptedException) {
l4j.info("Hit an interrupt while initializing MergeFileRecordProcessor. Message={}", e.getMessage());
throw (InterruptedException) e;
} else {
throw new RuntimeException("Map operator initialization failed", e);
}
}
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.TEZ_INIT_OPERATORS);
}
Aggregations