use of org.apache.hadoop.hive.ql.io.merge.MergeFileWork in project hive by apache.
the class SparkMergeFileRecordHandler method init.
@SuppressWarnings("unchecked")
@Override
public <K, V> void init(JobConf job, OutputCollector<K, V> output, Reporter reporter) throws Exception {
super.init(job, output, reporter);
try {
jc = job;
MapWork mapWork = Utilities.getMapWork(job);
if (mapWork instanceof MergeFileWork) {
MergeFileWork mergeFileWork = (MergeFileWork) mapWork;
String alias = mergeFileWork.getAliasToWork().keySet().iterator().next();
op = mergeFileWork.getAliasToWork().get(alias);
if (op instanceof AbstractFileMergeOperator) {
mergeOp = (AbstractFileMergeOperator<? extends FileMergeDesc>) op;
mergeOp.initializeOp(jc);
row = new Object[2];
abort = false;
} else {
abort = true;
throw new IllegalStateException("Merge file work's top operator should be an" + " instance of AbstractFileMergeOperator");
}
} else {
abort = true;
throw new IllegalStateException("Map work should be a merge file work.");
}
LOG.info(mergeOp.dump(0));
} catch (HiveException e) {
abort = true;
throw new RuntimeException(e);
}
}
use of org.apache.hadoop.hive.ql.io.merge.MergeFileWork in project hive by apache.
the class DagUtils method createVertex.
/*
* Helper function to create Vertex from MapWork.
*/
private Vertex createVertex(JobConf conf, MapWork mapWork, FileSystem fs, Path mrScratchDir, Context ctx, VertexType vertexType, Map<String, LocalResource> localResources) throws Exception {
// set up the operator plan
Utilities.cacheMapWork(conf, mapWork, mrScratchDir);
// create the directories FileSinkOperators need
Utilities.createTmpDirs(conf, mapWork);
// finally create the vertex
Vertex map = null;
// use tez to combine splits
boolean groupSplitsInInputInitializer;
DataSourceDescriptor dataSource;
int numTasks = -1;
@SuppressWarnings("rawtypes") Class inputFormatClass = conf.getClass("mapred.input.format.class", InputFormat.class);
boolean vertexHasCustomInput = VertexType.isCustomInputType(vertexType);
LOG.info("Vertex has custom input? " + vertexHasCustomInput);
if (vertexHasCustomInput) {
groupSplitsInInputInitializer = false;
// grouping happens in execution phase. The input payload should not enable grouping here,
// it will be enabled in the CustomVertex.
inputFormatClass = HiveInputFormat.class;
conf.setClass("mapred.input.format.class", HiveInputFormat.class, InputFormat.class);
// mapreduce.tez.input.initializer.serialize.event.payload should be set to false when using
// this plug-in to avoid getting a serialized event at run-time.
conf.setBoolean("mapreduce.tez.input.initializer.serialize.event.payload", false);
} else {
// is HiveInputFormat
if (inputFormatClass == HiveInputFormat.class) {
groupSplitsInInputInitializer = true;
} else {
groupSplitsInInputInitializer = false;
}
}
if (mapWork instanceof MergeFileWork) {
Path outputPath = ((MergeFileWork) mapWork).getOutputDir();
// prepare the tmp output directory. The output tmp directory should
// exist before jobClose (before renaming after job completion)
Path tempOutPath = Utilities.toTempPath(outputPath);
try {
FileSystem tmpOutFS = tempOutPath.getFileSystem(conf);
if (!tmpOutFS.exists(tempOutPath)) {
tmpOutFS.mkdirs(tempOutPath);
}
} catch (IOException e) {
throw new RuntimeException("Can't make path " + outputPath + " : " + e.getMessage(), e);
}
}
// remember mapping of plan to input
conf.set(Utilities.INPUT_NAME, mapWork.getName());
if (HiveConf.getBoolVar(conf, ConfVars.HIVE_AM_SPLIT_GENERATION)) {
// set up the operator plan. (before setting up splits on the AM)
Utilities.setMapWork(conf, mapWork, mrScratchDir, false);
// the correct plugin.
if (groupSplitsInInputInitializer) {
// Not setting a payload, since the MRInput payload is the same and can be accessed.
InputInitializerDescriptor descriptor = InputInitializerDescriptor.create(HiveSplitGenerator.class.getName());
dataSource = MRInputLegacy.createConfigBuilder(conf, inputFormatClass).groupSplits(true).setCustomInitializerDescriptor(descriptor).build();
} else {
// Not HiveInputFormat, or a custom VertexManager will take care of grouping splits
if (vertexHasCustomInput && vertexType == VertexType.MULTI_INPUT_UNINITIALIZED_EDGES) {
// SMB Join.
dataSource = MultiMRInput.createConfigBuilder(conf, inputFormatClass).groupSplits(false).build();
} else {
dataSource = MRInputLegacy.createConfigBuilder(conf, inputFormatClass).groupSplits(false).build();
}
}
} else {
// Setup client side split generation.
// we need to set this, because with HS2 and client side split
// generation we end up not finding the map work. This is
// because of thread local madness (tez split generation is
// multi-threaded - HS2 plan cache uses thread locals). Setting
// VECTOR_MODE/USE_VECTORIZED_INPUT_FILE_FORMAT causes the split gen code to use the conf instead
// of the map work.
conf.setBoolean(Utilities.VECTOR_MODE, mapWork.getVectorMode());
conf.setBoolean(Utilities.USE_VECTORIZED_INPUT_FILE_FORMAT, mapWork.getUseVectorizedInputFileFormat());
InputSplitInfo inputSplitInfo = MRInputHelpers.generateInputSplitsToMem(conf, false, 0);
InputInitializerDescriptor descriptor = InputInitializerDescriptor.create(MRInputSplitDistributor.class.getName());
InputDescriptor inputDescriptor = InputDescriptor.create(MRInputLegacy.class.getName()).setUserPayload(UserPayload.create(MRRuntimeProtos.MRInputUserPayloadProto.newBuilder().setConfigurationBytes(TezUtils.createByteStringFromConf(conf)).setSplits(inputSplitInfo.getSplitsProto()).build().toByteString().asReadOnlyByteBuffer()));
dataSource = DataSourceDescriptor.create(inputDescriptor, descriptor, null);
numTasks = inputSplitInfo.getNumTasks();
// set up the operator plan. (after generating splits - that changes configs)
Utilities.setMapWork(conf, mapWork, mrScratchDir, false);
}
UserPayload serializedConf = TezUtils.createUserPayloadFromConf(conf);
String procClassName = MapTezProcessor.class.getName();
if (mapWork instanceof MergeFileWork) {
procClassName = MergeFileTezProcessor.class.getName();
}
VertexExecutionContext executionContext = createVertexExecutionContext(mapWork);
map = Vertex.create(mapWork.getName(), ProcessorDescriptor.create(procClassName).setUserPayload(serializedConf), numTasks, getContainerResource(conf));
map.setTaskEnvironment(getContainerEnvironment(conf, true));
map.setExecutionContext(executionContext);
map.setTaskLaunchCmdOpts(getContainerJavaOpts(conf));
assert mapWork.getAliasToWork().keySet().size() == 1;
// Add the actual source input
String alias = mapWork.getAliasToWork().keySet().iterator().next();
map.addDataSource(alias, dataSource);
map.addTaskLocalFiles(localResources);
return map;
}
use of org.apache.hadoop.hive.ql.io.merge.MergeFileWork in project hive by apache.
the class MergeFileRecordProcessor method init.
@Override
void init(MRTaskReporter mrReporter, Map<String, LogicalInput> inputs, Map<String, LogicalOutput> outputs) throws Exception {
// TODO HIVE-14042. Abort handling.
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.TEZ_INIT_OPERATORS);
super.init(mrReporter, inputs, outputs);
execContext = new ExecMapperContext(jconf);
// Update JobConf using MRInput, info like filename comes via this
mrInput = getMRInput(inputs);
Configuration updatedConf = mrInput.getConfigUpdates();
if (updatedConf != null) {
for (Map.Entry<String, String> entry : updatedConf) {
jconf.set(entry.getKey(), entry.getValue());
}
}
createOutputMap();
// Start all the Outputs.
for (Map.Entry<String, LogicalOutput> outputEntry : outputs.entrySet()) {
outputEntry.getValue().start();
((TezProcessor.TezKVOutputCollector) outMap.get(outputEntry.getKey())).initialize();
}
String queryId = HiveConf.getVar(jconf, HiveConf.ConfVars.HIVEQUERYID);
cache = ObjectCacheFactory.getCache(jconf, queryId, true);
try {
execContext.setJc(jconf);
cacheKey = MAP_PLAN_KEY;
MapWork mapWork = (MapWork) cache.retrieve(cacheKey, new Callable<Object>() {
@Override
public Object call() {
return Utilities.getMapWork(jconf);
}
});
Utilities.setMapWork(jconf, mapWork);
if (mapWork instanceof MergeFileWork) {
mfWork = (MergeFileWork) mapWork;
} else {
throw new RuntimeException("MapWork should be an instance of MergeFileWork.");
}
String alias = mfWork.getAliasToWork().keySet().iterator().next();
mergeOp = mfWork.getAliasToWork().get(alias);
LOG.info(mergeOp.dump(0));
MapredContext.init(true, new JobConf(jconf));
((TezContext) MapredContext.get()).setInputs(inputs);
mergeOp.passExecContext(execContext);
mergeOp.initializeLocalWork(jconf);
mergeOp.initialize(jconf, null);
OperatorUtils.setChildrenCollector(mergeOp.getChildOperators(), outMap);
mergeOp.setReporter(reporter);
MapredContext.get().setReporter(reporter);
} catch (Throwable e) {
if (e instanceof OutOfMemoryError) {
// Don't create a new object if we are already out of memory
throw (OutOfMemoryError) e;
} else if (e instanceof InterruptedException) {
l4j.info("Hit an interrupt while initializing MergeFileRecordProcessor. Message={}", e.getMessage());
throw (InterruptedException) e;
} else {
throw new RuntimeException("Map operator initialization failed", e);
}
}
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.TEZ_INIT_OPERATORS);
}
use of org.apache.hadoop.hive.ql.io.merge.MergeFileWork in project hive by apache.
the class DDLTask method mergeFiles.
/**
* First, make sure the source table/partition is not
* archived/indexes/non-rcfile. If either of these is true, throw an
* exception.
*
* The way how it does the merge is to create a BlockMergeTask from the
* mergeFilesDesc.
*
* @param db
* @param mergeFilesDesc
* @return
* @throws HiveException
*/
private int mergeFiles(Hive db, AlterTablePartMergeFilesDesc mergeFilesDesc, DriverContext driverContext) throws HiveException {
ListBucketingCtx lbCtx = mergeFilesDesc.getLbCtx();
boolean lbatc = lbCtx == null ? false : lbCtx.isSkewedStoredAsDir();
int lbd = lbCtx == null ? 0 : lbCtx.calculateListBucketingLevel();
// merge work only needs input and output.
MergeFileWork mergeWork = new MergeFileWork(mergeFilesDesc.getInputDir(), mergeFilesDesc.getOutputDir(), mergeFilesDesc.getInputFormatClass().getName(), mergeFilesDesc.getTableDesc());
LinkedHashMap<Path, ArrayList<String>> pathToAliases = new LinkedHashMap<>();
ArrayList<String> inputDirstr = new ArrayList<String>(1);
inputDirstr.add(mergeFilesDesc.getInputDir().toString());
pathToAliases.put(mergeFilesDesc.getInputDir().get(0), inputDirstr);
mergeWork.setPathToAliases(pathToAliases);
mergeWork.setListBucketingCtx(mergeFilesDesc.getLbCtx());
mergeWork.resolveConcatenateMerge(db.getConf());
mergeWork.setMapperCannotSpanPartns(true);
mergeWork.setSourceTableInputFormat(mergeFilesDesc.getInputFormatClass().getName());
final FileMergeDesc fmd;
if (mergeFilesDesc.getInputFormatClass().equals(RCFileInputFormat.class)) {
fmd = new RCFileMergeDesc();
} else {
// safe to assume else is ORC as semantic analyzer will check for RC/ORC
fmd = new OrcFileMergeDesc();
}
fmd.setDpCtx(null);
fmd.setHasDynamicPartitions(false);
fmd.setListBucketingAlterTableConcatenate(lbatc);
fmd.setListBucketingDepth(lbd);
fmd.setOutputPath(mergeFilesDesc.getOutputDir());
CompilationOpContext opContext = driverContext.getCtx().getOpContext();
Operator<? extends OperatorDesc> mergeOp = OperatorFactory.get(opContext, fmd);
LinkedHashMap<String, Operator<? extends OperatorDesc>> aliasToWork = new LinkedHashMap<String, Operator<? extends OperatorDesc>>();
aliasToWork.put(mergeFilesDesc.getInputDir().toString(), mergeOp);
mergeWork.setAliasToWork(aliasToWork);
DriverContext driverCxt = new DriverContext();
Task<?> task;
if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
TezWork tezWork = new TezWork(queryState.getQueryId(), conf);
mergeWork.setName("File Merge");
tezWork.add(mergeWork);
task = new TezTask();
((TezTask) task).setWork(tezWork);
} else {
task = new MergeFileTask();
((MergeFileTask) task).setWork(mergeWork);
}
// initialize the task and execute
task.initialize(queryState, getQueryPlan(), driverCxt, opContext);
subtask = task;
int ret = task.execute(driverCxt);
if (subtask.getException() != null) {
setException(subtask.getException());
}
return ret;
}
Aggregations