use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class SerDeEncodedDataReader method startReadSplitFromFile.
public void startReadSplitFromFile(FileSplit split, boolean[] splitIncludes, StripeData slice) throws IOException {
boolean maySplitTheSplit = slice == null;
ReaderWithOffsets offsetReader = null;
@SuppressWarnings("rawtypes") RecordReader sourceReader = sourceInputFormat.getRecordReader(split, jobConf, reporter);
Path path = split.getPath().getFileSystem(daemonConf).makeQualified(split.getPath());
PartitionDesc partDesc = HiveFileFormatUtils.getFromPathRecursively(parts, path, null);
try {
offsetReader = createOffsetReader(sourceReader, partDesc.getTableDesc(), split);
sourceReader = null;
} finally {
if (sourceReader != null) {
try {
sourceReader.close();
} catch (Exception ex) {
LlapIoImpl.LOG.error("Failed to close source reader", ex);
}
}
}
maySplitTheSplit = maySplitTheSplit && offsetReader.hasOffsets();
try {
StructObjectInspector originalOi = (StructObjectInspector) getOiFromSerDe();
List<Integer> splitColumnIds = OrcInputFormat.genIncludedColumnsReverse(schema, splitIncludes, false);
// fileread writes to the writer, which writes to orcWriter, which writes to cacheWriter
EncodingWriter writer = VectorDeserializeOrcWriter.create(sourceInputFormat, sourceSerDe, parts, daemonConf, jobConf, split.getPath(), originalOi, splitColumnIds, splitIncludes, allocSize, encodeExecutor);
// TODO: move this into ctor? EW would need to create CacheWriter then
List<Integer> cwColIds = writer.isOnlyWritingIncludedColumns() ? splitColumnIds : columnIds;
writer.init(new CacheWriter(bufferManager, cwColIds, splitIncludes, writer.isOnlyWritingIncludedColumns(), bufferFactory, isStopped), daemonConf, split.getPath());
if (writer instanceof VectorDeserializeOrcWriter) {
VectorDeserializeOrcWriter asyncWriter = (VectorDeserializeOrcWriter) writer;
asyncWriter.startAsync(new AsyncCacheDataCallback());
this.asyncWriters.add(asyncWriter);
}
currentFileRead = new FileReaderYieldReturn(offsetReader, split, writer, maySplitTheSplit, targetSliceRowCount);
} finally {
// Assignment is the last thing in the try, so if it happen we assume success.
if (currentFileRead != null)
return;
if (offsetReader == null)
return;
try {
offsetReader.close();
} catch (Exception ex) {
LlapIoImpl.LOG.error("Failed to close source reader", ex);
}
}
}
use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class Utilities method createDummyFileForEmptyTable.
@SuppressWarnings("rawtypes")
private static Path createDummyFileForEmptyTable(JobConf job, MapWork work, Path hiveScratchDir, String alias) throws Exception {
TableDesc tableDesc = work.getAliasToPartnInfo().get(alias).getTableDesc();
if (tableDesc.isNonNative()) {
// if it does not need native storage, we can't create an empty file for it.
return null;
}
Properties props = tableDesc.getProperties();
HiveOutputFormat outFileFormat = HiveFileFormatUtils.getHiveOutputFormat(job, tableDesc);
Path newPath = createEmptyFile(hiveScratchDir, outFileFormat, job, props, false);
LOG.info("Changed input file for alias {} to newPath", alias, newPath);
// update the work
Map<Path, List<String>> pathToAliases = work.getPathToAliases();
List<String> newList = new ArrayList<String>(1);
newList.add(alias);
pathToAliases.put(newPath, newList);
work.setPathToAliases(pathToAliases);
PartitionDesc pDesc = work.getAliasToPartnInfo().get(alias).clone();
work.addPathToPartitionInfo(newPath, pDesc);
return newPath;
}
use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class ExecMapper method configure.
@Override
public void configure(JobConf job) {
execContext = new ExecMapperContext(job);
Utilities.tryLoggingClassPaths(job, l4j);
setDone(false);
try {
jc = job;
execContext.setJc(jc);
// create map and fetch operators
MapWork mrwork = Utilities.getMapWork(job);
for (PartitionDesc part : mrwork.getAliasToPartnInfo().values()) {
TableDesc tableDesc = part.getTableDesc();
Utilities.copyJobSecretToTableProperties(tableDesc);
}
CompilationOpContext runtimeCtx = new CompilationOpContext();
if (mrwork.getVectorMode()) {
mo = new VectorMapOperator(runtimeCtx);
} else {
mo = new MapOperator(runtimeCtx);
}
mo.setConf(mrwork);
// initialize map operator
mo.initialize(job, null);
mo.setChildren(job);
l4j.info(mo.dump(0));
// initialize map local work
localWork = mrwork.getMapRedLocalWork();
execContext.setLocalWork(localWork);
MapredContext.init(true, new JobConf(jc));
mo.passExecContext(execContext);
mo.initializeLocalWork(jc);
mo.initializeMapOperator(jc);
if (localWork == null) {
return;
}
// The following code is for mapjoin
// initialize all the dummy ops
l4j.info("Initializing dummy operator");
List<Operator<? extends OperatorDesc>> dummyOps = localWork.getDummyParentOp();
for (Operator<? extends OperatorDesc> dummyOp : dummyOps) {
dummyOp.passExecContext(execContext);
dummyOp.initialize(jc, null);
}
} catch (Throwable e) {
abort = true;
if (e instanceof OutOfMemoryError) {
// Don't create a new object if we are already out of memory
throw (OutOfMemoryError) e;
} else {
throw new RuntimeException("Map operator initialization failed", e);
}
}
}
use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class CombineHiveInputFormat method getSplits.
/**
* Create Hive splits based on CombineFileSplit.
*/
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
PerfLogger perfLogger = SessionState.getPerfLogger();
perfLogger.perfLogBegin(CLASS_NAME, PerfLogger.GET_SPLITS);
init(job);
ArrayList<InputSplit> result = new ArrayList<InputSplit>();
Path[] paths = getInputPaths(job);
List<Path> nonCombinablePaths = new ArrayList<Path>(paths.length / 2);
List<Path> combinablePaths = new ArrayList<Path>(paths.length / 2);
int numThreads = Math.min(MAX_CHECK_NONCOMBINABLE_THREAD_NUM, (int) Math.ceil((double) paths.length / DEFAULT_NUM_PATH_PER_THREAD));
// In that case, Executors.newFixedThreadPool will fail.
if (numThreads > 0) {
try {
Set<Integer> nonCombinablePathIndices = getNonCombinablePathIndices(job, paths, numThreads);
for (int i = 0; i < paths.length; i++) {
if (nonCombinablePathIndices.contains(i)) {
nonCombinablePaths.add(paths[i]);
} else {
combinablePaths.add(paths[i]);
}
}
} catch (Exception e) {
LOG.error("Error checking non-combinable path", e);
perfLogger.perfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
throw new IOException(e);
}
}
// Store the previous value for the path specification
String oldPaths = job.get(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR);
LOG.debug("The received input paths are: [{}] against the property {}", oldPaths, org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR);
// Process the normal splits
if (nonCombinablePaths.size() > 0) {
FileInputFormat.setInputPaths(job, nonCombinablePaths.toArray(new Path[nonCombinablePaths.size()]));
InputSplit[] splits = super.getSplits(job, numSplits);
for (InputSplit split : splits) {
result.add(split);
}
}
// Process the combine splits
if (combinablePaths.size() > 0) {
FileInputFormat.setInputPaths(job, combinablePaths.toArray(new Path[combinablePaths.size()]));
Map<Path, PartitionDesc> pathToPartitionInfo = this.pathToPartitionInfo != null ? this.pathToPartitionInfo : Utilities.getMapWork(job).getPathToPartitionInfo();
InputSplit[] splits = getCombineSplits(job, numSplits, pathToPartitionInfo);
for (InputSplit split : splits) {
result.add(split);
}
}
// if some application depends on the original value being set.
if (oldPaths != null) {
job.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR, oldPaths);
}
// clear work from ThreadLocal after splits generated in case of thread is reused in pool.
Utilities.clearWorkMapForConf(job);
if (result.isEmpty() && paths.length > 0 && job.getBoolean(Utilities.ENSURE_OPERATORS_EXECUTED, false)) {
// If there are no inputs; the Execution engine skips the operator tree.
// To prevent it from happening; an opaque ZeroRows input is added here - when needed.
result.add(new HiveInputSplit(new NullRowsInputFormat.DummyInputSplit(paths[0]), ZeroRowsInputFormat.class.getName()));
}
LOG.info("Number of all splits " + result.size());
perfLogger.perfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
return result.toArray(new InputSplit[result.size()]);
}
use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.
the class CombineHiveRecordReader method extractSinglePartSpec.
private PartitionDesc extractSinglePartSpec(CombineHiveInputSplit hsplit) throws IOException {
PartitionDesc part = null;
Map<Map<Path, PartitionDesc>, Map<Path, PartitionDesc>> cache = new HashMap<>();
for (Path path : hsplit.getPaths()) {
PartitionDesc otherPart = HiveFileFormatUtils.getFromPathRecursively(pathToPartInfo, path, cache);
LOG.debug("Found spec for {} {} from {}", path, otherPart, pathToPartInfo);
if (part == null) {
part = otherPart;
} else if (otherPart != part) {
// Assume we should have the exact same object.
// TODO: we could also compare the schema and SerDe, and pass only those to the call
// instead; most of the time these would be the same and LLAP IO can handle that.
LOG.warn("Multiple partitions found; not going to pass a part spec to LLAP IO: {" + part.getPartSpec() + "} and {" + otherPart.getPartSpec() + "}");
return null;
}
}
return part;
}
Aggregations