use of org.apache.hadoop.hive.ql.plan.BucketMapJoinContext in project hive by apache.
the class SMBMapJoinOperator method setUpFetchContexts.
private void setUpFetchContexts(String alias, MergeQueue mergeQueue) throws HiveException {
mergeQueue.clearFetchContext();
Path currentInputPath = getExecContext().getCurrentInputPath();
BucketMapJoinContext bucketMatcherCxt = localWork.getBucketMapjoinContext();
Class<? extends BucketMatcher> bucketMatcherCls = bucketMatcherCxt.getBucketMatcherClass();
BucketMatcher bucketMatcher = ReflectionUtil.newInstance(bucketMatcherCls, null);
getExecContext().setFileId(bucketMatcherCxt.createFileId(currentInputPath.toString()));
LOG.info("set task id: " + getExecContext().getFileId());
bucketMatcher.setAliasBucketFileNameMapping(bucketMatcherCxt.getAliasBucketFileNameMapping());
List<Path> aliasFiles = bucketMatcher.getAliasBucketFiles(currentInputPath.toString(), bucketMatcherCxt.getMapJoinBigTableAlias(), alias);
mergeQueue.setupContext(aliasFiles);
}
use of org.apache.hadoop.hive.ql.plan.BucketMapJoinContext in project hive by apache.
the class SparkHashTableSinkOperator method flushToFile.
protected void flushToFile(MapJoinPersistableTableContainer tableContainer, byte tag) throws Exception {
MapredLocalWork localWork = getExecContext().getLocalWork();
BucketMapJoinContext mapJoinCtx = localWork.getBucketMapjoinContext();
Path inputPath = getExecContext().getCurrentInputPath();
String bigInputPath = null;
if (inputPath != null && mapJoinCtx != null) {
Set<String> aliases = ((SparkBucketMapJoinContext) mapJoinCtx).getPosToAliasMap().get((int) tag);
bigInputPath = mapJoinCtx.getMappingBigFile(aliases.iterator().next(), inputPath.toString());
}
// get tmp file URI
Path tmpURI = localWork.getTmpHDFSPath();
LOG.info("Temp URI for side table: " + tmpURI);
// get current bucket file name
String fileName = localWork.getBucketFileName(bigInputPath);
// get the tmp URI path; it will be a hdfs path if not local mode
String dumpFilePrefix = conf.getDumpFilePrefix();
Path path = Utilities.generatePath(tmpURI, dumpFilePrefix, tag, fileName);
FileSystem fs = path.getFileSystem(htsOperator.getConfiguration());
// Create the folder and its parents if not there
fs.mkdirs(path);
while (true) {
path = new Path(path, getOperatorId() + "-" + Math.abs(ThreadLocalRandom.current().nextInt()));
try {
// This will guarantee file name uniqueness.
if (fs.createNewFile(path)) {
break;
}
} catch (FileExistsException e) {
// No problem, use a new name
}
}
htsOperator.console.printInfo(Utilities.now() + "\tDump the side-table for tag: " + tag + " with group count: " + tableContainer.size() + " into file: " + path);
try {
// get the hashtable file and path
OutputStream os = null;
ObjectOutputStream out = null;
MapJoinTableContainerSerDe mapJoinTableSerde = htsOperator.mapJoinTableSerdes[tag];
try {
os = fs.create(path, numReplication);
out = new ObjectOutputStream(new BufferedOutputStream(os, 4096));
mapJoinTableSerde.persist(out, tableContainer);
} finally {
if (out != null) {
out.close();
} else if (os != null) {
os.close();
}
}
FileStatus status = fs.getFileStatus(path);
htsOperator.console.printInfo(Utilities.now() + "\tUploaded 1 File to: " + path + " (" + status.getLen() + " bytes)");
} catch (Exception e) {
// Failed to dump the side-table, remove the partial file
try {
fs.delete(path, false);
} catch (Exception ex) {
LOG.warn("Got exception in deleting partial side-table dump for tag: " + tag + ", file " + path, ex);
}
throw e;
}
tableContainer.clear();
}
use of org.apache.hadoop.hive.ql.plan.BucketMapJoinContext in project hive by apache.
the class MapredLocalTask method setUpFetchOpContext.
private void setUpFetchOpContext(FetchOperator fetchOp, String alias, String currentInputFile) throws Exception {
BucketMapJoinContext bucketMatcherCxt = this.work.getBucketMapjoinContext();
Class<? extends BucketMatcher> bucketMatcherCls = bucketMatcherCxt.getBucketMatcherClass();
BucketMatcher bucketMatcher = ReflectionUtils.newInstance(bucketMatcherCls, null);
bucketMatcher.setAliasBucketFileNameMapping(bucketMatcherCxt.getAliasBucketFileNameMapping());
List<Path> aliasFiles = bucketMatcher.getAliasBucketFiles(currentInputFile, bucketMatcherCxt.getMapJoinBigTableAlias(), alias);
fetchOp.setupContext(aliasFiles);
}
use of org.apache.hadoop.hive.ql.plan.BucketMapJoinContext in project hive by apache.
the class HashTableLoader method load.
@Override
public void load(MapJoinTableContainer[] mapJoinTables, MapJoinTableContainerSerDe[] mapJoinTableSerdes) throws HiveException {
// Note: it's possible that a MJ operator is in a ReduceWork, in which case the
// currentInputPath will be null. But, since currentInputPath is only interesting
// for bucket join case, and for bucket join the MJ operator will always be in
// a MapWork, this should be OK.
String currentInputPath = context.getCurrentInputPath() == null ? null : context.getCurrentInputPath().toString();
LOG.info("******* Load from HashTable for input file: " + currentInputPath);
MapredLocalWork localWork = context.getLocalWork();
try {
if (localWork.getDirectFetchOp() != null) {
loadDirectly(mapJoinTables, currentInputPath);
}
// All HashTables share the same base dir,
// which is passed in as the tmp path
Path baseDir = localWork.getTmpPath();
if (baseDir == null) {
return;
}
FileSystem fs = FileSystem.get(baseDir.toUri(), hconf);
BucketMapJoinContext mapJoinCtx = localWork.getBucketMapjoinContext();
boolean firstContainer = true;
boolean useOptimizedContainer = !useFastContainer && HiveConf.getBoolVar(hconf, HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE);
for (int pos = 0; pos < mapJoinTables.length; pos++) {
if (pos == desc.getPosBigTable() || mapJoinTables[pos] != null) {
continue;
}
if (useOptimizedContainer) {
MapJoinObjectSerDeContext keyCtx = mapJoinTableSerdes[pos].getKeyContext();
ObjectInspector keyOI = keyCtx.getSerDe().getObjectInspector();
if (!MapJoinBytesTableContainer.isSupportedKey(keyOI)) {
if (firstContainer) {
LOG.warn("Not using optimized table container." + "Only a subset of mapjoin keys is supported.");
useOptimizedContainer = false;
HiveConf.setBoolVar(hconf, HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE, false);
} else {
throw new HiveException("Only a subset of mapjoin keys is supported.");
}
}
}
firstContainer = false;
String bigInputPath = currentInputPath;
if (currentInputPath != null && mapJoinCtx != null) {
if (!desc.isBucketMapJoin()) {
bigInputPath = null;
} else {
Set<String> aliases = ((SparkBucketMapJoinContext) mapJoinCtx).getPosToAliasMap().get(pos);
String alias = aliases.iterator().next();
// Any one small table input path
String smallInputPath = mapJoinCtx.getAliasBucketFileNameMapping().get(alias).get(bigInputPath).get(0);
bigInputPath = mapJoinCtx.getMappingBigFile(alias, smallInputPath);
}
}
String fileName = localWork.getBucketFileName(bigInputPath);
Path path = Utilities.generatePath(baseDir, desc.getDumpFilePrefix(), (byte) pos, fileName);
mapJoinTables[pos] = load(fs, path, mapJoinTableSerdes[pos]);
}
} catch (Exception e) {
throw new HiveException(e);
}
}
use of org.apache.hadoop.hive.ql.plan.BucketMapJoinContext in project hive by apache.
the class SparkSortMergeJoinFactory method setupBucketMapJoinInfo.
private static void setupBucketMapJoinInfo(MapWork plan, SMBMapJoinOperator currMapJoinOp) {
if (currMapJoinOp != null) {
Map<String, Map<String, List<String>>> aliasBucketFileNameMapping = currMapJoinOp.getConf().getAliasBucketFileNameMapping();
if (aliasBucketFileNameMapping != null) {
MapredLocalWork localPlan = plan.getMapRedLocalWork();
if (localPlan == null) {
localPlan = currMapJoinOp.getConf().getLocalWork();
} else {
// local plan is not null, we want to merge it into SMBMapJoinOperator's local work
MapredLocalWork smbLocalWork = currMapJoinOp.getConf().getLocalWork();
if (smbLocalWork != null) {
localPlan.getAliasToFetchWork().putAll(smbLocalWork.getAliasToFetchWork());
localPlan.getAliasToWork().putAll(smbLocalWork.getAliasToWork());
}
}
if (localPlan == null) {
return;
}
plan.setMapRedLocalWork(null);
currMapJoinOp.getConf().setLocalWork(localPlan);
BucketMapJoinContext bucketMJCxt = new BucketMapJoinContext();
localPlan.setBucketMapjoinContext(bucketMJCxt);
bucketMJCxt.setAliasBucketFileNameMapping(aliasBucketFileNameMapping);
bucketMJCxt.setBucketFileNameMapping(currMapJoinOp.getConf().getBigTableBucketNumMapping());
localPlan.setInputFileChangeSensitive(true);
bucketMJCxt.setMapJoinBigTableAlias(currMapJoinOp.getConf().getBigTableAlias());
bucketMJCxt.setBucketMatcherClass(org.apache.hadoop.hive.ql.exec.DefaultBucketMatcher.class);
bucketMJCxt.setBigTablePartSpecToFileMapping(currMapJoinOp.getConf().getBigTablePartSpecToFileMapping());
plan.setUseBucketizedHiveInputFormat(true);
}
}
}
Aggregations