use of org.apache.hadoop.hive.ql.plan.SparkBucketMapJoinContext in project hive by apache.
the class SparkHashTableSinkOperator method flushToFile.
protected void flushToFile(MapJoinPersistableTableContainer tableContainer, byte tag) throws Exception {
MapredLocalWork localWork = getExecContext().getLocalWork();
BucketMapJoinContext mapJoinCtx = localWork.getBucketMapjoinContext();
Path inputPath = getExecContext().getCurrentInputPath();
String bigInputPath = null;
if (inputPath != null && mapJoinCtx != null) {
Set<String> aliases = ((SparkBucketMapJoinContext) mapJoinCtx).getPosToAliasMap().get((int) tag);
bigInputPath = mapJoinCtx.getMappingBigFile(aliases.iterator().next(), inputPath.toString());
}
// get tmp file URI
Path tmpURI = localWork.getTmpHDFSPath();
LOG.info("Temp URI for side table: " + tmpURI);
// get current bucket file name
String fileName = localWork.getBucketFileName(bigInputPath);
// get the tmp URI path; it will be a hdfs path if not local mode
String dumpFilePrefix = conf.getDumpFilePrefix();
Path path = Utilities.generatePath(tmpURI, dumpFilePrefix, tag, fileName);
FileSystem fs = path.getFileSystem(htsOperator.getConfiguration());
short replication = fs.getDefaultReplication(path);
// Create the folder and its parents if not there
fs.mkdirs(path);
while (true) {
path = new Path(path, getOperatorId() + "-" + Math.abs(Utilities.randGen.nextInt()));
try {
// This will guarantee file name uniqueness.
if (fs.createNewFile(path)) {
break;
}
} catch (FileExistsException e) {
// No problem, use a new name
}
}
// TODO find out numOfPartitions for the big table
int numOfPartitions = replication;
replication = (short) Math.max(minReplication, numOfPartitions);
htsOperator.console.printInfo(Utilities.now() + "\tDump the side-table for tag: " + tag + " with group count: " + tableContainer.size() + " into file: " + path);
try {
// get the hashtable file and path
OutputStream os = null;
ObjectOutputStream out = null;
MapJoinTableContainerSerDe mapJoinTableSerde = htsOperator.mapJoinTableSerdes[tag];
try {
os = fs.create(path, replication);
out = new ObjectOutputStream(new BufferedOutputStream(os, 4096));
mapJoinTableSerde.persist(out, tableContainer);
} finally {
if (out != null) {
out.close();
} else if (os != null) {
os.close();
}
}
FileStatus status = fs.getFileStatus(path);
htsOperator.console.printInfo(Utilities.now() + "\tUploaded 1 File to: " + path + " (" + status.getLen() + " bytes)");
} catch (Exception e) {
// Failed to dump the side-table, remove the partial file
try {
fs.delete(path, false);
} catch (Exception ex) {
LOG.warn("Got exception in deleting partial side-table dump for tag: " + tag + ", file " + path, ex);
}
throw e;
}
tableContainer.clear();
}
use of org.apache.hadoop.hive.ql.plan.SparkBucketMapJoinContext in project hive by apache.
the class HashTableLoader method load.
@Override
public void load(MapJoinTableContainer[] mapJoinTables, MapJoinTableContainerSerDe[] mapJoinTableSerdes) throws HiveException {
// Note: it's possible that a MJ operator is in a ReduceWork, in which case the
// currentInputPath will be null. But, since currentInputPath is only interesting
// for bucket join case, and for bucket join the MJ operator will always be in
// a MapWork, this should be OK.
String currentInputPath = context.getCurrentInputPath() == null ? null : context.getCurrentInputPath().toString();
LOG.info("******* Load from HashTable for input file: " + currentInputPath);
MapredLocalWork localWork = context.getLocalWork();
try {
if (localWork.getDirectFetchOp() != null) {
loadDirectly(mapJoinTables, currentInputPath);
}
// All HashTables share the same base dir,
// which is passed in as the tmp path
Path baseDir = localWork.getTmpPath();
if (baseDir == null) {
return;
}
FileSystem fs = FileSystem.get(baseDir.toUri(), hconf);
BucketMapJoinContext mapJoinCtx = localWork.getBucketMapjoinContext();
boolean firstContainer = true;
boolean useOptimizedContainer = !useFastContainer && HiveConf.getBoolVar(hconf, HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE);
for (int pos = 0; pos < mapJoinTables.length; pos++) {
if (pos == desc.getPosBigTable() || mapJoinTables[pos] != null) {
continue;
}
if (useOptimizedContainer) {
MapJoinObjectSerDeContext keyCtx = mapJoinTableSerdes[pos].getKeyContext();
ObjectInspector keyOI = keyCtx.getSerDe().getObjectInspector();
if (!MapJoinBytesTableContainer.isSupportedKey(keyOI)) {
if (firstContainer) {
LOG.warn("Not using optimized table container." + "Only a subset of mapjoin keys is supported.");
useOptimizedContainer = false;
HiveConf.setBoolVar(hconf, HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE, false);
} else {
throw new HiveException("Only a subset of mapjoin keys is supported.");
}
}
}
firstContainer = false;
String bigInputPath = currentInputPath;
if (currentInputPath != null && mapJoinCtx != null) {
if (!desc.isBucketMapJoin()) {
bigInputPath = null;
} else {
Set<String> aliases = ((SparkBucketMapJoinContext) mapJoinCtx).getPosToAliasMap().get(pos);
String alias = aliases.iterator().next();
// Any one small table input path
String smallInputPath = mapJoinCtx.getAliasBucketFileNameMapping().get(alias).get(bigInputPath).get(0);
bigInputPath = mapJoinCtx.getMappingBigFile(alias, smallInputPath);
}
}
String fileName = localWork.getBucketFileName(bigInputPath);
Path path = Utilities.generatePath(baseDir, desc.getDumpFilePrefix(), (byte) pos, fileName);
mapJoinTables[pos] = load(fs, path, mapJoinTableSerdes[pos]);
}
} catch (Exception e) {
throw new HiveException(e);
}
}
Aggregations