use of org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe in project hive by apache.
the class MapJoinOperator method completeInitializationOp.
@SuppressWarnings("unchecked")
@Override
protected void completeInitializationOp(Object[] os) throws HiveException {
if (os.length != 0) {
Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]> pair = (Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]>) os[0];
boolean spilled = false;
for (MapJoinTableContainer container : pair.getLeft()) {
if (container != null) {
spilled = spilled || container.hasSpill();
}
}
if (spilled) {
// we can't use the cached table because it has spilled.
loadHashTable(getExecContext(), MapredContext.get());
} else {
if (LOG.isDebugEnabled()) {
String s = "Using tables from cache: [";
for (MapJoinTableContainer c : pair.getLeft()) {
s += ((c == null) ? "null" : c.getClass().getSimpleName()) + ", ";
}
LOG.debug(s + "]");
}
// let's use the table from the cache.
mapJoinTables = pair.getLeft();
mapJoinTableSerdes = pair.getRight();
}
hashTblInitedOnce = true;
}
if (this.getExecContext() != null) {
// reset exec context so that initialization of the map operator happens
// properly
this.getExecContext().setLastInputPath(null);
this.getExecContext().setCurrentInputPath(null);
}
}
use of org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe in project hive by apache.
the class SparkHashTableSinkOperator method flushToFile.
protected void flushToFile(MapJoinPersistableTableContainer tableContainer, byte tag) throws Exception {
MapredLocalWork localWork = getExecContext().getLocalWork();
BucketMapJoinContext mapJoinCtx = localWork.getBucketMapjoinContext();
Path inputPath = getExecContext().getCurrentInputPath();
String bigInputPath = null;
if (inputPath != null && mapJoinCtx != null) {
Set<String> aliases = ((SparkBucketMapJoinContext) mapJoinCtx).getPosToAliasMap().get((int) tag);
bigInputPath = mapJoinCtx.getMappingBigFile(aliases.iterator().next(), inputPath.toString());
}
// get tmp file URI
Path tmpURI = localWork.getTmpHDFSPath();
LOG.info("Temp URI for side table: " + tmpURI);
// get current bucket file name
String fileName = localWork.getBucketFileName(bigInputPath);
// get the tmp URI path; it will be a hdfs path if not local mode
String dumpFilePrefix = conf.getDumpFilePrefix();
Path path = Utilities.generatePath(tmpURI, dumpFilePrefix, tag, fileName);
FileSystem fs = path.getFileSystem(htsOperator.getConfiguration());
// Create the folder and its parents if not there
fs.mkdirs(path);
while (true) {
path = new Path(path, getOperatorId() + "-" + Math.abs(Utilities.randGen.nextInt()));
try {
// This will guarantee file name uniqueness.
if (fs.createNewFile(path)) {
break;
}
} catch (FileExistsException e) {
// No problem, use a new name
}
}
htsOperator.console.printInfo(Utilities.now() + "\tDump the side-table for tag: " + tag + " with group count: " + tableContainer.size() + " into file: " + path);
try {
// get the hashtable file and path
OutputStream os = null;
ObjectOutputStream out = null;
MapJoinTableContainerSerDe mapJoinTableSerde = htsOperator.mapJoinTableSerdes[tag];
try {
os = fs.create(path, numReplication);
out = new ObjectOutputStream(new BufferedOutputStream(os, 4096));
mapJoinTableSerde.persist(out, tableContainer);
} finally {
if (out != null) {
out.close();
} else if (os != null) {
os.close();
}
}
FileStatus status = fs.getFileStatus(path);
htsOperator.console.printInfo(Utilities.now() + "\tUploaded 1 File to: " + path + " (" + status.getLen() + " bytes)");
} catch (Exception e) {
// Failed to dump the side-table, remove the partial file
try {
fs.delete(path, false);
} catch (Exception ex) {
LOG.warn("Got exception in deleting partial side-table dump for tag: " + tag + ", file " + path, ex);
}
throw e;
}
tableContainer.clear();
}
use of org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe in project hive by apache.
the class MapJoinOperator method generateMapMetaData.
public void generateMapMetaData() throws HiveException {
try {
TableDesc keyTableDesc = conf.getKeyTblDesc();
AbstractSerDe keySerializer = (AbstractSerDe) ReflectionUtil.newInstance(keyTableDesc.getDeserializerClass(), null);
SerDeUtils.initializeSerDe(keySerializer, null, keyTableDesc.getProperties(), null);
MapJoinObjectSerDeContext keyContext = new MapJoinObjectSerDeContext(keySerializer, false);
for (int pos = 0; pos < order.length; pos++) {
if (pos == posBigTable) {
continue;
}
TableDesc valueTableDesc;
if (conf.getNoOuterJoin()) {
valueTableDesc = conf.getValueTblDescs().get(pos);
} else {
valueTableDesc = conf.getValueFilteredTblDescs().get(pos);
}
AbstractSerDe valueSerDe = (AbstractSerDe) ReflectionUtil.newInstance(valueTableDesc.getDeserializerClass(), null);
SerDeUtils.initializeSerDe(valueSerDe, null, valueTableDesc.getProperties(), null);
MapJoinObjectSerDeContext valueContext = new MapJoinObjectSerDeContext(valueSerDe, hasFilter(pos));
mapJoinTableSerdes[pos] = new MapJoinTableContainerSerDe(keyContext, valueContext);
}
} catch (SerDeException e) {
throw new HiveException(e);
}
}
use of org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe in project hive by apache.
the class HashTableSinkOperator method initializeOp.
@Override
@SuppressWarnings("unchecked")
protected void initializeOp(Configuration hconf) throws HiveException {
super.initializeOp(hconf);
boolean isSilent = HiveConf.getBoolVar(hconf, HiveConf.ConfVars.HIVESESSIONSILENT);
console = new LogHelper(LOG, isSilent);
memoryExhaustionHandler = new MapJoinMemoryExhaustionHandler(console, conf.getHashtableMemoryUsage());
emptyRowContainer.addRow(emptyObjectArray);
// for small tables only; so get the big table position first
posBigTableAlias = conf.getPosBigTable();
order = conf.getTagOrder();
// initialize some variables, which used to be initialized in CommonJoinOperator
this.hconf = hconf;
filterMaps = conf.getFilterMap();
int tagLen = conf.getTagLength();
// process join keys
joinKeys = new List[tagLen];
JoinUtil.populateJoinKeyValue(joinKeys, conf.getKeys(), posBigTableAlias, hconf);
joinKeysObjectInspectors = JoinUtil.getObjectInspectorsFromEvaluators(joinKeys, inputObjInspectors, posBigTableAlias, tagLen);
// process join values
joinValues = new List[tagLen];
JoinUtil.populateJoinKeyValue(joinValues, conf.getExprs(), posBigTableAlias, hconf);
joinValuesObjectInspectors = JoinUtil.getObjectInspectorsFromEvaluators(joinValues, inputObjInspectors, posBigTableAlias, tagLen);
// process join filters
joinFilters = new List[tagLen];
JoinUtil.populateJoinKeyValue(joinFilters, conf.getFilters(), posBigTableAlias, hconf);
joinFilterObjectInspectors = JoinUtil.getObjectInspectorsFromEvaluators(joinFilters, inputObjInspectors, posBigTableAlias, tagLen);
if (!conf.isNoOuterJoin()) {
for (Byte alias : order) {
if (alias == posBigTableAlias || joinValues[alias] == null) {
continue;
}
List<ObjectInspector> rcOIs = joinValuesObjectInspectors[alias];
if (filterMaps != null && filterMaps[alias] != null) {
// for each alias, add object inspector for filter tag as the last element
rcOIs = new ArrayList<ObjectInspector>(rcOIs);
rcOIs.add(PrimitiveObjectInspectorFactory.writableShortObjectInspector);
}
}
}
mapJoinTables = new MapJoinPersistableTableContainer[tagLen];
mapJoinTableSerdes = new MapJoinTableContainerSerDe[tagLen];
hashTableScale = HiveConf.getLongVar(hconf, HiveConf.ConfVars.HIVEHASHTABLESCALE);
if (hashTableScale <= 0) {
hashTableScale = 1;
}
try {
TableDesc keyTableDesc = conf.getKeyTblDesc();
AbstractSerDe keySerde = (AbstractSerDe) ReflectionUtils.newInstance(keyTableDesc.getDeserializerClass(), null);
SerDeUtils.initializeSerDe(keySerde, null, keyTableDesc.getProperties(), null);
MapJoinObjectSerDeContext keyContext = new MapJoinObjectSerDeContext(keySerde, false);
for (Byte pos : order) {
if (pos == posBigTableAlias) {
continue;
}
mapJoinTables[pos] = new HashMapWrapper(hconf, -1);
TableDesc valueTableDesc = conf.getValueTblFilteredDescs().get(pos);
AbstractSerDe valueSerDe = (AbstractSerDe) ReflectionUtils.newInstance(valueTableDesc.getDeserializerClass(), null);
SerDeUtils.initializeSerDe(valueSerDe, null, valueTableDesc.getProperties(), null);
mapJoinTableSerdes[pos] = new MapJoinTableContainerSerDe(keyContext, new MapJoinObjectSerDeContext(valueSerDe, hasFilter(pos)));
}
} catch (SerDeException e) {
throw new HiveException(e);
}
}
use of org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe in project hive by apache.
the class MapJoinOperator method initializeOp.
@Override
protected void initializeOp(Configuration hconf) throws HiveException {
this.hconf = hconf;
unwrapContainer = new UnwrapRowContainer[conf.getTagLength()];
super.initializeOp(hconf);
int tagLen = conf.getTagLength();
// On Tez only: The hash map might already be cached in the container we run
// the task in. On MR: The cache is a no-op.
String queryId = HiveConf.getVar(hconf, HiveConf.ConfVars.HIVEQUERYID);
cacheKey = "HASH_MAP_" + this.getOperatorId() + "_container";
cache = ObjectCacheFactory.getCache(hconf, queryId, false);
loader = getHashTableLoader(hconf);
bucketId = hconf.getInt(Constants.LLAP_BUCKET_ID, -1);
numBuckets = hconf.getInt(Constants.LLAP_NUM_BUCKETS, -1);
hashMapRowGetters = null;
mapJoinTables = new MapJoinTableContainer[tagLen];
mapJoinTableSerdes = new MapJoinTableContainerSerDe[tagLen];
hashTblInitedOnce = false;
// Reset grace hashjoin context so that there is no state maintained when operator/work is
// retrieved from object cache
hybridMapJoinLeftover = false;
firstSmallTable = null;
generateMapMetaData();
isTestingNoHashTableLoad = HiveConf.getBoolVar(hconf, HiveConf.ConfVars.HIVE_MAPJOIN_TESTING_NO_HASH_TABLE_LOAD);
if (isTestingNoHashTableLoad) {
return;
}
final ExecMapperContext mapContext = getExecContext();
final MapredContext mrContext = MapredContext.get();
if (!conf.isBucketMapJoin() && !conf.isDynamicPartitionHashJoin()) {
/*
* The issue with caching in case of bucket map join is that different tasks
* process different buckets and if the container is reused to join a different bucket,
* join results can be incorrect. The cache is keyed on operator id and for bucket map join
* the operator does not change but data needed is different. For a proper fix, this
* requires changes in the Tez API with regard to finding bucket id and
* also ability to schedule tasks to re-use containers that have cached the specific bucket.
*/
if (LOG.isDebugEnabled()) {
LOG.debug("This is not bucket map join, so cache");
}
Future<Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]>> future = cache.retrieveAsync(cacheKey, () -> loadHashTable(mapContext, mrContext));
asyncInitOperations.add(future);
} else if (!isInputFileChangeSensitive(mapContext)) {
loadHashTable(mapContext, mrContext);
hashTblInitedOnce = true;
}
}
Aggregations