use of org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer in project hive by apache.
the class MapJoinOperator method completeInitializationOp.
@SuppressWarnings("unchecked")
@Override
protected void completeInitializationOp(Object[] os) throws HiveException {
if (os.length != 0) {
Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]> pair = (Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]>) os[0];
boolean spilled = false;
for (MapJoinTableContainer container : pair.getLeft()) {
if (container != null) {
spilled = spilled || container.hasSpill();
}
}
if (spilled) {
// we can't use the cached table because it has spilled.
loadHashTable(getExecContext(), MapredContext.get());
} else {
if (LOG.isDebugEnabled()) {
String s = "Using tables from cache: [";
for (MapJoinTableContainer c : pair.getLeft()) {
s += ((c == null) ? "null" : c.getClass().getSimpleName()) + ", ";
}
LOG.debug(s + "]");
}
// let's use the table from the cache.
mapJoinTables = pair.getLeft();
mapJoinTableSerdes = pair.getRight();
}
hashTblInitedOnce = true;
}
if (this.getExecContext() != null) {
// reset exec context so that initialization of the map operator happens
// properly
this.getExecContext().setLastInputPath(null);
this.getExecContext().setCurrentInputPath(null);
}
}
use of org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer in project hive by apache.
the class MapJoinOperator method canSkipJoinProcessing.
// If the loaded hash table is empty, for some conditions we can skip processing the big table rows.
protected boolean canSkipJoinProcessing(ExecMapperContext mapContext) {
if (!canSkipReload(mapContext)) {
return false;
}
JoinCondDesc[] joinConds = getConf().getConds();
if (joinConds.length > 0) {
for (JoinCondDesc joinCond : joinConds) {
if (joinCond.getType() != JoinDesc.INNER_JOIN) {
return false;
}
}
} else {
return false;
}
boolean skipJoinProcessing = false;
for (int idx = 0; idx < mapJoinTables.length; ++idx) {
if (idx == getConf().getPosBigTable()) {
continue;
}
MapJoinTableContainer mapJoinTable = mapJoinTables[idx];
if (mapJoinTable.size() == 0) {
// If any table is empty, an inner join involving the tables should yield 0 rows.
LOG.info("Hash table number " + idx + " is empty");
skipJoinProcessing = true;
break;
}
}
return skipJoinProcessing;
}
use of org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer in project hive by apache.
the class MapJoinOperator method initializeOp.
@Override
protected void initializeOp(Configuration hconf) throws HiveException {
this.hconf = hconf;
unwrapContainer = new UnwrapRowContainer[conf.getTagLength()];
super.initializeOp(hconf);
int tagLen = conf.getTagLength();
// On Tez only: The hash map might already be cached in the container we run
// the task in. On MR: The cache is a no-op.
String queryId = HiveConf.getVar(hconf, HiveConf.ConfVars.HIVEQUERYID);
cacheKey = "HASH_MAP_" + this.getOperatorId() + "_container";
cache = ObjectCacheFactory.getCache(hconf, queryId, false);
loader = getHashTableLoader(hconf);
hashMapRowGetters = null;
mapJoinTables = new MapJoinTableContainer[tagLen];
mapJoinTableSerdes = new MapJoinTableContainerSerDe[tagLen];
hashTblInitedOnce = false;
// Reset grace hashjoin context so that there is no state maintained when operator/work is
// retrieved from object cache
hybridMapJoinLeftover = false;
firstSmallTable = null;
generateMapMetaData();
final ExecMapperContext mapContext = getExecContext();
final MapredContext mrContext = MapredContext.get();
if (!conf.isBucketMapJoin() && !conf.isDynamicPartitionHashJoin()) {
/*
* The issue with caching in case of bucket map join is that different tasks
* process different buckets and if the container is reused to join a different bucket,
* join results can be incorrect. The cache is keyed on operator id and for bucket map join
* the operator does not change but data needed is different. For a proper fix, this
* requires changes in the Tez API with regard to finding bucket id and
* also ability to schedule tasks to re-use containers that have cached the specific bucket.
*/
if (isLogDebugEnabled) {
LOG.debug("This is not bucket map join, so cache");
}
Future<Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]>> future = cache.retrieveAsync(cacheKey, new Callable<Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]>>() {
@Override
public Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]> call() throws HiveException {
return loadHashTable(mapContext, mrContext);
}
});
asyncInitOperations.add(future);
} else if (!isInputFileChangeSensitive(mapContext)) {
loadHashTable(mapContext, mrContext);
hashTblInitedOnce = true;
}
}
use of org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer in project hive by apache.
the class MapJoinOperator method closeOp.
@Override
public void closeOp(boolean abort) throws HiveException {
boolean spilled = false;
for (MapJoinTableContainer container : mapJoinTables) {
if (container != null) {
spilled = spilled || container.hasSpill();
container.dumpMetrics();
}
}
// For Hybrid Grace Hash Join, we need to see if there is any spilled data to be processed next
if (spilled) {
if (!abort) {
if (hashMapRowGetters == null) {
hashMapRowGetters = new ReusableGetAdaptor[mapJoinTables.length];
}
int numPartitions = 0;
// Find out number of partitions for each small table (should be same across tables)
for (byte pos = 0; pos < mapJoinTables.length; pos++) {
if (pos != conf.getPosBigTable()) {
firstSmallTable = (HybridHashTableContainer) mapJoinTables[pos];
numPartitions = firstSmallTable.getHashPartitions().length;
break;
}
}
assert numPartitions != 0 : "Number of partitions must be greater than 0!";
if (firstSmallTable.hasSpill()) {
spilledMapJoinTables = new MapJoinBytesTableContainer[mapJoinTables.length];
hybridMapJoinLeftover = true;
// Clear all in-memory partitions first
for (byte pos = 0; pos < mapJoinTables.length; pos++) {
MapJoinTableContainer tableContainer = mapJoinTables[pos];
if (tableContainer != null && tableContainer instanceof HybridHashTableContainer) {
HybridHashTableContainer hybridHtContainer = (HybridHashTableContainer) tableContainer;
hybridHtContainer.dumpStats();
HashPartition[] hashPartitions = hybridHtContainer.getHashPartitions();
// Clear all in memory partitions first
for (int i = 0; i < hashPartitions.length; i++) {
if (!hashPartitions[i].isHashMapOnDisk()) {
hybridHtContainer.setTotalInMemRowCount(hybridHtContainer.getTotalInMemRowCount() - hashPartitions[i].getHashMapFromMemory().getNumValues());
hashPartitions[i].getHashMapFromMemory().clear();
}
}
assert hybridHtContainer.getTotalInMemRowCount() == 0;
}
}
// Reprocess the spilled data
for (int i = 0; i < numPartitions; i++) {
HashPartition[] hashPartitions = firstSmallTable.getHashPartitions();
if (hashPartitions[i].isHashMapOnDisk()) {
try {
// Re-process spilled data
continueProcess(i);
} catch (KryoException ke) {
LOG.error("Processing the spilled data failed due to Kryo error!");
LOG.error("Cleaning up all spilled data!");
cleanupGraceHashJoin();
throw new HiveException(ke);
} catch (Exception e) {
throw new HiveException(e);
}
for (byte pos = 0; pos < order.length; pos++) {
if (pos != conf.getPosBigTable())
spilledMapJoinTables[pos] = null;
}
}
}
}
}
if (isLogInfoEnabled) {
LOG.info("spilled: " + spilled + " abort: " + abort + ". Clearing spilled partitions.");
}
// spilled tables are loaded always (no sharing), so clear it
clearAllTableContainers();
cache.remove(cacheKey);
}
// in mapreduce case, we need to always clear up as mapreduce doesn't have object registry.
if ((this.getExecContext() != null) && (this.getExecContext().getLocalWork() != null) && (this.getExecContext().getLocalWork().getInputFileChangeSensitive()) && !(HiveConf.getVar(hconf, ConfVars.HIVE_EXECUTION_ENGINE).equals("spark") && SparkUtilities.isDedicatedCluster(hconf))) {
if (isLogInfoEnabled) {
LOG.info("MR: Clearing all map join table containers.");
}
clearAllTableContainers();
}
this.loader = null;
super.closeOp(abort);
}
use of org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer in project hive by apache.
the class HashTableLoader method loadDirectly.
private void loadDirectly(MapJoinTableContainer[] mapJoinTables, String inputFileName) throws Exception {
MapredLocalWork localWork = context.getLocalWork();
List<Operator<?>> directWorks = localWork.getDirectFetchOp().get(joinOp);
if (directWorks == null || directWorks.isEmpty()) {
return;
}
JobConf job = new JobConf(hconf);
MapredLocalTask localTask = new MapredLocalTask(localWork, job, false);
HashTableSinkOperator sink = new TemporaryHashSinkOperator(new CompilationOpContext(), desc);
sink.setParentOperators(new ArrayList<Operator<? extends OperatorDesc>>(directWorks));
for (Operator<?> operator : directWorks) {
if (operator != null) {
operator.setChildOperators(Arrays.<Operator<? extends OperatorDesc>>asList(sink));
}
}
localTask.setExecContext(context);
localTask.startForward(inputFileName);
MapJoinTableContainer[] tables = sink.getMapJoinTables();
for (int i = 0; i < sink.getNumParent(); i++) {
if (sink.getParentOperators().get(i) != null) {
mapJoinTables[i] = tables[i];
}
}
Arrays.fill(tables, null);
}
Aggregations