Search in sources :

Example 1 with HashPartition

use of org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableContainer.HashPartition in project hive by apache.

the class MapJoinOperator method spillBigTableRow.

/**
   * Postpone processing the big table row temporarily by spilling it to a row container
   * @param hybridHtContainer Hybrid hashtable container
   * @param row big table row
   */
protected void spillBigTableRow(MapJoinTableContainer hybridHtContainer, Object row) throws HiveException {
    HybridHashTableContainer ht = (HybridHashTableContainer) hybridHtContainer;
    int partitionId = ht.getToSpillPartitionId();
    HashPartition hp = ht.getHashPartitions()[partitionId];
    ObjectContainer bigTable = hp.getMatchfileObjContainer();
    bigTable.add(row);
}
Also used : HashPartition(org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableContainer.HashPartition) ObjectContainer(org.apache.hadoop.hive.ql.exec.persistence.ObjectContainer) HybridHashTableContainer(org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableContainer)

Example 2 with HashPartition

use of org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableContainer.HashPartition in project hive by apache.

the class MapJoinOperator method reProcessBigTable.

/**
   * Iterate over the big table row container and feed process() with leftover rows
   * @param partitionId the partition from which to take out spilled big table rows
   * @throws HiveException
   */
protected void reProcessBigTable(int partitionId) throws HiveException {
    // For binary join, firstSmallTable is the only small table; it has reference to spilled big
    // table rows;
    // For n-way join, since we only spill once, when processing the first small table, so only the
    // firstSmallTable has reference to the spilled big table rows.
    HashPartition partition = firstSmallTable.getHashPartitions()[partitionId];
    ObjectContainer bigTable = partition.getMatchfileObjContainer();
    LOG.info("Hybrid Grace Hash Join: Going to process spilled big table rows in partition " + partitionId + ". Number of rows: " + bigTable.size());
    while (bigTable.hasNext()) {
        Object row = bigTable.next();
        process(row, conf.getPosBigTable());
    }
    bigTable.clear();
}
Also used : HashPartition(org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableContainer.HashPartition) ObjectContainer(org.apache.hadoop.hive.ql.exec.persistence.ObjectContainer)

Example 3 with HashPartition

use of org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableContainer.HashPartition in project hive by apache.

the class VectorMapJoinGenerateResultOperator method reProcessBigTable.

@Override
protected void reProcessBigTable(int partitionId) throws HiveException {
    if (isLogDebugEnabled) {
        LOG.debug(CLASS_NAME + " reProcessBigTable enter...");
    }
    if (spillReplayBatch == null) {
        // The process method was not called -- no big table rows.
        return;
    }
    HashPartition partition = firstSmallTable.getHashPartitions()[partitionId];
    int rowCount = 0;
    int batchCount = 0;
    try {
        VectorMapJoinRowBytesContainer bigTable = partition.getMatchfileRowBytesContainer();
        bigTable.prepareForReading();
        while (bigTable.readNext()) {
            rowCount++;
            byte[] bytes = bigTable.currentBytes();
            int offset = bigTable.currentOffset();
            int length = bigTable.currentLength();
            bigTableVectorDeserializeRow.setBytes(bytes, offset, length);
            try {
                bigTableVectorDeserializeRow.deserialize(spillReplayBatch, spillReplayBatch.size);
            } catch (Exception e) {
                throw new HiveException("\nDeserializeRead detail: " + bigTableVectorDeserializeRow.getDetailedReadPositionString(), e);
            }
            spillReplayBatch.size++;
            if (spillReplayBatch.size == VectorizedRowBatch.DEFAULT_SIZE) {
                // call process once we have a full batch
                process(spillReplayBatch, posBigTable);
                spillReplayBatch.reset();
                batchCount++;
            }
        }
        // Process the row batch that has less than DEFAULT_SIZE rows
        if (spillReplayBatch.size > 0) {
            process(spillReplayBatch, posBigTable);
            spillReplayBatch.reset();
            batchCount++;
        }
        bigTable.clear();
    } catch (Exception e) {
        LOG.info(CLASS_NAME + " reProcessBigTable exception! " + e);
        throw new HiveException(e);
    }
    if (isLogDebugEnabled) {
        LOG.debug(CLASS_NAME + " reProcessBigTable exit! " + rowCount + " row processed and " + batchCount + " batches processed");
    }
}
Also used : HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) HashPartition(org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableContainer.HashPartition) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException)

Example 4 with HashPartition

use of org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableContainer.HashPartition in project hive by apache.

the class MapJoinOperator method reloadHashTable.

/**
   * Reload hashtable from the hash partition.
   * It can have two steps:
   * 1) Deserialize a serialized hash table, and
   * 2) Merge every key/value pair from small table container into the hash table
   * @param pos position of small table
   * @param partitionId the partition of the small table to be reloaded from
   * @throws IOException
   * @throws HiveException
   * @throws SerDeException
   */
protected void reloadHashTable(byte pos, int partitionId) throws IOException, HiveException, SerDeException, ClassNotFoundException {
    HybridHashTableContainer container = (HybridHashTableContainer) mapJoinTables[pos];
    HashPartition partition = container.getHashPartitions()[partitionId];
    // Merge the sidefile into the newly created hash table
    // This is where the spilling may happen again
    LOG.info("Going to restore sidefile...");
    KeyValueContainer kvContainer = partition.getSidefileKVContainer();
    int rowCount = kvContainer.size();
    LOG.info("Hybrid Grace Hash Join: Number of rows restored from KeyValueContainer: " + kvContainer.size());
    // We're sure this part is smaller than memory limit
    if (rowCount <= 0) {
        // Since rowCount is used later to instantiate a BytesBytesMultiHashMap
        rowCount = 1024 * 1024;
    // as the initialCapacity which cannot be 0, we provide a reasonable
    // positive number here
    }
    LOG.info("Going to restore hashmap...");
    BytesBytesMultiHashMap restoredHashMap = partition.getHashMapFromDisk(rowCount);
    rowCount += restoredHashMap.getNumValues();
    LOG.info("Hybrid Grace Hash Join: Deserializing spilled hash partition...");
    LOG.info("Hybrid Grace Hash Join: Number of rows in hashmap: " + rowCount);
    // The size of deserialized partition shouldn't exceed half of memory limit
    if (rowCount * container.getTableRowSize() >= container.getMemoryThreshold() / 2) {
        LOG.warn("Hybrid Grace Hash Join: Hash table cannot be reloaded since it" + " will be greater than memory limit. Recursive spilling is currently not supported");
    }
    KeyValueHelper writeHelper = container.getWriteHelper();
    while (kvContainer.hasNext()) {
        ObjectPair<HiveKey, BytesWritable> pair = kvContainer.next();
        Writable key = pair.getFirst();
        Writable val = pair.getSecond();
        writeHelper.setKeyValue(key, val);
        restoredHashMap.put(writeHelper, -1);
    }
    container.setTotalInMemRowCount(container.getTotalInMemRowCount() + restoredHashMap.getNumValues());
    kvContainer.clear();
    spilledMapJoinTables[pos] = new MapJoinBytesTableContainer(restoredHashMap);
    spilledMapJoinTables[pos].setInternalValueOi(container.getInternalValueOi());
    spilledMapJoinTables[pos].setSortableSortOrders(container.getSortableSortOrders());
    spilledMapJoinTables[pos].setNullMarkers(container.getNullMarkers());
    spilledMapJoinTables[pos].setNotNullMarkers(container.getNotNullMarkers());
}
Also used : HiveKey(org.apache.hadoop.hive.ql.io.HiveKey) KeyValueHelper(org.apache.hadoop.hive.ql.exec.persistence.MapJoinBytesTableContainer.KeyValueHelper) MapJoinBytesTableContainer(org.apache.hadoop.hive.ql.exec.persistence.MapJoinBytesTableContainer) Writable(org.apache.hadoop.io.Writable) BytesWritable(org.apache.hadoop.io.BytesWritable) HashPartition(org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableContainer.HashPartition) BytesWritable(org.apache.hadoop.io.BytesWritable) HybridHashTableContainer(org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableContainer) KeyValueContainer(org.apache.hadoop.hive.ql.exec.persistence.KeyValueContainer) BytesBytesMultiHashMap(org.apache.hadoop.hive.ql.exec.persistence.BytesBytesMultiHashMap)

Example 5 with HashPartition

use of org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableContainer.HashPartition in project hive by apache.

the class MapJoinOperator method closeOp.

@Override
public void closeOp(boolean abort) throws HiveException {
    boolean spilled = false;
    for (MapJoinTableContainer container : mapJoinTables) {
        if (container != null) {
            spilled = spilled || container.hasSpill();
            container.dumpMetrics();
        }
    }
    // For Hybrid Grace Hash Join, we need to see if there is any spilled data to be processed next
    if (spilled) {
        if (!abort) {
            if (hashMapRowGetters == null) {
                hashMapRowGetters = new ReusableGetAdaptor[mapJoinTables.length];
            }
            int numPartitions = 0;
            // Find out number of partitions for each small table (should be same across tables)
            for (byte pos = 0; pos < mapJoinTables.length; pos++) {
                if (pos != conf.getPosBigTable()) {
                    firstSmallTable = (HybridHashTableContainer) mapJoinTables[pos];
                    numPartitions = firstSmallTable.getHashPartitions().length;
                    break;
                }
            }
            assert numPartitions != 0 : "Number of partitions must be greater than 0!";
            if (firstSmallTable.hasSpill()) {
                spilledMapJoinTables = new MapJoinBytesTableContainer[mapJoinTables.length];
                hybridMapJoinLeftover = true;
                // Clear all in-memory partitions first
                for (byte pos = 0; pos < mapJoinTables.length; pos++) {
                    MapJoinTableContainer tableContainer = mapJoinTables[pos];
                    if (tableContainer != null && tableContainer instanceof HybridHashTableContainer) {
                        HybridHashTableContainer hybridHtContainer = (HybridHashTableContainer) tableContainer;
                        hybridHtContainer.dumpStats();
                        HashPartition[] hashPartitions = hybridHtContainer.getHashPartitions();
                        // Clear all in memory partitions first
                        for (int i = 0; i < hashPartitions.length; i++) {
                            if (!hashPartitions[i].isHashMapOnDisk()) {
                                hybridHtContainer.setTotalInMemRowCount(hybridHtContainer.getTotalInMemRowCount() - hashPartitions[i].getHashMapFromMemory().getNumValues());
                                hashPartitions[i].getHashMapFromMemory().clear();
                            }
                        }
                        assert hybridHtContainer.getTotalInMemRowCount() == 0;
                    }
                }
                // Reprocess the spilled data
                for (int i = 0; i < numPartitions; i++) {
                    HashPartition[] hashPartitions = firstSmallTable.getHashPartitions();
                    if (hashPartitions[i].isHashMapOnDisk()) {
                        try {
                            // Re-process spilled data
                            continueProcess(i);
                        } catch (KryoException ke) {
                            LOG.error("Processing the spilled data failed due to Kryo error!");
                            LOG.error("Cleaning up all spilled data!");
                            cleanupGraceHashJoin();
                            throw new HiveException(ke);
                        } catch (Exception e) {
                            throw new HiveException(e);
                        }
                        for (byte pos = 0; pos < order.length; pos++) {
                            if (pos != conf.getPosBigTable())
                                spilledMapJoinTables[pos] = null;
                        }
                    }
                }
            }
        }
        if (isLogInfoEnabled) {
            LOG.info("spilled: " + spilled + " abort: " + abort + ". Clearing spilled partitions.");
        }
        // spilled tables are loaded always (no sharing), so clear it
        clearAllTableContainers();
        cache.remove(cacheKey);
    }
    // in mapreduce case, we need to always clear up as mapreduce doesn't have object registry.
    if ((this.getExecContext() != null) && (this.getExecContext().getLocalWork() != null) && (this.getExecContext().getLocalWork().getInputFileChangeSensitive()) && !(HiveConf.getVar(hconf, ConfVars.HIVE_EXECUTION_ENGINE).equals("spark") && SparkUtilities.isDedicatedCluster(hconf))) {
        if (isLogInfoEnabled) {
            LOG.info("MR: Clearing all map join table containers.");
        }
        clearAllTableContainers();
    }
    this.loader = null;
    super.closeOp(abort);
}
Also used : KryoException(com.esotericsoftware.kryo.KryoException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) HashPartition(org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableContainer.HashPartition) MapJoinTableContainer(org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer) HybridHashTableContainer(org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableContainer) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) KryoException(com.esotericsoftware.kryo.KryoException) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException)

Aggregations

HashPartition (org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableContainer.HashPartition)6 HybridHashTableContainer (org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableContainer)4 IOException (java.io.IOException)2 ObjectContainer (org.apache.hadoop.hive.ql.exec.persistence.ObjectContainer)2 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)2 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)2 KryoException (com.esotericsoftware.kryo.KryoException)1 BytesBytesMultiHashMap (org.apache.hadoop.hive.ql.exec.persistence.BytesBytesMultiHashMap)1 KeyValueContainer (org.apache.hadoop.hive.ql.exec.persistence.KeyValueContainer)1 MapJoinBytesTableContainer (org.apache.hadoop.hive.ql.exec.persistence.MapJoinBytesTableContainer)1 KeyValueHelper (org.apache.hadoop.hive.ql.exec.persistence.MapJoinBytesTableContainer.KeyValueHelper)1 MapJoinTableContainer (org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer)1 HiveKey (org.apache.hadoop.hive.ql.io.HiveKey)1 Output (org.apache.hadoop.hive.serde2.ByteStream.Output)1 BytesWritable (org.apache.hadoop.io.BytesWritable)1 Writable (org.apache.hadoop.io.Writable)1