Search in sources :

Example 1 with RowContainer

use of org.apache.hadoop.hive.ql.exec.persistence.RowContainer in project hive by apache.

the class SMBMapJoinOperator method joinFinalLeftData.

/*
   * this happens either when the input file of the big table is changed or in
   * closeop. It needs to fetch all the left data from the small tables and try
   * to join them.
   */
private void joinFinalLeftData() throws HiveException {
    RowContainer bigTblRowContainer = this.candidateStorage[this.posBigTable];
    boolean allFetchDone = allFetchDone();
    // in big table, let's them catch up
    while (bigTblRowContainer != null && bigTblRowContainer.rowCount() > 0 && !allFetchDone) {
        joinOneGroup();
        bigTblRowContainer = this.candidateStorage[this.posBigTable];
        allFetchDone = allFetchDone();
    }
    while (!allFetchDone) {
        List<Byte> ret = joinOneGroup();
        if (ret == null || ret.size() == 0) {
            break;
        }
        reportProgress();
        numMapRowsRead++;
        allFetchDone = allFetchDone();
    }
    boolean dataInCache = true;
    while (dataInCache) {
        for (byte pos = 0; pos < order.length; pos++) {
            if (this.foundNextKeyGroup[pos] && this.nextKeyWritables[pos] != null) {
                promoteNextGroupToCandidate(pos);
            }
        }
        joinOneGroup();
        dataInCache = false;
        for (byte pos = 0; pos < order.length; pos++) {
            if (this.candidateStorage[pos] != null && this.candidateStorage[pos].hasRows()) {
                dataInCache = true;
                break;
            }
        }
    }
}
Also used : RowContainer(org.apache.hadoop.hive.ql.exec.persistence.RowContainer)

Example 2 with RowContainer

use of org.apache.hadoop.hive.ql.exec.persistence.RowContainer in project hive by apache.

the class SkewJoinHandler method initiliaze.

public void initiliaze(Configuration hconf) {
    this.hconf = hconf;
    JoinDesc desc = joinOp.getConf();
    skewKeyDefinition = desc.getSkewKeyDefinition();
    skewKeysTableObjectInspector = new HashMap<Byte, StructObjectInspector>(numAliases);
    tblDesc = desc.getSkewKeysValuesTables();
    tblSerializers = new HashMap<Byte, AbstractSerDe>(numAliases);
    bigKeysExistingMap = new HashMap<Byte, Boolean>(numAliases);
    taskId = Utilities.getTaskId(hconf);
    int[][] filterMap = desc.getFilterMap();
    for (int i = 0; i < numAliases; i++) {
        Byte alias = conf.getTagOrder()[i];
        List<ObjectInspector> skewTableKeyInspectors = new ArrayList<ObjectInspector>();
        StructObjectInspector soi = (StructObjectInspector) joinOp.inputObjInspectors[alias];
        StructField sf = soi.getStructFieldRef(Utilities.ReduceField.KEY.toString());
        List<? extends StructField> keyFields = ((StructObjectInspector) sf.getFieldObjectInspector()).getAllStructFieldRefs();
        int keyFieldSize = keyFields.size();
        for (int k = 0; k < keyFieldSize; k++) {
            skewTableKeyInspectors.add(keyFields.get(k).getFieldObjectInspector());
        }
        TableDesc joinKeyDesc = desc.getKeyTableDesc();
        List<String> keyColNames = Utilities.getColumnNames(joinKeyDesc.getProperties());
        StructObjectInspector structTblKeyInpector = ObjectInspectorFactory.getStandardStructObjectInspector(keyColNames, skewTableKeyInspectors);
        try {
            AbstractSerDe serializer = (AbstractSerDe) ReflectionUtils.newInstance(tblDesc.get(alias).getDeserializerClass(), null);
            SerDeUtils.initializeSerDe(serializer, null, tblDesc.get(alias).getProperties(), null);
            tblSerializers.put((byte) i, serializer);
        } catch (SerDeException e) {
            LOG.error("Skewjoin will be disabled due to " + e.getMessage(), e);
            joinOp.handleSkewJoin = false;
            break;
        }
        boolean hasFilter = filterMap != null && filterMap[i] != null;
        TableDesc valTblDesc = JoinUtil.getSpillTableDesc(alias, joinOp.spillTableDesc, conf, !hasFilter);
        List<String> valColNames = new ArrayList<String>();
        if (valTblDesc != null) {
            valColNames = Utilities.getColumnNames(valTblDesc.getProperties());
        }
        StructObjectInspector structTblValInpector = ObjectInspectorFactory.getStandardStructObjectInspector(valColNames, joinOp.joinValuesStandardObjectInspectors[i]);
        StructObjectInspector structTblInpector = ObjectInspectorFactory.getUnionStructObjectInspector(Arrays.asList(structTblValInpector, structTblKeyInpector));
        skewKeysTableObjectInspector.put((byte) i, structTblInpector);
    }
    // reset rowcontainer's serde, objectinspector, and tableDesc.
    for (int i = 0; i < numAliases; i++) {
        Byte alias = conf.getTagOrder()[i];
        RowContainer<ArrayList<Object>> rc = (RowContainer) joinOp.storage[i];
        if (rc != null) {
            rc.setSerDe(tblSerializers.get((byte) i), skewKeysTableObjectInspector.get((byte) i));
            rc.setTableDesc(tblDesc.get(alias));
        }
    }
}
Also used : RowContainer(org.apache.hadoop.hive.ql.exec.persistence.RowContainer) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) ArrayList(java.util.ArrayList) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) JoinDesc(org.apache.hadoop.hive.ql.plan.JoinDesc) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 3 with RowContainer

use of org.apache.hadoop.hive.ql.exec.persistence.RowContainer in project hive by apache.

the class JoinUtil method getRowContainer.

public static RowContainer<List<Object>> getRowContainer(Configuration hconf, List<ObjectInspector> structFieldObjectInspectors, Byte alias, int containerSize, TableDesc[] spillTableDesc, JoinDesc conf, boolean noFilter, Reporter reporter) throws HiveException {
    TableDesc tblDesc = JoinUtil.getSpillTableDesc(alias, spillTableDesc, conf, noFilter);
    AbstractSerDe serde = JoinUtil.getSpillSerDe(alias, spillTableDesc, conf, noFilter);
    if (serde == null) {
        containerSize = -1;
    }
    RowContainer<List<Object>> rc = new RowContainer<List<Object>>(containerSize, hconf, reporter);
    StructObjectInspector rcOI = null;
    if (tblDesc != null) {
        // arbitrary column names used internally for serializing to spill table
        List<String> colNames = Utilities.getColumnNames(tblDesc.getProperties());
        // object inspector for serializing input tuples
        rcOI = ObjectInspectorFactory.getStandardStructObjectInspector(colNames, structFieldObjectInspectors);
    }
    rc.setSerDe(serde, rcOI);
    rc.setTableDesc(tblDesc);
    return rc;
}
Also used : RowContainer(org.apache.hadoop.hive.ql.exec.persistence.RowContainer) ArrayList(java.util.ArrayList) List(java.util.List) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 4 with RowContainer

use of org.apache.hadoop.hive.ql.exec.persistence.RowContainer in project hive by apache.

the class SkewJoinHandler method endGroup.

void endGroup() throws IOException, HiveException {
    if (skewKeyInCurrentGroup) {
        Path specPath = conf.getBigKeysDirMap().get((byte) currBigKeyTag);
        RowContainer<ArrayList<Object>> bigKey = (RowContainer) joinOp.storage[currBigKeyTag];
        Path outputPath = getOperatorOutputPath(specPath);
        FileSystem destFs = outputPath.getFileSystem(hconf);
        bigKey.copyToDFSDirecory(destFs, outputPath);
        for (int i = 0; i < numAliases; i++) {
            if (((byte) i) == currBigKeyTag) {
                continue;
            }
            RowContainer<ArrayList<Object>> values = (RowContainer) joinOp.storage[i];
            if (values != null) {
                specPath = conf.getSmallKeysDirMap().get((byte) currBigKeyTag).get((byte) i);
                values.copyToDFSDirecory(destFs, getOperatorOutputPath(specPath));
            }
        }
    }
    skewKeyInCurrentGroup = false;
}
Also used : Path(org.apache.hadoop.fs.Path) RowContainer(org.apache.hadoop.hive.ql.exec.persistence.RowContainer) FileSystem(org.apache.hadoop.fs.FileSystem) ArrayList(java.util.ArrayList)

Example 5 with RowContainer

use of org.apache.hadoop.hive.ql.exec.persistence.RowContainer in project hive by apache.

the class CommonMergeJoinOperator method joinFinalLeftData.

private void joinFinalLeftData() throws HiveException {
    @SuppressWarnings("rawtypes") RowContainer bigTblRowContainer = this.candidateStorage[this.posBigTable];
    boolean allFetchDone = allFetchDone();
    // in big table, let's them catch up
    while (bigTblRowContainer != null && bigTblRowContainer.rowCount() > 0 && !allFetchDone) {
        joinOneGroup();
        bigTblRowContainer = this.candidateStorage[this.posBigTable];
        allFetchDone = allFetchDone();
    }
    while (!allFetchDone) {
        List<Byte> ret = joinOneGroup();
        // if we are in close op phase, we have definitely exhausted the big table input
        fetchDone[posBigTable] = true;
        // First, handle the condition where the first fetch was never done (big table is empty).
        doFirstFetchIfNeeded();
        // in case of outer joins, we need to pull in records from the sides we still
        // need to produce output for apart from the big table. for e.g. full outer join
        // TODO: this reproduces the logic of the loop that was here before, assuming
        // firstFetchHappened == true. In reality it almost always calls joinOneGroup. Fix it?
        int lastPos = (fetchDone.length - 1);
        if (posBigTable != lastPos && (fetchInputAtClose.contains(lastPos)) && (fetchDone[lastPos] == false)) {
            // Do the join. It does fetching of next row groups itself.
            ret = joinOneGroup();
        }
        if (ret == null || ret.size() == 0) {
            break;
        }
        reportProgress();
        numMapRowsRead++;
        allFetchDone = allFetchDone();
    }
    boolean dataInCache = true;
    while (dataInCache) {
        for (byte pos = 0; pos < order.length; pos++) {
            if (this.foundNextKeyGroup[pos] && this.nextKeyWritables[pos] != null) {
                fetchNextGroup(pos);
            }
        }
        joinOneGroup();
        dataInCache = false;
        for (byte pos = 0; pos < order.length; pos++) {
            if (candidateStorage[pos] == null) {
                continue;
            }
            if (this.candidateStorage[pos].hasRows()) {
                dataInCache = true;
                break;
            }
        }
    }
}
Also used : RowContainer(org.apache.hadoop.hive.ql.exec.persistence.RowContainer)

Aggregations

RowContainer (org.apache.hadoop.hive.ql.exec.persistence.RowContainer)5 ArrayList (java.util.ArrayList)3 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)2 AbstractSerDe (org.apache.hadoop.hive.serde2.AbstractSerDe)2 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)2 List (java.util.List)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 JoinDesc (org.apache.hadoop.hive.ql.plan.JoinDesc)1 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)1 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)1 StructField (org.apache.hadoop.hive.serde2.objectinspector.StructField)1