use of org.apache.hadoop.hive.ql.exec.persistence.RowContainer in project hive by apache.
the class SMBMapJoinOperator method joinFinalLeftData.
/*
* this happens either when the input file of the big table is changed or in
* closeop. It needs to fetch all the left data from the small tables and try
* to join them.
*/
private void joinFinalLeftData() throws HiveException {
RowContainer bigTblRowContainer = this.candidateStorage[this.posBigTable];
boolean allFetchDone = allFetchDone();
// in big table, let's them catch up
while (bigTblRowContainer != null && bigTblRowContainer.rowCount() > 0 && !allFetchDone) {
joinOneGroup();
bigTblRowContainer = this.candidateStorage[this.posBigTable];
allFetchDone = allFetchDone();
}
while (!allFetchDone) {
List<Byte> ret = joinOneGroup();
if (ret == null || ret.size() == 0) {
break;
}
reportProgress();
numMapRowsRead++;
allFetchDone = allFetchDone();
}
boolean dataInCache = true;
while (dataInCache) {
for (byte pos = 0; pos < order.length; pos++) {
if (this.foundNextKeyGroup[pos] && this.nextKeyWritables[pos] != null) {
promoteNextGroupToCandidate(pos);
}
}
joinOneGroup();
dataInCache = false;
for (byte pos = 0; pos < order.length; pos++) {
if (this.candidateStorage[pos] != null && this.candidateStorage[pos].hasRows()) {
dataInCache = true;
break;
}
}
}
}
use of org.apache.hadoop.hive.ql.exec.persistence.RowContainer in project hive by apache.
the class SkewJoinHandler method initiliaze.
public void initiliaze(Configuration hconf) {
this.hconf = hconf;
JoinDesc desc = joinOp.getConf();
skewKeyDefinition = desc.getSkewKeyDefinition();
skewKeysTableObjectInspector = new HashMap<Byte, StructObjectInspector>(numAliases);
tblDesc = desc.getSkewKeysValuesTables();
tblSerializers = new HashMap<Byte, AbstractSerDe>(numAliases);
bigKeysExistingMap = new HashMap<Byte, Boolean>(numAliases);
taskId = Utilities.getTaskId(hconf);
int[][] filterMap = desc.getFilterMap();
for (int i = 0; i < numAliases; i++) {
Byte alias = conf.getTagOrder()[i];
List<ObjectInspector> skewTableKeyInspectors = new ArrayList<ObjectInspector>();
StructObjectInspector soi = (StructObjectInspector) joinOp.inputObjInspectors[alias];
StructField sf = soi.getStructFieldRef(Utilities.ReduceField.KEY.toString());
List<? extends StructField> keyFields = ((StructObjectInspector) sf.getFieldObjectInspector()).getAllStructFieldRefs();
int keyFieldSize = keyFields.size();
for (int k = 0; k < keyFieldSize; k++) {
skewTableKeyInspectors.add(keyFields.get(k).getFieldObjectInspector());
}
TableDesc joinKeyDesc = desc.getKeyTableDesc();
List<String> keyColNames = Utilities.getColumnNames(joinKeyDesc.getProperties());
StructObjectInspector structTblKeyInpector = ObjectInspectorFactory.getStandardStructObjectInspector(keyColNames, skewTableKeyInspectors);
try {
AbstractSerDe serializer = (AbstractSerDe) ReflectionUtils.newInstance(tblDesc.get(alias).getDeserializerClass(), null);
SerDeUtils.initializeSerDe(serializer, null, tblDesc.get(alias).getProperties(), null);
tblSerializers.put((byte) i, serializer);
} catch (SerDeException e) {
LOG.error("Skewjoin will be disabled due to " + e.getMessage(), e);
joinOp.handleSkewJoin = false;
break;
}
boolean hasFilter = filterMap != null && filterMap[i] != null;
TableDesc valTblDesc = JoinUtil.getSpillTableDesc(alias, joinOp.spillTableDesc, conf, !hasFilter);
List<String> valColNames = new ArrayList<String>();
if (valTblDesc != null) {
valColNames = Utilities.getColumnNames(valTblDesc.getProperties());
}
StructObjectInspector structTblValInpector = ObjectInspectorFactory.getStandardStructObjectInspector(valColNames, joinOp.joinValuesStandardObjectInspectors[i]);
StructObjectInspector structTblInpector = ObjectInspectorFactory.getUnionStructObjectInspector(Arrays.asList(structTblValInpector, structTblKeyInpector));
skewKeysTableObjectInspector.put((byte) i, structTblInpector);
}
// reset rowcontainer's serde, objectinspector, and tableDesc.
for (int i = 0; i < numAliases; i++) {
Byte alias = conf.getTagOrder()[i];
RowContainer<ArrayList<Object>> rc = (RowContainer) joinOp.storage[i];
if (rc != null) {
rc.setSerDe(tblSerializers.get((byte) i), skewKeysTableObjectInspector.get((byte) i));
rc.setTableDesc(tblDesc.get(alias));
}
}
}
use of org.apache.hadoop.hive.ql.exec.persistence.RowContainer in project hive by apache.
the class JoinUtil method getRowContainer.
public static RowContainer<List<Object>> getRowContainer(Configuration hconf, List<ObjectInspector> structFieldObjectInspectors, Byte alias, int containerSize, TableDesc[] spillTableDesc, JoinDesc conf, boolean noFilter, Reporter reporter) throws HiveException {
TableDesc tblDesc = JoinUtil.getSpillTableDesc(alias, spillTableDesc, conf, noFilter);
AbstractSerDe serde = JoinUtil.getSpillSerDe(alias, spillTableDesc, conf, noFilter);
if (serde == null) {
containerSize = -1;
}
RowContainer<List<Object>> rc = new RowContainer<List<Object>>(containerSize, hconf, reporter);
StructObjectInspector rcOI = null;
if (tblDesc != null) {
// arbitrary column names used internally for serializing to spill table
List<String> colNames = Utilities.getColumnNames(tblDesc.getProperties());
// object inspector for serializing input tuples
rcOI = ObjectInspectorFactory.getStandardStructObjectInspector(colNames, structFieldObjectInspectors);
}
rc.setSerDe(serde, rcOI);
rc.setTableDesc(tblDesc);
return rc;
}
use of org.apache.hadoop.hive.ql.exec.persistence.RowContainer in project hive by apache.
the class SkewJoinHandler method endGroup.
void endGroup() throws IOException, HiveException {
if (skewKeyInCurrentGroup) {
Path specPath = conf.getBigKeysDirMap().get((byte) currBigKeyTag);
RowContainer<ArrayList<Object>> bigKey = (RowContainer) joinOp.storage[currBigKeyTag];
Path outputPath = getOperatorOutputPath(specPath);
FileSystem destFs = outputPath.getFileSystem(hconf);
bigKey.copyToDFSDirecory(destFs, outputPath);
for (int i = 0; i < numAliases; i++) {
if (((byte) i) == currBigKeyTag) {
continue;
}
RowContainer<ArrayList<Object>> values = (RowContainer) joinOp.storage[i];
if (values != null) {
specPath = conf.getSmallKeysDirMap().get((byte) currBigKeyTag).get((byte) i);
values.copyToDFSDirecory(destFs, getOperatorOutputPath(specPath));
}
}
}
skewKeyInCurrentGroup = false;
}
use of org.apache.hadoop.hive.ql.exec.persistence.RowContainer in project hive by apache.
the class CommonMergeJoinOperator method joinFinalLeftData.
private void joinFinalLeftData() throws HiveException {
@SuppressWarnings("rawtypes") RowContainer bigTblRowContainer = this.candidateStorage[this.posBigTable];
boolean allFetchDone = allFetchDone();
// in big table, let's them catch up
while (bigTblRowContainer != null && bigTblRowContainer.rowCount() > 0 && !allFetchDone) {
joinOneGroup();
bigTblRowContainer = this.candidateStorage[this.posBigTable];
allFetchDone = allFetchDone();
}
while (!allFetchDone) {
List<Byte> ret = joinOneGroup();
// if we are in close op phase, we have definitely exhausted the big table input
fetchDone[posBigTable] = true;
// First, handle the condition where the first fetch was never done (big table is empty).
doFirstFetchIfNeeded();
// in case of outer joins, we need to pull in records from the sides we still
// need to produce output for apart from the big table. for e.g. full outer join
// TODO: this reproduces the logic of the loop that was here before, assuming
// firstFetchHappened == true. In reality it almost always calls joinOneGroup. Fix it?
int lastPos = (fetchDone.length - 1);
if (posBigTable != lastPos && (fetchInputAtClose.contains(lastPos)) && (fetchDone[lastPos] == false)) {
// Do the join. It does fetching of next row groups itself.
ret = joinOneGroup();
}
if (ret == null || ret.size() == 0) {
break;
}
reportProgress();
numMapRowsRead++;
allFetchDone = allFetchDone();
}
boolean dataInCache = true;
while (dataInCache) {
for (byte pos = 0; pos < order.length; pos++) {
if (this.foundNextKeyGroup[pos] && this.nextKeyWritables[pos] != null) {
fetchNextGroup(pos);
}
}
joinOneGroup();
dataInCache = false;
for (byte pos = 0; pos < order.length; pos++) {
if (candidateStorage[pos] == null) {
continue;
}
if (this.candidateStorage[pos].hasRows()) {
dataInCache = true;
break;
}
}
}
}
Aggregations