use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.
the class Utilities method createEmptyBuckets.
/**
* Check the existence of buckets according to bucket specification. Create empty buckets if
* needed.
*
* @param hconf
* @param paths A list of empty buckets to create
* @param conf The definition of the FileSink.
* @param reporter The mapreduce reporter object
* @throws HiveException
* @throws IOException
*/
private static void createEmptyBuckets(Configuration hconf, List<Path> paths, FileSinkDesc conf, Reporter reporter) throws HiveException, IOException {
JobConf jc;
if (hconf instanceof JobConf) {
jc = new JobConf(hconf);
} else {
// test code path
jc = new JobConf(hconf);
}
HiveOutputFormat<?, ?> hiveOutputFormat = null;
Class<? extends Writable> outputClass = null;
boolean isCompressed = conf.getCompressed();
TableDesc tableInfo = conf.getTableInfo();
try {
Serializer serializer = (Serializer) tableInfo.getDeserializerClass().newInstance();
serializer.initialize(null, tableInfo.getProperties());
outputClass = serializer.getSerializedClass();
hiveOutputFormat = HiveFileFormatUtils.getHiveOutputFormat(hconf, conf.getTableInfo());
} catch (SerDeException e) {
throw new HiveException(e);
} catch (InstantiationException e) {
throw new HiveException(e);
} catch (IllegalAccessException e) {
throw new HiveException(e);
}
for (Path path : paths) {
RecordWriter writer = HiveFileFormatUtils.getRecordWriter(jc, hiveOutputFormat, outputClass, isCompressed, tableInfo.getProperties(), path, reporter);
writer.close(false);
LOG.info("created empty bucket for enforcing bucketing at " + path);
}
}
use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.
the class ReduceSinkOperator method initializeOp.
@Override
protected void initializeOp(Configuration hconf) throws HiveException {
super.initializeOp(hconf);
try {
numRows = 0;
cntr = 1;
logEveryNRows = HiveConf.getLongVar(hconf, HiveConf.ConfVars.HIVE_LOG_N_RECORDS);
statsMap.put(getCounterName(Counter.RECORDS_OUT_INTERMEDIATE, hconf), recordCounter);
List<ExprNodeDesc> keys = conf.getKeyCols();
if (isLogDebugEnabled) {
LOG.debug("keys size is " + keys.size());
for (ExprNodeDesc k : keys) {
LOG.debug("Key exprNodeDesc " + k.getExprString());
}
}
keyEval = new ExprNodeEvaluator[keys.size()];
int i = 0;
for (ExprNodeDesc e : keys) {
if (e instanceof ExprNodeConstantDesc && (BUCKET_NUMBER_COL_NAME).equals(((ExprNodeConstantDesc) e).getValue())) {
buckColIdxInKeyForSdpo = i;
}
keyEval[i++] = ExprNodeEvaluatorFactory.get(e);
}
numDistributionKeys = conf.getNumDistributionKeys();
distinctColIndices = conf.getDistinctColumnIndices();
numDistinctExprs = distinctColIndices.size();
valueEval = new ExprNodeEvaluator[conf.getValueCols().size()];
i = 0;
for (ExprNodeDesc e : conf.getValueCols()) {
valueEval[i++] = ExprNodeEvaluatorFactory.get(e);
}
partitionEval = new ExprNodeEvaluator[conf.getPartitionCols().size()];
i = 0;
for (ExprNodeDesc e : conf.getPartitionCols()) {
int index = ExprNodeDescUtils.indexOf(e, keys);
partitionEval[i++] = index < 0 ? ExprNodeEvaluatorFactory.get(e) : keyEval[index];
}
if (conf.getBucketCols() != null && !conf.getBucketCols().isEmpty()) {
bucketEval = new ExprNodeEvaluator[conf.getBucketCols().size()];
i = 0;
for (ExprNodeDesc e : conf.getBucketCols()) {
int index = ExprNodeDescUtils.indexOf(e, keys);
bucketEval[i++] = index < 0 ? ExprNodeEvaluatorFactory.get(e) : keyEval[index];
}
buckColIdxInKey = conf.getPartitionCols().size();
}
tag = conf.getTag();
tagByte[0] = (byte) tag;
skipTag = conf.getSkipTag();
if (isLogInfoEnabled) {
LOG.info("Using tag = " + tag);
}
TableDesc keyTableDesc = conf.getKeySerializeInfo();
keySerializer = (Serializer) keyTableDesc.getDeserializerClass().newInstance();
keySerializer.initialize(null, keyTableDesc.getProperties());
keyIsText = keySerializer.getSerializedClass().equals(Text.class);
TableDesc valueTableDesc = conf.getValueSerializeInfo();
valueSerializer = (Serializer) valueTableDesc.getDeserializerClass().newInstance();
valueSerializer.initialize(null, valueTableDesc.getProperties());
int limit = conf.getTopN();
float memUsage = conf.getTopNMemoryUsage();
if (limit >= 0 && memUsage > 0) {
reducerHash = conf.isPTFReduceSink() ? new PTFTopNHash() : new TopNHash();
reducerHash.initialize(limit, memUsage, conf.isMapGroupBy(), this, conf, hconf);
}
useUniformHash = conf.getReducerTraits().contains(UNIFORM);
firstRow = true;
} catch (Exception e) {
String msg = "Error initializing ReduceSinkOperator: " + e.getMessage();
LOG.error(msg, e);
throw new RuntimeException(e);
}
}
use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.
the class SkewJoinHandler method initiliaze.
public void initiliaze(Configuration hconf) {
this.hconf = hconf;
JoinDesc desc = joinOp.getConf();
skewKeyDefinition = desc.getSkewKeyDefinition();
skewKeysTableObjectInspector = new HashMap<Byte, StructObjectInspector>(numAliases);
tblDesc = desc.getSkewKeysValuesTables();
tblSerializers = new HashMap<Byte, AbstractSerDe>(numAliases);
bigKeysExistingMap = new HashMap<Byte, Boolean>(numAliases);
taskId = Utilities.getTaskId(hconf);
int[][] filterMap = desc.getFilterMap();
for (int i = 0; i < numAliases; i++) {
Byte alias = conf.getTagOrder()[i];
List<ObjectInspector> skewTableKeyInspectors = new ArrayList<ObjectInspector>();
StructObjectInspector soi = (StructObjectInspector) joinOp.inputObjInspectors[alias];
StructField sf = soi.getStructFieldRef(Utilities.ReduceField.KEY.toString());
List<? extends StructField> keyFields = ((StructObjectInspector) sf.getFieldObjectInspector()).getAllStructFieldRefs();
int keyFieldSize = keyFields.size();
for (int k = 0; k < keyFieldSize; k++) {
skewTableKeyInspectors.add(keyFields.get(k).getFieldObjectInspector());
}
TableDesc joinKeyDesc = desc.getKeyTableDesc();
List<String> keyColNames = Utilities.getColumnNames(joinKeyDesc.getProperties());
StructObjectInspector structTblKeyInpector = ObjectInspectorFactory.getStandardStructObjectInspector(keyColNames, skewTableKeyInspectors);
try {
AbstractSerDe serializer = (AbstractSerDe) ReflectionUtils.newInstance(tblDesc.get(alias).getDeserializerClass(), null);
SerDeUtils.initializeSerDe(serializer, null, tblDesc.get(alias).getProperties(), null);
tblSerializers.put((byte) i, serializer);
} catch (SerDeException e) {
LOG.error("Skewjoin will be disabled due to " + e.getMessage(), e);
joinOp.handleSkewJoin = false;
break;
}
boolean hasFilter = filterMap != null && filterMap[i] != null;
TableDesc valTblDesc = JoinUtil.getSpillTableDesc(alias, joinOp.spillTableDesc, conf, !hasFilter);
List<String> valColNames = new ArrayList<String>();
if (valTblDesc != null) {
valColNames = Utilities.getColumnNames(valTblDesc.getProperties());
}
StructObjectInspector structTblValInpector = ObjectInspectorFactory.getStandardStructObjectInspector(valColNames, joinOp.joinValuesStandardObjectInspectors[i]);
StructObjectInspector structTblInpector = ObjectInspectorFactory.getUnionStructObjectInspector(Arrays.asList(structTblValInpector, structTblKeyInpector));
skewKeysTableObjectInspector.put((byte) i, structTblInpector);
}
// reset rowcontainer's serde, objectinspector, and tableDesc.
for (int i = 0; i < numAliases; i++) {
Byte alias = conf.getTagOrder()[i];
RowContainer<ArrayList<Object>> rc = (RowContainer) joinOp.storage[i];
if (rc != null) {
rc.setSerDe(tblSerializers.get((byte) i), skewKeysTableObjectInspector.get((byte) i));
rc.setTableDesc(tblDesc.get(alias));
}
}
}
use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.
the class MapOperator method setChildren.
public void setChildren(Configuration hconf) throws Exception {
List<Operator<? extends OperatorDesc>> children = new ArrayList<Operator<? extends OperatorDesc>>();
Map<String, Configuration> tableNameToConf = cloneConfsForNestedColPruning(hconf);
Map<TableDesc, StructObjectInspector> convertedOI = getConvertedOI(tableNameToConf);
for (Map.Entry<Path, ArrayList<String>> entry : conf.getPathToAliases().entrySet()) {
Path onefile = entry.getKey();
List<String> aliases = entry.getValue();
PartitionDesc partDesc = conf.getPathToPartitionInfo().get(onefile);
TableDesc tableDesc = partDesc.getTableDesc();
Configuration newConf = tableNameToConf.get(tableDesc.getTableName());
for (String alias : aliases) {
Operator<? extends OperatorDesc> op = conf.getAliasToWork().get(alias);
if (isLogDebugEnabled) {
LOG.debug("Adding alias " + alias + " to work list for file " + onefile);
}
Map<Operator<?>, MapOpCtx> contexts = opCtxMap.get(onefile.toString());
if (contexts == null) {
opCtxMap.put(onefile.toString(), contexts = new LinkedHashMap<Operator<?>, MapOpCtx>());
}
if (contexts.containsKey(op)) {
continue;
}
MapOpCtx context = new MapOpCtx(alias, op, partDesc);
StructObjectInspector tableRowOI = convertedOI.get(partDesc.getTableDesc());
contexts.put(op, initObjectInspector(newConf, context, tableRowOI));
if (children.contains(op) == false) {
op.setParentOperators(new ArrayList<Operator<? extends OperatorDesc>>(1));
op.getParentOperators().add(this);
children.add(op);
}
}
}
initOperatorContext(children);
// we found all the operators that we are supposed to process.
setChildOperators(children);
}
use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.
the class DynamicPartitionPruner method initialize.
private void initialize() throws SerDeException {
this.clear();
Map<String, SourceInfo> columnMap = new HashMap<String, SourceInfo>();
// sources represent vertex names
Set<String> sources = work.getEventSourceTableDescMap().keySet();
sourcesWaitingForEvents.addAll(sources);
for (String s : sources) {
// Set to 0 to start with. This will be decremented for all columns for which events
// are generated by this source - which is eventually used to determine number of expected
// events for the source. #colums X #tasks
numExpectedEventsPerSource.put(s, new MutableInt(0));
numEventsSeenPerSource.put(s, new MutableInt(0));
// Virtual relation generated by the reduce sync
List<TableDesc> tables = work.getEventSourceTableDescMap().get(s);
// Real column name - on which the operation is being performed
List<String> columnNames = work.getEventSourceColumnNameMap().get(s);
// Column type
List<String> columnTypes = work.getEventSourceColumnTypeMap().get(s);
// Expression for the operation. e.g. N^2 > 10
List<ExprNodeDesc> partKeyExprs = work.getEventSourcePartKeyExprMap().get(s);
// eventSourceTableDesc, eventSourceColumnName, evenSourcePartKeyExpr move in lock-step.
// One entry is added to each at the same time
Iterator<String> cit = columnNames.iterator();
Iterator<String> typit = columnTypes.iterator();
Iterator<ExprNodeDesc> pit = partKeyExprs.iterator();
// A single source can process multiple columns, and will send an event for each of them.
for (TableDesc t : tables) {
numExpectedEventsPerSource.get(s).decrement();
++sourceInfoCount;
String columnName = cit.next();
String columnType = typit.next();
ExprNodeDesc partKeyExpr = pit.next();
SourceInfo si = createSourceInfo(t, partKeyExpr, columnName, columnType, jobConf);
if (!sourceInfoMap.containsKey(s)) {
sourceInfoMap.put(s, new ArrayList<SourceInfo>());
}
List<SourceInfo> sis = sourceInfoMap.get(s);
sis.add(si);
// the union of the values in that case.
if (columnMap.containsKey(columnName)) {
// All Sources are initialized up front. Events from different sources will end up getting added to the same list.
// Pruning is disabled if either source sends in an event which causes pruning to be skipped
si.values = columnMap.get(columnName).values;
si.skipPruning = columnMap.get(columnName).skipPruning;
}
columnMap.put(columnName, si);
}
}
}
Aggregations