use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.
the class TestOperators method testScriptOperator.
public void testScriptOperator() throws Throwable {
try {
System.out.println("Testing Script Operator");
// col1
ExprNodeDesc exprDesc1 = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, "col1", "", false);
// col2
ExprNodeDesc expr1 = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, "col0", "", false);
ExprNodeDesc expr2 = new ExprNodeConstantDesc("1");
ExprNodeDesc exprDesc2 = TypeCheckProcFactory.DefaultExprProcessor.getFuncExprNodeDesc("concat", expr1, expr2);
// select operator to project these two columns
ArrayList<ExprNodeDesc> earr = new ArrayList<ExprNodeDesc>();
earr.add(exprDesc1);
earr.add(exprDesc2);
ArrayList<String> outputCols = new ArrayList<String>();
for (int i = 0; i < earr.size(); i++) {
outputCols.add("_col" + i);
}
SelectDesc selectCtx = new SelectDesc(earr, outputCols);
Operator<SelectDesc> op = OperatorFactory.get(new CompilationOpContext(), SelectDesc.class);
op.setConf(selectCtx);
// scriptOperator to echo the output of the select
TableDesc scriptOutput = PlanUtils.getDefaultTableDesc("" + Utilities.tabCode, "a,b");
TableDesc scriptInput = PlanUtils.getDefaultTableDesc("" + Utilities.tabCode, "a,b");
ScriptDesc sd = new ScriptDesc("cat", scriptOutput, TextRecordWriter.class, scriptInput, TextRecordReader.class, TextRecordReader.class, PlanUtils.getDefaultTableDesc("" + Utilities.tabCode, "key"));
Operator<ScriptDesc> sop = OperatorFactory.getAndMakeChild(sd, op);
// Collect operator to observe the output of the script
CollectDesc cd = new CollectDesc(Integer.valueOf(10));
CollectOperator cdop = (CollectOperator) OperatorFactory.getAndMakeChild(cd, sop);
op.initialize(new JobConf(TestOperators.class), new ObjectInspector[] { r[0].oi });
// evaluate on row
for (int i = 0; i < 5; i++) {
op.process(r[i].o, 0);
}
op.close(false);
InspectableObject io = new InspectableObject();
for (int i = 0; i < 5; i++) {
cdop.retrieve(io);
System.out.println("[" + i + "] io.o=" + io.o);
System.out.println("[" + i + "] io.oi=" + io.oi);
StructObjectInspector soi = (StructObjectInspector) io.oi;
assert (soi != null);
StructField a = soi.getStructFieldRef("a");
StructField b = soi.getStructFieldRef("b");
assertEquals("" + (i + 1), ((PrimitiveObjectInspector) a.getFieldObjectInspector()).getPrimitiveJavaObject(soi.getStructFieldData(io.o, a)));
assertEquals((i) + "1", ((PrimitiveObjectInspector) b.getFieldObjectInspector()).getPrimitiveJavaObject(soi.getStructFieldData(io.o, b)));
}
System.out.println("Script Operator ok");
} catch (Throwable e) {
e.printStackTrace();
throw e;
}
}
use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.
the class DemuxOperator method initializeOp.
@Override
protected void initializeOp(Configuration hconf) throws HiveException {
super.initializeOp(hconf);
// A DemuxOperator should have at least one child
if (childOperatorsArray.length == 0) {
throw new HiveException("Expected number of children is at least 1. Found : " + childOperatorsArray.length);
}
newTagToOldTag = toArray(conf.getNewTagToOldTag());
newTagToChildIndex = toArray(conf.getNewTagToChildIndex());
childInputObjInspectors = new ObjectInspector[childOperators.size()][];
cntrs = new long[newTagToOldTag.length];
nextCntrs = new long[newTagToOldTag.length];
try {
// Those inputObjectInspectors are stored in childInputObjInspectors.
for (int i = 0; i < newTagToOldTag.length; i++) {
int newTag = i;
int oldTag = newTagToOldTag[i];
int childIndex = newTagToChildIndex[newTag];
cntrs[newTag] = 0;
nextCntrs[newTag] = 0;
TableDesc keyTableDesc = conf.getKeysSerializeInfos().get(newTag);
Deserializer inputKeyDeserializer = ReflectionUtil.newInstance(keyTableDesc.getDeserializerClass(), null);
SerDeUtils.initializeSerDe(inputKeyDeserializer, null, keyTableDesc.getProperties(), null);
TableDesc valueTableDesc = conf.getValuesSerializeInfos().get(newTag);
Deserializer inputValueDeserializer = ReflectionUtil.newInstance(valueTableDesc.getDeserializerClass(), null);
SerDeUtils.initializeSerDe(inputValueDeserializer, null, valueTableDesc.getProperties(), null);
List<ObjectInspector> oi = new ArrayList<ObjectInspector>();
oi.add(inputKeyDeserializer.getObjectInspector());
oi.add(inputValueDeserializer.getObjectInspector());
int childParentsCount = conf.getChildIndexToOriginalNumParents().get(childIndex);
// So, we first check if childInputObjInspectors contains the key of childIndex.
if (childInputObjInspectors[childIndex] == null) {
childInputObjInspectors[childIndex] = new ObjectInspector[childParentsCount];
}
ObjectInspector[] ois = childInputObjInspectors[childIndex];
ois[oldTag] = ObjectInspectorFactory.getStandardStructObjectInspector(Utilities.reduceFieldNameList, oi);
}
} catch (Exception e) {
throw new RuntimeException(e);
}
childrenDone = 0;
newChildOperatorsTag = new int[childOperators.size()][];
for (int i = 0; i < childOperators.size(); i++) {
Operator<? extends OperatorDesc> child = childOperators.get(i);
List<Integer> childOperatorTags = new ArrayList<Integer>();
if (child instanceof MuxOperator) {
// This DemuxOperator can appear multiple times in MuxOperator's
// parentOperators
int index = 0;
for (Operator<? extends OperatorDesc> parent : child.getParentOperators()) {
if (this == parent) {
childOperatorTags.add(index);
}
index++;
}
} else {
childOperatorTags.add(child.getParentOperators().indexOf(this));
}
newChildOperatorsTag[i] = toArray(childOperatorTags);
}
if (LOG.isInfoEnabled()) {
LOG.info("newChildOperatorsTag " + Arrays.toString(newChildOperatorsTag));
}
}
use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.
the class SparkDynamicPartitionPruner method initialize.
public void initialize(MapWork work, JobConf jobConf) throws SerDeException {
Map<String, SourceInfo> columnMap = new HashMap<String, SourceInfo>();
Set<String> sourceWorkIds = work.getEventSourceTableDescMap().keySet();
for (String id : sourceWorkIds) {
List<TableDesc> tables = work.getEventSourceTableDescMap().get(id);
// Real column name - on which the operation is being performed
List<String> columnNames = work.getEventSourceColumnNameMap().get(id);
// Column type
List<String> columnTypes = work.getEventSourceColumnTypeMap().get(id);
List<ExprNodeDesc> partKeyExprs = work.getEventSourcePartKeyExprMap().get(id);
Iterator<String> cit = columnNames.iterator();
Iterator<String> typit = columnTypes.iterator();
Iterator<ExprNodeDesc> pit = partKeyExprs.iterator();
for (TableDesc t : tables) {
String columnName = cit.next();
String columnType = typit.next();
ExprNodeDesc partKeyExpr = pit.next();
SourceInfo si = new SourceInfo(t, partKeyExpr, columnName, columnType, jobConf);
if (!sourceInfoMap.containsKey(id)) {
sourceInfoMap.put(id, new ArrayList<SourceInfo>());
}
sourceInfoMap.get(id).add(si);
// the union of the values in that case.
if (columnMap.containsKey(columnName)) {
si.values = columnMap.get(columnName).values;
}
columnMap.put(columnName, si);
}
}
}
use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.
the class Utilities method getInputSummaryWithPool.
@VisibleForTesting
static ContentSummary getInputSummaryWithPool(final Context ctx, Set<Path> pathNeedProcess, MapWork work, long[] summary, ExecutorService executor) throws IOException {
List<Future<?>> results = new ArrayList<Future<?>>();
final Map<String, ContentSummary> resultMap = new ConcurrentHashMap<String, ContentSummary>();
HiveInterruptCallback interrup = HiveInterruptUtils.add(new HiveInterruptCallback() {
@Override
public void interrupt() {
for (Path path : pathNeedProcess) {
try {
path.getFileSystem(ctx.getConf()).close();
} catch (IOException ignore) {
LOG.debug("Failed to close filesystem", ignore);
}
}
if (executor != null) {
executor.shutdownNow();
}
}
});
try {
Configuration conf = ctx.getConf();
JobConf jobConf = new JobConf(conf);
for (Path path : pathNeedProcess) {
final Path p = path;
final String pathStr = path.toString();
// All threads share the same Configuration and JobConf based on the
// assumption that they are thread safe if only read operations are
// executed. It is not stated in Hadoop's javadoc, the sourcce codes
// clearly showed that they made efforts for it and we believe it is
// thread safe. Will revisit this piece of codes if we find the assumption
// is not correct.
final Configuration myConf = conf;
final JobConf myJobConf = jobConf;
final Map<String, Operator<?>> aliasToWork = work.getAliasToWork();
final Map<Path, ArrayList<String>> pathToAlias = work.getPathToAliases();
final PartitionDesc partDesc = work.getPathToPartitionInfo().get(p);
Runnable r = new Runnable() {
@Override
public void run() {
try {
Class<? extends InputFormat> inputFormatCls = partDesc.getInputFileFormatClass();
InputFormat inputFormatObj = HiveInputFormat.getInputFormatFromCache(inputFormatCls, myJobConf);
if (inputFormatObj instanceof ContentSummaryInputFormat) {
ContentSummaryInputFormat cs = (ContentSummaryInputFormat) inputFormatObj;
resultMap.put(pathStr, cs.getContentSummary(p, myJobConf));
return;
}
String metaTableStorage = null;
if (partDesc.getTableDesc() != null && partDesc.getTableDesc().getProperties() != null) {
metaTableStorage = partDesc.getTableDesc().getProperties().getProperty(hive_metastoreConstants.META_TABLE_STORAGE, null);
}
if (partDesc.getProperties() != null) {
metaTableStorage = partDesc.getProperties().getProperty(hive_metastoreConstants.META_TABLE_STORAGE, metaTableStorage);
}
HiveStorageHandler handler = HiveUtils.getStorageHandler(myConf, metaTableStorage);
if (handler instanceof InputEstimator) {
long total = 0;
TableDesc tableDesc = partDesc.getTableDesc();
InputEstimator estimator = (InputEstimator) handler;
for (String alias : HiveFileFormatUtils.doGetAliasesFromPath(pathToAlias, p)) {
JobConf jobConf = new JobConf(myJobConf);
TableScanOperator scanOp = (TableScanOperator) aliasToWork.get(alias);
Utilities.setColumnNameList(jobConf, scanOp, true);
Utilities.setColumnTypeList(jobConf, scanOp, true);
PlanUtils.configureInputJobPropertiesForStorageHandler(tableDesc);
Utilities.copyTableJobPropertiesToConf(tableDesc, jobConf);
total += estimator.estimate(jobConf, scanOp, -1).getTotalLength();
}
resultMap.put(pathStr, new ContentSummary(total, -1, -1));
} else {
// todo: should nullify summary for non-native tables,
// not to be selected as a mapjoin target
FileSystem fs = p.getFileSystem(myConf);
resultMap.put(pathStr, fs.getContentSummary(p));
}
} catch (Exception e) {
// We safely ignore this exception for summary data.
// We don't update the cache to protect it from polluting other
// usages. The worst case is that IOException will always be
// retried for another getInputSummary(), which is fine as
// IOException is not considered as a common case.
LOG.info("Cannot get size of {}. Safely ignored.", pathStr);
}
}
};
if (executor == null) {
r.run();
} else {
Future<?> result = executor.submit(r);
results.add(result);
}
}
if (executor != null) {
for (Future<?> result : results) {
boolean executorDone = false;
do {
try {
result.get();
executorDone = true;
} catch (InterruptedException e) {
LOG.info("Interrupted when waiting threads: ", e);
Thread.currentThread().interrupt();
break;
} catch (ExecutionException e) {
throw new IOException(e);
}
} while (!executorDone);
}
executor.shutdown();
}
HiveInterruptUtils.checkInterrupted();
for (Map.Entry<String, ContentSummary> entry : resultMap.entrySet()) {
ContentSummary cs = entry.getValue();
summary[0] += cs.getLength();
summary[1] += cs.getFileCount();
summary[2] += cs.getDirectoryCount();
ctx.addCS(entry.getKey(), cs);
if (LOG.isInfoEnabled()) {
LOG.info("Cache Content Summary for {} length: {} file count: {} " + " directory count: {}", entry.getKey(), cs.getLength(), cs.getFileCount(), cs.getDirectoryCount());
}
}
return new ContentSummary(summary[0], summary[1], summary[2]);
} finally {
if (executor != null) {
executor.shutdownNow();
}
HiveInterruptUtils.remove(interrup);
}
}
use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.
the class ReduceSinkOperator method initializeOp.
@Override
protected void initializeOp(Configuration hconf) throws HiveException {
super.initializeOp(hconf);
try {
numRows = 0;
cntr = 1;
logEveryNRows = HiveConf.getLongVar(hconf, HiveConf.ConfVars.HIVE_LOG_N_RECORDS);
List<ExprNodeDesc> keys = conf.getKeyCols();
if (LOG.isDebugEnabled()) {
LOG.debug("keys size is " + keys.size());
for (ExprNodeDesc k : keys) {
LOG.debug("Key exprNodeDesc " + k.getExprString());
}
}
keyEval = new ExprNodeEvaluator[keys.size()];
int i = 0;
for (ExprNodeDesc e : keys) {
if (e instanceof ExprNodeConstantDesc && (BUCKET_NUMBER_COL_NAME).equals(((ExprNodeConstantDesc) e).getValue())) {
buckColIdxInKeyForSdpo = i;
}
keyEval[i++] = ExprNodeEvaluatorFactory.get(e);
}
numDistributionKeys = conf.getNumDistributionKeys();
distinctColIndices = conf.getDistinctColumnIndices();
numDistinctExprs = distinctColIndices.size();
valueEval = new ExprNodeEvaluator[conf.getValueCols().size()];
i = 0;
for (ExprNodeDesc e : conf.getValueCols()) {
valueEval[i++] = ExprNodeEvaluatorFactory.get(e);
}
partitionEval = new ExprNodeEvaluator[conf.getPartitionCols().size()];
i = 0;
for (ExprNodeDesc e : conf.getPartitionCols()) {
int index = ExprNodeDescUtils.indexOf(e, keys);
partitionEval[i++] = index < 0 ? ExprNodeEvaluatorFactory.get(e) : keyEval[index];
}
if (conf.getBucketCols() != null && !conf.getBucketCols().isEmpty()) {
bucketEval = new ExprNodeEvaluator[conf.getBucketCols().size()];
i = 0;
for (ExprNodeDesc e : conf.getBucketCols()) {
int index = ExprNodeDescUtils.indexOf(e, keys);
bucketEval[i++] = index < 0 ? ExprNodeEvaluatorFactory.get(e) : keyEval[index];
}
buckColIdxInKey = conf.getPartitionCols().size();
}
tag = conf.getTag();
tagByte[0] = (byte) tag;
skipTag = conf.getSkipTag();
if (LOG.isInfoEnabled()) {
LOG.info("Using tag = " + tag);
}
TableDesc keyTableDesc = conf.getKeySerializeInfo();
keySerializer = (Serializer) keyTableDesc.getDeserializerClass().newInstance();
keySerializer.initialize(null, keyTableDesc.getProperties());
keyIsText = keySerializer.getSerializedClass().equals(Text.class);
TableDesc valueTableDesc = conf.getValueSerializeInfo();
valueSerializer = (Serializer) valueTableDesc.getDeserializerClass().newInstance();
valueSerializer.initialize(null, valueTableDesc.getProperties());
int limit = conf.getTopN();
float memUsage = conf.getTopNMemoryUsage();
if (limit >= 0 && memUsage > 0) {
reducerHash = conf.isPTFReduceSink() ? new PTFTopNHash() : new TopNHash();
reducerHash.initialize(limit, memUsage, conf.isMapGroupBy(), this, conf, hconf);
}
useUniformHash = conf.getReducerTraits().contains(UNIFORM);
firstRow = true;
} catch (Exception e) {
String msg = "Error initializing ReduceSinkOperator: " + e.getMessage();
LOG.error(msg, e);
throw new RuntimeException(e);
}
}
Aggregations