use of org.apache.hadoop.hive.ql.io.HiveInputFormat in project hive by apache.
the class TestInputOutputFormat method testCombinationInputFormatWithAcid.
// test non-vectorized, acid, combine
@Test
public void testCombinationInputFormatWithAcid() throws Exception {
// get the object inspector for MyRow
StructObjectInspector inspector;
final int PARTITIONS = 2;
final int BUCKETS = 3;
synchronized (TestOrcFile.class) {
inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
JobConf conf = createMockExecutionEnvironment(workDir, new Path("mock:///"), "combinationAcid", inspector, false, PARTITIONS, MockFileSystem.class.getName());
// write the orc file to the mock file system
Path[] partDir = new Path[PARTITIONS];
String[] paths = conf.getStrings("mapred.input.dir");
for (int p = 0; p < PARTITIONS; ++p) {
partDir[p] = new Path(paths[p]);
}
// write a base file in partition 0
OrcRecordUpdater writer = new OrcRecordUpdater(partDir[0], new AcidOutputFormat.Options(conf).maximumWriteId(10).writingBase(true).bucket(0).inspector(inspector).finalDestination(partDir[0]));
for (int i = 0; i < 10; ++i) {
writer.insert(10, new MyRow(i, 2 * i));
}
writer.close(false);
// base file
Path base0 = new Path("mock:/combinationAcid/p=0/base_0000010/bucket_00000");
setBlocks(base0, conf, new MockBlock("host1", "host2"));
// write a delta file in partition 0
writer = new OrcRecordUpdater(partDir[0], new AcidOutputFormat.Options(conf).maximumWriteId(10).writingBase(true).bucket(1).inspector(inspector).finalDestination(partDir[0]));
for (int i = 10; i < 20; ++i) {
writer.insert(10, new MyRow(i, 2 * i));
}
writer.close(false);
Path base1 = new Path("mock:/combinationAcid/p=0/base_0000010/bucket_00001");
setBlocks(base1, conf, new MockBlock("host1", "host2"));
// write three files in partition 1
for (int bucket = 0; bucket < BUCKETS; ++bucket) {
Path path = new Path(partDir[1], "00000" + bucket + "_0");
Writer orc = OrcFile.createWriter(path, OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
orc.addRow(new MyRow(1, 2));
orc.close();
setBlocks(path, conf, new MockBlock("host3", "host4"));
}
// call getsplits
conf.setInt(hive_metastoreConstants.BUCKET_COUNT, BUCKETS);
setupAcidProperties(conf, RowType.MYROW);
HiveInputFormat<?, ?> inputFormat = new CombineHiveInputFormat<WritableComparable, Writable>();
InputSplit[] splits = inputFormat.getSplits(conf, 1);
assertEquals(3, splits.length);
HiveInputFormat.HiveInputSplit split = (HiveInputFormat.HiveInputSplit) splits[0];
assertEquals("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat", split.inputFormatClassName());
assertEquals("mock:/combinationAcid/p=0/base_0000010/bucket_00000", split.getPath().toString());
assertEquals(0, split.getStart());
assertEquals(702, split.getLength());
split = (HiveInputFormat.HiveInputSplit) splits[1];
assertEquals("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat", split.inputFormatClassName());
assertEquals("mock:/combinationAcid/p=0/base_0000010/bucket_00001", split.getPath().toString());
assertEquals(0, split.getStart());
assertEquals(726, split.getLength());
CombineHiveInputFormat.CombineHiveInputSplit combineSplit = (CombineHiveInputFormat.CombineHiveInputSplit) splits[2];
assertEquals(BUCKETS, combineSplit.getNumPaths());
for (int bucket = 0; bucket < BUCKETS; ++bucket) {
assertEquals("mock:/combinationAcid/p=1/00000" + bucket + "_0", combineSplit.getPath(bucket).toString());
assertEquals(0, combineSplit.getOffset(bucket));
assertEquals(253, combineSplit.getLength(bucket));
}
String[] hosts = combineSplit.getLocations();
assertEquals(2, hosts.length);
}
use of org.apache.hadoop.hive.ql.io.HiveInputFormat in project hive by apache.
the class TestInputOutputFormat method testVectorizationWithBuckets.
/**
* Test vectorization, non-acid, non-combine.
* @throws Exception
*/
@Test
public void testVectorizationWithBuckets() throws Exception {
// get the object inspector for MyRow
StructObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
JobConf conf = createMockExecutionEnvironment(workDir, new Path("mock:///"), "vectorBuckets", inspector, true, 1, MockFileSystem.class.getName());
// write the orc file to the mock file system
Path path = new Path(conf.get("mapred.input.dir") + "/0_0");
Writer writer = OrcFile.createWriter(path, OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
for (int i = 0; i < 10; ++i) {
writer.addRow(new MyRow(i, 2 * i));
}
writer.close();
setBlocks(path, conf, new MockBlock("host0", "host1"));
// call getsplits
conf.setInt(hive_metastoreConstants.BUCKET_COUNT, 3);
HiveInputFormat<?, ?> inputFormat = new HiveInputFormat<WritableComparable, Writable>();
InputSplit[] splits = inputFormat.getSplits(conf, 10);
assertEquals(1, splits.length);
org.apache.hadoop.mapred.RecordReader<NullWritable, VectorizedRowBatch> reader = inputFormat.getRecordReader(splits[0], conf, Reporter.NULL);
NullWritable key = reader.createKey();
VectorizedRowBatch value = reader.createValue();
assertEquals(true, reader.next(key, value));
assertEquals(10, value.count());
LongColumnVector col0 = (LongColumnVector) value.cols[0];
for (int i = 0; i < 10; i++) {
assertEquals("checking " + i, i, col0.vector[i]);
}
assertEquals(false, reader.next(key, value));
}
use of org.apache.hadoop.hive.ql.io.HiveInputFormat in project hive by apache.
the class FetchOperator method getInputFormatFromCache.
@SuppressWarnings("unchecked")
static InputFormat getInputFormatFromCache(Class<? extends InputFormat> inputFormatClass, JobConf conf) throws IOException {
if (Configurable.class.isAssignableFrom(inputFormatClass) || JobConfigurable.class.isAssignableFrom(inputFormatClass)) {
return ReflectionUtil.newInstance(inputFormatClass, conf);
}
// TODO: why is this copy-pasted from HiveInputFormat?
InputFormat format = inputFormats.get(inputFormatClass.getName());
if (format == null) {
try {
format = ReflectionUtil.newInstance(inputFormatClass, conf);
inputFormats.put(inputFormatClass.getName(), format);
} catch (Exception e) {
throw new IOException("Cannot create an instance of InputFormat class " + inputFormatClass.getName() + " as specified in mapredWork!", e);
}
}
return format;
}
use of org.apache.hadoop.hive.ql.io.HiveInputFormat in project hive by apache.
the class FetchOperator method processCurrPathForMmWriteIds.
private String processCurrPathForMmWriteIds(InputFormat inputFormat) throws IOException {
if (inputFormat instanceof HiveInputFormat) {
// No need to process here.
return StringUtils.escapeString(currPath.toString());
}
ValidWriteIdList validWriteIdList;
if (AcidUtils.isInsertOnlyTable(currDesc.getTableDesc().getProperties())) {
validWriteIdList = extractValidWriteIdList();
} else {
// non-MM case
validWriteIdList = null;
}
if (validWriteIdList != null) {
Utilities.FILE_OP_LOGGER.info("Processing " + currDesc.getTableName() + " for MM paths");
}
Path[] dirs = HiveInputFormat.processPathsForMmRead(Lists.newArrayList(currPath), job, validWriteIdList);
if (dirs == null || dirs.length == 0) {
// No valid inputs. This condition is logged inside the call.
return null;
}
StringBuffer str = new StringBuffer(StringUtils.escapeString(dirs[0].toString()));
for (int i = 1; i < dirs.length; i++) {
str.append(",").append(StringUtils.escapeString(dirs[i].toString()));
}
return str.toString();
}
use of org.apache.hadoop.hive.ql.io.HiveInputFormat in project hive by apache.
the class GenSparkSkewJoinProcessor method processSkewJoin.
@SuppressWarnings("unchecked")
public static void processSkewJoin(JoinOperator joinOp, Task<? extends Serializable> currTask, ReduceWork reduceWork, ParseContext parseCtx) throws SemanticException {
SparkWork currentWork = ((SparkTask) currTask).getWork();
if (currentWork.getChildren(reduceWork).size() > 0) {
LOG.warn("Skip runtime skew join as the ReduceWork has child work and hasn't been split.");
return;
}
List<Task<? extends Serializable>> children = currTask.getChildTasks();
Path baseTmpDir = parseCtx.getContext().getMRTmpPath();
JoinDesc joinDescriptor = joinOp.getConf();
Map<Byte, List<ExprNodeDesc>> joinValues = joinDescriptor.getExprs();
int numAliases = joinValues.size();
Map<Byte, Path> bigKeysDirMap = new HashMap<Byte, Path>();
Map<Byte, Map<Byte, Path>> smallKeysDirMap = new HashMap<Byte, Map<Byte, Path>>();
Map<Byte, Path> skewJoinJobResultsDir = new HashMap<Byte, Path>();
Byte[] tags = joinDescriptor.getTagOrder();
// for each joining table, set dir for big key and small keys properly
for (int i = 0; i < numAliases; i++) {
Byte alias = tags[i];
bigKeysDirMap.put(alias, GenMRSkewJoinProcessor.getBigKeysDir(baseTmpDir, alias));
Map<Byte, Path> smallKeysMap = new HashMap<Byte, Path>();
smallKeysDirMap.put(alias, smallKeysMap);
for (Byte src2 : tags) {
if (!src2.equals(alias)) {
smallKeysMap.put(src2, GenMRSkewJoinProcessor.getSmallKeysDir(baseTmpDir, alias, src2));
}
}
skewJoinJobResultsDir.put(alias, GenMRSkewJoinProcessor.getBigKeysSkewJoinResultDir(baseTmpDir, alias));
}
joinDescriptor.setHandleSkewJoin(true);
joinDescriptor.setBigKeysDirMap(bigKeysDirMap);
joinDescriptor.setSmallKeysDirMap(smallKeysDirMap);
joinDescriptor.setSkewKeyDefinition(HiveConf.getIntVar(parseCtx.getConf(), HiveConf.ConfVars.HIVESKEWJOINKEY));
// create proper table/column desc for spilled tables
TableDesc keyTblDesc = (TableDesc) reduceWork.getKeyDesc().clone();
List<String> joinKeys = Utilities.getColumnNames(keyTblDesc.getProperties());
List<String> joinKeyTypes = Utilities.getColumnTypes(keyTblDesc.getProperties());
Map<Byte, TableDesc> tableDescList = new HashMap<Byte, TableDesc>();
Map<Byte, RowSchema> rowSchemaList = new HashMap<Byte, RowSchema>();
Map<Byte, List<ExprNodeDesc>> newJoinValues = new HashMap<Byte, List<ExprNodeDesc>>();
Map<Byte, List<ExprNodeDesc>> newJoinKeys = new HashMap<Byte, List<ExprNodeDesc>>();
// used for create mapJoinDesc, should be in order
List<TableDesc> newJoinValueTblDesc = new ArrayList<TableDesc>();
for (int i = 0; i < tags.length; i++) {
newJoinValueTblDesc.add(null);
}
for (int i = 0; i < numAliases; i++) {
Byte alias = tags[i];
List<ExprNodeDesc> valueCols = joinValues.get(alias);
String colNames = "";
String colTypes = "";
int columnSize = valueCols.size();
List<ExprNodeDesc> newValueExpr = new ArrayList<ExprNodeDesc>();
List<ExprNodeDesc> newKeyExpr = new ArrayList<ExprNodeDesc>();
ArrayList<ColumnInfo> columnInfos = new ArrayList<ColumnInfo>();
boolean first = true;
for (int k = 0; k < columnSize; k++) {
TypeInfo type = valueCols.get(k).getTypeInfo();
// any name, it does not matter.
String newColName = i + "_VALUE_" + k;
ColumnInfo columnInfo = new ColumnInfo(newColName, type, alias.toString(), false);
columnInfos.add(columnInfo);
newValueExpr.add(new ExprNodeColumnDesc(columnInfo.getType(), columnInfo.getInternalName(), columnInfo.getTabAlias(), false));
if (!first) {
colNames = colNames + ",";
colTypes = colTypes + ",";
}
first = false;
colNames = colNames + newColName;
colTypes = colTypes + valueCols.get(k).getTypeString();
}
// we are putting join keys at last part of the spilled table
for (int k = 0; k < joinKeys.size(); k++) {
if (!first) {
colNames = colNames + ",";
colTypes = colTypes + ",";
}
first = false;
colNames = colNames + joinKeys.get(k);
colTypes = colTypes + joinKeyTypes.get(k);
ColumnInfo columnInfo = new ColumnInfo(joinKeys.get(k), TypeInfoFactory.getPrimitiveTypeInfo(joinKeyTypes.get(k)), alias.toString(), false);
columnInfos.add(columnInfo);
newKeyExpr.add(new ExprNodeColumnDesc(columnInfo.getType(), columnInfo.getInternalName(), columnInfo.getTabAlias(), false));
}
newJoinValues.put(alias, newValueExpr);
newJoinKeys.put(alias, newKeyExpr);
tableDescList.put(alias, Utilities.getTableDesc(colNames, colTypes));
rowSchemaList.put(alias, new RowSchema(columnInfos));
// construct value table Desc
String valueColNames = "";
String valueColTypes = "";
first = true;
for (int k = 0; k < columnSize; k++) {
// any name, it does not matter.
String newColName = i + "_VALUE_" + k;
if (!first) {
valueColNames = valueColNames + ",";
valueColTypes = valueColTypes + ",";
}
valueColNames = valueColNames + newColName;
valueColTypes = valueColTypes + valueCols.get(k).getTypeString();
first = false;
}
newJoinValueTblDesc.set((byte) i, Utilities.getTableDesc(valueColNames, valueColTypes));
}
joinDescriptor.setSkewKeysValuesTables(tableDescList);
joinDescriptor.setKeyTableDesc(keyTblDesc);
// create N-1 map join tasks
HashMap<Path, Task<? extends Serializable>> bigKeysDirToTaskMap = new HashMap<Path, Task<? extends Serializable>>();
List<Serializable> listWorks = new ArrayList<Serializable>();
List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>();
for (int i = 0; i < numAliases - 1; i++) {
Byte src = tags[i];
HiveConf hiveConf = new HiveConf(parseCtx.getConf(), GenSparkSkewJoinProcessor.class);
SparkWork sparkWork = new SparkWork(parseCtx.getConf().getVar(HiveConf.ConfVars.HIVEQUERYID));
Task<? extends Serializable> skewJoinMapJoinTask = TaskFactory.get(sparkWork);
skewJoinMapJoinTask.setFetchSource(currTask.isFetchSource());
// create N TableScans
Operator<? extends OperatorDesc>[] parentOps = new TableScanOperator[tags.length];
for (int k = 0; k < tags.length; k++) {
Operator<? extends OperatorDesc> ts = GenMapRedUtils.createTemporaryTableScanOperator(joinOp.getCompilationOpContext(), rowSchemaList.get((byte) k));
((TableScanOperator) ts).setTableDescSkewJoin(tableDescList.get((byte) k));
parentOps[k] = ts;
}
// create the MapJoinOperator
String dumpFilePrefix = "mapfile" + PlanUtils.getCountForMapJoinDumpFilePrefix();
MapJoinDesc mapJoinDescriptor = new MapJoinDesc(newJoinKeys, keyTblDesc, newJoinValues, newJoinValueTblDesc, newJoinValueTblDesc, joinDescriptor.getOutputColumnNames(), i, joinDescriptor.getConds(), joinDescriptor.getFilters(), joinDescriptor.getNoOuterJoin(), dumpFilePrefix, joinDescriptor.getMemoryMonitorInfo(), joinDescriptor.getInMemoryDataSize());
mapJoinDescriptor.setTagOrder(tags);
mapJoinDescriptor.setHandleSkewJoin(false);
mapJoinDescriptor.setNullSafes(joinDescriptor.getNullSafes());
mapJoinDescriptor.setColumnExprMap(joinDescriptor.getColumnExprMap());
// temporarily, mark it as child of all the TS
MapJoinOperator mapJoinOp = (MapJoinOperator) OperatorFactory.getAndMakeChild(joinOp.getCompilationOpContext(), mapJoinDescriptor, null, parentOps);
// clone the original join operator, and replace it with the MJ
// this makes sure MJ has the same downstream operator plan as the original join
List<Operator<?>> reducerList = new ArrayList<Operator<?>>();
reducerList.add(reduceWork.getReducer());
Operator<? extends OperatorDesc> reducer = SerializationUtilities.cloneOperatorTree(reducerList).get(0);
Preconditions.checkArgument(reducer instanceof JoinOperator, "Reducer should be join operator, but actually is " + reducer.getName());
JoinOperator cloneJoinOp = (JoinOperator) reducer;
List<Operator<? extends OperatorDesc>> childOps = cloneJoinOp.getChildOperators();
for (Operator<? extends OperatorDesc> childOp : childOps) {
childOp.replaceParent(cloneJoinOp, mapJoinOp);
}
mapJoinOp.setChildOperators(childOps);
// set memory usage for the MJ operator
setMemUsage(mapJoinOp, skewJoinMapJoinTask, parseCtx);
// create N MapWorks and add them to the SparkWork
MapWork bigMapWork = null;
Map<Byte, Path> smallTblDirs = smallKeysDirMap.get(src);
for (int j = 0; j < tags.length; j++) {
MapWork mapWork = PlanUtils.getMapRedWork().getMapWork();
sparkWork.add(mapWork);
// This code has been only added for testing
boolean mapperCannotSpanPartns = parseCtx.getConf().getBoolVar(HiveConf.ConfVars.HIVE_MAPPER_CANNOT_SPAN_MULTIPLE_PARTITIONS);
mapWork.setMapperCannotSpanPartns(mapperCannotSpanPartns);
Operator<? extends OperatorDesc> tableScan = parentOps[j];
String alias = tags[j].toString();
ArrayList<String> aliases = new ArrayList<String>();
aliases.add(alias);
Path path;
if (j == i) {
path = bigKeysDirMap.get(tags[j]);
bigKeysDirToTaskMap.put(path, skewJoinMapJoinTask);
bigMapWork = mapWork;
} else {
path = smallTblDirs.get(tags[j]);
}
mapWork.addPathToAlias(path, aliases);
mapWork.getAliasToWork().put(alias, tableScan);
PartitionDesc partitionDesc = new PartitionDesc(tableDescList.get(tags[j]), null);
mapWork.addPathToPartitionInfo(path, partitionDesc);
mapWork.getAliasToPartnInfo().put(alias, partitionDesc);
mapWork.setName("Map " + GenSparkUtils.getUtils().getNextSeqNumber());
}
// connect all small dir map work to the big dir map work
Preconditions.checkArgument(bigMapWork != null, "Haven't identified big dir MapWork");
// these 2 flags are intended only for the big-key map work
bigMapWork.setNumMapTasks(HiveConf.getIntVar(hiveConf, HiveConf.ConfVars.HIVESKEWJOINMAPJOINNUMMAPTASK));
bigMapWork.setMinSplitSize(HiveConf.getLongVar(hiveConf, HiveConf.ConfVars.HIVESKEWJOINMAPJOINMINSPLIT));
// use HiveInputFormat so that we can control the number of map tasks
bigMapWork.setInputformat(HiveInputFormat.class.getName());
for (BaseWork work : sparkWork.getRoots()) {
Preconditions.checkArgument(work instanceof MapWork, "All root work should be MapWork, but got " + work.getClass().getSimpleName());
if (work != bigMapWork) {
sparkWork.connect(work, bigMapWork, new SparkEdgeProperty(SparkEdgeProperty.SHUFFLE_NONE));
}
}
// insert SparkHashTableSink and Dummy operators
for (int j = 0; j < tags.length; j++) {
if (j != i) {
insertSHTS(tags[j], (TableScanOperator) parentOps[j], bigMapWork);
}
}
listWorks.add(skewJoinMapJoinTask.getWork());
listTasks.add(skewJoinMapJoinTask);
}
if (children != null) {
for (Task<? extends Serializable> tsk : listTasks) {
for (Task<? extends Serializable> oldChild : children) {
tsk.addDependentTask(oldChild);
}
}
currTask.setChildTasks(new ArrayList<Task<? extends Serializable>>());
for (Task<? extends Serializable> oldChild : children) {
oldChild.getParentTasks().remove(currTask);
}
listTasks.addAll(children);
for (Task<? extends Serializable> oldChild : children) {
listWorks.add(oldChild.getWork());
}
}
ConditionalResolverSkewJoin.ConditionalResolverSkewJoinCtx context = new ConditionalResolverSkewJoin.ConditionalResolverSkewJoinCtx(bigKeysDirToTaskMap, children);
ConditionalWork cndWork = new ConditionalWork(listWorks);
ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork);
cndTsk.setListTasks(listTasks);
cndTsk.setResolver(new ConditionalResolverSkewJoin());
cndTsk.setResolverCtx(context);
currTask.setChildTasks(new ArrayList<Task<? extends Serializable>>());
currTask.addDependentTask(cndTsk);
}
Aggregations