use of org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx in project hive by apache.
the class TestVectorCoalesceElt method doVectorCastTest.
private boolean doVectorCastTest(TypeInfo typeInfo, int iteration, List<String> columns, String[] columnNames, TypeInfo[] typeInfos, DataTypePhysicalVariation[] dataTypePhysicalVariations, List<ExprNodeDesc> children, GenericUDF udf, ExprNodeGenericFuncDesc exprDesc, CoalesceEltTestMode coalesceEltTestMode, VectorRandomBatchSource batchSource, ObjectInspector objectInspector, TypeInfo outputTypeInfo, Object[] resultObjects) throws Exception {
HiveConf hiveConf = new HiveConf();
if (coalesceEltTestMode == CoalesceEltTestMode.ADAPTOR) {
hiveConf.setBoolVar(HiveConf.ConfVars.HIVE_TEST_VECTOR_ADAPTOR_OVERRIDE, true);
}
VectorizationContext vectorizationContext = new VectorizationContext("name", columns, Arrays.asList(typeInfos), Arrays.asList(dataTypePhysicalVariations), hiveConf);
VectorExpression vectorExpression = vectorizationContext.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.PROJECTION);
vectorExpression.transientInit(hiveConf);
if (coalesceEltTestMode == CoalesceEltTestMode.VECTOR_EXPRESSION && vectorExpression instanceof VectorUDFAdaptor) {
System.out.println("*NO NATIVE VECTOR EXPRESSION* typeInfo " + typeInfo.toString() + " coalesceEltTestMode " + coalesceEltTestMode + " vectorExpression " + vectorExpression.toString());
}
// System.out.println("*VECTOR EXPRESSION* " + vectorExpression.getClass().getSimpleName());
/*
System.out.println(
"*DEBUG* typeInfo " + typeInfo.toString() +
" coalesceEltTestMode " + coalesceEltTestMode +
" vectorExpression " + vectorExpression.toString());
*/
VectorRandomRowSource rowSource = batchSource.getRowSource();
VectorizedRowBatchCtx batchContext = new VectorizedRowBatchCtx(columnNames, rowSource.typeInfos(), rowSource.dataTypePhysicalVariations(), /* dataColumnNums */
null, /* partitionColumnCount */
0, /* virtualColumnCount */
0, /* neededVirtualColumns */
null, vectorizationContext.getScratchColumnTypeNames(), vectorizationContext.getScratchDataTypePhysicalVariations());
VectorizedRowBatch batch = batchContext.createVectorizedRowBatch();
VectorExtractRow resultVectorExtractRow = new VectorExtractRow();
resultVectorExtractRow.init(new TypeInfo[] { outputTypeInfo }, new int[] { vectorExpression.getOutputColumnNum() });
Object[] scrqtchRow = new Object[1];
batchSource.resetBatchIteration();
int rowIndex = 0;
while (true) {
if (!batchSource.fillNextBatch(batch)) {
break;
}
vectorExpression.evaluate(batch);
extractResultObjects(batch, rowIndex, resultVectorExtractRow, scrqtchRow, objectInspector, resultObjects);
rowIndex += batch.size;
}
return true;
}
use of org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx in project hive by apache.
the class TestInputOutputFormat method createMockExecutionEnvironment.
/**
* Create a mock execution environment that has enough detail that
* ORC, vectorization, HiveInputFormat, and CombineHiveInputFormat don't
* explode.
* @param workDir a local filesystem work directory
* @param warehouseDir a mock filesystem warehouse directory
* @param tableName the table name
* @param objectInspector object inspector for the row
* @param isVectorized should run vectorized
* @return a JobConf that contains the necessary information
* @throws IOException
* @throws HiveException
*/
static JobConf createMockExecutionEnvironment(Path workDir, Path warehouseDir, String tableName, ObjectInspector objectInspector, boolean isVectorized, int partitions, String currFileSystemName) throws IOException, HiveException {
JobConf conf = new JobConf();
Utilities.clearWorkMap(conf);
conf.set("hive.exec.plan", workDir.toString());
conf.set("mapred.job.tracker", "local");
String isVectorizedString = Boolean.toString(isVectorized);
conf.set("hive.vectorized.execution.enabled", isVectorizedString);
conf.set(Utilities.VECTOR_MODE, isVectorizedString);
conf.set(Utilities.USE_VECTORIZED_INPUT_FILE_FORMAT, isVectorizedString);
conf.set("fs.mock.impl", currFileSystemName);
conf.set("mapred.mapper.class", ExecMapper.class.getName());
Path root = new Path(warehouseDir, tableName);
// clean out previous contents
((MockFileSystem) root.getFileSystem(conf)).clear();
// build partition strings
String[] partPath = new String[partitions];
StringBuilder buffer = new StringBuilder();
for (int p = 0; p < partitions; ++p) {
partPath[p] = new Path(root, "p=" + p).toString();
if (p != 0) {
buffer.append(',');
}
buffer.append(partPath[p]);
}
conf.set("mapred.input.dir", buffer.toString());
StringBuilder columnIds = new StringBuilder();
StringBuilder columnNames = new StringBuilder();
StringBuilder columnTypes = new StringBuilder();
StructObjectInspector structOI = (StructObjectInspector) objectInspector;
List<? extends StructField> fields = structOI.getAllStructFieldRefs();
int numCols = fields.size();
for (int i = 0; i < numCols; ++i) {
if (i != 0) {
columnIds.append(',');
columnNames.append(',');
columnTypes.append(',');
}
columnIds.append(i);
columnNames.append(fields.get(i).getFieldName());
columnTypes.append(fields.get(i).getFieldObjectInspector().getTypeName());
}
conf.set("hive.io.file.readcolumn.ids", columnIds.toString());
conf.set("partition_columns", "p");
conf.set(serdeConstants.LIST_COLUMNS, columnNames.toString());
conf.set(serdeConstants.LIST_COLUMN_TYPES, columnTypes.toString());
MockFileSystem fs = (MockFileSystem) warehouseDir.getFileSystem(conf);
fs.clear();
Properties tblProps = new Properties();
tblProps.put("name", tableName);
tblProps.put("serialization.lib", OrcSerde.class.getName());
tblProps.put("columns", columnNames.toString());
tblProps.put("columns.types", columnTypes.toString());
TableDesc tbl = new TableDesc(OrcInputFormat.class, OrcOutputFormat.class, tblProps);
MapWork mapWork = new MapWork();
mapWork.setVectorMode(isVectorized);
if (isVectorized) {
VectorizedRowBatchCtx vectorizedRowBatchCtx = new VectorizedRowBatchCtx();
vectorizedRowBatchCtx.init(structOI, new String[0]);
mapWork.setVectorizedRowBatchCtx(vectorizedRowBatchCtx);
}
mapWork.setUseBucketizedHiveInputFormat(false);
Map<Path, List<String>> aliasMap = new LinkedHashMap<>();
List<String> aliases = new ArrayList<String>();
aliases.add(tableName);
LinkedHashMap<Path, PartitionDesc> partMap = new LinkedHashMap<>();
for (int p = 0; p < partitions; ++p) {
Path path = new Path(partPath[p]);
aliasMap.put(path, aliases);
LinkedHashMap<String, String> partSpec = new LinkedHashMap<String, String>();
PartitionDesc part = new PartitionDesc(tbl, partSpec);
if (isVectorized) {
part.setVectorPartitionDesc(VectorPartitionDesc.createVectorizedInputFileFormat("MockInputFileFormatClassName", false, null));
}
partMap.put(path, part);
}
mapWork.setPathToAliases(aliasMap);
mapWork.setPathToPartitionInfo(partMap);
// write the plan out
FileSystem localFs = FileSystem.getLocal(conf).getRaw();
Path mapXml = new Path(workDir, "map.xml");
localFs.delete(mapXml, true);
FSDataOutputStream planStream = localFs.create(mapXml);
SerializationUtilities.serializePlan(mapWork, planStream);
conf.setBoolean(Utilities.HAS_MAP_WORK, true);
planStream.close();
return conf;
}
use of org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx in project hive by apache.
the class LlapInputFormat method createFakeVrbCtx.
static VectorizedRowBatchCtx createFakeVrbCtx(MapWork mapWork) throws HiveException {
// This is based on Vectorizer code, minus the validation.
// Add all non-virtual columns from the TableScan operator.
RowSchema rowSchema = findTsOp(mapWork).getSchema();
final List<String> colNames = new ArrayList<String>(rowSchema.getSignature().size());
final List<TypeInfo> colTypes = new ArrayList<TypeInfo>(rowSchema.getSignature().size());
ArrayList<VirtualColumn> virtualColumnList = new ArrayList<>(2);
for (ColumnInfo c : rowSchema.getSignature()) {
String columnName = c.getInternalName();
if (ALLOWED_VIRTUAL_COLUMNS.containsKey(columnName)) {
virtualColumnList.add(ALLOWED_VIRTUAL_COLUMNS.get(columnName));
} else if (VirtualColumn.VIRTUAL_COLUMN_NAMES.contains(columnName)) {
continue;
}
colNames.add(columnName);
colTypes.add(TypeInfoUtils.getTypeInfoFromTypeString(c.getTypeName()));
}
// Determine the partition columns using the first partition descriptor.
// Note - like vectorizer, this assumes partition columns go after data columns.
int partitionColumnCount = 0;
Iterator<Path> paths = mapWork.getPathToAliases().keySet().iterator();
if (paths.hasNext()) {
PartitionDesc partDesc = mapWork.getPathToPartitionInfo().get(paths.next());
if (partDesc != null) {
LinkedHashMap<String, String> partSpec = partDesc.getPartSpec();
if (partSpec != null && !partSpec.isEmpty()) {
partitionColumnCount = partSpec.size();
}
}
}
final VirtualColumn[] virtualColumns = virtualColumnList.toArray(new VirtualColumn[0]);
return new VectorizedRowBatchCtx(colNames.toArray(new String[colNames.size()]), colTypes.toArray(new TypeInfo[colTypes.size()]), null, null, partitionColumnCount, virtualColumns.length, virtualColumns, new String[0], null);
}
use of org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx in project hive by apache.
the class TestVectorizedOrcAcidRowBatchReader method testFetchDeletedRows.
private void testFetchDeletedRows() throws Exception {
List<OrcInputFormat.SplitStrategy<?>> splitStrategies = getSplitStrategies();
List<OrcSplit> splits = ((OrcInputFormat.ACIDSplitStrategy) splitStrategies.get(0)).getSplits();
// Mark one of the transactions as an exception to test that invalid transactions
// are being handled properly.
// Exclude transaction 5
conf.set(ValidWriteIdList.VALID_WRITEIDS_KEY, "tbl:14:1:1:5");
// enable fetching deleted rows
conf.set(Constants.ACID_FETCH_DELETED_ROWS, "true");
// Project ROW__IS__DELETED
VectorizedRowBatchCtx rbCtx = new VectorizedRowBatchCtx(new String[] { "payload", VirtualColumn.ROWISDELETED.getName() }, new TypeInfo[] { TypeInfoFactory.longTypeInfo, VirtualColumn.ROWISDELETED.getTypeInfo() }, new DataTypePhysicalVariation[] { DataTypePhysicalVariation.NONE, DataTypePhysicalVariation.NONE }, new int[] { 0 }, 0, 1, new VirtualColumn[] { VirtualColumn.ROWISDELETED }, new String[0], new DataTypePhysicalVariation[] { DataTypePhysicalVariation.NONE, DataTypePhysicalVariation.NONE });
VectorizedOrcAcidRowBatchReader vectorizedReader = new VectorizedOrcAcidRowBatchReader(splits.get(0), conf, Reporter.NULL, rbCtx);
VectorizedRowBatch vectorizedRowBatch = rbCtx.createVectorizedRowBatch();
// set data column count as 1.
vectorizedRowBatch.setPartitionInfo(1, 0);
long previousPayload = Long.MIN_VALUE;
while (vectorizedReader.next(null, vectorizedRowBatch)) {
LongColumnVector col = (LongColumnVector) vectorizedRowBatch.cols[0];
LongColumnVector rowIsDeletedColumnVector = (LongColumnVector) vectorizedRowBatch.cols[1];
for (int i = 0; i < vectorizedRowBatch.size; ++i) {
int idx = vectorizedRowBatch.selected[i];
long payload = col.vector[idx];
long owid = (payload / NUM_ROWID_PER_OWID) + 1;
long rowId = payload % NUM_ROWID_PER_OWID;
if (rowId % 2 == 0 || rowId % 3 == 0) {
assertEquals(1, rowIsDeletedColumnVector.vector[idx]);
} else {
assertEquals(0, rowIsDeletedColumnVector.vector[idx]);
}
// Check that writeid#5 has been excluded.
assertTrue(owid != 5);
// Check that the data is in sorted order.
assertTrue(payload >= previousPayload);
previousPayload = payload;
}
}
}
use of org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx in project hive by apache.
the class TestVectorizedOrcAcidRowBatchReader method testDeleteEventFiltering.
/**
* Tests that we can figure out min/max ROW__ID for each split and then use
* that to only load delete events between min/max.
* This test doesn't actually check what is read - that is done more E2E
* unit tests.
* @throws Exception
*/
private void testDeleteEventFiltering() throws Exception {
boolean filterOn = HiveConf.getBoolVar(conf, HiveConf.ConfVars.FILTER_DELETE_EVENTS);
int bucket = 0;
AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).filesystem(fs).bucket(bucket).writingBase(false).minimumWriteId(1).maximumWriteId(1).inspector(inspector).reporter(Reporter.NULL).recordIdColumn(1).finalDestination(root);
int bucketProperty = BucketCodec.V1.encode(options);
// create 3 insert deltas so that we have 3 splits
RecordUpdater updater = new OrcRecordUpdater(root, options);
// In the first delta add 2000 recs to simulate recs in multiple stripes.
int numRows = 2000;
for (int i = 1; i <= numRows; i++) {
updater.insert(options.getMinimumWriteId(), new DummyRow(i, i - 1, options.getMinimumWriteId(), bucket));
}
updater.close(false);
options.minimumWriteId(2).maximumWriteId(2);
updater = new OrcRecordUpdater(root, options);
updater.insert(options.getMinimumWriteId(), new DummyRow(4, 0, options.getMinimumWriteId(), bucket));
updater.insert(options.getMinimumWriteId(), new DummyRow(5, 1, options.getMinimumWriteId(), bucket));
updater.insert(options.getMinimumWriteId(), new DummyRow(6, 2, options.getMinimumWriteId(), bucket));
updater.close(false);
options.minimumWriteId(3).maximumWriteId(3);
updater = new OrcRecordUpdater(root, options);
updater.insert(options.getMinimumWriteId(), new DummyRow(7, 0, options.getMinimumWriteId(), bucket));
updater.insert(options.getMinimumWriteId(), new DummyRow(8, 1, options.getMinimumWriteId(), bucket));
updater.insert(options.getMinimumWriteId(), new DummyRow(9, 2, options.getMinimumWriteId(), bucket));
updater.close(false);
// delete 1 row from each of the insert deltas
options.minimumWriteId(4).maximumWriteId(4);
updater = new OrcRecordUpdater(root, options);
updater.delete(options.getMinimumWriteId(), new DummyRow(-1, 0, 1, bucket));
updater.delete(options.getMinimumWriteId(), new DummyRow(-1, 1, 2, bucket));
updater.delete(options.getMinimumWriteId(), new DummyRow(-1, 2, 3, bucket));
updater.close(false);
conf.set(ValidTxnList.VALID_TXNS_KEY, new ValidReadTxnList(new long[0], new BitSet(), 1000, Long.MAX_VALUE).writeToString());
// HWM is not important - just make sure deltas created above are read as
// if committed
conf.set(ValidWriteIdList.VALID_WRITEIDS_KEY, "tbl:5:" + Long.MAX_VALUE + "::");
// now we have 3 delete events total, but for each split we should only
// load 1 into DeleteRegistry (if filtering is on)
List<OrcInputFormat.SplitStrategy<?>> splitStrategies = getSplitStrategies();
assertEquals(1, splitStrategies.size());
List<OrcSplit> splits = ((OrcInputFormat.ACIDSplitStrategy) splitStrategies.get(0)).getSplits();
assertEquals(3, splits.size());
assertEquals(root.toUri().toString() + File.separator + "delta_0000001_0000001_0000/bucket_00000", splits.get(0).getPath().toUri().toString());
assertFalse(splits.get(0).isOriginal());
assertEquals(root.toUri().toString() + File.separator + "delta_0000002_0000002_0000/bucket_00000", splits.get(1).getPath().toUri().toString());
assertFalse(splits.get(1).isOriginal());
assertEquals(root.toUri().toString() + File.separator + "delta_0000003_0000003_0000/bucket_00000", splits.get(2).getPath().toUri().toString());
assertFalse(splits.get(2).isOriginal());
VectorizedOrcAcidRowBatchReader vectorizedReader = new VectorizedOrcAcidRowBatchReader(splits.get(0), conf, Reporter.NULL, new VectorizedRowBatchCtx());
ColumnizedDeleteEventRegistry deleteEventRegistry = (ColumnizedDeleteEventRegistry) vectorizedReader.getDeleteEventRegistry();
assertEquals("number of delete events for stripe 1", filterOn ? 1 : 3, deleteEventRegistry.size());
OrcRawRecordMerger.KeyInterval keyInterval = vectorizedReader.getKeyInterval();
if (filterOn) {
assertEquals(new OrcRawRecordMerger.KeyInterval(new RecordIdentifier(1, bucketProperty, 0), new RecordIdentifier(1, bucketProperty, numRows - 1)), keyInterval);
} else {
assertEquals(new OrcRawRecordMerger.KeyInterval(null, null), keyInterval);
}
vectorizedReader = new VectorizedOrcAcidRowBatchReader(splits.get(1), conf, Reporter.NULL, new VectorizedRowBatchCtx());
deleteEventRegistry = (ColumnizedDeleteEventRegistry) vectorizedReader.getDeleteEventRegistry();
assertEquals("number of delete events for stripe 2", filterOn ? 1 : 3, deleteEventRegistry.size());
keyInterval = vectorizedReader.getKeyInterval();
if (filterOn) {
assertEquals(new OrcRawRecordMerger.KeyInterval(new RecordIdentifier(2, bucketProperty, 0), new RecordIdentifier(2, bucketProperty, 2)), keyInterval);
} else {
assertEquals(new OrcRawRecordMerger.KeyInterval(null, null), keyInterval);
}
vectorizedReader = new VectorizedOrcAcidRowBatchReader(splits.get(2), conf, Reporter.NULL, new VectorizedRowBatchCtx());
deleteEventRegistry = (ColumnizedDeleteEventRegistry) vectorizedReader.getDeleteEventRegistry();
assertEquals("number of delete events for stripe 3", filterOn ? 1 : 3, deleteEventRegistry.size());
keyInterval = vectorizedReader.getKeyInterval();
if (filterOn) {
assertEquals(new OrcRawRecordMerger.KeyInterval(new RecordIdentifier(3, bucketProperty, 0), new RecordIdentifier(3, bucketProperty, 2)), keyInterval);
} else {
assertEquals(new OrcRawRecordMerger.KeyInterval(null, null), keyInterval);
}
}
Aggregations