use of org.apache.hadoop.hive.shims.CombineHiveKey in project hive by apache.
the class TestInputOutputFormat method testCombinationInputFormat.
// test non-vectorized, non-acid, combine
@Test
public void testCombinationInputFormat() throws Exception {
// get the object inspector for MyRow
StructObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
JobConf conf = createMockExecutionEnvironment(workDir, new Path("mock:///"), "combination", inspector, false, 1);
// write the orc file to the mock file system
Path partDir = new Path(conf.get("mapred.input.dir"));
Writer writer = OrcFile.createWriter(new Path(partDir, "0_0"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
for (int i = 0; i < 10; ++i) {
writer.addRow(new MyRow(i, 2 * i));
}
writer.close();
Path path = new Path("mock:/combination/p=0/0_0");
setBlocks(path, conf, new MockBlock("host0", "host1"));
MockFileSystem mockFs = (MockFileSystem) partDir.getFileSystem(conf);
int length0 = getLength(path, conf);
writer = OrcFile.createWriter(new Path(partDir, "1_0"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
for (int i = 10; i < 20; ++i) {
writer.addRow(new MyRow(i, 2 * i));
}
writer.close();
Path path1 = new Path("mock:/combination/p=0/1_0");
setBlocks(path1, conf, new MockBlock("host1", "host2"));
// call getsplits
HiveInputFormat<?, ?> inputFormat = new CombineHiveInputFormat<WritableComparable, Writable>();
InputSplit[] splits = inputFormat.getSplits(conf, 1);
assertEquals(1, splits.length);
CombineHiveInputFormat.CombineHiveInputSplit split = (CombineHiveInputFormat.CombineHiveInputSplit) splits[0];
// check split
assertEquals(2, split.getNumPaths());
assertEquals(partDir.toString() + "/0_0", split.getPath(0).toString());
assertEquals(partDir.toString() + "/1_0", split.getPath(1).toString());
assertEquals(length0, split.getLength(0));
assertEquals(getLength(path1, conf), split.getLength(1));
assertEquals(0, split.getOffset(0));
assertEquals(0, split.getOffset(1));
// hadoop-1 gets 3 and hadoop-2 gets 0. *sigh*
// best answer would be 1.
assertTrue(3 >= split.getLocations().length);
// read split
org.apache.hadoop.mapred.RecordReader<CombineHiveKey, OrcStruct> reader = inputFormat.getRecordReader(split, conf, Reporter.NULL);
CombineHiveKey key = reader.createKey();
OrcStruct value = reader.createValue();
for (int i = 0; i < 20; i++) {
assertEquals(true, reader.next(key, value));
assertEquals(i, ((IntWritable) value.getFieldValue(0)).get());
}
assertEquals(false, reader.next(key, value));
}
use of org.apache.hadoop.hive.shims.CombineHiveKey in project hive by apache.
the class ColumnTruncateMapper method map.
@Override
public void map(Object k, RCFileValueBufferWrapper value, OutputCollector<Object, Object> output, Reporter reporter) throws IOException {
try {
RCFileKeyBufferWrapper key = null;
if (k instanceof CombineHiveKey) {
key = (RCFileKeyBufferWrapper) ((CombineHiveKey) k).getKey();
} else {
key = (RCFileKeyBufferWrapper) k;
}
if (work.getListBucketingCtx().calculateListBucketingLevel() > 0) {
if (!this.tmpPathFixedConcatenate) {
fixTmpPathConcatenate(key.getInputPath().getParent(), work.getListBucketingCtx().calculateListBucketingLevel());
tmpPathFixedConcatenate = true;
}
}
if (outWriter == null) {
codec = key.getCodec();
columnNumber = key.getKeyBuffer().getColumnNumber();
RCFileOutputFormat.setColumnNumber(jc, columnNumber);
outWriter = new RCFile.Writer(fs, jc, outPath, null, codec);
}
for (Integer i : work.getDroppedColumns()) {
key.getKeyBuffer().nullColumn(i);
value.getValueBuffer().nullColumn(i);
}
int keyLength = key.getKeyBuffer().getSize();
int recordLength = key.getKeyBuffer().getSize();
for (int columnLen : key.getKeyBuffer().getEachColumnValueLen()) {
recordLength += columnLen;
}
outWriter.flushBlock(key.getKeyBuffer(), value.getValueBuffer(), recordLength, keyLength, key.getCompressedKeyLength());
} catch (Throwable e) {
this.exception = true;
close();
throw new IOException(e);
}
}
use of org.apache.hadoop.hive.shims.CombineHiveKey in project hive by apache.
the class RCFileMergeOperator method processKeyValuePairs.
private void processKeyValuePairs(Object k, Object v) throws HiveException {
try {
RCFileKeyBufferWrapper key;
if (k instanceof CombineHiveKey) {
key = (RCFileKeyBufferWrapper) ((CombineHiveKey) k).getKey();
} else {
key = (RCFileKeyBufferWrapper) k;
}
RCFileValueBufferWrapper value = (RCFileValueBufferWrapper) v;
fixTmpPath(key.getInputPath().getParent());
if (outWriter == null) {
codec = key.getCodec();
columnNumber = key.getKeyBuffer().getColumnNumber();
RCFileOutputFormat.setColumnNumber(jc, columnNumber);
outWriter = new RCFile.Writer(fs, jc, outPath, null, codec);
}
boolean sameCodec = ((codec == key.getCodec()) || codec.getClass().equals(key.getCodec().getClass()));
if ((key.getKeyBuffer().getColumnNumber() != columnNumber) || (!sameCodec)) {
throw new IOException("RCFileMerge failed because the input files" + " use different CompressionCodec or have different column number" + " setting.");
}
outWriter.flushBlock(key.getKeyBuffer(), value.getValueBuffer(), key.getRecordLength(), key.getKeyLength(), key.getCompressedKeyLength());
} catch (Throwable e) {
this.exception = true;
closeOp(true);
throw new HiveException(e);
}
}
use of org.apache.hadoop.hive.shims.CombineHiveKey in project hive by apache.
the class OrcFileMergeOperator method processKeyValuePairs.
private void processKeyValuePairs(Object key, Object value) throws HiveException {
String filePath = "";
try {
OrcFileValueWrapper v;
OrcFileKeyWrapper k;
if (key instanceof CombineHiveKey) {
k = (OrcFileKeyWrapper) ((CombineHiveKey) key).getKey();
} else {
k = (OrcFileKeyWrapper) key;
}
// skip incompatible file, files that are missing stripe statistics are set to incompatible
if (k.isIncompatFile()) {
LOG.warn("Incompatible ORC file merge! Stripe statistics is missing. " + k.getInputPath());
incompatFileSet.add(k.getInputPath());
return;
}
filePath = k.getInputPath().toUri().getPath();
fixTmpPath(k.getInputPath().getParent());
v = (OrcFileValueWrapper) value;
if (prevPath == null) {
prevPath = k.getInputPath();
reader = OrcFile.createReader(fs, k.getInputPath());
if (isLogInfoEnabled) {
LOG.info("ORC merge file input path: " + k.getInputPath());
}
}
// match this configuration before merging else will not be merged
if (outWriter == null) {
compression = k.getCompression();
compressBuffSize = k.getCompressBufferSize();
version = k.getVersion();
columnCount = k.getTypes().get(0).getSubtypesCount();
rowIndexStride = k.getRowIndexStride();
OrcFile.WriterOptions options = OrcFile.writerOptions(jc).compress(compression).version(version).rowIndexStride(rowIndexStride).inspector(reader.getObjectInspector());
// compression buffer size should only be set if compression is enabled
if (compression != CompressionKind.NONE) {
// enforce is required to retain the buffer sizes of old files instead of orc writer
// inferring the optimal buffer size
options.bufferSize(compressBuffSize).enforceBufferSize();
}
outWriter = OrcFile.createWriter(outPath, options);
if (isLogDebugEnabled) {
LOG.info("ORC merge file output path: " + outPath);
}
}
if (!checkCompatibility(k)) {
incompatFileSet.add(k.getInputPath());
return;
}
// next file in the path
if (!k.getInputPath().equals(prevPath)) {
reader = OrcFile.createReader(fs, k.getInputPath());
}
// initialize buffer to read the entire stripe
byte[] buffer = new byte[(int) v.getStripeInformation().getLength()];
fdis = fs.open(k.getInputPath());
fdis.readFully(v.getStripeInformation().getOffset(), buffer, 0, (int) v.getStripeInformation().getLength());
// append the stripe buffer to the new ORC file
outWriter.appendStripe(buffer, 0, buffer.length, v.getStripeInformation(), v.getStripeStatistics());
if (isLogInfoEnabled) {
LOG.info("Merged stripe from file " + k.getInputPath() + " [ offset : " + v.getStripeInformation().getOffset() + " length: " + v.getStripeInformation().getLength() + " row: " + v.getStripeStatistics().getColStats(0).getNumberOfValues() + " ]");
}
// add user metadata to footer in case of any
if (v.isLastStripeInFile()) {
outWriter.appendUserMetadata(v.getUserMetadata());
}
} catch (Throwable e) {
this.exception = true;
LOG.error("Closing operator..Exception: " + ExceptionUtils.getStackTrace(e));
throw new HiveException(e);
} finally {
if (exception) {
closeOp(true);
}
if (fdis != null) {
try {
fdis.close();
} catch (IOException e) {
throw new HiveException(String.format("Unable to close file %s", filePath), e);
} finally {
fdis = null;
}
}
}
}
use of org.apache.hadoop.hive.shims.CombineHiveKey in project hive by apache.
the class PartialScanMapper method map.
@Override
public void map(Object k, RCFileValueBufferWrapper value, OutputCollector<Object, Object> output, Reporter reporter) throws IOException {
if (rp == null) {
this.rp = reporter;
MapredContext.get().setReporter(reporter);
}
try {
//CombineHiveInputFormat may be set in PartialScanTask.
RCFileKeyBufferWrapper key = (RCFileKeyBufferWrapper) ((k instanceof CombineHiveKey) ? ((CombineHiveKey) k).getKey() : k);
// calculate rawdatasize
KeyBuffer keyBuffer = key.getKeyBuffer();
long[] uncompressedColumnSizes = new long[keyBuffer.getColumnNumber()];
for (int i = 0; i < keyBuffer.getColumnNumber(); i++) {
uncompressedColumnSizes[i] += keyBuffer.getEachColumnUncompressedValueLen()[i];
}
if (uncompressedColumnSizes != null) {
for (int i = 0; i < uncompressedColumnSizes.length; i++) {
uncompressedFileSize += uncompressedColumnSizes[i];
}
}
// calculate no. of rows
rowNo += keyBuffer.getNumberRows();
} catch (Throwable e) {
this.exception = true;
close();
throw new IOException(e);
}
}
Aggregations