use of org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getReflectionObjectInspector in project hive by apache.
the class TestInputOutputFormat method testVectorReaderNoFooterSerialize.
@Test
public void testVectorReaderNoFooterSerialize() throws Exception {
MockFileSystem fs = new MockFileSystem(conf);
MockPath mockPath = new MockPath(fs, "mock:///mocktable3");
conf.set("hive.orc.splits.include.file.footer", "false");
conf.set("mapred.input.dir", mockPath.toString());
conf.set("fs.defaultFS", "mock:///");
conf.set("fs.mock.impl", MockFileSystem.class.getName());
StructObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
JobConf jobConf = createMockExecutionEnvironment(workDir, new Path("mock:///"), "mocktable3", inspector, true, 0);
Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
for (int i = 0; i < 10; ++i) {
writer.addRow(new MyRow(i, 2 * i));
}
writer.close();
writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
for (int i = 0; i < 10; ++i) {
writer.addRow(new MyRow(i, 2 * i));
}
writer.close();
OrcInputFormat orcInputFormat = new OrcInputFormat();
InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
assertEquals(2, splits.length);
int readOpsBefore = -1;
for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
if (statistics.getScheme().equalsIgnoreCase("mock")) {
readOpsBefore = statistics.getReadOps();
}
}
assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);
for (InputSplit split : splits) {
assertTrue("OrcSplit is expected", split instanceof OrcSplit);
// ETL strategies will have start=3 (start of first stripe)
assertTrue(split.toString().contains("start=3"));
assertTrue(split.toString().contains("hasFooter=false"));
assertTrue(split.toString().contains("hasBase=true"));
assertTrue(split.toString().contains("deltas=0"));
if (split instanceof OrcSplit) {
assertFalse("No footer serialize test for vector reader, hasFooter is not expected in" + " orc splits.", ((OrcSplit) split).hasFooter());
}
orcInputFormat.getRecordReader(split, jobConf, Reporter.NULL);
}
int readOpsDelta = -1;
for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
if (statistics.getScheme().equalsIgnoreCase("mock")) {
readOpsDelta = statistics.getReadOps() - readOpsBefore;
}
}
// call-1: open to read footer - split 1 => mock:/mocktable3/0_0
// call-2: open to read data - split 1 => mock:/mocktable3/0_0
// call-3: open to read footer - split 2 => mock:/mocktable3/0_1
// call-4: open to read data - split 2 => mock:/mocktable3/0_1
assertEquals(4, readOpsDelta);
// revert back to local fs
conf.set("fs.defaultFS", "file:///");
}
use of org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getReflectionObjectInspector in project hive by apache.
the class TestInputOutputFormat method testMROutput.
@Test
public void testMROutput() throws Exception {
Properties properties = new Properties();
StructObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(NestedRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
AbstractSerDe serde = new OrcSerde();
OutputFormat<?, ?> outFormat = new OrcOutputFormat();
RecordWriter writer = outFormat.getRecordWriter(fs, conf, testFilePath.toString(), Reporter.NULL);
writer.write(NullWritable.get(), serde.serialize(new NestedRow(1, 2, 3), inspector));
writer.write(NullWritable.get(), serde.serialize(new NestedRow(4, 5, 6), inspector));
writer.write(NullWritable.get(), serde.serialize(new NestedRow(7, 8, 9), inspector));
writer.close(Reporter.NULL);
serde = new OrcSerde();
properties.setProperty("columns", "z,r");
properties.setProperty("columns.types", "int:struct<x:int,y:int>");
SerDeUtils.initializeSerDe(serde, conf, properties, null);
inspector = (StructObjectInspector) serde.getObjectInspector();
InputFormat<?, ?> in = new OrcInputFormat();
FileInputFormat.setInputPaths(conf, testFilePath.toString());
InputSplit[] splits = in.getSplits(conf, 1);
assertEquals(1, splits.length);
ColumnProjectionUtils.appendReadColumns(conf, Collections.singletonList(1));
conf.set("columns", "z,r");
conf.set("columns.types", "int:struct<x:int,y:int>");
org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
Object key = reader.createKey();
Object value = reader.createValue();
int rowNum = 0;
List<? extends StructField> fields = inspector.getAllStructFieldRefs();
StructObjectInspector inner = (StructObjectInspector) fields.get(1).getFieldObjectInspector();
List<? extends StructField> inFields = inner.getAllStructFieldRefs();
IntObjectInspector intInspector = (IntObjectInspector) fields.get(0).getFieldObjectInspector();
while (reader.next(key, value)) {
assertEquals(null, inspector.getStructFieldData(value, fields.get(0)));
Object sub = inspector.getStructFieldData(value, fields.get(1));
assertEquals(3 * rowNum + 1, intInspector.get(inner.getStructFieldData(sub, inFields.get(0))));
assertEquals(3 * rowNum + 2, intInspector.get(inner.getStructFieldData(sub, inFields.get(1))));
rowNum += 1;
}
assertEquals(3, rowNum);
reader.close();
}
use of org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getReflectionObjectInspector in project hive by apache.
the class TestInputOutputFormat method testACIDReaderNoFooterSerialize.
@Test
public void testACIDReaderNoFooterSerialize() throws Exception {
MockFileSystem fs = new MockFileSystem(conf);
MockPath mockPath = new MockPath(fs, "mock:///mocktable5");
conf.set("hive.transactional.table.scan", "true");
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty());
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty());
conf.set("hive.orc.splits.include.file.footer", "false");
conf.set("mapred.input.dir", mockPath.toString());
conf.set("fs.defaultFS", "mock:///");
conf.set("fs.mock.impl", MockFileSystem.class.getName());
StructObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
for (int i = 0; i < 10; ++i) {
writer.addRow(new MyRow(i, 2 * i));
}
writer.close();
writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
for (int i = 0; i < 10; ++i) {
writer.addRow(new MyRow(i, 2 * i));
}
writer.close();
OrcInputFormat orcInputFormat = new OrcInputFormat();
InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
assertEquals(2, splits.length);
int readOpsBefore = -1;
for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
if (statistics.getScheme().equalsIgnoreCase("mock")) {
readOpsBefore = statistics.getReadOps();
}
}
assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);
for (InputSplit split : splits) {
assertTrue("OrcSplit is expected", split instanceof OrcSplit);
// ETL strategies will have start=3 (start of first stripe)
assertTrue(split.toString().contains("start=3"));
assertTrue(split.toString().contains("hasFooter=false"));
assertTrue(split.toString().contains("hasBase=true"));
assertTrue(split.toString().contains("deltas=0"));
if (split instanceof OrcSplit) {
assertFalse("No footer serialize test for non-vector reader, hasFooter is not expected in" + " orc splits.", ((OrcSplit) split).hasFooter());
}
orcInputFormat.getRecordReader(split, conf, Reporter.NULL);
}
int readOpsDelta = -1;
for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
if (statistics.getScheme().equalsIgnoreCase("mock")) {
readOpsDelta = statistics.getReadOps() - readOpsBefore;
}
}
// call-1: open to read footer - split 1 => mock:/mocktable5/0_0
// call-2: open to read data - split 1 => mock:/mocktable5/0_0
// call-3: open to read footer - split 2 => mock:/mocktable5/0_1
// call-4: open to read data - split 2 => mock:/mocktable5/0_1
assertEquals(4, readOpsDelta);
// revert back to local fs
conf.set("fs.defaultFS", "file:///");
}
use of org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getReflectionObjectInspector in project hive by apache.
the class TestInputOutputFormat method testRowNumberUniquenessInDifferentSplits.
/**
* also see {@link TestOrcFile#testPredicatePushdown()}
* This tests that {@link RecordReader#getRowNumber()} works with multiple splits
* @throws Exception
*/
@Test
public void testRowNumberUniquenessInDifferentSplits() throws Exception {
Properties properties = new Properties();
properties.setProperty("columns", "x,y");
properties.setProperty("columns.types", "int:int");
StructObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
// Save the conf variable values so that they can be restored later.
long oldDefaultStripeSize = conf.getLong(OrcConf.STRIPE_SIZE.getHiveConfName(), -1L);
long oldMaxSplitSize = conf.getLong(HiveConf.ConfVars.MAPREDMAXSPLITSIZE.varname, -1L);
// Set the conf variable values for this test.
// 10000 bytes per stripe
long newStripeSize = 10000L;
// 1024 bytes per split
long newMaxSplitSize = 100L;
conf.setLong(OrcConf.STRIPE_SIZE.getHiveConfName(), newStripeSize);
conf.setLong(HiveConf.ConfVars.MAPREDMAXSPLITSIZE.varname, newMaxSplitSize);
AbstractSerDe serde = new OrcSerde();
HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat();
org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true, properties, Reporter.NULL);
// The following loop should create 20 stripes in the orc file.
for (int i = 0; i < newStripeSize * 10; ++i) {
writer.write(serde.serialize(new MyRow(i, i + 1), inspector));
}
writer.close(true);
serde = new OrcSerde();
SerDeUtils.initializeSerDe(serde, conf, properties, null);
assertEquals(OrcSerde.OrcSerdeRow.class, serde.getSerializedClass());
inspector = (StructObjectInspector) serde.getObjectInspector();
assertEquals("struct<x:int,y:int>", inspector.getTypeName());
InputFormat<?, ?> in = new OrcInputFormat();
FileInputFormat.setInputPaths(conf, testFilePath.toString());
int numExpectedSplits = 20;
InputSplit[] splits = in.getSplits(conf, numExpectedSplits);
assertEquals(numExpectedSplits, splits.length);
for (int i = 0; i < numExpectedSplits; ++i) {
OrcSplit split = (OrcSplit) splits[i];
Reader.Options orcReaderOptions = new Reader.Options();
orcReaderOptions.range(split.getStart(), split.getLength());
OrcFile.ReaderOptions qlReaderOptions = OrcFile.readerOptions(conf).maxLength(split.getFileLength());
Reader reader = OrcFile.createReader(split.getPath(), qlReaderOptions);
RecordReader recordReader = reader.rowsOptions(orcReaderOptions);
for (int j = 0; recordReader.hasNext(); j++) {
long rowNum = (i * 5000) + j;
long rowNumActual = recordReader.getRowNumber();
assertEquals("rowNum=" + rowNum, rowNum, rowNumActual);
Object row = recordReader.next(null);
}
recordReader.close();
}
// Reset the conf variable values that we changed for this test.
if (oldDefaultStripeSize != -1L) {
conf.setLong(OrcConf.STRIPE_SIZE.getHiveConfName(), oldDefaultStripeSize);
} else {
// this means that nothing was set for default stripe size previously, so we should unset it.
conf.unset(OrcConf.STRIPE_SIZE.getHiveConfName());
}
if (oldMaxSplitSize != -1L) {
conf.setLong(HiveConf.ConfVars.MAPREDMAXSPLITSIZE.varname, oldMaxSplitSize);
} else {
// this means that nothing was set for default stripe size previously, so we should unset it.
conf.unset(HiveConf.ConfVars.MAPREDMAXSPLITSIZE.varname);
}
}
use of org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getReflectionObjectInspector in project hive by apache.
the class OrcFileGenerator method generateOrcFile.
/**
* Generates an orc file based on the provided record class in the specified file system
* at the output path.
*
* @param conf the configuration used to initialize the orc writer
* @param fs the file system to which will contain the generated orc file
* @param outputPath the path where the generated orc will be placed
* @param recordClass a class the defines the record format for the generated orc file, this
* class must have exactly one constructor.
*/
public static void generateOrcFile(Configuration conf, FileSystem fs, Path outputPath, Class recordClass) throws IOException, InstantiationException, IllegalAccessException, InvocationTargetException {
ObjectInspector inspector;
synchronized (TestVectorizedORCReader.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector(recordClass, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
Writer writer = OrcFile.createWriter(fs, outputPath, conf, inspector, 100000, CompressionKind.ZLIB, 10000, 10000);
try {
Constructor[] constructors = recordClass.getConstructors();
if (constructors.length != 1) {
throw new UnsupportedOperationException("The provided recordClass must have exactly one constructor.");
}
BatchDataDistribution[] dataDist = BatchDataDistribution.values();
Class[] columns = constructors[0].getParameterTypes();
for (int i = 0; i < dataDist.length * 3; i++) {
Object[][] rows = new Object[columns.length][VectorizedRowBatch.DEFAULT_SIZE];
for (int c = 0; c < columns.length; c++) {
if (!TYPE_TO_BATCH_GEN_MAP.containsKey(columns[c])) {
throw new UnsupportedOperationException("No batch generator defined for type " + columns[c].getName());
}
rows[c] = TYPE_TO_BATCH_GEN_MAP.get(columns[c]).generateBatch(dataDist[(i + c) % dataDist.length]);
}
for (int r = 0; r < VectorizedRowBatch.DEFAULT_SIZE; r++) {
Object[] row = new Object[columns.length];
for (int c = 0; c < columns.length; c++) {
row[c] = rows[c][r];
}
writer.addRow(constructors[0].newInstance(row));
}
}
} finally {
writer.close();
}
}
Aggregations