use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class TestInputOutputFormat method testSplitGenReadOpsLocalCacheChangeModificationTime.
@Test
public void testSplitGenReadOpsLocalCacheChangeModificationTime() throws Exception {
MockFileSystem fs = new MockFileSystem(conf);
// creates the static cache
MockPath mockPath = new MockPath(fs, "mock:///mocktbl2");
conf.set("hive.orc.cache.use.soft.references", "true");
conf.set("mapred.input.dir", mockPath.toString());
conf.set("fs.defaultFS", "mock:///");
conf.set("fs.mock.impl", MockFileSystem.class.getName());
StructObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
for (int i = 0; i < 10; ++i) {
writer.addRow(new MyRow(i, 2 * i));
}
writer.close();
writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
for (int i = 0; i < 10; ++i) {
writer.addRow(new MyRow(i, 2 * i));
}
writer.close();
int readOpsBefore = -1;
for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
if (statistics.getScheme().equalsIgnoreCase("mock")) {
readOpsBefore = statistics.getReadOps();
}
}
assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);
OrcInputFormat orcInputFormat = new OrcInputFormat();
InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
assertEquals(2, splits.length);
int readOpsDelta = -1;
for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
if (statistics.getScheme().equalsIgnoreCase("mock")) {
readOpsDelta = statistics.getReadOps() - readOpsBefore;
}
}
// call-1: listLocatedStatus - mock:/mocktbl2
// call-2: check side file for mock:/mocktbl2/0_0
// call-3: open - mock:/mocktbl2/0_0
// call-4: check side file for mock:/mocktbl2/0_1
// call-5: open - mock:/mocktbl2/0_1
assertEquals(5, readOpsDelta);
// change file modification time and look for cache misses
FileSystem fs1 = FileSystem.get(conf);
MockFile mockFile = ((MockFileSystem) fs1).findFile(new Path(mockPath + "/0_0"));
((MockFileSystem) fs1).touch(mockFile);
for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
if (statistics.getScheme().equalsIgnoreCase("mock")) {
readOpsBefore = statistics.getReadOps();
}
}
orcInputFormat = new OrcInputFormat();
splits = orcInputFormat.getSplits(conf, 2);
assertEquals(2, splits.length);
for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
if (statistics.getScheme().equalsIgnoreCase("mock")) {
readOpsDelta = statistics.getReadOps() - readOpsBefore;
}
}
// call-1: listLocatedStatus - mock:/mocktbl2
// call-2: check side file for mock:/mocktbl2/0_1
// call-3: open - mock:/mocktbl2/0_1
assertEquals(3, readOpsDelta);
// touch the next file
fs1 = FileSystem.get(conf);
mockFile = ((MockFileSystem) fs1).findFile(new Path(mockPath + "/0_1"));
((MockFileSystem) fs1).touch(mockFile);
for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
if (statistics.getScheme().equalsIgnoreCase("mock")) {
readOpsBefore = statistics.getReadOps();
}
}
orcInputFormat = new OrcInputFormat();
splits = orcInputFormat.getSplits(conf, 2);
assertEquals(2, splits.length);
for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
if (statistics.getScheme().equalsIgnoreCase("mock")) {
readOpsDelta = statistics.getReadOps() - readOpsBefore;
}
}
// call-1: listLocatedStatus - mock:/mocktbl2
// call-2: check side file for mock:/mocktbl2/0_0
// call-3: open - mock:/mocktbl2/0_0
assertEquals(3, readOpsDelta);
for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
if (statistics.getScheme().equalsIgnoreCase("mock")) {
readOpsBefore = statistics.getReadOps();
}
}
orcInputFormat = new OrcInputFormat();
splits = orcInputFormat.getSplits(conf, 2);
assertEquals(2, splits.length);
for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
if (statistics.getScheme().equalsIgnoreCase("mock")) {
readOpsDelta = statistics.getReadOps() - readOpsBefore;
}
}
// call-1: listLocatedStatus - mock:/mocktbl2
assertEquals(1, readOpsDelta);
// revert back to local fs
conf.set("fs.defaultFS", "file:///");
}
use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class TestInputOutputFormat method testSplitGenReadOps.
@Test
public void testSplitGenReadOps() throws Exception {
MockFileSystem fs = new MockFileSystem(conf);
conf.set("mapred.input.dir", "mock:///mocktable");
conf.set("fs.defaultFS", "mock:///");
conf.set("fs.mock.impl", MockFileSystem.class.getName());
MockPath mockPath = new MockPath(fs, "mock:///mocktable");
StructObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
for (int i = 0; i < 10; ++i) {
writer.addRow(new MyRow(i, 2 * i));
}
writer.close();
writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
for (int i = 0; i < 10; ++i) {
writer.addRow(new MyRow(i, 2 * i));
}
writer.close();
int readOpsBefore = -1;
for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
if (statistics.getScheme().equalsIgnoreCase("mock")) {
readOpsBefore = statistics.getReadOps();
}
}
assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);
OrcInputFormat orcInputFormat = new OrcInputFormat();
InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
int readOpsDelta = -1;
for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
if (statistics.getScheme().equalsIgnoreCase("mock")) {
readOpsDelta = statistics.getReadOps() - readOpsBefore;
}
}
// call-1: listLocatedStatus - mock:/mocktable
// call-2: check existence of side file for mock:/mocktable/0_0
// call-3: open - mock:/mocktable/0_0
// call-4: check existence of side file for mock:/mocktable/0_1
// call-5: open - mock:/mocktable/0_1
assertEquals(5, readOpsDelta);
assertEquals(2, splits.length);
// revert back to local fs
conf.set("fs.defaultFS", "file:///");
}
use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class TestInputOutputFormat method testACIDReaderFooterSerializeWithDeltas.
@Test
public void testACIDReaderFooterSerializeWithDeltas() throws Exception {
conf.set("fs.defaultFS", "mock:///");
conf.set("fs.mock.impl", MockFileSystem.class.getName());
// ensures that FS object is cached so that everyone uses the same instance
FileSystem fs = FileSystem.get(conf);
MockPath mockPath = new MockPath(fs, "mock:///mocktable8");
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty());
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty());
conf.set("hive.orc.splits.include.file.footer", "true");
conf.set("mapred.input.dir", mockPath.toString());
StructObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
for (int i = 0; i < 10; ++i) {
writer.addRow(new MyRow(i, 2 * i));
}
writer.close();
AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).bucket(1).minimumWriteId(1).maximumWriteId(1).inspector(inspector).finalDestination(mockPath);
OrcOutputFormat of = new OrcOutputFormat();
RecordUpdater ru = of.getRecordUpdater(mockPath, options);
for (int i = 0; i < 10; ++i) {
ru.insert(options.getMinimumWriteId(), new MyRow(i, 2 * i));
}
// this deletes the side file
ru.close(false);
// set up props for read
conf.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, true);
AcidUtils.setAcidOperationalProperties(conf, true, null);
OrcInputFormat orcInputFormat = new OrcInputFormat();
InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
assertEquals(2, splits.length);
int readOpsBefore = -1;
for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
if (statistics.getScheme().equalsIgnoreCase("mock")) {
readOpsBefore = statistics.getReadOps();
}
}
assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);
for (InputSplit split : splits) {
assertTrue("OrcSplit is expected", split instanceof OrcSplit);
// ETL strategies will have start=3 (start of first stripe)
assertTrue(split.toString().contains("start=3"));
assertTrue(split.toString().contains("hasFooter=true"));
assertTrue(split.toString().contains("hasBase=true"));
assertTrue("Footer serialize test for ACID reader, hasFooter is not expected in" + " orc splits.", ((OrcSplit) split).hasFooter());
orcInputFormat.getRecordReader(split, conf, Reporter.NULL);
}
int readOpsDelta = -1;
for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
if (statistics.getScheme().equalsIgnoreCase("mock")) {
readOpsDelta = statistics.getReadOps() - readOpsBefore;
}
}
// call-1: open to read data - split 1 => mock:/mocktable8/0_0
// call-2: listLocatedFileStatus(mock:/mocktable8)
// call-3: getFileStatus(mock:/mocktable8/delta_0000001_0000001_0000/_metadata_acid)
// call-4: getFileStatus(mock:/mocktable8/delta_0000001_0000001_0000/_metadata_acid)
// call-5: open(mock:/mocktable8/delta_0000001_0000001_0000/bucket_00001)
assertEquals(5, readOpsDelta);
// revert back to local fs
conf.set("fs.defaultFS", "file:///");
}
use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class TestOrcRawRecordMerger method testRecordReaderNewBaseAndDelta.
/**
* Test the RecordReader when there is a new base and a delta.
* This test creates multiple stripes in both base and delta files which affects how many splits
* are created on read. With ORC-228 this could be done in E2E fashion with a query or
* streaming ingest writing data.
* @see #testRecordReaderOldBaseAndDelta()
* @throws Exception
*/
@Test
public void testRecordReaderNewBaseAndDelta() throws Exception {
final int BUCKET = 11;
Configuration conf = new Configuration();
OrcOutputFormat of = new OrcOutputFormat();
FileSystem fs = FileSystem.getLocal(conf);
Path root = new Path(tmpDir, "testRecordReaderNewBaseAndDelta").makeQualified(fs);
fs.delete(root, true);
ObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector(BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
// write the base
MemoryManager mgr = new MemoryManagerImpl(conf) {
int rowsAddedSinceCheck = 0;
@Override
public synchronized void addedRow(int rows) throws IOException {
rowsAddedSinceCheck += rows;
if (rowsAddedSinceCheck >= 2) {
notifyWriters();
rowsAddedSinceCheck = 0;
}
}
};
// make 5 stripes with 2 rows each
OrcRecordUpdater.OrcOptions options = (OrcRecordUpdater.OrcOptions) new OrcRecordUpdater.OrcOptions(conf).writingBase(true).minimumWriteId(0).maximumWriteId(0).bucket(BUCKET).inspector(inspector).filesystem(fs);
final int BUCKET_PROPERTY = BucketCodec.V1.encode(options);
options.orcOptions(OrcFile.writerOptions(conf).stripeSize(1).blockPadding(false).compress(CompressionKind.NONE).memory(mgr).batchSize(2));
options.finalDestination(root);
RecordUpdater ru = of.getRecordUpdater(root, options);
String[] values = new String[] { "ignore.1", "0.1", "ignore.2", "ignore.3", "2.0", "2.1", "3.0", "ignore.4", "ignore.5", "ignore.6" };
for (int i = 0; i < values.length; ++i) {
ru.insert(0, new BigRow(i, i, values[i], i, i));
}
ru.close(false);
// write a delta
options.writingBase(false).minimumWriteId(1).maximumWriteId(1).recordIdColumn(5);
ru = of.getRecordUpdater(root, options);
values = new String[] { "0.0", null, null, "1.1", null, null, null, "ignore.7" };
for (int i = 0; i < values.length; ++i) {
if (values[i] != null) {
ru.update(1, new BigRow(i, i, values[i], i, i, i, 0, BUCKET_PROPERTY));
}
}
ru.delete(1, new BigRow(9, 0, BUCKET_PROPERTY));
ru.close(false);
// write a delta
options.minimumWriteId(100).maximumWriteId(100);
ru = of.getRecordUpdater(root, options);
values = new String[] { null, null, "1.0", null, null, null, null, "3.1" };
for (int i = 0; i < values.length - 1; ++i) {
if (values[i] != null) {
ru.update(100, new BigRow(i, i, values[i], i, i, i, 0, BUCKET_PROPERTY));
}
}
// do this before next update so that delte_delta is properly sorted
ru.delete(100, new BigRow(8, 0, BUCKET_PROPERTY));
// because row 8 was updated and thus has a different RecordIdentifier now
ru.update(100, new BigRow(7, 7, values[values.length - 1], 7, 7, 2, 1, BUCKET_PROPERTY));
ru.close(false);
MyResult[] expected = new MyResult[10];
int k = 0;
expected[k++] = new MyResult(0, "0.0");
expected[k++] = new MyResult(1, "0.1");
expected[k++] = new MyResult(2, "1.0");
expected[k++] = new MyResult(3, "1.1");
expected[k++] = new MyResult(4, "2.0");
expected[k++] = new MyResult(5, "2.1");
expected[k++] = new MyResult(6, "3.0");
expected[k] = new MyResult(7, "3.1");
InputFormat inf = new OrcInputFormat();
JobConf job = new JobConf();
job.set("mapred.min.split.size", "1");
job.set("mapred.max.split.size", "2");
job.set("mapred.input.dir", root.toString());
job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, BigRow.getColumnNamesProperty());
job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, BigRow.getColumnTypesProperty());
AcidUtils.setAcidOperationalProperties(job, true, null);
job.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, true);
InputSplit[] splits = inf.getSplits(job, 5);
// base has 10 rows, so 5 splits, 1 delta has 2 rows so 1 split, and 1 delta has 3 so 2 splits
assertEquals(8, splits.length);
org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr;
for (InputSplit split : splits) {
rr = inf.getRecordReader(split, job, Reporter.NULL);
NullWritable key = rr.createKey();
OrcStruct value = rr.createValue();
while (rr.next(key, value)) {
MyResult mr = new MyResult(Integer.parseInt(value.getFieldValue(0).toString()), value.getFieldValue(2).toString());
int i = 0;
for (; i < expected.length; i++) {
if (mr.equals(expected[i])) {
expected[i] = null;
break;
}
}
if (i >= expected.length) {
// not found
assertTrue("Found unexpected row: " + mr, false);
}
}
}
for (MyResult mr : expected) {
assertTrue("Expected " + mr + " not found in any InputSplit", mr == null);
}
}
use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class TestOrcRawRecordMerger method testRecordReaderIncompleteDelta.
/**
* @param use130Format true means use delta_0001_0001_0000 format, else delta_0001_00001
*/
private void testRecordReaderIncompleteDelta(boolean use130Format) throws Exception {
final int BUCKET = 1;
Configuration conf = new Configuration();
OrcOutputFormat of = new OrcOutputFormat();
FileSystem fs = FileSystem.getLocal(conf).getRaw();
Path root = new Path(tmpDir, "testRecordReaderIncompleteDelta").makeQualified(fs);
fs.delete(root, true);
ObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
// write a base
AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).writingBase(true).minimumWriteId(0).maximumWriteId(0).bucket(BUCKET).inspector(inspector).filesystem(fs).finalDestination(root);
if (!use130Format) {
options.statementId(-1);
}
RecordUpdater ru = of.getRecordUpdater(root, options);
String[] values = new String[] { "1", "2", "3", "4", "5" };
for (int i = 0; i < values.length; ++i) {
ru.insert(0, new MyRow(values[i]));
}
ru.close(false);
// write a delta
options.writingBase(false).minimumWriteId(10).maximumWriteId(19);
ru = of.getRecordUpdater(root, options);
values = new String[] { "6", "7", "8" };
for (int i = 0; i < values.length; ++i) {
ru.insert(1, new MyRow(values[i]));
}
InputFormat inf = new OrcInputFormat();
JobConf job = new JobConf();
job.set("mapred.input.dir", root.toString());
job.set("bucket_count", "2");
job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty());
job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty());
AcidUtils.setAcidOperationalProperties(job, true, null);
job.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, true);
// read the keys before the delta is flushed
InputSplit[] splits = inf.getSplits(job, 1);
// 1 split since we only have 1 bucket file in base/. delta is not flushed (committed) yet, i.e. empty
assertEquals(1, splits.length);
org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr = inf.getRecordReader(splits[0], job, Reporter.NULL);
NullWritable key = rr.createKey();
OrcStruct value = rr.createValue();
System.out.println("Looking at split " + splits[0]);
for (int i = 1; i < 6; ++i) {
System.out.println("Checking row " + i);
assertEquals(true, rr.next(key, value));
assertEquals(Integer.toString(i), value.getFieldValue(0).toString());
}
assertEquals(false, rr.next(key, value));
ru.flush();
ru.flush();
values = new String[] { "9", "10" };
for (int i = 0; i < values.length; ++i) {
ru.insert(3, new MyRow(values[i]));
}
ru.flush();
splits = inf.getSplits(job, 1);
assertEquals(2, splits.length);
Path sideFile = new Path(root + "/" + (use130Format ? AcidUtils.deltaSubdir(10, 19, 0) : AcidUtils.deltaSubdir(10, 19)) + "/bucket_00001_flush_length");
assertEquals(true, fs.exists(sideFile));
assertEquals(32, fs.getFileStatus(sideFile).getLen());
rr = inf.getRecordReader(splits[0], job, Reporter.NULL);
for (int i = 1; i <= 5; ++i) {
assertEquals(true, rr.next(key, value));
assertEquals(Integer.toString(i), value.getFieldValue(0).toString());
}
assertEquals(false, rr.next(key, value));
rr = inf.getRecordReader(splits[1], job, Reporter.NULL);
for (int i = 6; i < 11; ++i) {
assertEquals("i=" + i, true, rr.next(key, value));
assertEquals(Integer.toString(i), value.getFieldValue(0).toString());
}
assertEquals(false, rr.next(key, value));
}
Aggregations