use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class TestSymlinkTextInputFormat method testCombine.
/**
* Test combine symlink text input file. Two input dir, and each contains one
* file, and then create one symlink file containing these 2 files. Normally
* without combine, it will return at least 2 splits
*/
public void testCombine() throws Exception {
JobConf newJob = new JobConf(job);
FileSystem fs = dataDir1.getFileSystem(newJob);
int symbolLinkedFileSize = 0;
Path dir1_file1 = new Path(dataDir1, "combinefile1_1");
writeTextFile(dir1_file1, "dir1_file1_line1\n" + "dir1_file1_line2\n");
symbolLinkedFileSize += fs.getFileStatus(dir1_file1).getLen();
Path dir2_file1 = new Path(dataDir2, "combinefile2_1");
writeTextFile(dir2_file1, "dir2_file1_line1\n" + "dir2_file1_line2\n");
symbolLinkedFileSize += fs.getFileStatus(dir2_file1).getLen();
// A symlink file, contains first file from first dir and second file from
// second dir.
writeSymlinkFile(new Path(symlinkDir, "symlink_file"), new Path(dataDir1, "combinefile1_1"), new Path(dataDir2, "combinefile2_1"));
HiveConf hiveConf = new HiveConf(TestSymlinkTextInputFormat.class);
hiveConf.setVar(HiveConf.ConfVars.HIVE_AUTHORIZATION_MANAGER, "org.apache.hadoop.hive.ql.security.authorization.plugin.sqlstd.SQLStdHiveAuthorizerFactory");
HiveConf.setBoolVar(hiveConf, HiveConf.ConfVars.HIVE_REWORK_MAPREDWORK, true);
HiveConf.setBoolVar(hiveConf, HiveConf.ConfVars.HIVE_SUPPORT_CONCURRENCY, false);
Driver drv = new Driver(hiveConf);
String tblName = "text_symlink_text";
String createSymlinkTableCmd = "create table " + tblName + " (key int) stored as " + " inputformat 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat' " + " outputformat 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat'";
SessionState.start(hiveConf);
boolean tblCreated = false;
try {
int ecode = 0;
ecode = drv.run(createSymlinkTableCmd).getResponseCode();
if (ecode != 0) {
throw new Exception("Create table command: " + createSymlinkTableCmd + " failed with exit code= " + ecode);
}
tblCreated = true;
String loadFileCommand = "LOAD DATA LOCAL INPATH '" + new Path(symlinkDir, "symlink_file").toString() + "' INTO TABLE " + tblName;
ecode = drv.run(loadFileCommand).getResponseCode();
if (ecode != 0) {
throw new Exception("Load data command: " + loadFileCommand + " failed with exit code= " + ecode);
}
String cmd = "select key*1 from " + tblName;
ecode = drv.compile(cmd);
if (ecode != 0) {
throw new Exception("Select compile: " + cmd + " failed with exit code= " + ecode);
}
// create scratch dir
Context ctx = new Context(newJob);
Path emptyScratchDir = ctx.getMRTmpPath();
FileSystem fileSys = emptyScratchDir.getFileSystem(newJob);
fileSys.mkdirs(emptyScratchDir);
QueryPlan plan = drv.getPlan();
MapRedTask selectTask = (MapRedTask) plan.getRootTasks().get(0);
List<Path> inputPaths = Utilities.getInputPaths(newJob, selectTask.getWork().getMapWork(), emptyScratchDir, ctx, false);
Utilities.setInputPaths(newJob, inputPaths);
Utilities.setMapRedWork(newJob, selectTask.getWork(), ctx.getMRTmpPath());
CombineHiveInputFormat combineInputFormat = ReflectionUtils.newInstance(CombineHiveInputFormat.class, newJob);
InputSplit[] retSplits = combineInputFormat.getSplits(newJob, 1);
assertEquals(1, retSplits.length);
} catch (Exception e) {
e.printStackTrace();
fail("Caught exception " + e);
} finally {
if (tblCreated) {
drv.run("drop table text_symlink_text").getResponseCode();
}
}
}
use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class TestSymlinkTextInputFormat method testAccuracy1.
/**
* Test scenario: Two data directories, one symlink file that contains two
* paths each point to a file in one of data directories.
*/
public void testAccuracy1() throws IOException {
// First data dir, contains 2 files.
FileSystem fs = dataDir1.getFileSystem(job);
int symbolLinkedFileSize = 0;
Path dir1_file1 = new Path(dataDir1, "file1");
writeTextFile(dir1_file1, "dir1_file1_line1\n" + "dir1_file1_line2\n");
symbolLinkedFileSize += fs.getFileStatus(dir1_file1).getLen();
Path dir1_file2 = new Path(dataDir1, "file2");
writeTextFile(dir1_file2, "dir1_file2_line1\n" + "dir1_file2_line2\n");
// Second data dir, contains 2 files.
Path dir2_file1 = new Path(dataDir2, "file1");
writeTextFile(dir2_file1, "dir2_file1_line1\n" + "dir2_file1_line2\n");
Path dir2_file2 = new Path(dataDir2, "file2");
writeTextFile(dir2_file2, "dir2_file2_line1\n" + "dir2_file2_line2\n");
symbolLinkedFileSize += fs.getFileStatus(dir2_file2).getLen();
// A symlink file, contains first file from first dir and second file from
// second dir.
writeSymlinkFile(new Path(symlinkDir, "symlink_file"), new Path(dataDir1, "file1"), new Path(dataDir2, "file2"));
SymlinkTextInputFormat inputFormat = new SymlinkTextInputFormat();
// test content summary
ContentSummary cs = inputFormat.getContentSummary(symlinkDir, job);
assertEquals(symbolLinkedFileSize, cs.getLength());
assertEquals(2, cs.getFileCount());
assertEquals(0, cs.getDirectoryCount());
FileInputFormat.setInputPaths(job, symlinkDir);
InputSplit[] splits = inputFormat.getSplits(job, 2);
log.info("Number of splits: " + splits.length);
// Read all values.
List<String> received = new ArrayList<String>();
for (InputSplit split : splits) {
RecordReader<LongWritable, Text> reader = inputFormat.getRecordReader(split, job, reporter);
LongWritable key = reader.createKey();
Text value = reader.createValue();
while (reader.next(key, value)) {
received.add(value.toString());
}
reader.close();
}
List<String> expected = new ArrayList<String>();
expected.add("dir1_file1_line1");
expected.add("dir1_file1_line2");
expected.add("dir2_file2_line1");
expected.add("dir2_file2_line2");
assertEquals(expected, received);
}
use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class TestSymlinkTextInputFormat method testAccuracy2.
/**
* Scenario: Empty input directory, i.e. no symlink file.
*
* Expected: Should return empty result set without any exception.
*/
public void testAccuracy2() throws IOException {
fileSystem.mkdirs(symlinkDir);
FileInputFormat.setInputPaths(job, symlinkDir);
SymlinkTextInputFormat inputFormat = new SymlinkTextInputFormat();
ContentSummary cs = inputFormat.getContentSummary(symlinkDir, job);
assertEquals(0, cs.getLength());
assertEquals(0, cs.getFileCount());
assertEquals(0, cs.getDirectoryCount());
InputSplit[] splits = inputFormat.getSplits(job, 2);
log.info("Number of splits: " + splits.length);
// Read all values.
List<String> received = new ArrayList<String>();
for (InputSplit split : splits) {
RecordReader<LongWritable, Text> reader = inputFormat.getRecordReader(split, job, reporter);
LongWritable key = reader.createKey();
Text value = reader.createValue();
while (reader.next(key, value)) {
received.add(value.toString());
}
reader.close();
}
List<String> expected = new ArrayList<String>();
assertEquals(expected, received);
}
use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class TestInputOutputFormat method testACIDReaderNoFooterSerialize.
@Test
public void testACIDReaderNoFooterSerialize() throws Exception {
MockFileSystem fs = new MockFileSystem(conf);
MockPath mockPath = new MockPath(fs, "mock:///mocktable5");
conf.set(ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN.varname, "true");
conf.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, true);
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty());
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty());
conf.set("hive.orc.splits.include.file.footer", "false");
conf.set("mapred.input.dir", mockPath.toString());
conf.set("fs.defaultFS", "mock:///");
conf.set("fs.mock.impl", MockFileSystem.class.getName());
StructObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
for (int i = 0; i < 10; ++i) {
writer.addRow(new MyRow(i, 2 * i));
}
writer.close();
writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
for (int i = 0; i < 10; ++i) {
writer.addRow(new MyRow(i, 2 * i));
}
writer.close();
OrcInputFormat orcInputFormat = new OrcInputFormat();
InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
assertEquals(2, splits.length);
int readOpsBefore = -1;
for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
if (statistics.getScheme().equalsIgnoreCase("mock")) {
readOpsBefore = statistics.getReadOps();
}
}
assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);
for (InputSplit split : splits) {
assertTrue("OrcSplit is expected", split instanceof OrcSplit);
// ETL strategies will have start=3 (start of first stripe)
assertTrue(split.toString().contains("start=3"));
assertTrue(split.toString().contains("hasFooter=false"));
assertTrue(split.toString().contains("hasBase=true"));
assertTrue(split.toString().contains("deltas=0"));
assertTrue(split.toString().contains("isOriginal=true"));
if (split instanceof OrcSplit) {
assertFalse("No footer serialize test for non-vector reader, hasFooter is not expected in" + " orc splits.", ((OrcSplit) split).hasFooter());
}
orcInputFormat.getRecordReader(split, conf, Reporter.NULL);
}
int readOpsDelta = -1;
for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
if (statistics.getScheme().equalsIgnoreCase("mock")) {
readOpsDelta = statistics.getReadOps() - readOpsBefore;
}
}
// call-1: open to read footer - split 1 => mock:/mocktable5/0_0
// call-2: open to read data - split 1 => mock:/mocktable5/0_0
// call-3: getAcidState - split 1 => mock:/mocktable5 (to compute offset for original read)
// call-4: open to read footer - split 2 => mock:/mocktable5/0_1
// call-5: open to read data - split 2 => mock:/mocktable5/0_1
// call-6: getAcidState - split 2 => mock:/mocktable5 (to compute offset for original read)
// call-7: open to read footer - split 2 => mock:/mocktable5/0_0 (to get row count)
// call-8: file status - split 2 => mock:/mocktable5/0_0
assertEquals(8, readOpsDelta);
// revert back to local fs
conf.set("fs.defaultFS", "file:///");
}
use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class TestInputOutputFormat method testSplitGenReadOpsLocalCacheChangeFileLen.
@Test
public void testSplitGenReadOpsLocalCacheChangeFileLen() throws Exception {
MockFileSystem fs = new MockFileSystem(conf);
// creates the static cache
MockPath mockPath = new MockPath(fs, "mock:///mocktbl1");
conf.set("mapred.input.dir", mockPath.toString());
conf.set("fs.defaultFS", "mock:///");
conf.set("fs.mock.impl", MockFileSystem.class.getName());
StructObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
for (int i = 0; i < 10; ++i) {
writer.addRow(new MyRow(i, 2 * i));
}
writer.close();
writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
for (int i = 0; i < 10; ++i) {
writer.addRow(new MyRow(i, 2 * i));
}
writer.close();
int readOpsBefore = -1;
for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
if (statistics.getScheme().equalsIgnoreCase("mock")) {
readOpsBefore = statistics.getReadOps();
}
}
assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);
OrcInputFormat orcInputFormat = new OrcInputFormat();
InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
assertEquals(2, splits.length);
int readOpsDelta = -1;
for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
if (statistics.getScheme().equalsIgnoreCase("mock")) {
readOpsDelta = statistics.getReadOps() - readOpsBefore;
}
}
// call-1: listLocatedStatus - mock:/mocktable
// call-2: check side file for mock:/mocktbl1/0_0
// call-3: open - mock:/mocktbl1/0_0
// call-4: check side file for mock:/mocktbl1/0_1
// call-5: open - mock:/mocktbl1/0_1
assertEquals(5, readOpsDelta);
// change file length and look for cache misses
fs.clear();
writer = OrcFile.createWriter(new Path(mockPath + "/0_0"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
for (int i = 0; i < 100; ++i) {
writer.addRow(new MyRow(i, 2 * i));
}
writer.close();
writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
for (int i = 0; i < 100; ++i) {
writer.addRow(new MyRow(i, 2 * i));
}
writer.close();
for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
if (statistics.getScheme().equalsIgnoreCase("mock")) {
readOpsBefore = statistics.getReadOps();
}
}
orcInputFormat = new OrcInputFormat();
splits = orcInputFormat.getSplits(conf, 2);
assertEquals(2, splits.length);
for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
if (statistics.getScheme().equalsIgnoreCase("mock")) {
readOpsDelta = statistics.getReadOps() - readOpsBefore;
}
}
// call-1: listLocatedStatus - mock:/mocktable
// call-2: check side file for mock:/mocktbl1/0_0
// call-3: open - mock:/mocktbl1/0_0
// call-4: check side file for mock:/mocktbl1/0_1
// call-5: open - mock:/mocktbl1/0_1
assertEquals(5, readOpsDelta);
for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
if (statistics.getScheme().equalsIgnoreCase("mock")) {
readOpsBefore = statistics.getReadOps();
}
}
orcInputFormat = new OrcInputFormat();
splits = orcInputFormat.getSplits(conf, 2);
assertEquals(2, splits.length);
for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
if (statistics.getScheme().equalsIgnoreCase("mock")) {
readOpsDelta = statistics.getReadOps() - readOpsBefore;
}
}
// call-1: listLocatedStatus - mock:/mocktbl1
assertEquals(1, readOpsDelta);
// revert back to local fs
conf.set("fs.defaultFS", "file:///");
}
Aggregations