use of org.apache.hadoop.mapred.InputSplit in project hadoop by apache.
the class TestAutoInputFormat method testFormat.
@SuppressWarnings({ "unchecked", "deprecation" })
@Test
public void testFormat() throws IOException {
JobConf job = new JobConf(conf);
FileSystem fs = FileSystem.getLocal(conf);
Path dir = new Path(System.getProperty("test.build.data", ".") + "/mapred");
Path txtFile = new Path(dir, "auto.txt");
Path seqFile = new Path(dir, "auto.seq");
fs.delete(dir, true);
FileInputFormat.setInputPaths(job, dir);
Writer txtWriter = new OutputStreamWriter(fs.create(txtFile));
try {
for (int i = 0; i < LINES_COUNT; i++) {
txtWriter.write("" + (10 * i));
txtWriter.write("\n");
}
} finally {
txtWriter.close();
}
SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, seqFile, IntWritable.class, LongWritable.class);
try {
for (int i = 0; i < RECORDS_COUNT; i++) {
IntWritable key = new IntWritable(11 * i);
LongWritable value = new LongWritable(12 * i);
seqWriter.append(key, value);
}
} finally {
seqWriter.close();
}
AutoInputFormat format = new AutoInputFormat();
InputSplit[] splits = format.getSplits(job, SPLITS_COUNT);
for (InputSplit split : splits) {
RecordReader reader = format.getRecordReader(split, job, Reporter.NULL);
Object key = reader.createKey();
Object value = reader.createValue();
try {
while (reader.next(key, value)) {
if (key instanceof LongWritable) {
assertEquals("Wrong value class.", Text.class, value.getClass());
assertTrue("Invalid value", Integer.parseInt(((Text) value).toString()) % 10 == 0);
} else {
assertEquals("Wrong key class.", IntWritable.class, key.getClass());
assertEquals("Wrong value class.", LongWritable.class, value.getClass());
assertTrue("Invalid key.", ((IntWritable) key).get() % 11 == 0);
assertTrue("Invalid value.", ((LongWritable) value).get() % 12 == 0);
}
}
} finally {
reader.close();
}
}
}
use of org.apache.hadoop.mapred.InputSplit in project hadoop by apache.
the class TestDelegatingInputFormat method testSplitting.
@Test
public void testSplitting() throws Exception {
JobConf conf = new JobConf();
MiniDFSCluster dfs = null;
try {
dfs = new MiniDFSCluster.Builder(conf).numDataNodes(4).racks(new String[] { "/rack0", "/rack0", "/rack1", "/rack1" }).hosts(new String[] { "host0", "host1", "host2", "host3" }).build();
FileSystem fs = dfs.getFileSystem();
Path path = getPath("/foo/bar", fs);
Path path2 = getPath("/foo/baz", fs);
Path path3 = getPath("/bar/bar", fs);
Path path4 = getPath("/bar/baz", fs);
final int numSplits = 100;
MultipleInputs.addInputPath(conf, path, TextInputFormat.class, MapClass.class);
MultipleInputs.addInputPath(conf, path2, TextInputFormat.class, MapClass2.class);
MultipleInputs.addInputPath(conf, path3, KeyValueTextInputFormat.class, MapClass.class);
MultipleInputs.addInputPath(conf, path4, TextInputFormat.class, MapClass2.class);
DelegatingInputFormat inFormat = new DelegatingInputFormat();
InputSplit[] splits = inFormat.getSplits(conf, numSplits);
int[] bins = new int[3];
for (InputSplit split : splits) {
assertTrue(split instanceof TaggedInputSplit);
final TaggedInputSplit tis = (TaggedInputSplit) split;
int index = -1;
if (tis.getInputFormatClass().equals(KeyValueTextInputFormat.class)) {
// path3
index = 0;
} else if (tis.getMapperClass().equals(MapClass.class)) {
// path
index = 1;
} else {
// path2 and path4
index = 2;
}
bins[index]++;
}
// regardless of the number of paths that use that Mapper/InputFormat
for (int count : bins) {
assertEquals(numSplits, count);
}
assertTrue(true);
} finally {
if (dfs != null) {
dfs.shutdown();
}
}
}
use of org.apache.hadoop.mapred.InputSplit in project hadoop by apache.
the class TestPipesNonJavaInputFormat method testFormat.
/**
* test PipesNonJavaInputFormat
*/
@Test
public void testFormat() throws IOException {
PipesNonJavaInputFormat inputFormat = new PipesNonJavaInputFormat();
JobConf conf = new JobConf();
Reporter reporter = mock(Reporter.class);
RecordReader<FloatWritable, NullWritable> reader = inputFormat.getRecordReader(new FakeSplit(), conf, reporter);
assertEquals(0.0f, reader.getProgress(), 0.001);
// input and output files
File input1 = new File(workSpace + File.separator + "input1");
if (!input1.getParentFile().exists()) {
Assert.assertTrue(input1.getParentFile().mkdirs());
}
if (!input1.exists()) {
Assert.assertTrue(input1.createNewFile());
}
File input2 = new File(workSpace + File.separator + "input2");
if (!input2.exists()) {
Assert.assertTrue(input2.createNewFile());
}
// set data for splits
conf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR, StringUtils.escapeString(input1.getAbsolutePath()) + "," + StringUtils.escapeString(input2.getAbsolutePath()));
InputSplit[] splits = inputFormat.getSplits(conf, 2);
assertEquals(2, splits.length);
PipesNonJavaInputFormat.PipesDummyRecordReader dummyRecordReader = new PipesNonJavaInputFormat.PipesDummyRecordReader(conf, splits[0]);
// empty dummyRecordReader
assertNull(dummyRecordReader.createKey());
assertNull(dummyRecordReader.createValue());
assertEquals(0, dummyRecordReader.getPos());
assertEquals(0.0, dummyRecordReader.getProgress(), 0.001);
// test method next
assertTrue(dummyRecordReader.next(new FloatWritable(2.0f), NullWritable.get()));
assertEquals(2.0, dummyRecordReader.getProgress(), 0.001);
dummyRecordReader.close();
}
use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class GenericUDTFGetSplits method getSplits.
public InputSplit[] getSplits(JobConf job, int numSplits, TezWork work, Schema schema) throws IOException {
DAG dag = DAG.create(work.getName());
dag.setCredentials(job.getCredentials());
DagUtils utils = DagUtils.getInstance();
Context ctx = new Context(job);
MapWork mapWork = (MapWork) work.getAllWork().get(0);
// bunch of things get setup in the context based on conf but we need only the MR tmp directory
// for the following method.
JobConf wxConf = utils.initializeVertexConf(job, ctx, mapWork);
// TODO: should we also whitelist input formats here? from mapred.input.format.class
Path scratchDir = utils.createTezDir(ctx.getMRScratchDir(), job);
FileSystem fs = scratchDir.getFileSystem(job);
try {
LocalResource appJarLr = createJarLocalResource(utils.getExecJarPathLocal(), utils, job);
Vertex wx = utils.createVertex(wxConf, mapWork, scratchDir, appJarLr, new ArrayList<LocalResource>(), fs, ctx, false, work, work.getVertexType(mapWork));
String vertexName = wx.getName();
dag.addVertex(wx);
utils.addCredentials(mapWork, dag);
// we have the dag now proceed to get the splits:
Preconditions.checkState(HiveConf.getBoolVar(wxConf, ConfVars.HIVE_TEZ_GENERATE_CONSISTENT_SPLITS));
Preconditions.checkState(HiveConf.getBoolVar(wxConf, ConfVars.LLAP_CLIENT_CONSISTENT_SPLITS));
HiveSplitGenerator splitGenerator = new HiveSplitGenerator(wxConf, mapWork);
List<Event> eventList = splitGenerator.initialize();
InputSplit[] result = new InputSplit[eventList.size() - 1];
InputConfigureVertexTasksEvent configureEvent = (InputConfigureVertexTasksEvent) eventList.get(0);
List<TaskLocationHint> hints = configureEvent.getLocationHint().getTaskLocationHints();
Preconditions.checkState(hints.size() == eventList.size() - 1);
if (LOG.isDebugEnabled()) {
LOG.debug("NumEvents=" + eventList.size() + ", NumSplits=" + result.length);
}
LlapCoordinator coordinator = LlapCoordinator.getInstance();
if (coordinator == null) {
throw new IOException("LLAP coordinator is not initialized; must be running in HS2 with " + ConfVars.LLAP_HS2_ENABLE_COORDINATOR.varname + " enabled");
}
// See the discussion in the implementation as to why we generate app ID.
ApplicationId applicationId = coordinator.createExtClientAppId();
// This assumes LLAP cluster owner is always the HS2 user.
String llapUser = UserGroupInformation.getLoginUser().getShortUserName();
String queryUser = null;
byte[] tokenBytes = null;
LlapSigner signer = null;
if (UserGroupInformation.isSecurityEnabled()) {
signer = coordinator.getLlapSigner(job);
// 1. Generate the token for query user (applies to all splits).
queryUser = SessionState.getUserFromAuthenticator();
if (queryUser == null) {
queryUser = UserGroupInformation.getCurrentUser().getUserName();
LOG.warn("Cannot determine the session user; using " + queryUser + " instead");
}
LlapTokenLocalClient tokenClient = coordinator.getLocalTokenClient(job, llapUser);
// We put the query user, not LLAP user, into the message and token.
Token<LlapTokenIdentifier> token = tokenClient.createToken(applicationId.toString(), queryUser, true);
LOG.info("Created the token for remote user: {}", token);
bos.reset();
token.write(dos);
tokenBytes = bos.toByteArray();
} else {
queryUser = UserGroupInformation.getCurrentUser().getUserName();
}
LOG.info("Number of splits: " + (eventList.size() - 1));
SignedMessage signedSvs = null;
for (int i = 0; i < eventList.size() - 1; i++) {
TaskSpec taskSpec = new TaskSpecBuilder().constructTaskSpec(dag, vertexName, eventList.size() - 1, applicationId, i);
// 2. Generate the vertex/submit information for all events.
if (i == 0) {
// The queryId could either be picked up from the current request being processed, or
// generated. The current request isn't exactly correct since the query is 'done' once we
// return the results. Generating a new one has the added benefit of working once this
// is moved out of a UDTF into a proper API.
// Setting this to the generated AppId which is unique.
// Despite the differences in TaskSpec, the vertex spec should be the same.
signedSvs = createSignedVertexSpec(signer, taskSpec, applicationId, queryUser, applicationId.toString());
}
SubmitWorkInfo submitWorkInfo = new SubmitWorkInfo(applicationId, System.currentTimeMillis(), taskSpec.getVertexParallelism(), signedSvs.message, signedSvs.signature);
byte[] submitWorkBytes = SubmitWorkInfo.toBytes(submitWorkInfo);
// 3. Generate input event.
SignedMessage eventBytes = makeEventBytes(wx, vertexName, eventList.get(i + 1), signer);
// 4. Make location hints.
SplitLocationInfo[] locations = makeLocationHints(hints.get(i));
result[i] = new LlapInputSplit(i, submitWorkBytes, eventBytes.message, eventBytes.signature, locations, schema, llapUser, tokenBytes);
}
return result;
} catch (Exception e) {
throw new IOException(e);
}
}
use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class GenericUDTFGetSplits method process.
@Override
public void process(Object[] arguments) throws HiveException {
String query = stringOI.getPrimitiveJavaObject(arguments[0]);
int num = intOI.get(arguments[1]);
PlanFragment fragment = createPlanFragment(query, num);
TezWork tezWork = fragment.work;
Schema schema = fragment.schema;
try {
for (InputSplit s : getSplits(jc, num, tezWork, schema)) {
Object[] os = new Object[1];
bos.reset();
s.write(dos);
byte[] frozen = bos.toByteArray();
os[0] = frozen;
forward(os);
}
} catch (Exception e) {
throw new HiveException(e);
}
}
Aggregations