use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class HiveInputFormat method getRecordReader.
@Override
public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
HiveInputSplit hsplit = (HiveInputSplit) split;
InputSplit inputSplit = hsplit.getInputSplit();
String inputFormatClassName = null;
Class inputFormatClass = null;
try {
inputFormatClassName = hsplit.inputFormatClassName();
inputFormatClass = job.getClassByName(inputFormatClassName);
} catch (Exception e) {
throw new IOException("cannot find class " + inputFormatClassName, e);
}
if (this.mrwork == null || pathToPartitionInfo == null) {
init(job);
}
boolean nonNative = false;
PartitionDesc part = HiveFileFormatUtils.getFromPathRecursively(pathToPartitionInfo, hsplit.getPath(), null);
if (LOG.isDebugEnabled()) {
LOG.debug("Found spec for " + hsplit.getPath() + " " + part + " from " + pathToPartitionInfo);
}
try {
if ((part != null) && (part.getTableDesc() != null)) {
Utilities.copyTableJobPropertiesToConf(part.getTableDesc(), job);
nonNative = part.getTableDesc().isNonNative();
}
} catch (HiveException e) {
throw new IOException(e);
}
Path splitPath = hsplit.getPath();
pushProjectionsAndFilters(job, inputFormatClass, splitPath, nonNative);
InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
try {
inputFormat = HiveInputFormat.wrapForLlap(inputFormat, job, part);
} catch (HiveException e) {
throw new IOException(e);
}
RecordReader innerReader = null;
try {
innerReader = inputFormat.getRecordReader(inputSplit, job, reporter);
} catch (Exception e) {
innerReader = HiveIOExceptionHandlerUtil.handleRecordReaderCreationException(e, job);
}
HiveRecordReader<K, V> rr = new HiveRecordReader(innerReader, job);
rr.initIOContext(hsplit, job, inputFormatClass, innerReader);
return rr;
}
use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class SymlinkTextInputFormat method getSplits.
/**
* Parses all target paths from job input directory which contains symlink
* files, and splits the target data using TextInputFormat.
*/
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
Path[] symlinksDirs = FileInputFormat.getInputPaths(job);
if (symlinksDirs.length == 0) {
throw new IOException("No input paths specified in job.");
}
// Get all target paths first, because the number of total target paths
// is used to determine number of splits of each target path.
List<Path> targetPaths = new ArrayList<Path>();
List<Path> symlinkPaths = new ArrayList<Path>();
try {
getTargetPathsFromSymlinksDirs(job, symlinksDirs, targetPaths, symlinkPaths);
} catch (Exception e) {
throw new IOException("Error parsing symlinks from specified job input path.", e);
}
if (targetPaths.size() == 0) {
return new InputSplit[0];
}
// The input should be in TextInputFormat.
TextInputFormat inputFormat = new TextInputFormat();
JobConf newjob = new JobConf(job);
newjob.setInputFormat(TextInputFormat.class);
inputFormat.configure(newjob);
List<InputSplit> result = new ArrayList<InputSplit>();
// ceil(numSplits / numPaths), so we can get at least numSplits splits.
int numPaths = targetPaths.size();
int numSubSplits = (numSplits + numPaths - 1) / numPaths;
// For each path, do getSplits().
for (int i = 0; i < numPaths; ++i) {
Path targetPath = targetPaths.get(i);
Path symlinkPath = symlinkPaths.get(i);
FileInputFormat.setInputPaths(newjob, targetPath);
InputSplit[] iss = inputFormat.getSplits(newjob, numSubSplits);
for (InputSplit is : iss) {
result.add(new SymlinkTextInputSplit(symlinkPath, (FileSplit) is));
}
}
return result.toArray(new InputSplit[result.size()]);
}
use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class GenericUDTFGetSplits method getSplits.
public InputSplit[] getSplits(JobConf job, int numSplits, TezWork work, Schema schema, ApplicationId applicationId) throws IOException {
DAG dag = DAG.create(work.getName());
dag.setCredentials(job.getCredentials());
DagUtils utils = DagUtils.getInstance();
Context ctx = new Context(job);
MapWork mapWork = (MapWork) work.getAllWork().get(0);
// bunch of things get setup in the context based on conf but we need only the MR tmp directory
// for the following method.
JobConf wxConf = utils.initializeVertexConf(job, ctx, mapWork);
// TODO: should we also whitelist input formats here? from mapred.input.format.class
Path scratchDir = utils.createTezDir(ctx.getMRScratchDir(), job);
FileSystem fs = scratchDir.getFileSystem(job);
try {
LocalResource appJarLr = createJarLocalResource(utils.getExecJarPathLocal(ctx.getConf()), utils, job);
LlapCoordinator coordinator = LlapCoordinator.getInstance();
if (coordinator == null) {
throw new IOException("LLAP coordinator is not initialized; must be running in HS2 with " + ConfVars.LLAP_HS2_ENABLE_COORDINATOR.varname + " enabled");
}
// Update the queryId to use the generated applicationId. See comment below about
// why this is done.
HiveConf.setVar(wxConf, HiveConf.ConfVars.HIVEQUERYID, applicationId.toString());
Vertex wx = utils.createVertex(wxConf, mapWork, scratchDir, fs, ctx, false, work, work.getVertexType(mapWork), DagUtils.createTezLrMap(appJarLr, null));
String vertexName = wx.getName();
dag.addVertex(wx);
utils.addCredentials(mapWork, dag);
// we have the dag now proceed to get the splits:
Preconditions.checkState(HiveConf.getBoolVar(wxConf, ConfVars.HIVE_TEZ_GENERATE_CONSISTENT_SPLITS));
Preconditions.checkState(HiveConf.getBoolVar(wxConf, ConfVars.LLAP_CLIENT_CONSISTENT_SPLITS));
HiveSplitGenerator splitGenerator = new HiveSplitGenerator(wxConf, mapWork);
List<Event> eventList = splitGenerator.initialize();
InputSplit[] result = new InputSplit[eventList.size() - 1];
InputConfigureVertexTasksEvent configureEvent = (InputConfigureVertexTasksEvent) eventList.get(0);
List<TaskLocationHint> hints = configureEvent.getLocationHint().getTaskLocationHints();
Preconditions.checkState(hints.size() == eventList.size() - 1);
if (LOG.isDebugEnabled()) {
LOG.debug("NumEvents=" + eventList.size() + ", NumSplits=" + result.length);
}
// This assumes LLAP cluster owner is always the HS2 user.
String llapUser = UserGroupInformation.getLoginUser().getShortUserName();
String queryUser = null;
byte[] tokenBytes = null;
LlapSigner signer = null;
if (UserGroupInformation.isSecurityEnabled()) {
signer = coordinator.getLlapSigner(job);
// 1. Generate the token for query user (applies to all splits).
queryUser = SessionState.getUserFromAuthenticator();
if (queryUser == null) {
queryUser = UserGroupInformation.getCurrentUser().getUserName();
LOG.warn("Cannot determine the session user; using " + queryUser + " instead");
}
LlapTokenLocalClient tokenClient = coordinator.getLocalTokenClient(job, llapUser);
// We put the query user, not LLAP user, into the message and token.
Token<LlapTokenIdentifier> token = tokenClient.createToken(applicationId.toString(), queryUser, true);
LOG.info("Created the token for remote user: {}", token);
bos.reset();
token.write(dos);
tokenBytes = bos.toByteArray();
} else {
queryUser = UserGroupInformation.getCurrentUser().getUserName();
}
// Generate umbilical token (applies to all splits)
Token<JobTokenIdentifier> umbilicalToken = JobTokenCreator.createJobToken(applicationId);
LOG.info("Number of splits: " + (eventList.size() - 1));
SignedMessage signedSvs = null;
for (int i = 0; i < eventList.size() - 1; i++) {
TaskSpec taskSpec = new TaskSpecBuilder().constructTaskSpec(dag, vertexName, eventList.size() - 1, applicationId, i);
// 2. Generate the vertex/submit information for all events.
if (i == 0) {
// The queryId could either be picked up from the current request being processed, or
// generated. The current request isn't exactly correct since the query is 'done' once we
// return the results. Generating a new one has the added benefit of working once this
// is moved out of a UDTF into a proper API.
// Setting this to the generated AppId which is unique.
// Despite the differences in TaskSpec, the vertex spec should be the same.
signedSvs = createSignedVertexSpec(signer, taskSpec, applicationId, queryUser, applicationId.toString());
}
SubmitWorkInfo submitWorkInfo = new SubmitWorkInfo(applicationId, System.currentTimeMillis(), taskSpec.getVertexParallelism(), signedSvs.message, signedSvs.signature, umbilicalToken);
byte[] submitWorkBytes = SubmitWorkInfo.toBytes(submitWorkInfo);
// 3. Generate input event.
SignedMessage eventBytes = makeEventBytes(wx, vertexName, eventList.get(i + 1), signer);
// 4. Make location hints.
SplitLocationInfo[] locations = makeLocationHints(hints.get(i));
result[i] = new LlapInputSplit(i, submitWorkBytes, eventBytes.message, eventBytes.signature, locations, schema, llapUser, tokenBytes);
}
return result;
} catch (Exception e) {
throw new IOException(e);
}
}
use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class GenericUDTFGetSplits method process.
@Override
public void process(Object[] arguments) throws HiveException {
String query = stringOI.getPrimitiveJavaObject(arguments[0]);
int num = intOI.get(arguments[1]);
// Generate applicationId for the LLAP splits
LlapCoordinator coordinator = LlapCoordinator.getInstance();
if (coordinator == null) {
throw new HiveException("LLAP coordinator is not initialized; must be running in HS2 with " + ConfVars.LLAP_HS2_ENABLE_COORDINATOR.varname + " enabled");
}
ApplicationId applicationId = coordinator.createExtClientAppId();
LOG.info("Generated appID {} for LLAP splits", applicationId.toString());
PlanFragment fragment = createPlanFragment(query, num, applicationId);
TezWork tezWork = fragment.work;
Schema schema = fragment.schema;
try {
for (InputSplit s : getSplits(jc, num, tezWork, schema, applicationId)) {
Object[] os = new Object[1];
bos.reset();
s.write(dos);
byte[] frozen = bos.toByteArray();
os[0] = frozen;
forward(os);
}
} catch (Exception e) {
throw new HiveException(e);
}
}
use of org.apache.hadoop.mapred.InputSplit in project apex-malhar by apache.
the class MapOperatorTest method testNodeProcessingSchema.
public void testNodeProcessingSchema(MapOperator<LongWritable, Text, Text, IntWritable> oper) throws IOException {
CollectorTestSink sortSink = new CollectorTestSink();
oper.output.setSink(sortSink);
oper.setMapClass(WordCount.Map.class);
oper.setCombineClass(WordCount.Reduce.class);
oper.setDirName(testMeta.testDir);
oper.setConfigFile(null);
oper.setInputFormatClass(TextInputFormat.class);
Configuration conf = new Configuration();
JobConf jobConf = new JobConf(conf);
FileInputFormat.setInputPaths(jobConf, new Path(testMeta.testDir));
TextInputFormat inputFormat = new TextInputFormat();
inputFormat.configure(jobConf);
InputSplit[] splits = inputFormat.getSplits(jobConf, 1);
SerializationFactory serializationFactory = new SerializationFactory(conf);
Serializer keySerializer = serializationFactory.getSerializer(splits[0].getClass());
keySerializer.open(oper.getOutstream());
keySerializer.serialize(splits[0]);
oper.setInputSplitClass(splits[0].getClass());
keySerializer.close();
oper.setup(null);
oper.beginWindow(0);
oper.emitTuples();
oper.emitTuples();
oper.endWindow();
oper.beginWindow(1);
oper.emitTuples();
oper.endWindow();
Assert.assertEquals("number emitted tuples", 3, sortSink.collectedTuples.size());
for (Object o : sortSink.collectedTuples) {
LOG.debug(o.toString());
}
LOG.debug("Done testing round\n");
oper.teardown();
}
Aggregations