Search in sources :

Example 51 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class HiveInputFormat method getRecordReader.

@Override
public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
    HiveInputSplit hsplit = (HiveInputSplit) split;
    InputSplit inputSplit = hsplit.getInputSplit();
    String inputFormatClassName = null;
    Class inputFormatClass = null;
    try {
        inputFormatClassName = hsplit.inputFormatClassName();
        inputFormatClass = job.getClassByName(inputFormatClassName);
    } catch (Exception e) {
        throw new IOException("cannot find class " + inputFormatClassName, e);
    }
    if (this.mrwork == null || pathToPartitionInfo == null) {
        init(job);
    }
    boolean nonNative = false;
    PartitionDesc part = HiveFileFormatUtils.getFromPathRecursively(pathToPartitionInfo, hsplit.getPath(), null);
    if (LOG.isDebugEnabled()) {
        LOG.debug("Found spec for " + hsplit.getPath() + " " + part + " from " + pathToPartitionInfo);
    }
    try {
        if ((part != null) && (part.getTableDesc() != null)) {
            Utilities.copyTableJobPropertiesToConf(part.getTableDesc(), job);
            nonNative = part.getTableDesc().isNonNative();
        }
    } catch (HiveException e) {
        throw new IOException(e);
    }
    Path splitPath = hsplit.getPath();
    pushProjectionsAndFilters(job, inputFormatClass, splitPath, nonNative);
    InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
    try {
        inputFormat = HiveInputFormat.wrapForLlap(inputFormat, job, part);
    } catch (HiveException e) {
        throw new IOException(e);
    }
    RecordReader innerReader = null;
    try {
        innerReader = inputFormat.getRecordReader(inputSplit, job, reporter);
    } catch (Exception e) {
        innerReader = HiveIOExceptionHandlerUtil.handleRecordReaderCreationException(e, job);
    }
    HiveRecordReader<K, V> rr = new HiveRecordReader(innerReader, job);
    rr.initIOContext(hsplit, job, inputFormatClass, innerReader);
    return rr;
}
Also used : Path(org.apache.hadoop.fs.Path) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) RecordReader(org.apache.hadoop.mapred.RecordReader) IOException(java.io.IOException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) IOException(java.io.IOException) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) VectorPartitionDesc(org.apache.hadoop.hive.ql.plan.VectorPartitionDesc) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 52 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class SymlinkTextInputFormat method getSplits.

/**
 * Parses all target paths from job input directory which contains symlink
 * files, and splits the target data using TextInputFormat.
 */
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    Path[] symlinksDirs = FileInputFormat.getInputPaths(job);
    if (symlinksDirs.length == 0) {
        throw new IOException("No input paths specified in job.");
    }
    // Get all target paths first, because the number of total target paths
    // is used to determine number of splits of each target path.
    List<Path> targetPaths = new ArrayList<Path>();
    List<Path> symlinkPaths = new ArrayList<Path>();
    try {
        getTargetPathsFromSymlinksDirs(job, symlinksDirs, targetPaths, symlinkPaths);
    } catch (Exception e) {
        throw new IOException("Error parsing symlinks from specified job input path.", e);
    }
    if (targetPaths.size() == 0) {
        return new InputSplit[0];
    }
    // The input should be in TextInputFormat.
    TextInputFormat inputFormat = new TextInputFormat();
    JobConf newjob = new JobConf(job);
    newjob.setInputFormat(TextInputFormat.class);
    inputFormat.configure(newjob);
    List<InputSplit> result = new ArrayList<InputSplit>();
    // ceil(numSplits / numPaths), so we can get at least numSplits splits.
    int numPaths = targetPaths.size();
    int numSubSplits = (numSplits + numPaths - 1) / numPaths;
    // For each path, do getSplits().
    for (int i = 0; i < numPaths; ++i) {
        Path targetPath = targetPaths.get(i);
        Path symlinkPath = symlinkPaths.get(i);
        FileInputFormat.setInputPaths(newjob, targetPath);
        InputSplit[] iss = inputFormat.getSplits(newjob, numSubSplits);
        for (InputSplit is : iss) {
            result.add(new SymlinkTextInputSplit(symlinkPath, (FileSplit) is));
        }
    }
    return result.toArray(new InputSplit[result.size()]);
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) IOException(java.io.IOException) FileSplit(org.apache.hadoop.mapred.FileSplit) IOException(java.io.IOException) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) InputSplit(org.apache.hadoop.mapred.InputSplit) JobConf(org.apache.hadoop.mapred.JobConf)

Example 53 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class GenericUDTFGetSplits method getSplits.

public InputSplit[] getSplits(JobConf job, int numSplits, TezWork work, Schema schema, ApplicationId applicationId) throws IOException {
    DAG dag = DAG.create(work.getName());
    dag.setCredentials(job.getCredentials());
    DagUtils utils = DagUtils.getInstance();
    Context ctx = new Context(job);
    MapWork mapWork = (MapWork) work.getAllWork().get(0);
    // bunch of things get setup in the context based on conf but we need only the MR tmp directory
    // for the following method.
    JobConf wxConf = utils.initializeVertexConf(job, ctx, mapWork);
    // TODO: should we also whitelist input formats here? from mapred.input.format.class
    Path scratchDir = utils.createTezDir(ctx.getMRScratchDir(), job);
    FileSystem fs = scratchDir.getFileSystem(job);
    try {
        LocalResource appJarLr = createJarLocalResource(utils.getExecJarPathLocal(ctx.getConf()), utils, job);
        LlapCoordinator coordinator = LlapCoordinator.getInstance();
        if (coordinator == null) {
            throw new IOException("LLAP coordinator is not initialized; must be running in HS2 with " + ConfVars.LLAP_HS2_ENABLE_COORDINATOR.varname + " enabled");
        }
        // Update the queryId to use the generated applicationId. See comment below about
        // why this is done.
        HiveConf.setVar(wxConf, HiveConf.ConfVars.HIVEQUERYID, applicationId.toString());
        Vertex wx = utils.createVertex(wxConf, mapWork, scratchDir, fs, ctx, false, work, work.getVertexType(mapWork), DagUtils.createTezLrMap(appJarLr, null));
        String vertexName = wx.getName();
        dag.addVertex(wx);
        utils.addCredentials(mapWork, dag);
        // we have the dag now proceed to get the splits:
        Preconditions.checkState(HiveConf.getBoolVar(wxConf, ConfVars.HIVE_TEZ_GENERATE_CONSISTENT_SPLITS));
        Preconditions.checkState(HiveConf.getBoolVar(wxConf, ConfVars.LLAP_CLIENT_CONSISTENT_SPLITS));
        HiveSplitGenerator splitGenerator = new HiveSplitGenerator(wxConf, mapWork);
        List<Event> eventList = splitGenerator.initialize();
        InputSplit[] result = new InputSplit[eventList.size() - 1];
        InputConfigureVertexTasksEvent configureEvent = (InputConfigureVertexTasksEvent) eventList.get(0);
        List<TaskLocationHint> hints = configureEvent.getLocationHint().getTaskLocationHints();
        Preconditions.checkState(hints.size() == eventList.size() - 1);
        if (LOG.isDebugEnabled()) {
            LOG.debug("NumEvents=" + eventList.size() + ", NumSplits=" + result.length);
        }
        // This assumes LLAP cluster owner is always the HS2 user.
        String llapUser = UserGroupInformation.getLoginUser().getShortUserName();
        String queryUser = null;
        byte[] tokenBytes = null;
        LlapSigner signer = null;
        if (UserGroupInformation.isSecurityEnabled()) {
            signer = coordinator.getLlapSigner(job);
            // 1. Generate the token for query user (applies to all splits).
            queryUser = SessionState.getUserFromAuthenticator();
            if (queryUser == null) {
                queryUser = UserGroupInformation.getCurrentUser().getUserName();
                LOG.warn("Cannot determine the session user; using " + queryUser + " instead");
            }
            LlapTokenLocalClient tokenClient = coordinator.getLocalTokenClient(job, llapUser);
            // We put the query user, not LLAP user, into the message and token.
            Token<LlapTokenIdentifier> token = tokenClient.createToken(applicationId.toString(), queryUser, true);
            LOG.info("Created the token for remote user: {}", token);
            bos.reset();
            token.write(dos);
            tokenBytes = bos.toByteArray();
        } else {
            queryUser = UserGroupInformation.getCurrentUser().getUserName();
        }
        // Generate umbilical token (applies to all splits)
        Token<JobTokenIdentifier> umbilicalToken = JobTokenCreator.createJobToken(applicationId);
        LOG.info("Number of splits: " + (eventList.size() - 1));
        SignedMessage signedSvs = null;
        for (int i = 0; i < eventList.size() - 1; i++) {
            TaskSpec taskSpec = new TaskSpecBuilder().constructTaskSpec(dag, vertexName, eventList.size() - 1, applicationId, i);
            // 2. Generate the vertex/submit information for all events.
            if (i == 0) {
                // The queryId could either be picked up from the current request being processed, or
                // generated. The current request isn't exactly correct since the query is 'done' once we
                // return the results. Generating a new one has the added benefit of working once this
                // is moved out of a UDTF into a proper API.
                // Setting this to the generated AppId which is unique.
                // Despite the differences in TaskSpec, the vertex spec should be the same.
                signedSvs = createSignedVertexSpec(signer, taskSpec, applicationId, queryUser, applicationId.toString());
            }
            SubmitWorkInfo submitWorkInfo = new SubmitWorkInfo(applicationId, System.currentTimeMillis(), taskSpec.getVertexParallelism(), signedSvs.message, signedSvs.signature, umbilicalToken);
            byte[] submitWorkBytes = SubmitWorkInfo.toBytes(submitWorkInfo);
            // 3. Generate input event.
            SignedMessage eventBytes = makeEventBytes(wx, vertexName, eventList.get(i + 1), signer);
            // 4. Make location hints.
            SplitLocationInfo[] locations = makeLocationHints(hints.get(i));
            result[i] = new LlapInputSplit(i, submitWorkBytes, eventBytes.message, eventBytes.signature, locations, schema, llapUser, tokenBytes);
        }
        return result;
    } catch (Exception e) {
        throw new IOException(e);
    }
}
Also used : Vertex(org.apache.tez.dag.api.Vertex) SubmitWorkInfo(org.apache.hadoop.hive.llap.SubmitWorkInfo) LlapTokenIdentifier(org.apache.hadoop.hive.llap.security.LlapTokenIdentifier) SplitLocationInfo(org.apache.hadoop.mapred.SplitLocationInfo) HiveSplitGenerator(org.apache.hadoop.hive.ql.exec.tez.HiveSplitGenerator) TaskSpecBuilder(org.apache.tez.dag.api.TaskSpecBuilder) LlapSigner(org.apache.hadoop.hive.llap.security.LlapSigner) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) LlapTokenLocalClient(org.apache.hadoop.hive.llap.security.LlapTokenLocalClient) DagUtils(org.apache.hadoop.hive.ql.exec.tez.DagUtils) LlapInputSplit(org.apache.hadoop.hive.llap.LlapInputSplit) FileSystem(org.apache.hadoop.fs.FileSystem) JobConf(org.apache.hadoop.mapred.JobConf) LlapInputSplit(org.apache.hadoop.hive.llap.LlapInputSplit) InputSplit(org.apache.hadoop.mapred.InputSplit) Context(org.apache.hadoop.hive.ql.Context) Path(org.apache.hadoop.fs.Path) TaskSpec(org.apache.tez.runtime.api.impl.TaskSpec) SignedMessage(org.apache.hadoop.hive.llap.security.LlapSigner.SignedMessage) JobTokenIdentifier(org.apache.tez.common.security.JobTokenIdentifier) DAG(org.apache.tez.dag.api.DAG) IOException(java.io.IOException) LlapCoordinator(org.apache.hadoop.hive.llap.coordinator.LlapCoordinator) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) LoginException(javax.security.auth.login.LoginException) URISyntaxException(java.net.URISyntaxException) UDFArgumentLengthException(org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException) FileNotFoundException(java.io.FileNotFoundException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) UDFArgumentException(org.apache.hadoop.hive.ql.exec.UDFArgumentException) UDFArgumentTypeException(org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException) IOException(java.io.IOException) LocalResource(org.apache.hadoop.yarn.api.records.LocalResource) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) Event(org.apache.tez.runtime.api.Event) InputConfigureVertexTasksEvent(org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent) InputConfigureVertexTasksEvent(org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent)

Example 54 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class GenericUDTFGetSplits method process.

@Override
public void process(Object[] arguments) throws HiveException {
    String query = stringOI.getPrimitiveJavaObject(arguments[0]);
    int num = intOI.get(arguments[1]);
    // Generate applicationId for the LLAP splits
    LlapCoordinator coordinator = LlapCoordinator.getInstance();
    if (coordinator == null) {
        throw new HiveException("LLAP coordinator is not initialized; must be running in HS2 with " + ConfVars.LLAP_HS2_ENABLE_COORDINATOR.varname + " enabled");
    }
    ApplicationId applicationId = coordinator.createExtClientAppId();
    LOG.info("Generated appID {} for LLAP splits", applicationId.toString());
    PlanFragment fragment = createPlanFragment(query, num, applicationId);
    TezWork tezWork = fragment.work;
    Schema schema = fragment.schema;
    try {
        for (InputSplit s : getSplits(jc, num, tezWork, schema, applicationId)) {
            Object[] os = new Object[1];
            bos.reset();
            s.write(dos);
            byte[] frozen = bos.toByteArray();
            os[0] = frozen;
            forward(os);
        }
    } catch (Exception e) {
        throw new HiveException(e);
    }
}
Also used : HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) Schema(org.apache.hadoop.hive.llap.Schema) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) LlapCoordinator(org.apache.hadoop.hive.llap.coordinator.LlapCoordinator) LoginException(javax.security.auth.login.LoginException) URISyntaxException(java.net.URISyntaxException) UDFArgumentLengthException(org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException) FileNotFoundException(java.io.FileNotFoundException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) UDFArgumentException(org.apache.hadoop.hive.ql.exec.UDFArgumentException) UDFArgumentTypeException(org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException) IOException(java.io.IOException) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) LlapInputSplit(org.apache.hadoop.hive.llap.LlapInputSplit) InputSplit(org.apache.hadoop.mapred.InputSplit) TezWork(org.apache.hadoop.hive.ql.plan.TezWork)

Example 55 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project apex-malhar by apache.

the class MapOperatorTest method testNodeProcessingSchema.

public void testNodeProcessingSchema(MapOperator<LongWritable, Text, Text, IntWritable> oper) throws IOException {
    CollectorTestSink sortSink = new CollectorTestSink();
    oper.output.setSink(sortSink);
    oper.setMapClass(WordCount.Map.class);
    oper.setCombineClass(WordCount.Reduce.class);
    oper.setDirName(testMeta.testDir);
    oper.setConfigFile(null);
    oper.setInputFormatClass(TextInputFormat.class);
    Configuration conf = new Configuration();
    JobConf jobConf = new JobConf(conf);
    FileInputFormat.setInputPaths(jobConf, new Path(testMeta.testDir));
    TextInputFormat inputFormat = new TextInputFormat();
    inputFormat.configure(jobConf);
    InputSplit[] splits = inputFormat.getSplits(jobConf, 1);
    SerializationFactory serializationFactory = new SerializationFactory(conf);
    Serializer keySerializer = serializationFactory.getSerializer(splits[0].getClass());
    keySerializer.open(oper.getOutstream());
    keySerializer.serialize(splits[0]);
    oper.setInputSplitClass(splits[0].getClass());
    keySerializer.close();
    oper.setup(null);
    oper.beginWindow(0);
    oper.emitTuples();
    oper.emitTuples();
    oper.endWindow();
    oper.beginWindow(1);
    oper.emitTuples();
    oper.endWindow();
    Assert.assertEquals("number emitted tuples", 3, sortSink.collectedTuples.size());
    for (Object o : sortSink.collectedTuples) {
        LOG.debug(o.toString());
    }
    LOG.debug("Done testing round\n");
    oper.teardown();
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) SerializationFactory(org.apache.hadoop.io.serializer.SerializationFactory) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) CollectorTestSink(org.apache.apex.malhar.lib.testbench.CollectorTestSink) Serializer(org.apache.hadoop.io.serializer.Serializer)

Aggregations

InputSplit (org.apache.hadoop.mapred.InputSplit)161 Path (org.apache.hadoop.fs.Path)57 JobConf (org.apache.hadoop.mapred.JobConf)56 Test (org.junit.Test)49 IOException (java.io.IOException)47 ArrayList (java.util.ArrayList)29 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)27 FileSplit (org.apache.hadoop.mapred.FileSplit)24 FileSystem (org.apache.hadoop.fs.FileSystem)21 TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)21 InputFormat (org.apache.hadoop.mapred.InputFormat)19 RecordWriter (org.apache.hadoop.mapred.RecordWriter)19 NullWritable (org.apache.hadoop.io.NullWritable)18 Text (org.apache.hadoop.io.Text)18 Configuration (org.apache.hadoop.conf.Configuration)14 LongWritable (org.apache.hadoop.io.LongWritable)11 FileInputFormat (org.apache.hadoop.mapred.FileInputFormat)10 Properties (java.util.Properties)9 TaskLocationHint (org.apache.tez.dag.api.TaskLocationHint)9 HashMap (java.util.HashMap)8