Search in sources :

Example 1 with LlapInputSplit

use of org.apache.hadoop.hive.llap.LlapInputSplit in project hive by apache.

the class GenericUDTFGetSplits method getSplits.

public InputSplit[] getSplits(JobConf job, int numSplits, TezWork work, Schema schema) throws IOException {
    DAG dag = DAG.create(work.getName());
    dag.setCredentials(job.getCredentials());
    DagUtils utils = DagUtils.getInstance();
    Context ctx = new Context(job);
    MapWork mapWork = (MapWork) work.getAllWork().get(0);
    // bunch of things get setup in the context based on conf but we need only the MR tmp directory
    // for the following method.
    JobConf wxConf = utils.initializeVertexConf(job, ctx, mapWork);
    // TODO: should we also whitelist input formats here? from mapred.input.format.class
    Path scratchDir = utils.createTezDir(ctx.getMRScratchDir(), job);
    FileSystem fs = scratchDir.getFileSystem(job);
    try {
        LocalResource appJarLr = createJarLocalResource(utils.getExecJarPathLocal(), utils, job);
        Vertex wx = utils.createVertex(wxConf, mapWork, scratchDir, appJarLr, new ArrayList<LocalResource>(), fs, ctx, false, work, work.getVertexType(mapWork));
        String vertexName = wx.getName();
        dag.addVertex(wx);
        utils.addCredentials(mapWork, dag);
        // we have the dag now proceed to get the splits:
        Preconditions.checkState(HiveConf.getBoolVar(wxConf, ConfVars.HIVE_TEZ_GENERATE_CONSISTENT_SPLITS));
        Preconditions.checkState(HiveConf.getBoolVar(wxConf, ConfVars.LLAP_CLIENT_CONSISTENT_SPLITS));
        HiveSplitGenerator splitGenerator = new HiveSplitGenerator(wxConf, mapWork);
        List<Event> eventList = splitGenerator.initialize();
        InputSplit[] result = new InputSplit[eventList.size() - 1];
        InputConfigureVertexTasksEvent configureEvent = (InputConfigureVertexTasksEvent) eventList.get(0);
        List<TaskLocationHint> hints = configureEvent.getLocationHint().getTaskLocationHints();
        Preconditions.checkState(hints.size() == eventList.size() - 1);
        if (LOG.isDebugEnabled()) {
            LOG.debug("NumEvents=" + eventList.size() + ", NumSplits=" + result.length);
        }
        LlapCoordinator coordinator = LlapCoordinator.getInstance();
        if (coordinator == null) {
            throw new IOException("LLAP coordinator is not initialized; must be running in HS2 with " + ConfVars.LLAP_HS2_ENABLE_COORDINATOR.varname + " enabled");
        }
        // See the discussion in the implementation as to why we generate app ID.
        ApplicationId applicationId = coordinator.createExtClientAppId();
        // This assumes LLAP cluster owner is always the HS2 user.
        String llapUser = UserGroupInformation.getLoginUser().getShortUserName();
        String queryUser = null;
        byte[] tokenBytes = null;
        LlapSigner signer = null;
        if (UserGroupInformation.isSecurityEnabled()) {
            signer = coordinator.getLlapSigner(job);
            // 1. Generate the token for query user (applies to all splits).
            queryUser = SessionState.getUserFromAuthenticator();
            if (queryUser == null) {
                queryUser = UserGroupInformation.getCurrentUser().getUserName();
                LOG.warn("Cannot determine the session user; using " + queryUser + " instead");
            }
            LlapTokenLocalClient tokenClient = coordinator.getLocalTokenClient(job, llapUser);
            // We put the query user, not LLAP user, into the message and token.
            Token<LlapTokenIdentifier> token = tokenClient.createToken(applicationId.toString(), queryUser, true);
            LOG.info("Created the token for remote user: {}", token);
            bos.reset();
            token.write(dos);
            tokenBytes = bos.toByteArray();
        } else {
            queryUser = UserGroupInformation.getCurrentUser().getUserName();
        }
        LOG.info("Number of splits: " + (eventList.size() - 1));
        SignedMessage signedSvs = null;
        for (int i = 0; i < eventList.size() - 1; i++) {
            TaskSpec taskSpec = new TaskSpecBuilder().constructTaskSpec(dag, vertexName, eventList.size() - 1, applicationId, i);
            // 2. Generate the vertex/submit information for all events.
            if (i == 0) {
                // The queryId could either be picked up from the current request being processed, or
                // generated. The current request isn't exactly correct since the query is 'done' once we
                // return the results. Generating a new one has the added benefit of working once this
                // is moved out of a UDTF into a proper API.
                // Setting this to the generated AppId which is unique.
                // Despite the differences in TaskSpec, the vertex spec should be the same.
                signedSvs = createSignedVertexSpec(signer, taskSpec, applicationId, queryUser, applicationId.toString());
            }
            SubmitWorkInfo submitWorkInfo = new SubmitWorkInfo(applicationId, System.currentTimeMillis(), taskSpec.getVertexParallelism(), signedSvs.message, signedSvs.signature);
            byte[] submitWorkBytes = SubmitWorkInfo.toBytes(submitWorkInfo);
            // 3. Generate input event.
            SignedMessage eventBytes = makeEventBytes(wx, vertexName, eventList.get(i + 1), signer);
            // 4. Make location hints.
            SplitLocationInfo[] locations = makeLocationHints(hints.get(i));
            result[i] = new LlapInputSplit(i, submitWorkBytes, eventBytes.message, eventBytes.signature, locations, schema, llapUser, tokenBytes);
        }
        return result;
    } catch (Exception e) {
        throw new IOException(e);
    }
}
Also used : Vertex(org.apache.tez.dag.api.Vertex) SubmitWorkInfo(org.apache.hadoop.hive.llap.SubmitWorkInfo) LlapTokenIdentifier(org.apache.hadoop.hive.llap.security.LlapTokenIdentifier) SplitLocationInfo(org.apache.hadoop.mapred.SplitLocationInfo) HiveSplitGenerator(org.apache.hadoop.hive.ql.exec.tez.HiveSplitGenerator) TaskSpecBuilder(org.apache.tez.dag.api.TaskSpecBuilder) LlapSigner(org.apache.hadoop.hive.llap.security.LlapSigner) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) LlapTokenLocalClient(org.apache.hadoop.hive.llap.security.LlapTokenLocalClient) DagUtils(org.apache.hadoop.hive.ql.exec.tez.DagUtils) LlapInputSplit(org.apache.hadoop.hive.llap.LlapInputSplit) FileSystem(org.apache.hadoop.fs.FileSystem) JobConf(org.apache.hadoop.mapred.JobConf) LlapInputSplit(org.apache.hadoop.hive.llap.LlapInputSplit) InputSplit(org.apache.hadoop.mapred.InputSplit) Context(org.apache.hadoop.hive.ql.Context) Path(org.apache.hadoop.fs.Path) TaskSpec(org.apache.tez.runtime.api.impl.TaskSpec) SignedMessage(org.apache.hadoop.hive.llap.security.LlapSigner.SignedMessage) DAG(org.apache.tez.dag.api.DAG) IOException(java.io.IOException) LlapCoordinator(org.apache.hadoop.hive.llap.coordinator.LlapCoordinator) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) LoginException(javax.security.auth.login.LoginException) URISyntaxException(java.net.URISyntaxException) UDFArgumentLengthException(org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException) FileNotFoundException(java.io.FileNotFoundException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) UDFArgumentException(org.apache.hadoop.hive.ql.exec.UDFArgumentException) UDFArgumentTypeException(org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException) CommandNeedRetryException(org.apache.hadoop.hive.ql.CommandNeedRetryException) IOException(java.io.IOException) LocalResource(org.apache.hadoop.yarn.api.records.LocalResource) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) Event(org.apache.tez.runtime.api.Event) InputConfigureVertexTasksEvent(org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent) InputConfigureVertexTasksEvent(org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId)

Example 2 with LlapInputSplit

use of org.apache.hadoop.hive.llap.LlapInputSplit in project hive by apache.

the class LlapRowInputFormat method getRecordReader.

@Override
public RecordReader<NullWritable, Row> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
    LlapInputSplit llapSplit = (LlapInputSplit) split;
    LlapBaseRecordReader<Text> reader = (LlapBaseRecordReader<Text>) baseInputFormat.getRecordReader(llapSplit, job, reporter);
    return new LlapRowRecordReader(job, reader.getSchema(), reader);
}
Also used : LlapRowRecordReader(org.apache.hadoop.hive.llap.LlapRowRecordReader) LlapInputSplit(org.apache.hadoop.hive.llap.LlapInputSplit) Text(org.apache.hadoop.io.Text) LlapBaseRecordReader(org.apache.hadoop.hive.llap.LlapBaseRecordReader)

Example 3 with LlapInputSplit

use of org.apache.hadoop.hive.llap.LlapInputSplit in project hive by apache.

the class TestLlapInputSplit method testWritable.

@Test
public void testWritable() throws Exception {
    int splitNum = 88;
    byte[] planBytes = "0123456789987654321".getBytes();
    byte[] fragmentBytes = "abcdefghijklmnopqrstuvwxyz".getBytes();
    SplitLocationInfo[] locations = { new SplitLocationInfo("location1", false), new SplitLocationInfo("location2", false) };
    ArrayList<FieldDesc> colDescs = new ArrayList<FieldDesc>();
    colDescs.add(new FieldDesc("col1", new TypeDesc(TypeDesc.Type.STRING)));
    colDescs.add(new FieldDesc("col2", new TypeDesc(TypeDesc.Type.INT)));
    Schema schema = new Schema(colDescs);
    byte[] tokenBytes = new byte[] { 1 };
    LlapInputSplit split1 = new LlapInputSplit(splitNum, planBytes, fragmentBytes, null, locations, schema, "hive", tokenBytes);
    ByteArrayOutputStream byteOutStream = new ByteArrayOutputStream();
    DataOutputStream dataOut = new DataOutputStream(byteOutStream);
    split1.write(dataOut);
    ByteArrayInputStream byteInStream = new ByteArrayInputStream(byteOutStream.toByteArray());
    DataInputStream dataIn = new DataInputStream(byteInStream);
    LlapInputSplit split2 = new LlapInputSplit();
    split2.readFields(dataIn);
    // Did we read all the data?
    assertEquals(0, byteInStream.available());
    checkLlapSplits(split1, split2);
}
Also used : SplitLocationInfo(org.apache.hadoop.mapred.SplitLocationInfo) DataOutputStream(java.io.DataOutputStream) Schema(org.apache.hadoop.hive.llap.Schema) ArrayList(java.util.ArrayList) TypeDesc(org.apache.hadoop.hive.llap.TypeDesc) ByteArrayOutputStream(java.io.ByteArrayOutputStream) DataInputStream(java.io.DataInputStream) LlapInputSplit(org.apache.hadoop.hive.llap.LlapInputSplit) ByteArrayInputStream(java.io.ByteArrayInputStream) FieldDesc(org.apache.hadoop.hive.llap.FieldDesc) Test(org.junit.Test)

Aggregations

LlapInputSplit (org.apache.hadoop.hive.llap.LlapInputSplit)3 SplitLocationInfo (org.apache.hadoop.mapred.SplitLocationInfo)2 ByteArrayInputStream (java.io.ByteArrayInputStream)1 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 DataInputStream (java.io.DataInputStream)1 DataOutputStream (java.io.DataOutputStream)1 FileNotFoundException (java.io.FileNotFoundException)1 IOException (java.io.IOException)1 URISyntaxException (java.net.URISyntaxException)1 ArrayList (java.util.ArrayList)1 LoginException (javax.security.auth.login.LoginException)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 FieldDesc (org.apache.hadoop.hive.llap.FieldDesc)1 LlapBaseRecordReader (org.apache.hadoop.hive.llap.LlapBaseRecordReader)1 LlapRowRecordReader (org.apache.hadoop.hive.llap.LlapRowRecordReader)1 Schema (org.apache.hadoop.hive.llap.Schema)1 SubmitWorkInfo (org.apache.hadoop.hive.llap.SubmitWorkInfo)1 TypeDesc (org.apache.hadoop.hive.llap.TypeDesc)1 LlapCoordinator (org.apache.hadoop.hive.llap.coordinator.LlapCoordinator)1