Search in sources :

Example 36 with JobContext

use of org.apache.hadoop.mapreduce.JobContext in project hive by apache.

the class HiveAccumuloTableInputFormat method getSplits.

@Override
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {
    final AccumuloConnectionParameters accumuloParams = new AccumuloConnectionParameters(jobConf);
    final Instance instance = accumuloParams.getInstance();
    final ColumnMapper columnMapper;
    try {
        columnMapper = getColumnMapper(jobConf);
    } catch (TooManyAccumuloColumnsException e) {
        throw new IOException(e);
    }
    JobContext context = ShimLoader.getHadoopShims().newJobContext(Job.getInstance(jobConf));
    Path[] tablePaths = FileInputFormat.getInputPaths(context);
    try {
        Connector connector = null;
        // Need to get a Connector so we look up the user's authorizations if not otherwise specified
        if (accumuloParams.useSasl()) {
            log.info("Current user: " + UserGroupInformation.getCurrentUser());
            // In a YARN/Tez job, don't have the Kerberos credentials anymore, use the delegation token
            AuthenticationToken token = ConfiguratorBase.getAuthenticationToken(AccumuloInputFormat.class, jobConf);
            if (null != token && !jobConf.getCredentials().getAllTokens().isEmpty()) {
                // Convert the stub from the configuration back into a normal Token
                log.info("Found authentication token in Configuration: " + token);
                log.info("Job credential tokens: " + jobConf.getCredentials().getAllTokens());
                AuthenticationToken unwrappedToken = ConfiguratorBase.unwrapAuthenticationToken(jobConf, token);
                log.info("Converted authentication token from Configuration into: " + unwrappedToken);
                // will return back the original token (which we know is insufficient)
                if (unwrappedToken != token) {
                    log.info("Creating Accumulo Connector with unwrapped delegation token");
                    connector = instance.getConnector(accumuloParams.getAccumuloUserName(), unwrappedToken);
                } else {
                    log.info("Job credentials did not contain delegation token, fetching new token");
                }
            }
            if (connector == null) {
                log.info("Obtaining Accumulo Connector using KerberosToken");
                // Construct a KerberosToken -- relies on ProxyUser configuration. Will be the client making
                // the request on top of the HS2's user. Accumulo will require proper proxy-user auth configs.
                connector = instance.getConnector(accumuloParams.getAccumuloUserName(), new KerberosToken(accumuloParams.getAccumuloUserName()));
            }
        } else {
            // Still in the local JVM, use the username+password or Kerberos credentials
            connector = accumuloParams.getConnector(instance);
        }
        final List<ColumnMapping> columnMappings = columnMapper.getColumnMappings();
        final List<IteratorSetting> iterators = predicateHandler.getIterators(jobConf, columnMapper);
        final Collection<Range> ranges = predicateHandler.getRanges(jobConf, columnMapper);
        // We don't want that.
        if (null != ranges && ranges.isEmpty()) {
            return new InputSplit[0];
        }
        // Set the relevant information in the Configuration for the AccumuloInputFormat
        configure(jobConf, instance, connector, accumuloParams, columnMapper, iterators, ranges);
        int numColumns = columnMappings.size();
        List<Integer> readColIds = ColumnProjectionUtils.getReadColumnIDs(jobConf);
        // Sanity check
        if (numColumns < readColIds.size())
            throw new IOException("Number of column mappings (" + numColumns + ")" + " numbers less than the hive table columns. (" + readColIds.size() + ")");
        // get splits from Accumulo
        InputSplit[] splits = accumuloInputFormat.getSplits(jobConf, numSplits);
        HiveAccumuloSplit[] hiveSplits = new HiveAccumuloSplit[splits.length];
        for (int i = 0; i < splits.length; i++) {
            RangeInputSplit ris = (RangeInputSplit) splits[i];
            ris.setLogLevel(Level.DEBUG);
            hiveSplits[i] = new HiveAccumuloSplit(ris, tablePaths[0]);
        }
        return hiveSplits;
    } catch (AccumuloException e) {
        log.error("Could not configure AccumuloInputFormat", e);
        throw new IOException(StringUtils.stringifyException(e));
    } catch (AccumuloSecurityException e) {
        log.error("Could not configure AccumuloInputFormat", e);
        throw new IOException(StringUtils.stringifyException(e));
    } catch (SerDeException e) {
        log.error("Could not configure AccumuloInputFormat", e);
        throw new IOException(StringUtils.stringifyException(e));
    }
}
Also used : Connector(org.apache.accumulo.core.client.Connector) AuthenticationToken(org.apache.accumulo.core.client.security.tokens.AuthenticationToken) MockInstance(org.apache.accumulo.core.client.mock.MockInstance) Instance(org.apache.accumulo.core.client.Instance) RangeInputSplit(org.apache.accumulo.core.client.mapred.RangeInputSplit) AccumuloSecurityException(org.apache.accumulo.core.client.AccumuloSecurityException) JobContext(org.apache.hadoop.mapreduce.JobContext) RangeInputSplit(org.apache.accumulo.core.client.mapred.RangeInputSplit) InputSplit(org.apache.hadoop.mapred.InputSplit) HiveAccumuloMapColumnMapping(org.apache.hadoop.hive.accumulo.columns.HiveAccumuloMapColumnMapping) ColumnMapping(org.apache.hadoop.hive.accumulo.columns.ColumnMapping) HiveAccumuloColumnMapping(org.apache.hadoop.hive.accumulo.columns.HiveAccumuloColumnMapping) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) Path(org.apache.hadoop.fs.Path) AccumuloException(org.apache.accumulo.core.client.AccumuloException) KerberosToken(org.apache.accumulo.core.client.security.tokens.KerberosToken) IOException(java.io.IOException) TooManyAccumuloColumnsException(org.apache.hadoop.hive.accumulo.serde.TooManyAccumuloColumnsException) Range(org.apache.accumulo.core.data.Range) IteratorSetting(org.apache.accumulo.core.client.IteratorSetting) AccumuloConnectionParameters(org.apache.hadoop.hive.accumulo.AccumuloConnectionParameters) ColumnMapper(org.apache.hadoop.hive.accumulo.columns.ColumnMapper)

Example 37 with JobContext

use of org.apache.hadoop.mapreduce.JobContext in project hive by apache.

the class HiveHBaseTableOutputFormat method checkOutputSpecs.

@Override
public void checkOutputSpecs(FileSystem fs, JobConf jc) throws IOException {
    // obtain delegation tokens for the job
    if (UserGroupInformation.getCurrentUser().hasKerberosCredentials()) {
        TableMapReduceUtil.initCredentials(jc);
    }
    String hbaseTableName = jc.get(HBaseSerDe.HBASE_TABLE_NAME);
    jc.set(TableOutputFormat.OUTPUT_TABLE, hbaseTableName);
    Job job = new Job(jc);
    JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job);
    try {
        checkOutputSpecs(jobContext);
    } catch (InterruptedException e) {
        throw new IOException(e);
    }
}
Also used : JobContext(org.apache.hadoop.mapreduce.JobContext) IOException(java.io.IOException) Job(org.apache.hadoop.mapreduce.Job)

Example 38 with JobContext

use of org.apache.hadoop.mapreduce.JobContext in project hive by apache.

the class TestRCFileMapReduceInputFormat method writeThenReadByRecordReader.

private void writeThenReadByRecordReader(int intervalRecordCount, int writeCount, int splitNumber, long maxSplitSize, CompressionCodec codec) throws IOException, InterruptedException {
    Path testDir = new Path(System.getProperty("test.tmp.dir", ".") + "/mapred/testsmallfirstsplit");
    Path testFile = new Path(testDir, "test_rcfile");
    fs.delete(testFile, true);
    Configuration cloneConf = new Configuration(conf);
    RCFileOutputFormat.setColumnNumber(cloneConf, bytesArray.length);
    cloneConf.setInt(HiveConf.ConfVars.HIVE_RCFILE_RECORD_INTERVAL.varname, intervalRecordCount);
    RCFile.Writer writer = new RCFile.Writer(fs, cloneConf, testFile, null, codec);
    BytesRefArrayWritable bytes = new BytesRefArrayWritable(bytesArray.length);
    for (int i = 0; i < bytesArray.length; i++) {
        BytesRefWritable cu = null;
        cu = new BytesRefWritable(bytesArray[i], 0, bytesArray[i].length);
        bytes.set(i, cu);
    }
    for (int i = 0; i < writeCount; i++) {
        writer.append(bytes);
    }
    writer.close();
    RCFileMapReduceInputFormat<LongWritable, BytesRefArrayWritable> inputFormat = new RCFileMapReduceInputFormat<LongWritable, BytesRefArrayWritable>();
    Configuration jonconf = new Configuration(cloneConf);
    jonconf.set("mapred.input.dir", testDir.toString());
    JobContext context = new Job(jonconf);
    HiveConf.setLongVar(context.getConfiguration(), HiveConf.ConfVars.MAPREDMAXSPLITSIZE, maxSplitSize);
    List<InputSplit> splits = inputFormat.getSplits(context);
    assertEquals("splits length should be " + splitNumber, splitNumber, splits.size());
    int readCount = 0;
    for (int i = 0; i < splits.size(); i++) {
        TaskAttemptContext tac = ShimLoader.getHadoopShims().getHCatShim().createTaskAttemptContext(jonconf, new TaskAttemptID());
        RecordReader<LongWritable, BytesRefArrayWritable> rr = inputFormat.createRecordReader(splits.get(i), tac);
        rr.initialize(splits.get(i), tac);
        while (rr.nextKeyValue()) {
            readCount++;
        }
    }
    assertEquals("readCount should be equal to writeCount", readCount, writeCount);
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) BytesRefArrayWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) RCFile(org.apache.hadoop.hive.ql.io.RCFile) LongWritable(org.apache.hadoop.io.LongWritable) JobContext(org.apache.hadoop.mapreduce.JobContext) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) BytesRefWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefWritable)

Example 39 with JobContext

use of org.apache.hadoop.mapreduce.JobContext in project hive by apache.

the class HCatBaseInputFormat method createRecordReader.

/**
 * Create the RecordReader for the given InputSplit. Returns the underlying
 * RecordReader if the required operations are supported and schema matches
 * with HCatTable schema. Returns an HCatRecordReader if operations need to
 * be implemented in HCat.
 * @param split the split
 * @param taskContext the task attempt context
 * @return the record reader instance, either an HCatRecordReader(later) or
 *         the underlying storage handler's RecordReader
 * @throws IOException or InterruptedException
 */
@Override
public RecordReader<WritableComparable, HCatRecord> createRecordReader(InputSplit split, TaskAttemptContext taskContext) throws IOException, InterruptedException {
    HCatSplit hcatSplit = InternalUtil.castToHCatSplit(split);
    PartInfo partitionInfo = hcatSplit.getPartitionInfo();
    // Ensure PartInfo's TableInfo is initialized.
    if (partitionInfo.getTableInfo() == null) {
        partitionInfo.setTableInfo(HCatUtil.getLastInputJobInfosFromConf(taskContext.getConfiguration()).getTableInfo());
    }
    JobContext jobContext = taskContext;
    Configuration conf = jobContext.getConfiguration();
    HiveStorageHandler storageHandler = HCatUtil.getStorageHandler(conf, partitionInfo);
    JobConf jobConf = HCatUtil.getJobConfFromContext(jobContext);
    Map<String, String> jobProperties = partitionInfo.getJobProperties();
    HCatUtil.copyJobPropertiesToJobConf(jobProperties, jobConf);
    Map<String, Object> valuesNotInDataCols = getColValsNotInDataColumns(getOutputSchema(conf), partitionInfo);
    return new HCatRecordReader(storageHandler, valuesNotInDataCols);
}
Also used : HiveStorageHandler(org.apache.hadoop.hive.ql.metadata.HiveStorageHandler) Configuration(org.apache.hadoop.conf.Configuration) JobContext(org.apache.hadoop.mapreduce.JobContext) JobConf(org.apache.hadoop.mapred.JobConf)

Example 40 with JobContext

use of org.apache.hadoop.mapreduce.JobContext in project hive by apache.

the class FileOutputCommitterContainer method discoverPartitions.

/**
 * Run to discover dynamic partitions available
 */
private void discoverPartitions(JobContext context) throws IOException {
    if (!partitionsDiscovered) {
        // LOG.info("discover ptns called");
        OutputJobInfo jobInfo = HCatOutputFormat.getJobInfo(context.getConfiguration());
        harProcessor.setEnabled(jobInfo.getHarRequested());
        List<Integer> dynamicPartCols = jobInfo.getPosOfDynPartCols();
        int maxDynamicPartitions = jobInfo.getMaxDynamicPartitions();
        Path loadPath = new Path(jobInfo.getLocation());
        FileSystem fs = loadPath.getFileSystem(context.getConfiguration());
        // construct a path pattern (e.g., /*/*) to find all dynamically generated paths
        String dynPathSpec = loadPath.toUri().getPath();
        dynPathSpec = dynPathSpec.replace("__HIVE_DEFAULT_PARTITION__", "*");
        // LOG.info("Searching for "+dynPathSpec);
        Path pathPattern = new Path(dynPathSpec);
        FileStatus[] status = fs.globStatus(pathPattern, FileUtils.HIDDEN_FILES_PATH_FILTER);
        partitionsDiscoveredByPath = new LinkedHashMap<String, Map<String, String>>();
        contextDiscoveredByPath = new LinkedHashMap<String, JobContext>();
        if (status.length == 0) {
        // LOG.warn("No partition found genereated by dynamic partitioning in ["
        // +loadPath+"] with depth["+jobInfo.getTable().getPartitionKeysSize()
        // +"], dynSpec["+dynPathSpec+"]");
        } else {
            if ((maxDynamicPartitions != -1) && (status.length > maxDynamicPartitions)) {
                this.partitionsDiscovered = true;
                throw new HCatException(ErrorType.ERROR_TOO_MANY_DYNAMIC_PTNS, "Number of dynamic partitions being created " + "exceeds configured max allowable partitions[" + maxDynamicPartitions + "], increase parameter [" + HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS.varname + "] if needed.");
            }
            for (FileStatus st : status) {
                LinkedHashMap<String, String> fullPartSpec = new LinkedHashMap<String, String>();
                if (!customDynamicLocationUsed) {
                    Warehouse.makeSpecFromName(fullPartSpec, st.getPath(), null);
                } else {
                    HCatFileUtil.getPartKeyValuesForCustomLocation(fullPartSpec, jobInfo, st.getPath().toString());
                }
                partitionsDiscoveredByPath.put(st.getPath().toString(), fullPartSpec);
                JobConf jobConf = (JobConf) context.getConfiguration();
                JobContext currContext = HCatMapRedUtil.createJobContext(jobConf, context.getJobID(), InternalUtil.createReporter(HCatMapRedUtil.createTaskAttemptContext(jobConf, ShimLoader.getHadoopShims().getHCatShim().createTaskAttemptID())));
                HCatOutputFormat.configureOutputStorageHandler(currContext, jobInfo, fullPartSpec);
                contextDiscoveredByPath.put(st.getPath().toString(), currContext);
            }
        }
        // for (Entry<String,Map<String,String>> spec : partitionsDiscoveredByPath.entrySet()){
        // LOG.info("Partition "+ spec.getKey());
        // for (Entry<String,String> e : spec.getValue().entrySet()){
        // LOG.info(e.getKey() + "=>" +e.getValue());
        // }
        // }
        this.partitionsDiscovered = true;
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) HCatException(org.apache.hive.hcatalog.common.HCatException) LinkedHashMap(java.util.LinkedHashMap) FileSystem(org.apache.hadoop.fs.FileSystem) JobContext(org.apache.hadoop.mapreduce.JobContext) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) JobConf(org.apache.hadoop.mapred.JobConf)

Aggregations

JobContext (org.apache.hadoop.mapreduce.JobContext)85 Configuration (org.apache.hadoop.conf.Configuration)41 Job (org.apache.hadoop.mapreduce.Job)35 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)34 Test (org.junit.Test)31 JobContextImpl (org.apache.hadoop.mapreduce.task.JobContextImpl)29 InputSplit (org.apache.hadoop.mapreduce.InputSplit)28 TaskAttemptContextImpl (org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl)25 Path (org.apache.hadoop.fs.Path)24 IOException (java.io.IOException)22 File (java.io.File)19 TaskAttemptID (org.apache.hadoop.mapreduce.TaskAttemptID)16 ArrayList (java.util.ArrayList)13 RecordWriter (org.apache.hadoop.mapreduce.RecordWriter)11 JobConf (org.apache.hadoop.mapred.JobConf)10 OutputCommitter (org.apache.hadoop.mapreduce.OutputCommitter)10 LongWritable (org.apache.hadoop.io.LongWritable)9 MapFile (org.apache.hadoop.io.MapFile)9 JobID (org.apache.hadoop.mapreduce.JobID)7 FileSystem (org.apache.hadoop.fs.FileSystem)6