Search in sources :

Example 81 with JobContext

use of org.apache.hadoop.mapreduce.JobContext in project hive by apache.

the class HiveHBaseTableInputFormat method getSplitsInternal.

private InputSplit[] getSplitsInternal(JobConf jobConf, int numSplits) throws IOException {
    // obtain delegation tokens for the job
    if (UserGroupInformation.getCurrentUser().hasKerberosCredentials()) {
        TableMapReduceUtil.initCredentials(jobConf);
    }
    String hbaseTableName = jobConf.get(HBaseSerDe.HBASE_TABLE_NAME);
    Connection conn = ConnectionFactory.createConnection(HBaseConfiguration.create(jobConf));
    TableName tableName = TableName.valueOf(hbaseTableName);
    initializeTable(conn, tableName);
    String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING);
    boolean doColumnRegexMatching = jobConf.getBoolean(HBaseSerDe.HBASE_COLUMNS_REGEX_MATCHING, true);
    try {
        if (hbaseColumnsMapping == null) {
            throw new IOException(HBaseSerDe.HBASE_COLUMNS_MAPPING + " required for HBase Table.");
        }
        ColumnMappings columnMappings = null;
        try {
            columnMappings = HBaseSerDe.parseColumnsMapping(hbaseColumnsMapping, doColumnRegexMatching);
        } catch (SerDeException e) {
            throw new IOException(e);
        }
        int iKey = columnMappings.getKeyIndex();
        int iTimestamp = columnMappings.getTimestampIndex();
        ColumnMapping keyMapping = columnMappings.getKeyMapping();
        // Take filter pushdown into account while calculating splits; this
        // allows us to prune off regions immediately.  Note that although
        // the Javadoc for the superclass getSplits says that it returns one
        // split per region, the implementation actually takes the scan
        // definition into account and excludes regions which don't satisfy
        // the start/stop row conditions (HBASE-1829).
        Scan scan = createFilterScan(jobConf, iKey, iTimestamp, HiveHBaseInputFormatUtil.getStorageFormatOfKey(keyMapping.mappingSpec, jobConf.get(HBaseSerDe.HBASE_TABLE_DEFAULT_STORAGE_TYPE, "string")));
        // The list of families that have been added to the scan
        List<String> addedFamilies = new ArrayList<String>();
        // same as in getRecordReader?
        for (ColumnMapping colMap : columnMappings) {
            if (colMap.hbaseRowKey || colMap.hbaseTimestamp) {
                continue;
            }
            if (colMap.qualifierName == null) {
                scan.addFamily(colMap.familyNameBytes);
                addedFamilies.add(colMap.familyName);
            } else {
                if (!addedFamilies.contains(colMap.familyName)) {
                    // add the column only if the family has not already been added
                    scan.addColumn(colMap.familyNameBytes, colMap.qualifierNameBytes);
                }
            }
        }
        setScan(scan);
        Job job = new Job(jobConf);
        JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job);
        Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);
        List<org.apache.hadoop.mapreduce.InputSplit> splits = super.getSplits(jobContext);
        InputSplit[] results = new InputSplit[splits.size()];
        for (int i = 0; i < splits.size(); i++) {
            results[i] = new HBaseSplit((TableSplit) splits.get(i), tablePaths[0]);
        }
        return results;
    } finally {
        closeTable();
        conn.close();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Connection(org.apache.hadoop.hbase.client.Connection) ArrayList(java.util.ArrayList) IOException(java.io.IOException) TableName(org.apache.hadoop.hbase.TableName) TableSplit(org.apache.hadoop.hbase.mapreduce.TableSplit) Scan(org.apache.hadoop.hbase.client.Scan) JobContext(org.apache.hadoop.mapreduce.JobContext) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapred.InputSplit) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) ColumnMapping(org.apache.hadoop.hive.hbase.ColumnMappings.ColumnMapping)

Example 82 with JobContext

use of org.apache.hadoop.mapreduce.JobContext in project hive by apache.

the class DruidQueryBasedInputFormat method getInputSplits.

protected HiveDruidSplit[] getInputSplits(Configuration conf) throws IOException {
    String address = HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_DRUID_BROKER_DEFAULT_ADDRESS);
    String queryId = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEQUERYID);
    if (StringUtils.isEmpty(address)) {
        throw new IOException("Druid broker address not specified in configuration");
    }
    String druidQuery = StringEscapeUtils.unescapeJava(conf.get(Constants.DRUID_QUERY_JSON));
    String druidQueryType;
    if (StringUtils.isEmpty(druidQuery)) {
        // Empty, maybe because CBO did not run; we fall back to
        // full Select query
        LOG.warn("Druid query is empty; creating Select query");
        String dataSource = conf.get(Constants.DRUID_DATA_SOURCE);
        if (dataSource == null || dataSource.isEmpty()) {
            throw new IOException("Druid data source cannot be empty or null");
        }
        druidQuery = DruidStorageHandlerUtils.createScanAllQuery(dataSource, Utilities.getColumnNames(conf));
        druidQueryType = Query.SCAN;
        conf.set(Constants.DRUID_QUERY_TYPE, druidQueryType);
    } else {
        druidQueryType = conf.get(Constants.DRUID_QUERY_TYPE);
        if (druidQueryType == null) {
            throw new IOException("Druid query type not recognized");
        }
    }
    // Add Hive Query ID to Druid Query
    if (queryId != null) {
        druidQuery = withQueryId(druidQuery, queryId);
    }
    // hive depends on FileSplits
    Job job = Job.getInstance(conf);
    JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job);
    Path[] paths = FileInputFormat.getInputPaths(jobContext);
    // Then, create splits with the Druid queries.
    switch(druidQueryType) {
        case Query.TIMESERIES:
        case Query.TOPN:
        case Query.GROUP_BY:
            return new HiveDruidSplit[] { new HiveDruidSplit(druidQuery, paths[0], new String[] { address }) };
        case Query.SCAN:
            ScanQuery scanQuery = DruidStorageHandlerUtils.JSON_MAPPER.readValue(druidQuery, ScanQuery.class);
            return distributeScanQuery(address, scanQuery, paths[0]);
        default:
            throw new IOException("Druid query type not recognized");
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ScanQuery(org.apache.druid.query.scan.ScanQuery) IOException(java.io.IOException) JobContext(org.apache.hadoop.mapreduce.JobContext) Job(org.apache.hadoop.mapreduce.Job)

Example 83 with JobContext

use of org.apache.hadoop.mapreduce.JobContext in project hive by apache.

the class FileOutputCommitterContainer method abortJob.

@Override
public void abortJob(JobContext jobContext, State state) throws IOException {
    try {
        if (dynamicPartitioningUsed) {
            discoverPartitions(jobContext);
        }
        org.apache.hadoop.mapred.JobContext mapRedJobContext = HCatMapRedUtil.createJobContext(jobContext);
        if (getBaseOutputCommitter() != null && !dynamicPartitioningUsed) {
            getBaseOutputCommitter().abortJob(mapRedJobContext, state);
        } else if (dynamicPartitioningUsed) {
            for (JobContext currContext : contextDiscoveredByPath.values()) {
                try {
                    new JobConf(currContext.getConfiguration()).getOutputCommitter().abortJob(currContext, state);
                } catch (Exception e) {
                    throw new IOException(e);
                }
            }
        }
        Path src;
        OutputJobInfo jobInfo = HCatOutputFormat.getJobInfo(jobContext.getConfiguration());
        Path tblPath = new Path(jobInfo.getTableInfo().getTableLocation());
        if (dynamicPartitioningUsed) {
            if (!customDynamicLocationUsed) {
                src = new Path(getPartitionRootLocation(jobInfo.getLocation(), jobInfo.getTableInfo().getTable().getPartitionKeysSize()));
            } else {
                src = new Path(getCustomPartitionRootLocation(jobInfo, jobContext.getConfiguration()));
            }
        } else {
            src = new Path(jobInfo.getLocation());
        }
        FileSystem fs = src.getFileSystem(jobContext.getConfiguration());
        // Note fs.delete will fail on Windows. The reason is in OutputCommitter,
        // Hadoop is still writing to _logs/history. On Linux, OS don't care file is still
        // open and remove the directory anyway, but on Windows, OS refuse to remove a
        // directory containing open files. So on Windows, we will leave output directory
        // behind when job fail. User needs to remove the output directory manually
        LOG.info("Job failed. Try cleaning up temporary directory [{}].", src);
        if (!src.equals(tblPath)) {
            fs.delete(src, true);
        }
    } finally {
        cancelDelegationTokens(jobContext);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileSystem(org.apache.hadoop.fs.FileSystem) JobContext(org.apache.hadoop.mapreduce.JobContext) IOException(java.io.IOException) JobConf(org.apache.hadoop.mapred.JobConf) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) HCatException(org.apache.hive.hcatalog.common.HCatException) TException(org.apache.thrift.TException) IOException(java.io.IOException) InvalidOperationException(org.apache.hadoop.hive.metastore.api.InvalidOperationException) NoSuchObjectException(org.apache.hadoop.hive.metastore.api.NoSuchObjectException)

Example 84 with JobContext

use of org.apache.hadoop.mapreduce.JobContext in project hive by apache.

the class FileOutputCommitterContainer method commitJob.

@Override
public void commitJob(JobContext jobContext) throws IOException {
    if (dynamicPartitioningUsed) {
        discoverPartitions(jobContext);
        // dir
        for (JobContext context : contextDiscoveredByPath.values()) {
            new JobConf(context.getConfiguration()).getOutputCommitter().commitJob(context);
        }
    }
    if (getBaseOutputCommitter() != null && !dynamicPartitioningUsed) {
        getBaseOutputCommitter().commitJob(HCatMapRedUtil.createJobContext(jobContext));
    }
    registerPartitions(jobContext);
    // create _SUCCESS FILE if so requested.
    OutputJobInfo jobInfo = HCatOutputFormat.getJobInfo(jobContext.getConfiguration());
    if (getOutputDirMarking(jobContext.getConfiguration())) {
        Path outputPath = new Path(jobInfo.getLocation());
        FileSystem fileSys = outputPath.getFileSystem(jobContext.getConfiguration());
        // create a file in the folder to mark it
        if (fileSys.exists(outputPath)) {
            Path filePath = new Path(outputPath, SUCCEEDED_FILE_NAME);
            if (!fileSys.exists(filePath)) {
                // may have been
                // created by
                // baseCommitter.commitJob()
                fileSys.create(filePath).close();
            }
        }
    }
    // Commit has succeeded (since no exceptions have been thrown.)
    // Safe to cancel delegation tokens now.
    cancelDelegationTokens(jobContext);
}
Also used : Path(org.apache.hadoop.fs.Path) FileSystem(org.apache.hadoop.fs.FileSystem) JobContext(org.apache.hadoop.mapreduce.JobContext) JobConf(org.apache.hadoop.mapred.JobConf)

Example 85 with JobContext

use of org.apache.hadoop.mapreduce.JobContext in project hive by apache.

the class Hadoop23Shims method getCombineFileInputFormat.

@Override
public HadoopShims.CombineFileInputFormatShim getCombineFileInputFormat() {
    return new CombineFileInputFormatShim() {

        @Override
        public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
            throw new IOException("CombineFileInputFormat.getRecordReader not needed.");
        }

        @Override
        protected List<FileStatus> listStatus(JobContext job) throws IOException {
            List<FileStatus> result = super.listStatus(job);
            Iterator<FileStatus> it = result.iterator();
            while (it.hasNext()) {
                FileStatus stat = it.next();
                if (!stat.isFile() || (stat.getLen() == 0 && !stat.getPath().toUri().getScheme().equals("nullscan"))) {
                    it.remove();
                }
            }
            return result;
        }
    };
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) HdfsFileStatus(org.apache.hadoop.hdfs.protocol.HdfsFileStatus) HdfsLocatedFileStatus(org.apache.hadoop.hdfs.protocol.HdfsLocatedFileStatus) Reporter(org.apache.hadoop.mapred.Reporter) IOException(java.io.IOException) JobContext(org.apache.hadoop.mapreduce.JobContext) InputSplit(org.apache.hadoop.mapred.InputSplit) JobConf(org.apache.hadoop.mapred.JobConf)

Aggregations

JobContext (org.apache.hadoop.mapreduce.JobContext)85 Configuration (org.apache.hadoop.conf.Configuration)41 Job (org.apache.hadoop.mapreduce.Job)35 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)34 Test (org.junit.Test)31 JobContextImpl (org.apache.hadoop.mapreduce.task.JobContextImpl)29 InputSplit (org.apache.hadoop.mapreduce.InputSplit)28 TaskAttemptContextImpl (org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl)25 Path (org.apache.hadoop.fs.Path)24 IOException (java.io.IOException)22 File (java.io.File)19 TaskAttemptID (org.apache.hadoop.mapreduce.TaskAttemptID)16 ArrayList (java.util.ArrayList)13 RecordWriter (org.apache.hadoop.mapreduce.RecordWriter)11 JobConf (org.apache.hadoop.mapred.JobConf)10 OutputCommitter (org.apache.hadoop.mapreduce.OutputCommitter)10 LongWritable (org.apache.hadoop.io.LongWritable)9 MapFile (org.apache.hadoop.io.MapFile)9 JobID (org.apache.hadoop.mapreduce.JobID)7 FileSystem (org.apache.hadoop.fs.FileSystem)6