Search in sources :

Example 1 with InputJobInfo

use of org.apache.hive.hcatalog.mapreduce.InputJobInfo in project hive by apache.

the class HCatLoader method setLocation.

@Override
public void setLocation(String location, Job job) throws IOException {
    HCatContext.INSTANCE.setConf(job.getConfiguration()).getConf().get().setBoolean(HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION, true);
    UDFContext udfContext = UDFContext.getUDFContext();
    Properties udfProps = udfContext.getUDFProperties(this.getClass(), new String[] { signature });
    job.getConfiguration().set(INNER_SIGNATURE, INNER_SIGNATURE_PREFIX + "_" + signature);
    Pair<String, String> dbTablePair = PigHCatUtil.getDBTableNames(location);
    dbName = dbTablePair.first;
    tableName = dbTablePair.second;
    RequiredFieldList requiredFieldsInfo = (RequiredFieldList) udfProps.get(PRUNE_PROJECTION_INFO);
    // the Configuration
    if (udfProps.containsKey(HCatConstants.HCAT_PIG_LOADER_LOCATION_SET)) {
        for (Enumeration<Object> emr = udfProps.keys(); emr.hasMoreElements(); ) {
            PigHCatUtil.getConfigFromUDFProperties(udfProps, job.getConfiguration(), emr.nextElement().toString());
        }
        if (!HCatUtil.checkJobContextIfRunningFromBackend(job)) {
            //Combine credentials and credentials from job takes precedence for freshness
            Credentials crd = jobCredentials.get(INNER_SIGNATURE_PREFIX + "_" + signature);
            job.getCredentials().addAll(crd);
        }
    } else {
        Job clone = new Job(job.getConfiguration());
        HCatInputFormat.setInput(job, dbName, tableName, getPartitionFilterString());
        InputJobInfo inputJobInfo = (InputJobInfo) HCatUtil.deserialize(job.getConfiguration().get(HCatConstants.HCAT_KEY_JOB_INFO));
        SpecialCases.addSpecialCasesParametersForHCatLoader(job.getConfiguration(), inputJobInfo.getTableInfo());
        //be called many times.
        for (Entry<String, String> keyValue : job.getConfiguration()) {
            String oldValue = clone.getConfiguration().getRaw(keyValue.getKey());
            if ((oldValue == null) || (keyValue.getValue().equals(oldValue) == false)) {
                udfProps.put(keyValue.getKey(), keyValue.getValue());
            }
        }
        udfProps.put(HCatConstants.HCAT_PIG_LOADER_LOCATION_SET, true);
        //Store credentials in a private hash map and not the udf context to
        // make sure they are not public.
        Credentials crd = new Credentials();
        crd.addAll(job.getCredentials());
        jobCredentials.put(INNER_SIGNATURE_PREFIX + "_" + signature, crd);
    }
    if (requiredFieldsInfo != null) {
        // convert to hcatschema and pass to HCatInputFormat
        try {
            //push down projections to columnar store works for RCFile and ORCFile
            ArrayList<Integer> list = new ArrayList<Integer>(requiredFieldsInfo.getFields().size());
            for (RequiredField rf : requiredFieldsInfo.getFields()) {
                list.add(rf.getIndex());
            }
            ColumnProjectionUtils.setReadColumns(job.getConfiguration(), list);
            outputSchema = phutil.getHCatSchema(requiredFieldsInfo.getFields(), signature, this.getClass());
            HCatInputFormat.setOutputSchema(job, outputSchema);
        } catch (Exception e) {
            throw new IOException(e);
        }
    } else {
        // else - this means pig's optimizer never invoked the pushProjection
        // method - so we need all fields and hence we should not call the
        // setOutputSchema on HCatInputFormat
        ColumnProjectionUtils.setReadAllColumns(job.getConfiguration());
        if (HCatUtil.checkJobContextIfRunningFromBackend(job)) {
            try {
                HCatSchema hcatTableSchema = (HCatSchema) udfProps.get(HCatConstants.HCAT_TABLE_SCHEMA);
                outputSchema = hcatTableSchema;
                HCatInputFormat.setOutputSchema(job, outputSchema);
            } catch (Exception e) {
                throw new IOException(e);
            }
        }
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("outputSchema=" + outputSchema);
    }
}
Also used : ArrayList(java.util.ArrayList) UDFContext(org.apache.pig.impl.util.UDFContext) IOException(java.io.IOException) Properties(java.util.Properties) HCatException(org.apache.hive.hcatalog.common.HCatException) PigException(org.apache.pig.PigException) IOException(java.io.IOException) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema) Job(org.apache.hadoop.mapreduce.Job) InputJobInfo(org.apache.hive.hcatalog.mapreduce.InputJobInfo) Credentials(org.apache.hadoop.security.Credentials)

Example 2 with InputJobInfo

use of org.apache.hive.hcatalog.mapreduce.InputJobInfo in project hive by apache.

the class HCatLoader method getStatistics.

/**
   * Get statistics about the data to be loaded. Only input data size is implemented at this time.
   */
@Override
public ResourceStatistics getStatistics(String location, Job job) throws IOException {
    try {
        ResourceStatistics stats = new ResourceStatistics();
        InputJobInfo inputJobInfo = (InputJobInfo) HCatUtil.deserialize(job.getConfiguration().get(HCatConstants.HCAT_KEY_JOB_INFO));
        stats.setmBytes(getSizeInBytes(inputJobInfo) / 1024 / 1024);
        return stats;
    } catch (Exception e) {
        throw new IOException(e);
    }
}
Also used : ResourceStatistics(org.apache.pig.ResourceStatistics) IOException(java.io.IOException) InputJobInfo(org.apache.hive.hcatalog.mapreduce.InputJobInfo) HCatException(org.apache.hive.hcatalog.common.HCatException) PigException(org.apache.pig.PigException) IOException(java.io.IOException)

Aggregations

IOException (java.io.IOException)2 HCatException (org.apache.hive.hcatalog.common.HCatException)2 InputJobInfo (org.apache.hive.hcatalog.mapreduce.InputJobInfo)2 PigException (org.apache.pig.PigException)2 ArrayList (java.util.ArrayList)1 Properties (java.util.Properties)1 Job (org.apache.hadoop.mapreduce.Job)1 Credentials (org.apache.hadoop.security.Credentials)1 HCatSchema (org.apache.hive.hcatalog.data.schema.HCatSchema)1 ResourceStatistics (org.apache.pig.ResourceStatistics)1 UDFContext (org.apache.pig.impl.util.UDFContext)1