Search in sources :

Example 1 with ReworkMapredInputFormat

use of org.apache.hadoop.hive.ql.io.ReworkMapredInputFormat in project hive by apache.

the class Utilities method reworkMapRedWork.

/**
 * The check here is kind of not clean. It first use a for loop to go through
 * all input formats, and choose the ones that extend ReworkMapredInputFormat
 * to a set. And finally go through the ReworkMapredInputFormat set, and call
 * rework for each one.
 *
 * Technically all these can be avoided if all Hive's input formats can share
 * a same interface. As in today's hive and Hadoop, it is not possible because
 * a lot of Hive's input formats are in Hadoop's code. And most of Hadoop's
 * input formats just extend InputFormat interface.
 *
 * @param task
 * @param reworkMapredWork
 * @param conf
 * @throws SemanticException
 */
public static void reworkMapRedWork(Task<?> task, boolean reworkMapredWork, HiveConf conf) throws SemanticException {
    if (reworkMapredWork && (task instanceof MapRedTask)) {
        try {
            MapredWork mapredWork = ((MapRedTask) task).getWork();
            Set<Class<? extends InputFormat>> reworkInputFormats = new HashSet<Class<? extends InputFormat>>();
            for (PartitionDesc part : mapredWork.getMapWork().getPathToPartitionInfo().values()) {
                Class<? extends InputFormat> inputFormatCls = part.getInputFileFormatClass();
                if (ReworkMapredInputFormat.class.isAssignableFrom(inputFormatCls)) {
                    reworkInputFormats.add(inputFormatCls);
                }
            }
            if (reworkInputFormats.size() > 0) {
                for (Class<? extends InputFormat> inputFormatCls : reworkInputFormats) {
                    ReworkMapredInputFormat inst = (ReworkMapredInputFormat) ReflectionUtil.newInstance(inputFormatCls, null);
                    inst.rework(conf, mapredWork);
                }
            }
        } catch (IOException e) {
            throw new SemanticException(e);
        }
    }
}
Also used : MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ReworkMapredInputFormat(org.apache.hadoop.hive.ql.io.ReworkMapredInputFormat) OneNullRowInputFormat(org.apache.hadoop.hive.ql.io.OneNullRowInputFormat) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) SequenceFileInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat) ContentSummaryInputFormat(org.apache.hadoop.hive.ql.io.ContentSummaryInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) ReworkMapredInputFormat(org.apache.hadoop.hive.ql.io.ReworkMapredInputFormat) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) IOException(java.io.IOException) HashSet(java.util.HashSet) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Example 2 with ReworkMapredInputFormat

use of org.apache.hadoop.hive.ql.io.ReworkMapredInputFormat in project hive by apache.

the class Utilities method reworkMapRedWork.

/**
 * The check here is kind of not clean. It first use a for loop to go through
 * all input formats, and choose the ones that extend ReworkMapredInputFormat
 * to a set. And finally go through the ReworkMapredInputFormat set, and call
 * rework for each one.
 *
 * Technically all these can be avoided if all Hive's input formats can share
 * a same interface. As in today's hive and Hadoop, it is not possible because
 * a lot of Hive's input formats are in Hadoop's code. And most of Hadoop's
 * input formats just extend InputFormat interface.
 *
 * @param task
 * @param reworkMapredWork
 * @param conf
 * @throws SemanticException
 */
public static void reworkMapRedWork(Task<? extends Serializable> task, boolean reworkMapredWork, HiveConf conf) throws SemanticException {
    if (reworkMapredWork && (task instanceof MapRedTask)) {
        try {
            MapredWork mapredWork = ((MapRedTask) task).getWork();
            Set<Class<? extends InputFormat>> reworkInputFormats = new HashSet<Class<? extends InputFormat>>();
            for (PartitionDesc part : mapredWork.getMapWork().getPathToPartitionInfo().values()) {
                Class<? extends InputFormat> inputFormatCls = part.getInputFileFormatClass();
                if (ReworkMapredInputFormat.class.isAssignableFrom(inputFormatCls)) {
                    reworkInputFormats.add(inputFormatCls);
                }
            }
            if (reworkInputFormats.size() > 0) {
                for (Class<? extends InputFormat> inputFormatCls : reworkInputFormats) {
                    ReworkMapredInputFormat inst = (ReworkMapredInputFormat) ReflectionUtil.newInstance(inputFormatCls, null);
                    inst.rework(conf, mapredWork);
                }
            }
        } catch (IOException e) {
            throw new SemanticException(e);
        }
    }
}
Also used : MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) SequenceFileInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat) ReworkMapredInputFormat(org.apache.hadoop.hive.ql.io.ReworkMapredInputFormat) ContentSummaryInputFormat(org.apache.hadoop.hive.ql.io.ContentSummaryInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) OneNullRowInputFormat(org.apache.hadoop.hive.ql.io.OneNullRowInputFormat) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) ReworkMapredInputFormat(org.apache.hadoop.hive.ql.io.ReworkMapredInputFormat) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) IOException(java.io.IOException) HashSet(java.util.HashSet) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Aggregations

IOException (java.io.IOException)2 HashSet (java.util.HashSet)2 MapRedTask (org.apache.hadoop.hive.ql.exec.mr.MapRedTask)2 ContentSummaryInputFormat (org.apache.hadoop.hive.ql.io.ContentSummaryInputFormat)2 HiveInputFormat (org.apache.hadoop.hive.ql.io.HiveInputFormat)2 OneNullRowInputFormat (org.apache.hadoop.hive.ql.io.OneNullRowInputFormat)2 ReworkMapredInputFormat (org.apache.hadoop.hive.ql.io.ReworkMapredInputFormat)2 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)2 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)2 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)2 FileInputFormat (org.apache.hadoop.mapred.FileInputFormat)2 InputFormat (org.apache.hadoop.mapred.InputFormat)2 SequenceFileInputFormat (org.apache.hadoop.mapred.SequenceFileInputFormat)2 TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)2