Search in sources :

Example 1 with IntervalSplitter

use of org.apache.hive.storage.jdbc.spitter.IntervalSplitter in project hive by apache.

the class JdbcInputFormat method getSplits.

/**
 * {@inheritDoc}
 */
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    try {
        String partitionColumn = job.get(Constants.JDBC_PARTITION_COLUMN);
        int numPartitions = job.getInt(Constants.JDBC_NUM_PARTITIONS, -1);
        String lowerBound = job.get(Constants.JDBC_LOW_BOUND);
        String upperBound = job.get(Constants.JDBC_UPPER_BOUND);
        InputSplit[] splits;
        if (!job.getBoolean(Constants.JDBC_SPLIT_QUERY, true) || numPartitions <= 1) {
            // We will not split this query if:
            // 1. hive.sql.query.split is set to false (either manually or automatically by calcite
            // 2. numPartitions == 1
            splits = new InputSplit[1];
            splits[0] = new JdbcInputSplit(FileInputFormat.getInputPaths(job)[0]);
            LOGGER.info("Creating 1 input split " + splits[0]);
            return splits;
        }
        dbAccessor = DatabaseAccessorFactory.getAccessor(job);
        Path[] tablePaths = FileInputFormat.getInputPaths(job);
        // We will split this query into n splits
        LOGGER.debug("Creating {} input splits", numPartitions);
        if (partitionColumn != null) {
            List<String> columnNames = dbAccessor.getColumnNames(job);
            if (!columnNames.contains(partitionColumn)) {
                throw new IOException("Cannot find partitionColumn:" + partitionColumn + " in " + columnNames);
            }
            List<TypeInfo> hiveColumnTypesList = TypeInfoUtils.getTypeInfosFromTypeString(job.get(serdeConstants.LIST_COLUMN_TYPES));
            TypeInfo typeInfo = hiveColumnTypesList.get(columnNames.indexOf(partitionColumn));
            if (!(typeInfo instanceof PrimitiveTypeInfo)) {
                throw new IOException(partitionColumn + " is a complex type, only primitive type can be a partition column");
            }
            if (lowerBound == null || upperBound == null) {
                Pair<String, String> boundary = dbAccessor.getBounds(job, partitionColumn, lowerBound == null, upperBound == null);
                if (lowerBound == null) {
                    lowerBound = boundary.getLeft();
                }
                if (upperBound == null) {
                    upperBound = boundary.getRight();
                }
            }
            if (lowerBound == null) {
                throw new IOException("lowerBound of " + partitionColumn + " cannot be null");
            }
            if (upperBound == null) {
                throw new IOException("upperBound of " + partitionColumn + " cannot be null");
            }
            IntervalSplitter intervalSplitter = IntervalSplitterFactory.newIntervalSpitter(typeInfo);
            List<MutablePair<String, String>> intervals = intervalSplitter.getIntervals(lowerBound, upperBound, numPartitions, typeInfo);
            if (intervals.size() <= 1) {
                LOGGER.debug("Creating 1 input splits");
                splits = new InputSplit[1];
                splits[0] = new JdbcInputSplit(FileInputFormat.getInputPaths(job)[0]);
                return splits;
            }
            intervals.get(0).setLeft(null);
            intervals.get(intervals.size() - 1).setRight(null);
            splits = new InputSplit[intervals.size()];
            for (int i = 0; i < intervals.size(); i++) {
                splits[i] = new JdbcInputSplit(partitionColumn, intervals.get(i).getLeft(), intervals.get(i).getRight(), tablePaths[0]);
            }
        } else {
            int numRecords = dbAccessor.getTotalNumberOfRecords(job);
            if (numRecords < numPartitions) {
                numPartitions = numRecords;
            }
            int numRecordsPerSplit = numRecords / numPartitions;
            int numSplitsWithExtraRecords = numRecords % numPartitions;
            LOGGER.debug("Num records = {}", numRecords);
            splits = new InputSplit[numPartitions];
            int offset = 0;
            for (int i = 0; i < numPartitions; i++) {
                int numRecordsInThisSplit = numRecordsPerSplit;
                if (i < numSplitsWithExtraRecords) {
                    numRecordsInThisSplit++;
                }
                splits[i] = new JdbcInputSplit(numRecordsInThisSplit, offset, tablePaths[0]);
                offset += numRecordsInThisSplit;
            }
        }
        dbAccessor = null;
        LOGGER.info("Num input splits created {}", splits.length);
        for (InputSplit split : splits) {
            LOGGER.info("split:" + split.toString());
        }
        return splits;
    } catch (Exception e) {
        LOGGER.error("Error while splitting input data.", e);
        throw new IOException(e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) IntervalSplitter(org.apache.hive.storage.jdbc.spitter.IntervalSplitter) IOException(java.io.IOException) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) IOException(java.io.IOException) MutablePair(org.apache.commons.lang3.tuple.MutablePair) InputSplit(org.apache.hadoop.mapred.InputSplit)

Aggregations

IOException (java.io.IOException)1 MutablePair (org.apache.commons.lang3.tuple.MutablePair)1 Path (org.apache.hadoop.fs.Path)1 PrimitiveTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo)1 TypeInfo (org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)1 InputSplit (org.apache.hadoop.mapred.InputSplit)1 IntervalSplitter (org.apache.hive.storage.jdbc.spitter.IntervalSplitter)1