Search in sources :

Example 1 with RegionSizeCalculator

use of org.apache.hadoop.hbase.util.RegionSizeCalculator in project hbase by apache.

the class TableInputFormatBase method getSplits.

/**
   * Calculates the splits that will serve as input for the map tasks. The
   * number of splits matches the number of regions in a table.
   *
   * @param context  The current job context.
   * @return The list of input splits.
   * @throws IOException When creating the list of splits fails.
   * @see org.apache.hadoop.mapreduce.InputFormat#getSplits(
   *   org.apache.hadoop.mapreduce.JobContext)
   */
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
    boolean closeOnFinish = false;
    // Just in case a subclass is relying on JobConfigurable magic.
    if (table == null) {
        initialize(context);
        closeOnFinish = true;
    }
    // null check in case our child overrides getTable to not throw.
    try {
        if (getTable() == null) {
            // initialize() must not have been implemented in the subclass.
            throw new IOException(INITIALIZATION_ERROR);
        }
    } catch (IllegalStateException exception) {
        throw new IOException(INITIALIZATION_ERROR, exception);
    }
    try {
        RegionSizeCalculator sizeCalculator = new RegionSizeCalculator(getRegionLocator(), getAdmin());
        TableName tableName = getTable().getName();
        Pair<byte[][], byte[][]> keys = getStartEndKeys();
        if (keys == null || keys.getFirst() == null || keys.getFirst().length == 0) {
            HRegionLocation regLoc = getRegionLocator().getRegionLocation(HConstants.EMPTY_BYTE_ARRAY, false);
            if (null == regLoc) {
                throw new IOException("Expecting at least one region.");
            }
            List<InputSplit> splits = new ArrayList<>(1);
            long regionSize = sizeCalculator.getRegionSize(regLoc.getRegionInfo().getRegionName());
            TableSplit split = new TableSplit(tableName, scan, HConstants.EMPTY_BYTE_ARRAY, HConstants.EMPTY_BYTE_ARRAY, regLoc.getHostnamePort().split(Addressing.HOSTNAME_PORT_SEPARATOR)[0], regionSize);
            splits.add(split);
            return splits;
        }
        List<InputSplit> splits = new ArrayList<>(keys.getFirst().length);
        for (int i = 0; i < keys.getFirst().length; i++) {
            if (!includeRegionInSplit(keys.getFirst()[i], keys.getSecond()[i])) {
                continue;
            }
            byte[] startRow = scan.getStartRow();
            byte[] stopRow = scan.getStopRow();
            // determine if the given start an stop key fall into the region
            if ((startRow.length == 0 || keys.getSecond()[i].length == 0 || Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) && (stopRow.length == 0 || Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) {
                byte[] splitStart = startRow.length == 0 || Bytes.compareTo(keys.getFirst()[i], startRow) >= 0 ? keys.getFirst()[i] : startRow;
                byte[] splitStop = (stopRow.length == 0 || Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0) && keys.getSecond()[i].length > 0 ? keys.getSecond()[i] : stopRow;
                HRegionLocation location = getRegionLocator().getRegionLocation(keys.getFirst()[i], false);
                // The below InetSocketAddress creation does a name resolution.
                InetSocketAddress isa = new InetSocketAddress(location.getHostname(), location.getPort());
                if (isa.isUnresolved()) {
                    LOG.warn("Failed resolve " + isa);
                }
                InetAddress regionAddress = isa.getAddress();
                String regionLocation;
                regionLocation = reverseDNS(regionAddress);
                byte[] regionName = location.getRegionInfo().getRegionName();
                String encodedRegionName = location.getRegionInfo().getEncodedName();
                long regionSize = sizeCalculator.getRegionSize(regionName);
                TableSplit split = new TableSplit(tableName, scan, splitStart, splitStop, regionLocation, encodedRegionName, regionSize);
                splits.add(split);
                if (LOG.isDebugEnabled()) {
                    LOG.debug("getSplits: split -> " + i + " -> " + split);
                }
            }
        }
        //The default value of "hbase.mapreduce.input.autobalance" is false, which means not enabled.
        boolean enableAutoBalance = context.getConfiguration().getBoolean(MAPREDUCE_INPUT_AUTOBALANCE, false);
        if (enableAutoBalance) {
            long totalRegionSize = 0;
            for (int i = 0; i < splits.size(); i++) {
                TableSplit ts = (TableSplit) splits.get(i);
                totalRegionSize += ts.getLength();
            }
            long averageRegionSize = totalRegionSize / splits.size();
            // the averageRegionSize must be positive.
            if (averageRegionSize <= 0) {
                LOG.warn("The averageRegionSize is not positive: " + averageRegionSize + ", " + "set it to 1.");
                averageRegionSize = 1;
            }
            return calculateRebalancedSplits(splits, context, averageRegionSize);
        } else {
            return splits;
        }
    } finally {
        if (closeOnFinish) {
            closeTable();
        }
    }
}
Also used : RegionSizeCalculator(org.apache.hadoop.hbase.util.RegionSizeCalculator) InetSocketAddress(java.net.InetSocketAddress) ArrayList(java.util.ArrayList) IOException(java.io.IOException) TableName(org.apache.hadoop.hbase.TableName) HRegionLocation(org.apache.hadoop.hbase.HRegionLocation) InputSplit(org.apache.hadoop.mapreduce.InputSplit) InetAddress(java.net.InetAddress)

Example 2 with RegionSizeCalculator

use of org.apache.hadoop.hbase.util.RegionSizeCalculator in project phoenix by apache.

the class PhoenixInputFormat method generateSplits.

private List<InputSplit> generateSplits(final JobConf jobConf, final QueryPlan qplan, final List<KeyRange> splits, String query) throws IOException {
    Preconditions.checkNotNull(qplan);
    Preconditions.checkNotNull(splits);
    final List<InputSplit> psplits = Lists.newArrayListWithExpectedSize(splits.size());
    Path[] tablePaths = FileInputFormat.getInputPaths(ShimLoader.getHadoopShims().newJobContext(new Job(jobConf)));
    boolean splitByStats = jobConf.getBoolean(PhoenixStorageHandlerConstants.SPLIT_BY_STATS, false);
    setScanCacheSize(jobConf);
    // Adding Localization
    HConnection connection = HConnectionManager.createConnection(PhoenixConnectionUtil.getConfiguration(jobConf));
    RegionLocator regionLocator = connection.getRegionLocator(TableName.valueOf(qplan.getTableRef().getTable().getPhysicalName().toString()));
    RegionSizeCalculator sizeCalculator = new RegionSizeCalculator(regionLocator, connection.getAdmin());
    for (List<Scan> scans : qplan.getScans()) {
        PhoenixInputSplit inputSplit;
        HRegionLocation location = regionLocator.getRegionLocation(scans.get(0).getStartRow(), false);
        long regionSize = sizeCalculator.getRegionSize(location.getRegionInfo().getRegionName());
        String regionLocation = PhoenixStorageHandlerUtil.getRegionLocation(location, LOG);
        if (splitByStats) {
            for (Scan aScan : scans) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Split for  scan : " + aScan + "with scanAttribute : " + aScan.getAttributesMap() + " [scanCache, cacheBlock, scanBatch] : [" + aScan.getCaching() + ", " + aScan.getCacheBlocks() + ", " + aScan.getBatch() + "] and  regionLocation : " + regionLocation);
                }
                inputSplit = new PhoenixInputSplit(Lists.newArrayList(aScan), tablePaths[0], regionLocation, regionSize);
                inputSplit.setQuery(query);
                psplits.add(inputSplit);
            }
        } else {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Scan count[" + scans.size() + "] : " + Bytes.toStringBinary(scans.get(0).getStartRow()) + " ~ " + Bytes.toStringBinary(scans.get(scans.size() - 1).getStopRow()));
                LOG.debug("First scan : " + scans.get(0) + "with scanAttribute : " + scans.get(0).getAttributesMap() + " [scanCache, cacheBlock, scanBatch] : " + "[" + scans.get(0).getCaching() + ", " + scans.get(0).getCacheBlocks() + ", " + scans.get(0).getBatch() + "] and  regionLocation : " + regionLocation);
                for (int i = 0, limit = scans.size(); i < limit; i++) {
                    LOG.debug("EXPECTED_UPPER_REGION_KEY[" + i + "] : " + Bytes.toStringBinary(scans.get(i).getAttribute(BaseScannerRegionObserver.EXPECTED_UPPER_REGION_KEY)));
                }
            }
            inputSplit = new PhoenixInputSplit(scans, tablePaths[0], regionLocation, regionSize);
            inputSplit.setQuery(query);
            psplits.add(inputSplit);
        }
    }
    return psplits;
}
Also used : Path(org.apache.hadoop.fs.Path) RegionLocator(org.apache.hadoop.hbase.client.RegionLocator) RegionSizeCalculator(org.apache.hadoop.hbase.util.RegionSizeCalculator) HConnection(org.apache.hadoop.hbase.client.HConnection) HRegionLocation(org.apache.hadoop.hbase.HRegionLocation) Scan(org.apache.hadoop.hbase.client.Scan) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 3 with RegionSizeCalculator

use of org.apache.hadoop.hbase.util.RegionSizeCalculator in project phoenix by apache.

the class PhoenixInputFormat method generateSplits.

private List<InputSplit> generateSplits(final QueryPlan qplan, final List<KeyRange> splits, Configuration config) throws IOException {
    Preconditions.checkNotNull(qplan);
    Preconditions.checkNotNull(splits);
    // Get the RegionSizeCalculator
    org.apache.hadoop.hbase.client.Connection connection = ConnectionFactory.createConnection(config);
    RegionLocator regionLocator = connection.getRegionLocator(TableName.valueOf(qplan.getTableRef().getTable().getPhysicalName().toString()));
    RegionSizeCalculator sizeCalculator = new RegionSizeCalculator(regionLocator, connection.getAdmin());
    final List<InputSplit> psplits = Lists.newArrayListWithExpectedSize(splits.size());
    for (List<Scan> scans : qplan.getScans()) {
        // Get the region location
        HRegionLocation location = regionLocator.getRegionLocation(scans.get(0).getStartRow(), false);
        String regionLocation = location.getHostname();
        // Get the region size
        long regionSize = sizeCalculator.getRegionSize(location.getRegionInfo().getRegionName());
        // Generate splits based off statistics, or just region splits?
        boolean splitByStats = PhoenixConfigurationUtil.getSplitByStats(config);
        if (splitByStats) {
            for (Scan aScan : scans) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Split for  scan : " + aScan + "with scanAttribute : " + aScan.getAttributesMap() + " [scanCache, cacheBlock, scanBatch] : [" + aScan.getCaching() + ", " + aScan.getCacheBlocks() + ", " + aScan.getBatch() + "] and  regionLocation : " + regionLocation);
                }
                psplits.add(new PhoenixInputSplit(Collections.singletonList(aScan), regionSize, regionLocation));
            }
        } else {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Scan count[" + scans.size() + "] : " + Bytes.toStringBinary(scans.get(0).getStartRow()) + " ~ " + Bytes.toStringBinary(scans.get(scans.size() - 1).getStopRow()));
                LOG.debug("First scan : " + scans.get(0) + "with scanAttribute : " + scans.get(0).getAttributesMap() + " [scanCache, cacheBlock, scanBatch] : " + "[" + scans.get(0).getCaching() + ", " + scans.get(0).getCacheBlocks() + ", " + scans.get(0).getBatch() + "] and  regionLocation : " + regionLocation);
                for (int i = 0, limit = scans.size(); i < limit; i++) {
                    LOG.debug("EXPECTED_UPPER_REGION_KEY[" + i + "] : " + Bytes.toStringBinary(scans.get(i).getAttribute(BaseScannerRegionObserver.EXPECTED_UPPER_REGION_KEY)));
                }
            }
            psplits.add(new PhoenixInputSplit(scans, regionSize, regionLocation));
        }
    }
    return psplits;
}
Also used : RegionSizeCalculator(org.apache.hadoop.hbase.util.RegionSizeCalculator) HRegionLocation(org.apache.hadoop.hbase.HRegionLocation) org.apache.hadoop.hbase.client(org.apache.hadoop.hbase.client) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 4 with RegionSizeCalculator

use of org.apache.hadoop.hbase.util.RegionSizeCalculator in project hbase by apache.

the class MultiTableInputFormatBase method getSplits.

/**
   * Calculates the splits that will serve as input for the map tasks. The
   * number of splits matches the number of regions in a table.
   *
   * @param context The current job context.
   * @return The list of input splits.
   * @throws IOException When creating the list of splits fails.
   * @see org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)
   */
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
    if (scans.isEmpty()) {
        throw new IOException("No scans were provided.");
    }
    Map<TableName, List<Scan>> tableMaps = new HashMap<>();
    for (Scan scan : scans) {
        byte[] tableNameBytes = scan.getAttribute(Scan.SCAN_ATTRIBUTES_TABLE_NAME);
        if (tableNameBytes == null)
            throw new IOException("A scan object did not have a table name");
        TableName tableName = TableName.valueOf(tableNameBytes);
        List<Scan> scanList = tableMaps.get(tableName);
        if (scanList == null) {
            scanList = new ArrayList<>();
            tableMaps.put(tableName, scanList);
        }
        scanList.add(scan);
    }
    List<InputSplit> splits = new ArrayList<>();
    Iterator iter = tableMaps.entrySet().iterator();
    while (iter.hasNext()) {
        Map.Entry<TableName, List<Scan>> entry = (Map.Entry<TableName, List<Scan>>) iter.next();
        TableName tableName = entry.getKey();
        List<Scan> scanList = entry.getValue();
        try (Connection conn = ConnectionFactory.createConnection(context.getConfiguration());
            Table table = conn.getTable(tableName);
            RegionLocator regionLocator = conn.getRegionLocator(tableName)) {
            RegionSizeCalculator sizeCalculator = new RegionSizeCalculator(regionLocator, conn.getAdmin());
            Pair<byte[][], byte[][]> keys = regionLocator.getStartEndKeys();
            for (Scan scan : scanList) {
                if (keys == null || keys.getFirst() == null || keys.getFirst().length == 0) {
                    throw new IOException("Expecting at least one region for table : " + tableName.getNameAsString());
                }
                int count = 0;
                byte[] startRow = scan.getStartRow();
                byte[] stopRow = scan.getStopRow();
                for (int i = 0; i < keys.getFirst().length; i++) {
                    if (!includeRegionInSplit(keys.getFirst()[i], keys.getSecond()[i])) {
                        continue;
                    }
                    if ((startRow.length == 0 || keys.getSecond()[i].length == 0 || Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) && (stopRow.length == 0 || Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) {
                        byte[] splitStart = startRow.length == 0 || Bytes.compareTo(keys.getFirst()[i], startRow) >= 0 ? keys.getFirst()[i] : startRow;
                        byte[] splitStop = (stopRow.length == 0 || Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0) && keys.getSecond()[i].length > 0 ? keys.getSecond()[i] : stopRow;
                        HRegionLocation hregionLocation = regionLocator.getRegionLocation(keys.getFirst()[i], false);
                        String regionHostname = hregionLocation.getHostname();
                        HRegionInfo regionInfo = hregionLocation.getRegionInfo();
                        String encodedRegionName = regionInfo.getEncodedName();
                        long regionSize = sizeCalculator.getRegionSize(regionInfo.getRegionName());
                        TableSplit split = new TableSplit(table.getName(), scan, splitStart, splitStop, regionHostname, encodedRegionName, regionSize);
                        splits.add(split);
                        if (LOG.isDebugEnabled())
                            LOG.debug("getSplits: split -> " + (count++) + " -> " + split);
                    }
                }
            }
        }
    }
    return splits;
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HRegionInfo(org.apache.hadoop.hbase.HRegionInfo) HRegionLocation(org.apache.hadoop.hbase.HRegionLocation) Iterator(java.util.Iterator) ArrayList(java.util.ArrayList) List(java.util.List) InputSplit(org.apache.hadoop.mapreduce.InputSplit) RegionLocator(org.apache.hadoop.hbase.client.RegionLocator) Table(org.apache.hadoop.hbase.client.Table) RegionSizeCalculator(org.apache.hadoop.hbase.util.RegionSizeCalculator) Connection(org.apache.hadoop.hbase.client.Connection) IOException(java.io.IOException) TableName(org.apache.hadoop.hbase.TableName) Scan(org.apache.hadoop.hbase.client.Scan) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

HRegionLocation (org.apache.hadoop.hbase.HRegionLocation)4 RegionSizeCalculator (org.apache.hadoop.hbase.util.RegionSizeCalculator)4 InputSplit (org.apache.hadoop.mapreduce.InputSplit)3 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 TableName (org.apache.hadoop.hbase.TableName)2 RegionLocator (org.apache.hadoop.hbase.client.RegionLocator)2 Scan (org.apache.hadoop.hbase.client.Scan)2 InetAddress (java.net.InetAddress)1 InetSocketAddress (java.net.InetSocketAddress)1 HashMap (java.util.HashMap)1 Iterator (java.util.Iterator)1 List (java.util.List)1 Map (java.util.Map)1 Path (org.apache.hadoop.fs.Path)1 HRegionInfo (org.apache.hadoop.hbase.HRegionInfo)1 org.apache.hadoop.hbase.client (org.apache.hadoop.hbase.client)1 Connection (org.apache.hadoop.hbase.client.Connection)1 HConnection (org.apache.hadoop.hbase.client.HConnection)1 Table (org.apache.hadoop.hbase.client.Table)1