Search in sources :

Example 1 with BatchInputSplit

use of org.apache.accumulo.core.client.mapreduce.impl.BatchInputSplit in project accumulo by apache.

the class AbstractInputFormat method getSplits.

/**
 * Gets the splits of the tables that have been set on the job by reading the metadata table for the specified ranges.
 *
 * @return the splits from the tables based on the ranges.
 * @throws java.io.IOException
 *           if a table set on the job doesn't exist or an error occurs initializing the tablet locator
 */
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
    Level logLevel = getLogLevel(context);
    log.setLevel(logLevel);
    validateOptions(context);
    Random random = new Random();
    LinkedList<InputSplit> splits = new LinkedList<>();
    Map<String, InputTableConfig> tableConfigs = getInputTableConfigs(context);
    for (Map.Entry<String, InputTableConfig> tableConfigEntry : tableConfigs.entrySet()) {
        String tableName = tableConfigEntry.getKey();
        InputTableConfig tableConfig = tableConfigEntry.getValue();
        Instance instance = getInstance(context);
        Table.ID tableId;
        // resolve table name to id once, and use id from this point forward
        if (DeprecationUtil.isMockInstance(instance)) {
            tableId = Table.ID.of("");
        } else {
            try {
                tableId = Tables.getTableId(instance, tableName);
            } catch (TableNotFoundException e) {
                throw new IOException(e);
            }
        }
        Authorizations auths = getScanAuthorizations(context);
        String principal = getPrincipal(context);
        AuthenticationToken token = getAuthenticationToken(context);
        boolean batchScan = InputConfigurator.isBatchScan(CLASS, context.getConfiguration());
        boolean supportBatchScan = !(tableConfig.isOfflineScan() || tableConfig.shouldUseIsolatedScanners() || tableConfig.shouldUseLocalIterators());
        if (batchScan && !supportBatchScan)
            throw new IllegalArgumentException("BatchScanner optimization not available for offline scan, isolated, or local iterators");
        boolean autoAdjust = tableConfig.shouldAutoAdjustRanges();
        if (batchScan && !autoAdjust)
            throw new IllegalArgumentException("AutoAdjustRanges must be enabled when using BatchScanner optimization");
        List<Range> ranges = autoAdjust ? Range.mergeOverlapping(tableConfig.getRanges()) : tableConfig.getRanges();
        if (ranges.isEmpty()) {
            ranges = new ArrayList<>(1);
            ranges.add(new Range());
        }
        // get the metadata information for these ranges
        Map<String, Map<KeyExtent, List<Range>>> binnedRanges = new HashMap<>();
        TabletLocator tl;
        try {
            if (tableConfig.isOfflineScan()) {
                binnedRanges = binOfflineTable(context, tableId, ranges);
                while (binnedRanges == null) {
                    // Some tablets were still online, try again
                    // sleep randomly between 100 and 200 ms
                    sleepUninterruptibly(100 + random.nextInt(100), TimeUnit.MILLISECONDS);
                    binnedRanges = binOfflineTable(context, tableId, ranges);
                }
            } else {
                tl = InputConfigurator.getTabletLocator(CLASS, context.getConfiguration(), tableId);
                // its possible that the cache could contain complete, but old information about a tables tablets... so clear it
                tl.invalidateCache();
                ClientContext clientContext = new ClientContext(getInstance(context), new Credentials(getPrincipal(context), getAuthenticationToken(context)), getClientConfiguration(context));
                while (!tl.binRanges(clientContext, ranges, binnedRanges).isEmpty()) {
                    if (!DeprecationUtil.isMockInstance(instance)) {
                        String tableIdStr = tableId.canonicalID();
                        if (!Tables.exists(instance, tableId))
                            throw new TableDeletedException(tableIdStr);
                        if (Tables.getTableState(instance, tableId) == TableState.OFFLINE)
                            throw new TableOfflineException(instance, tableIdStr);
                    }
                    binnedRanges.clear();
                    log.warn("Unable to locate bins for specified ranges. Retrying.");
                    // sleep randomly between 100 and 200 ms
                    sleepUninterruptibly(100 + random.nextInt(100), TimeUnit.MILLISECONDS);
                    tl.invalidateCache();
                }
            }
        } catch (Exception e) {
            throw new IOException(e);
        }
        // all of this code will add either range per each locations or split ranges and add range-location split
        // Map from Range to Array of Locations, we only use this if we're don't split
        HashMap<Range, ArrayList<String>> splitsToAdd = null;
        if (!autoAdjust)
            splitsToAdd = new HashMap<>();
        HashMap<String, String> hostNameCache = new HashMap<>();
        for (Map.Entry<String, Map<KeyExtent, List<Range>>> tserverBin : binnedRanges.entrySet()) {
            String ip = tserverBin.getKey().split(":", 2)[0];
            String location = hostNameCache.get(ip);
            if (location == null) {
                InetAddress inetAddress = InetAddress.getByName(ip);
                location = inetAddress.getCanonicalHostName();
                hostNameCache.put(ip, location);
            }
            for (Map.Entry<KeyExtent, List<Range>> extentRanges : tserverBin.getValue().entrySet()) {
                Range ke = extentRanges.getKey().toDataRange();
                if (batchScan) {
                    // group ranges by tablet to be read by a BatchScanner
                    ArrayList<Range> clippedRanges = new ArrayList<>();
                    for (Range r : extentRanges.getValue()) clippedRanges.add(ke.clip(r));
                    BatchInputSplit split = new BatchInputSplit(tableName, tableId, clippedRanges, new String[] { location });
                    SplitUtils.updateSplit(split, instance, tableConfig, principal, token, auths, logLevel);
                    splits.add(split);
                } else {
                    // not grouping by tablet
                    for (Range r : extentRanges.getValue()) {
                        if (autoAdjust) {
                            // divide ranges into smaller ranges, based on the tablets
                            RangeInputSplit split = new RangeInputSplit(tableName, tableId.canonicalID(), ke.clip(r), new String[] { location });
                            SplitUtils.updateSplit(split, instance, tableConfig, principal, token, auths, logLevel);
                            split.setOffline(tableConfig.isOfflineScan());
                            split.setIsolatedScan(tableConfig.shouldUseIsolatedScanners());
                            split.setUsesLocalIterators(tableConfig.shouldUseLocalIterators());
                            splits.add(split);
                        } else {
                            // don't divide ranges
                            ArrayList<String> locations = splitsToAdd.get(r);
                            if (locations == null)
                                locations = new ArrayList<>(1);
                            locations.add(location);
                            splitsToAdd.put(r, locations);
                        }
                    }
                }
            }
        }
        if (!autoAdjust)
            for (Map.Entry<Range, ArrayList<String>> entry : splitsToAdd.entrySet()) {
                RangeInputSplit split = new RangeInputSplit(tableName, tableId.canonicalID(), entry.getKey(), entry.getValue().toArray(new String[0]));
                SplitUtils.updateSplit(split, instance, tableConfig, principal, token, auths, logLevel);
                split.setOffline(tableConfig.isOfflineScan());
                split.setIsolatedScan(tableConfig.shouldUseIsolatedScanners());
                split.setUsesLocalIterators(tableConfig.shouldUseLocalIterators());
                splits.add(split);
            }
    }
    return splits;
}
Also used : AuthenticationToken(org.apache.accumulo.core.client.security.tokens.AuthenticationToken) TableOfflineException(org.apache.accumulo.core.client.TableOfflineException) Instance(org.apache.accumulo.core.client.Instance) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) BatchInputSplit(org.apache.accumulo.core.client.mapreduce.impl.BatchInputSplit) KeyExtent(org.apache.accumulo.core.data.impl.KeyExtent) TableDeletedException(org.apache.accumulo.core.client.TableDeletedException) TableNotFoundException(org.apache.accumulo.core.client.TableNotFoundException) Random(java.util.Random) List(java.util.List) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) BatchInputSplit(org.apache.accumulo.core.client.mapreduce.impl.BatchInputSplit) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Authorizations(org.apache.accumulo.core.security.Authorizations) Table(org.apache.accumulo.core.client.impl.Table) ClientContext(org.apache.accumulo.core.client.impl.ClientContext) IOException(java.io.IOException) Range(org.apache.accumulo.core.data.Range) LinkedList(java.util.LinkedList) TableOfflineException(org.apache.accumulo.core.client.TableOfflineException) TableNotFoundException(org.apache.accumulo.core.client.TableNotFoundException) TableDeletedException(org.apache.accumulo.core.client.TableDeletedException) AccumuloSecurityException(org.apache.accumulo.core.client.AccumuloSecurityException) IOException(java.io.IOException) AccumuloException(org.apache.accumulo.core.client.AccumuloException) TabletLocator(org.apache.accumulo.core.client.impl.TabletLocator) Level(org.apache.log4j.Level) Map(java.util.Map) HashMap(java.util.HashMap) InetAddress(java.net.InetAddress) Credentials(org.apache.accumulo.core.client.impl.Credentials)

Example 2 with BatchInputSplit

use of org.apache.accumulo.core.client.mapreduce.impl.BatchInputSplit in project accumulo by apache.

the class AccumuloInputFormatIT method testGetSplits.

/**
 * Tests several different paths through the getSplits() method by setting different properties and verifying the results.
 */
@Test
public void testGetSplits() throws Exception {
    Connector conn = getConnector();
    String table = getUniqueNames(1)[0];
    conn.tableOperations().create(table);
    insertData(table, currentTimeMillis());
    ClientConfiguration clientConf = cluster.getClientConfig();
    AccumuloConfiguration clusterClientConf = new ConfigurationCopy(DefaultConfiguration.getInstance());
    // Pass SSL and CredentialProvider options into the ClientConfiguration given to AccumuloInputFormat
    boolean sslEnabled = Boolean.valueOf(clusterClientConf.get(Property.INSTANCE_RPC_SSL_ENABLED));
    if (sslEnabled) {
        ClientProperty[] sslProperties = new ClientProperty[] { ClientProperty.INSTANCE_RPC_SSL_ENABLED, ClientProperty.INSTANCE_RPC_SSL_CLIENT_AUTH, ClientProperty.RPC_SSL_KEYSTORE_PATH, ClientProperty.RPC_SSL_KEYSTORE_TYPE, ClientProperty.RPC_SSL_KEYSTORE_PASSWORD, ClientProperty.RPC_SSL_TRUSTSTORE_PATH, ClientProperty.RPC_SSL_TRUSTSTORE_TYPE, ClientProperty.RPC_SSL_TRUSTSTORE_PASSWORD, ClientProperty.RPC_USE_JSSE, ClientProperty.GENERAL_SECURITY_CREDENTIAL_PROVIDER_PATHS };
        for (ClientProperty prop : sslProperties) {
            // The default property is returned if it's not in the ClientConfiguration so we don't have to check if the value is actually defined
            clientConf.setProperty(prop, clusterClientConf.get(prop.getKey()));
        }
    }
    Job job = Job.getInstance();
    AccumuloInputFormat.setInputTableName(job, table);
    AccumuloInputFormat.setZooKeeperInstance(job, clientConf);
    AccumuloInputFormat.setConnectorInfo(job, getAdminPrincipal(), getAdminToken());
    // split table
    TreeSet<Text> splitsToAdd = new TreeSet<>();
    for (int i = 0; i < 10000; i += 1000) splitsToAdd.add(new Text(String.format("%09d", i)));
    conn.tableOperations().addSplits(table, splitsToAdd);
    // wait for splits to be propagated
    sleepUninterruptibly(500, TimeUnit.MILLISECONDS);
    // get splits without setting any range
    Collection<Text> actualSplits = conn.tableOperations().listSplits(table);
    List<InputSplit> splits = inputFormat.getSplits(job);
    // No ranges set on the job so it'll start with -inf
    assertEquals(actualSplits.size() + 1, splits.size());
    // set ranges and get splits
    List<Range> ranges = new ArrayList<>();
    for (Text text : actualSplits) ranges.add(new Range(text));
    AccumuloInputFormat.setRanges(job, ranges);
    splits = inputFormat.getSplits(job);
    assertEquals(actualSplits.size(), splits.size());
    // offline mode
    AccumuloInputFormat.setOfflineTableScan(job, true);
    try {
        inputFormat.getSplits(job);
        fail("An exception should have been thrown");
    } catch (IOException e) {
    }
    conn.tableOperations().offline(table, true);
    splits = inputFormat.getSplits(job);
    assertEquals(actualSplits.size(), splits.size());
    // auto adjust ranges
    ranges = new ArrayList<>();
    for (int i = 0; i < 5; i++) // overlapping ranges
    ranges.add(new Range(String.format("%09d", i), String.format("%09d", i + 2)));
    AccumuloInputFormat.setRanges(job, ranges);
    splits = inputFormat.getSplits(job);
    assertEquals(2, splits.size());
    AccumuloInputFormat.setAutoAdjustRanges(job, false);
    splits = inputFormat.getSplits(job);
    assertEquals(ranges.size(), splits.size());
    // BatchScan not available for offline scans
    AccumuloInputFormat.setBatchScan(job, true);
    // Reset auto-adjust ranges too
    AccumuloInputFormat.setAutoAdjustRanges(job, true);
    AccumuloInputFormat.setOfflineTableScan(job, true);
    try {
        inputFormat.getSplits(job);
        fail("An exception should have been thrown");
    } catch (IllegalArgumentException e) {
    }
    conn.tableOperations().online(table, true);
    AccumuloInputFormat.setOfflineTableScan(job, false);
    // test for resumption of success
    splits = inputFormat.getSplits(job);
    assertEquals(2, splits.size());
    // BatchScan not available with isolated iterators
    AccumuloInputFormat.setScanIsolation(job, true);
    try {
        inputFormat.getSplits(job);
        fail("An exception should have been thrown");
    } catch (IllegalArgumentException e) {
    }
    AccumuloInputFormat.setScanIsolation(job, false);
    // test for resumption of success
    splits = inputFormat.getSplits(job);
    assertEquals(2, splits.size());
    // BatchScan not available with local iterators
    AccumuloInputFormat.setLocalIterators(job, true);
    try {
        inputFormat.getSplits(job);
        fail("An exception should have been thrown");
    } catch (IllegalArgumentException e) {
    }
    AccumuloInputFormat.setLocalIterators(job, false);
    // Check we are getting back correct type pf split
    conn.tableOperations().online(table);
    splits = inputFormat.getSplits(job);
    for (InputSplit split : splits) assert (split instanceof BatchInputSplit);
    // We should divide along the tablet lines similar to when using `setAutoAdjustRanges(job, true)`
    assertEquals(2, splits.size());
}
Also used : Connector(org.apache.accumulo.core.client.Connector) ConfigurationCopy(org.apache.accumulo.core.conf.ConfigurationCopy) ClientProperty(org.apache.accumulo.core.client.ClientConfiguration.ClientProperty) ArrayList(java.util.ArrayList) BatchInputSplit(org.apache.accumulo.core.client.mapreduce.impl.BatchInputSplit) Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) Range(org.apache.accumulo.core.data.Range) TreeSet(java.util.TreeSet) Job(org.apache.hadoop.mapreduce.Job) BatchInputSplit(org.apache.accumulo.core.client.mapreduce.impl.BatchInputSplit) RangeInputSplit(org.apache.accumulo.core.client.mapreduce.RangeInputSplit) InputSplit(org.apache.hadoop.mapreduce.InputSplit) ClientConfiguration(org.apache.accumulo.core.client.ClientConfiguration) AccumuloConfiguration(org.apache.accumulo.core.conf.AccumuloConfiguration) Test(org.junit.Test)

Aggregations

IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 BatchInputSplit (org.apache.accumulo.core.client.mapreduce.impl.BatchInputSplit)2 Range (org.apache.accumulo.core.data.Range)2 InputSplit (org.apache.hadoop.mapreduce.InputSplit)2 InetAddress (java.net.InetAddress)1 HashMap (java.util.HashMap)1 LinkedList (java.util.LinkedList)1 List (java.util.List)1 Map (java.util.Map)1 Random (java.util.Random)1 TreeSet (java.util.TreeSet)1 AccumuloException (org.apache.accumulo.core.client.AccumuloException)1 AccumuloSecurityException (org.apache.accumulo.core.client.AccumuloSecurityException)1 ClientConfiguration (org.apache.accumulo.core.client.ClientConfiguration)1 ClientProperty (org.apache.accumulo.core.client.ClientConfiguration.ClientProperty)1 Connector (org.apache.accumulo.core.client.Connector)1 Instance (org.apache.accumulo.core.client.Instance)1 TableDeletedException (org.apache.accumulo.core.client.TableDeletedException)1 TableNotFoundException (org.apache.accumulo.core.client.TableNotFoundException)1