Search in sources :

Example 6 with InputTableConfig

use of org.apache.accumulo.hadoopImpl.mapreduce.InputTableConfig in project accumulo by apache.

the class AccumuloRecordReader method initialize.

/**
 * Initialize a scanner over the given input split using this task attempt configuration.
 */
public void initialize(InputSplit inSplit, JobConf job) throws IOException {
    baseSplit = (org.apache.accumulo.hadoopImpl.mapreduce.RangeInputSplit) inSplit;
    log.debug("Initializing input split: " + baseSplit);
    client = createClient(job, CLASS);
    ClientContext context = (ClientContext) client;
    Authorizations authorizations = InputConfigurator.getScanAuthorizations(CLASS, job);
    String classLoaderContext = InputConfigurator.getClassLoaderContext(CLASS, job);
    String table = baseSplit.getTableName();
    // in case the table name changed, we can still use the previous name for terms of
    // configuration, but the scanner will use the table id resolved at job setup time
    InputTableConfig tableConfig = InputConfigurator.getInputTableConfig(CLASS, job, baseSplit.getTableName());
    log.debug("Created client with user: " + context.whoami());
    log.debug("Creating scanner for table: " + table);
    log.debug("Authorizations are: " + authorizations);
    if (baseSplit instanceof BatchInputSplit) {
        BatchScanner scanner;
        BatchInputSplit multiRangeSplit = (BatchInputSplit) baseSplit;
        try {
            // Note: BatchScanner will use at most one thread per tablet, currently BatchInputSplit
            // will not span tablets
            int scanThreads = 1;
            scanner = context.createBatchScanner(baseSplit.getTableName(), authorizations, scanThreads);
            setupIterators(job, scanner, baseSplit);
            if (classLoaderContext != null) {
                scanner.setClassLoaderContext(classLoaderContext);
            }
        } catch (TableNotFoundException e) {
            throw new IOException(e);
        }
        scanner.setRanges(multiRangeSplit.getRanges());
        scannerBase = scanner;
    } else if (baseSplit instanceof RangeInputSplit) {
        split = (RangeInputSplit) baseSplit;
        Boolean isOffline = baseSplit.isOffline();
        if (isOffline == null) {
            isOffline = tableConfig.isOfflineScan();
        }
        Boolean isIsolated = baseSplit.isIsolatedScan();
        if (isIsolated == null) {
            isIsolated = tableConfig.shouldUseIsolatedScanners();
        }
        Boolean usesLocalIterators = baseSplit.usesLocalIterators();
        if (usesLocalIterators == null) {
            usesLocalIterators = tableConfig.shouldUseLocalIterators();
        }
        Scanner scanner;
        try {
            if (isOffline) {
                scanner = new OfflineScanner(context, TableId.of(baseSplit.getTableId()), authorizations);
            } else {
                scanner = new ScannerImpl(context, TableId.of(baseSplit.getTableId()), authorizations);
            }
            if (isIsolated) {
                log.info("Creating isolated scanner");
                scanner = new IsolatedScanner(scanner);
            }
            if (usesLocalIterators) {
                log.info("Using local iterators");
                scanner = new ClientSideIteratorScanner(scanner);
            }
            setupIterators(job, scanner, baseSplit);
        } catch (RuntimeException e) {
            throw new IOException(e);
        }
        scanner.setRange(baseSplit.getRange());
        scannerBase = scanner;
    } else {
        throw new IllegalArgumentException("Can not initialize from " + baseSplit.getClass());
    }
    Collection<IteratorSetting.Column> columns = baseSplit.getFetchedColumns();
    if (columns == null) {
        columns = tableConfig.getFetchedColumns();
    }
    // setup a scanner within the bounds of this split
    for (Pair<Text, Text> c : columns) {
        if (c.getSecond() != null) {
            log.debug("Fetching column " + c.getFirst() + ":" + c.getSecond());
            scannerBase.fetchColumn(c.getFirst(), c.getSecond());
        } else {
            log.debug("Fetching column family " + c.getFirst());
            scannerBase.fetchColumnFamily(c.getFirst());
        }
    }
    SamplerConfiguration samplerConfig = baseSplit.getSamplerConfiguration();
    if (samplerConfig == null) {
        samplerConfig = tableConfig.getSamplerConfiguration();
    }
    if (samplerConfig != null) {
        scannerBase.setSamplerConfiguration(samplerConfig);
    }
    Map<String, String> executionHints = baseSplit.getExecutionHints();
    if (executionHints == null || executionHints.isEmpty()) {
        executionHints = tableConfig.getExecutionHints();
    }
    if (executionHints != null) {
        scannerBase.setExecutionHints(executionHints);
    }
    scannerIterator = scannerBase.iterator();
    numKeysRead = 0;
}
Also used : BatchScanner(org.apache.accumulo.core.client.BatchScanner) OfflineScanner(org.apache.accumulo.core.clientImpl.OfflineScanner) ClientSideIteratorScanner(org.apache.accumulo.core.client.ClientSideIteratorScanner) IsolatedScanner(org.apache.accumulo.core.client.IsolatedScanner) Scanner(org.apache.accumulo.core.client.Scanner) BatchScanner(org.apache.accumulo.core.client.BatchScanner) SamplerConfiguration(org.apache.accumulo.core.client.sample.SamplerConfiguration) TableNotFoundException(org.apache.accumulo.core.client.TableNotFoundException) OfflineScanner(org.apache.accumulo.core.clientImpl.OfflineScanner) ClientSideIteratorScanner(org.apache.accumulo.core.client.ClientSideIteratorScanner) Authorizations(org.apache.accumulo.core.security.Authorizations) ClientContext(org.apache.accumulo.core.clientImpl.ClientContext) Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) ScannerImpl(org.apache.accumulo.core.clientImpl.ScannerImpl) InputTableConfig(org.apache.accumulo.hadoopImpl.mapreduce.InputTableConfig) IsolatedScanner(org.apache.accumulo.core.client.IsolatedScanner)

Example 7 with InputTableConfig

use of org.apache.accumulo.hadoopImpl.mapreduce.InputTableConfig in project accumulo by apache.

the class MultiTableInputFormatTest method testStoreTables.

/**
 * Verify {@link InputTableConfig} objects get correctly serialized in the JobContext.
 */
@Test
public void testStoreTables() throws Exception {
    String table1Name = testName.getMethodName() + "1";
    String table2Name = testName.getMethodName() + "2";
    JobConf job = new JobConf();
    Properties clientProps = org.apache.accumulo.hadoop.mapreduce.AccumuloInputFormatTest.setupClientProperties();
    List<Range> ranges = singletonList(new Range("a", "b"));
    Set<IteratorSetting.Column> cols = singleton(new IteratorSetting.Column(new Text("CF1"), new Text("CQ1")));
    IteratorSetting iter1 = new IteratorSetting(50, "iter1", "iterclass1");
    IteratorSetting iter2 = new IteratorSetting(60, "iter2", "iterclass2");
    List<IteratorSetting> allIters = new ArrayList<>();
    allIters.add(iter1);
    allIters.add(iter2);
    // if auths are not set client will try to get from server, we dont want that here
    Authorizations auths = Authorizations.EMPTY;
    // @formatter:off
    AccumuloInputFormat.configure().clientProperties(clientProps).table(table1Name).auths(auths).ranges(ranges).fetchColumns(cols).addIterator(iter1).addIterator(iter2).localIterators(true).offlineScan(// end table 1
    true).table(table2Name).auths(auths).ranges(ranges).fetchColumns(cols).addIterator(// end
    iter2).store(job);
    // @formatter:on
    InputTableConfig table1 = new InputTableConfig();
    table1.setScanAuths(auths).setRanges(ranges).fetchColumns(cols).setUseLocalIterators(true).setOfflineScan(true);
    allIters.forEach(table1::addIterator);
    InputTableConfig table2 = new InputTableConfig();
    table2.setScanAuths(auths).setRanges(ranges).fetchColumns(cols).addIterator(iter2);
    assertEquals(table1, InputConfigurator.getInputTableConfig(CLASS, job, table1Name));
    assertEquals(table2, InputConfigurator.getInputTableConfig(CLASS, job, table2Name));
}
Also used : Authorizations(org.apache.accumulo.core.security.Authorizations) ArrayList(java.util.ArrayList) Text(org.apache.hadoop.io.Text) Properties(java.util.Properties) Range(org.apache.accumulo.core.data.Range) IteratorSetting(org.apache.accumulo.core.client.IteratorSetting) InputTableConfig(org.apache.accumulo.hadoopImpl.mapreduce.InputTableConfig) Column(org.apache.accumulo.core.client.IteratorSetting.Column) Column(org.apache.accumulo.core.client.IteratorSetting.Column) JobConf(org.apache.hadoop.mapred.JobConf) Test(org.junit.Test)

Example 8 with InputTableConfig

use of org.apache.accumulo.hadoopImpl.mapreduce.InputTableConfig in project accumulo by apache.

the class InputConfigurator method getInputTableConfigs.

/**
 * Returns all {@link InputTableConfig} objects associated with this job.
 *
 * @param implementingClass
 *          the class whose name will be used as a prefix for the property configuration key
 * @param conf
 *          the Hadoop configuration object to configure
 * @param tableName
 *          the table name for which to retrieve the configuration
 * @return all of the table query configs for the job
 * @since 1.6.0
 */
private static Map<String, InputTableConfig> getInputTableConfigs(Class<?> implementingClass, Configuration conf, String tableName) {
    Map<String, InputTableConfig> configs = new HashMap<>();
    Map.Entry<String, InputTableConfig> defaultConfig = getDefaultInputTableConfig(implementingClass, conf, tableName);
    if (defaultConfig != null)
        configs.put(defaultConfig.getKey(), defaultConfig.getValue());
    String configString = conf.get(enumToConfKey(implementingClass, ScanOpts.TABLE_CONFIGS));
    MapWritable mapWritable = new MapWritable();
    if (configString != null) {
        try {
            byte[] bytes = Base64.getDecoder().decode(configString);
            ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
            mapWritable.readFields(new DataInputStream(bais));
            bais.close();
        } catch (IOException e) {
            throw new IllegalStateException("The table query configurations could not be deserialized" + " from the given configuration");
        }
    }
    for (Map.Entry<Writable, Writable> entry : mapWritable.entrySet()) configs.put(entry.getKey().toString(), (InputTableConfig) entry.getValue());
    return configs;
}
Also used : HashMap(java.util.HashMap) Writable(org.apache.hadoop.io.Writable) MapWritable(org.apache.hadoop.io.MapWritable) MapWritable(org.apache.hadoop.io.MapWritable) IOException(java.io.IOException) DataInputStream(java.io.DataInputStream) InputTableConfig(org.apache.accumulo.hadoopImpl.mapreduce.InputTableConfig) ByteArrayInputStream(java.io.ByteArrayInputStream) Map(java.util.Map) HashMap(java.util.HashMap)

Example 9 with InputTableConfig

use of org.apache.accumulo.hadoopImpl.mapreduce.InputTableConfig in project accumulo by apache.

the class AccumuloRecordReader method getSplits.

/**
 * Gets the splits of the tables that have been set on the job by reading the metadata table for
 * the specified ranges.
 */
public static InputSplit[] getSplits(JobConf job, Class<?> callingClass) throws IOException {
    validateOptions(job, callingClass);
    LinkedList<InputSplit> splits = new LinkedList<>();
    Map<String, InputTableConfig> tableConfigs = InputConfigurator.getInputTableConfigs(callingClass, job);
    try (AccumuloClient client = createClient(job, callingClass);
        var context = ((ClientContext) client)) {
        for (Map.Entry<String, InputTableConfig> tableConfigEntry : tableConfigs.entrySet()) {
            String tableName = tableConfigEntry.getKey();
            InputTableConfig tableConfig = tableConfigEntry.getValue();
            TableId tableId;
            // resolve table name to id once, and use id from this point forward
            try {
                tableId = context.getTableId(tableName);
            } catch (TableNotFoundException e) {
                throw new IOException(e);
            }
            boolean batchScan = InputConfigurator.isBatchScan(callingClass, job);
            boolean supportBatchScan = !(tableConfig.isOfflineScan() || tableConfig.shouldUseIsolatedScanners() || tableConfig.shouldUseLocalIterators());
            if (batchScan && !supportBatchScan)
                throw new IllegalArgumentException("BatchScanner optimization not available for offline" + " scan, isolated, or local iterators");
            boolean autoAdjust = tableConfig.shouldAutoAdjustRanges();
            if (batchScan && !autoAdjust)
                throw new IllegalArgumentException("AutoAdjustRanges must be enabled when using BatchScanner optimization");
            List<Range> ranges = autoAdjust ? Range.mergeOverlapping(tableConfig.getRanges()) : tableConfig.getRanges();
            if (ranges.isEmpty()) {
                ranges = new ArrayList<>(1);
                ranges.add(new Range());
            }
            // get the metadata information for these ranges
            Map<String, Map<KeyExtent, List<Range>>> binnedRanges = new HashMap<>();
            TabletLocator tl;
            try {
                if (tableConfig.isOfflineScan()) {
                    binnedRanges = binOfflineTable(job, tableId, ranges, callingClass);
                    while (binnedRanges == null) {
                        // Some tablets were still online, try again
                        // sleep randomly between 100 and 200 ms
                        sleepUninterruptibly(100 + random.nextInt(100), TimeUnit.MILLISECONDS);
                        binnedRanges = binOfflineTable(job, tableId, ranges, callingClass);
                    }
                } else {
                    tl = InputConfigurator.getTabletLocator(callingClass, job, tableId);
                    // its possible that the cache could contain complete, but old information about a
                    // tables
                    // tablets... so clear it
                    tl.invalidateCache();
                    while (!tl.binRanges(context, ranges, binnedRanges).isEmpty()) {
                        context.requireNotDeleted(tableId);
                        context.requireNotOffline(tableId, tableName);
                        binnedRanges.clear();
                        log.warn("Unable to locate bins for specified ranges. Retrying.");
                        // sleep randomly between 100 and 200 ms
                        sleepUninterruptibly(100 + random.nextInt(100), TimeUnit.MILLISECONDS);
                        tl.invalidateCache();
                    }
                }
            } catch (TableOfflineException | TableNotFoundException | AccumuloException | AccumuloSecurityException e) {
                throw new IOException(e);
            }
            HashMap<Range, ArrayList<String>> splitsToAdd = null;
            if (!autoAdjust)
                splitsToAdd = new HashMap<>();
            HashMap<String, String> hostNameCache = new HashMap<>();
            for (Map.Entry<String, Map<KeyExtent, List<Range>>> tserverBin : binnedRanges.entrySet()) {
                String ip = tserverBin.getKey().split(":", 2)[0];
                String location = hostNameCache.get(ip);
                if (location == null) {
                    InetAddress inetAddress = InetAddress.getByName(ip);
                    location = inetAddress.getCanonicalHostName();
                    hostNameCache.put(ip, location);
                }
                for (Map.Entry<KeyExtent, List<Range>> extentRanges : tserverBin.getValue().entrySet()) {
                    Range ke = extentRanges.getKey().toDataRange();
                    if (batchScan) {
                        // group ranges by tablet to be read by a BatchScanner
                        ArrayList<Range> clippedRanges = new ArrayList<>();
                        for (Range r : extentRanges.getValue()) clippedRanges.add(ke.clip(r));
                        BatchInputSplit split = new BatchInputSplit(tableName, tableId, clippedRanges, new String[] { location });
                        SplitUtils.updateSplit(split, tableConfig);
                        splits.add(split);
                    } else {
                        // not grouping by tablet
                        for (Range r : extentRanges.getValue()) {
                            if (autoAdjust) {
                                // divide ranges into smaller ranges, based on the tablets
                                RangeInputSplit split = new RangeInputSplit(tableName, tableId.canonical(), ke.clip(r), new String[] { location });
                                SplitUtils.updateSplit(split, tableConfig);
                                split.setOffline(tableConfig.isOfflineScan());
                                split.setIsolatedScan(tableConfig.shouldUseIsolatedScanners());
                                split.setUsesLocalIterators(tableConfig.shouldUseLocalIterators());
                                splits.add(split);
                            } else {
                                // don't divide ranges
                                ArrayList<String> locations = splitsToAdd.get(r);
                                if (locations == null)
                                    locations = new ArrayList<>(1);
                                locations.add(location);
                                splitsToAdd.put(r, locations);
                            }
                        }
                    }
                }
            }
            if (!autoAdjust)
                for (Map.Entry<Range, ArrayList<String>> entry : splitsToAdd.entrySet()) {
                    RangeInputSplit split = new RangeInputSplit(tableName, tableId.canonical(), entry.getKey(), entry.getValue().toArray(new String[0]));
                    SplitUtils.updateSplit(split, tableConfig);
                    split.setOffline(tableConfig.isOfflineScan());
                    split.setIsolatedScan(tableConfig.shouldUseIsolatedScanners());
                    split.setUsesLocalIterators(tableConfig.shouldUseLocalIterators());
                    splits.add(split);
                }
        }
    }
    return splits.toArray(new InputSplit[splits.size()]);
}
Also used : AccumuloClient(org.apache.accumulo.core.client.AccumuloClient) TableId(org.apache.accumulo.core.data.TableId) TableOfflineException(org.apache.accumulo.core.client.TableOfflineException) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) KeyExtent(org.apache.accumulo.core.dataImpl.KeyExtent) TableNotFoundException(org.apache.accumulo.core.client.TableNotFoundException) AccumuloSecurityException(org.apache.accumulo.core.client.AccumuloSecurityException) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) List(java.util.List) InputSplit(org.apache.hadoop.mapred.InputSplit) AccumuloException(org.apache.accumulo.core.client.AccumuloException) IOException(java.io.IOException) Range(org.apache.accumulo.core.data.Range) LinkedList(java.util.LinkedList) TabletLocator(org.apache.accumulo.core.clientImpl.TabletLocator) InputTableConfig(org.apache.accumulo.hadoopImpl.mapreduce.InputTableConfig) HashMap(java.util.HashMap) Map(java.util.Map) InetAddress(java.net.InetAddress)

Example 10 with InputTableConfig

use of org.apache.accumulo.hadoopImpl.mapreduce.InputTableConfig in project accumulo by apache.

the class MultiTableInputFormatTest method testManyTables.

@Test
public void testManyTables() throws Exception {
    Job job = Job.getInstance();
    Properties clientProps = org.apache.accumulo.hadoop.mapreduce.AccumuloInputFormatTest.setupClientProperties();
    // if auths are not set client will try to get from server, we dont want that here
    Authorizations auths = Authorizations.EMPTY;
    // set the client properties once then loop over tables
    InputFormatBuilder.TableParams<Job> opts = AccumuloInputFormat.configure().clientProperties(clientProps);
    for (int i = 0; i < 10_000; i++) {
        List<Range> ranges = singletonList(new Range("a" + i, "b" + i));
        Set<Column> cols = singleton(new Column(new Text("CF" + i), new Text("CQ" + i)));
        IteratorSetting iter = new IteratorSetting(50, "iter" + i, "iterclass" + i);
        opts.table("table" + i).auths(auths).ranges(ranges).fetchColumns(cols).addIterator(iter);
    }
    opts.store(job);
    // verify
    Map<String, InputTableConfig> configs = InputConfigurator.getInputTableConfigs(CLASS, job.getConfiguration());
    assertEquals(10_000, configs.size());
    // create objects to test against
    for (int i = 0; i < 10_000; i++) {
        InputTableConfig t = new InputTableConfig();
        List<Range> ranges = singletonList(new Range("a" + i, "b" + i));
        Set<Column> cols = singleton(new Column(new Text("CF" + i), new Text("CQ" + i)));
        IteratorSetting iter = new IteratorSetting(50, "iter" + i, "iterclass" + i);
        t.setScanAuths(auths).setRanges(ranges).fetchColumns(cols).addIterator(iter);
        assertEquals(t, configs.get("table" + i));
    }
}
Also used : Authorizations(org.apache.accumulo.core.security.Authorizations) Text(org.apache.hadoop.io.Text) Properties(java.util.Properties) Range(org.apache.accumulo.core.data.Range) IteratorSetting(org.apache.accumulo.core.client.IteratorSetting) InputTableConfig(org.apache.accumulo.hadoopImpl.mapreduce.InputTableConfig) Column(org.apache.accumulo.core.client.IteratorSetting.Column) Job(org.apache.hadoop.mapreduce.Job) Test(org.junit.Test)

Aggregations

InputTableConfig (org.apache.accumulo.hadoopImpl.mapreduce.InputTableConfig)10 IOException (java.io.IOException)6 IteratorSetting (org.apache.accumulo.core.client.IteratorSetting)6 Range (org.apache.accumulo.core.data.Range)6 Text (org.apache.hadoop.io.Text)6 Properties (java.util.Properties)5 Authorizations (org.apache.accumulo.core.security.Authorizations)5 HashMap (java.util.HashMap)4 Map (java.util.Map)4 Column (org.apache.accumulo.core.client.IteratorSetting.Column)4 Test (org.junit.Test)4 ArrayList (java.util.ArrayList)3 TableNotFoundException (org.apache.accumulo.core.client.TableNotFoundException)3 AccumuloException (org.apache.accumulo.core.client.AccumuloException)2 AccumuloSecurityException (org.apache.accumulo.core.client.AccumuloSecurityException)2 SamplerConfiguration (org.apache.accumulo.core.client.sample.SamplerConfiguration)2 MapWritable (org.apache.hadoop.io.MapWritable)2 JobConf (org.apache.hadoop.mapred.JobConf)2 Job (org.apache.hadoop.mapreduce.Job)2 ByteArrayInputStream (java.io.ByteArrayInputStream)1