Examples with MultiIterator - org.apache.accumulo.core.iteratorsImpl.system.MultiIterator

Example 11 with MultiIterator

use of org.apache.accumulo.core.iteratorsImpl.system.MultiIterator in project accumulo by apache.

the class RFileScanner method iterator.

@Override
public Iterator<Entry<Key, Value>> iterator() {
    try {
        RFileSource[] sources = opts.in.getSources();
        List<SortedKeyValueIterator<Key, Value>> readers = new ArrayList<>(sources.length);
        CacheProvider cacheProvider = new BasicCacheProvider(indexCache, dataCache);
        for (int i = 0; i < sources.length; i++) {
            // TODO may have been a bug with multiple files and caching in older version...
            FSDataInputStream inputStream = (FSDataInputStream) sources[i].getInputStream();
            CachableBuilder cb = new CachableBuilder().input(inputStream, "source-" + i).length(sources[i].getLength()).conf(opts.in.getConf()).cacheProvider(cacheProvider).cryptoService(cryptoService);
            readers.add(new RFile.Reader(cb));
        }
        if (getSamplerConfiguration() != null) {
            for (int i = 0; i < readers.size(); i++) {
                readers.set(i, ((Reader) readers.get(i)).getSample(new SamplerConfigurationImpl(getSamplerConfiguration())));
            }
        }
        SortedKeyValueIterator<Key, Value> iterator;
        if (opts.bounds != null) {
            iterator = new MultiIterator(readers, opts.bounds);
        } else {
            iterator = new MultiIterator(readers, false);
        }
        Set<ByteSequence> families = Collections.emptySet();
        if (opts.useSystemIterators) {
            SortedSet<Column> cols = this.getFetchedColumns();
            families = LocalityGroupUtil.families(cols);
            iterator = SystemIteratorUtil.setupSystemScanIterators(iterator, cols, getAuthorizations(), EMPTY_BYTES, tableConf);
        }
        try {
            if (opts.tableConfig != null && !opts.tableConfig.isEmpty()) {
                IterLoad il = IterConfigUtil.loadIterConf(IteratorScope.scan, serverSideIteratorList, serverSideIteratorOptions, tableConf);
                iterator = IterConfigUtil.loadIterators(iterator, il.iterEnv(new IterEnv()).useAccumuloClassLoader(true));
            } else {
                iterator = IterConfigUtil.loadIterators(iterator, new IterLoad().iters(serverSideIteratorList).iterOpts(serverSideIteratorOptions).iterEnv(new IterEnv()).useAccumuloClassLoader(false));
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        iterator.seek(getRange() == null ? EMPTY_RANGE : getRange(), families, !families.isEmpty());
        return new IteratorAdapter(iterator);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}

Also used : IteratorAdapter(org.apache.accumulo.core.iterators.IteratorAdapter) BasicCacheProvider(org.apache.accumulo.core.file.blockfile.impl.BasicCacheProvider) SamplerConfigurationImpl(org.apache.accumulo.core.sample.impl.SamplerConfigurationImpl) ArrayList(java.util.ArrayList) RFile(org.apache.accumulo.core.file.rfile.RFile) Column(org.apache.accumulo.core.data.Column) CachableBuilder(org.apache.accumulo.core.file.blockfile.impl.CachableBlockFile.CachableBuilder) MultiIterator(org.apache.accumulo.core.iteratorsImpl.system.MultiIterator) SortedKeyValueIterator(org.apache.accumulo.core.iterators.SortedKeyValueIterator) IOException(java.io.IOException) BasicCacheProvider(org.apache.accumulo.core.file.blockfile.impl.BasicCacheProvider) CacheProvider(org.apache.accumulo.core.file.blockfile.impl.CacheProvider) Reader(org.apache.accumulo.core.file.rfile.RFile.Reader) IterLoad(org.apache.accumulo.core.conf.IterLoad) Value(org.apache.accumulo.core.data.Value) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) Key(org.apache.accumulo.core.data.Key) ByteSequence(org.apache.accumulo.core.data.ByteSequence)

Example 12 with MultiIterator

use of org.apache.accumulo.core.iteratorsImpl.system.MultiIterator in project accumulo by apache.

the class GenerateSplits method getSplitsFromFullScan.

private TreeSet<String> getSplitsFromFullScan(SiteConfiguration accumuloConf, Configuration hadoopConf, List<Path> files, FileSystem fs, int numSplits, boolean base64encode) throws IOException {
    Text[] splitArray;
    List<FileSKVIterator> fileReaders = new ArrayList<>(files.size());
    List<SortedKeyValueIterator<Key, Value>> readers = new ArrayList<>(files.size());
    SortedKeyValueIterator<Key, Value> iterator;
    try {
        for (Path file : files) {
            FileSKVIterator reader = FileOperations.getInstance().newScanReaderBuilder().forFile(file.toString(), fs, hadoopConf, CryptoServiceFactory.newDefaultInstance()).withTableConfiguration(accumuloConf).overRange(new Range(), Set.of(), false).build();
            readers.add(reader);
            fileReaders.add(reader);
        }
        iterator = new MultiIterator(readers, false);
        iterator.seek(new Range(), Collections.emptySet(), false);
        splitArray = getQuantiles(iterator, numSplits);
    } finally {
        for (var r : fileReaders) {
            r.close();
        }
    }
    log.debug("Got {} splits from quantiles across {} files", splitArray.length, files.size());
    return Arrays.stream(splitArray).map(t -> encode(base64encode, t)).collect(toCollection(TreeSet::new));
}

Also used : Path(org.apache.hadoop.fs.Path) ConfigOpts(org.apache.accumulo.core.cli.ConfigOpts) Arrays(java.util.Arrays) Parameter(com.beust.jcommander.Parameter) FileSystem(org.apache.hadoop.fs.FileSystem) LoggerFactory(org.slf4j.LoggerFactory) Text(org.apache.hadoop.io.Text) BinaryComparable(org.apache.hadoop.io.BinaryComparable) SortedKeyValueIterator(org.apache.accumulo.core.iterators.SortedKeyValueIterator) TextUtil(org.apache.accumulo.core.util.TextUtil) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) Collectors.toCollection(java.util.stream.Collectors.toCollection) FileOperations(org.apache.accumulo.core.file.FileOperations) Key(org.apache.accumulo.core.data.Key) Configuration(org.apache.hadoop.conf.Configuration) OutputStreamWriter(java.io.OutputStreamWriter) Path(org.apache.hadoop.fs.Path) Value(org.apache.accumulo.core.data.Value) KeywordExecutable(org.apache.accumulo.start.spi.KeywordExecutable) MultiIterator(org.apache.accumulo.core.iteratorsImpl.system.MultiIterator) PrintWriter(java.io.PrintWriter) Logger(org.slf4j.Logger) Iterator(java.util.Iterator) UTF_8(java.nio.charset.StandardCharsets.UTF_8) BufferedWriter(java.io.BufferedWriter) FileSKVIterator(org.apache.accumulo.core.file.FileSKVIterator) FileOutputStream(java.io.FileOutputStream) Set(java.util.Set) IOException(java.io.IOException) CryptoServiceFactory(org.apache.accumulo.core.crypto.CryptoServiceFactory) AccumuloConfiguration(org.apache.accumulo.core.conf.AccumuloConfiguration) Range(org.apache.accumulo.core.data.Range) SiteConfiguration(org.apache.accumulo.core.conf.SiteConfiguration) Base64(java.util.Base64) List(java.util.List) AutoService(com.google.auto.service.AutoService) Collections(java.util.Collections) SuppressFBWarnings(edu.umd.cs.findbugs.annotations.SuppressFBWarnings) ItemsSketch(org.apache.datasketches.quantiles.ItemsSketch) FileSKVIterator(org.apache.accumulo.core.file.FileSKVIterator) MultiIterator(org.apache.accumulo.core.iteratorsImpl.system.MultiIterator) ArrayList(java.util.ArrayList) SortedKeyValueIterator(org.apache.accumulo.core.iterators.SortedKeyValueIterator) Text(org.apache.hadoop.io.Text) Range(org.apache.accumulo.core.data.Range) Value(org.apache.accumulo.core.data.Value) Key(org.apache.accumulo.core.data.Key)

Example 13 with MultiIterator

use of org.apache.accumulo.core.iteratorsImpl.system.MultiIterator in project accumulo by apache.

the class GenerateSplits method getIndexKeys.

/**
 * Scan the files for indexed keys first since it is more efficient than a full file scan.
 */
private TreeSet<String> getIndexKeys(AccumuloConfiguration accumuloConf, Configuration hadoopConf, FileSystem fs, List<Path> files, int requestedNumSplits, boolean base64encode) throws IOException {
    Text[] splitArray;
    List<SortedKeyValueIterator<Key, Value>> readers = new ArrayList<>(files.size());
    List<FileSKVIterator> fileReaders = new ArrayList<>(files.size());
    try {
        for (Path file : files) {
            FileSKVIterator reader = FileOperations.getInstance().newIndexReaderBuilder().forFile(file.toString(), fs, hadoopConf, CryptoServiceFactory.newDefaultInstance()).withTableConfiguration(accumuloConf).build();
            readers.add(reader);
            fileReaders.add(reader);
        }
        var iterator = new MultiIterator(readers, true);
        splitArray = getQuantiles(iterator, requestedNumSplits);
    } finally {
        for (var r : fileReaders) {
            r.close();
        }
    }
    log.debug("Got {} splits from indices of {}", splitArray.length, files);
    return Arrays.stream(splitArray).map(t -> encode(base64encode, t)).collect(toCollection(TreeSet::new));
}

Example 14 with MultiIterator

use of org.apache.accumulo.core.iteratorsImpl.system.MultiIterator in project accumulo by apache.

the class CombinerTest method test5.

@Test
public void test5() throws IOException {
    Encoder<Long> encoder = LongCombiner.STRING_ENCODER;
    // try aggregating across multiple data sets that contain
    // the exact same keys w/ different values
    TreeMap<Key, Value> tm1 = new TreeMap<>();
    newKeyValue(tm1, 1, 1, 1, 1, false, 2L, encoder);
    TreeMap<Key, Value> tm2 = new TreeMap<>();
    newKeyValue(tm2, 1, 1, 1, 1, false, 3L, encoder);
    TreeMap<Key, Value> tm3 = new TreeMap<>();
    newKeyValue(tm3, 1, 1, 1, 1, false, 4L, encoder);
    Combiner ai = new SummingCombiner();
    IteratorSetting is = new IteratorSetting(1, SummingCombiner.class);
    LongCombiner.setEncodingType(is, StringEncoder.class);
    Combiner.setColumns(is, Collections.singletonList(new IteratorSetting.Column("cf001")));
    List<SortedKeyValueIterator<Key, Value>> sources = new ArrayList<>(3);
    sources.add(new SortedMapIterator(tm1));
    sources.add(new SortedMapIterator(tm2));
    sources.add(new SortedMapIterator(tm3));
    MultiIterator mi = new MultiIterator(sources, true);
    ai.init(mi, is.getOptions(), SCAN_IE);
    ai.seek(new Range(), EMPTY_COL_FAMS, false);
    assertTrue(ai.hasTop());
    assertEquals(newKey(1, 1, 1, 1), ai.getTopKey());
    assertEquals("9", encoder.decode(ai.getTopValue().get()).toString());
}

Also used : MultiIterator(org.apache.accumulo.core.iteratorsImpl.system.MultiIterator) ArrayList(java.util.ArrayList) SortedKeyValueIterator(org.apache.accumulo.core.iterators.SortedKeyValueIterator) LongCombiner(org.apache.accumulo.core.iterators.LongCombiner) TypedValueCombiner(org.apache.accumulo.core.iterators.TypedValueCombiner) Combiner(org.apache.accumulo.core.iterators.Combiner) TreeMap(java.util.TreeMap) SortedMapIterator(org.apache.accumulo.core.iteratorsImpl.system.SortedMapIterator) Range(org.apache.accumulo.core.data.Range) IteratorSetting(org.apache.accumulo.core.client.IteratorSetting) Value(org.apache.accumulo.core.data.Value) Key(org.apache.accumulo.core.data.Key) Test(org.junit.jupiter.api.Test)

Example 15 with MultiIterator

use of org.apache.accumulo.core.iteratorsImpl.system.MultiIterator in project accumulo by apache.

the class IntersectingIteratorTest method test3.

@Test
public void test3() throws IOException {
    columnFamilies = new Text[6];
    columnFamilies[0] = new Text("C");
    columnFamilies[1] = new Text("E");
    columnFamilies[2] = new Text("G");
    columnFamilies[3] = new Text("H");
    columnFamilies[4] = new Text("I");
    columnFamilies[5] = new Text("J");
    otherColumnFamilies = new Text[4];
    otherColumnFamilies[0] = new Text("A");
    otherColumnFamilies[1] = new Text("B");
    otherColumnFamilies[2] = new Text("D");
    otherColumnFamilies[3] = new Text("F");
    float hitRatio = 0.5f;
    SortedKeyValueIterator<Key, Value> source = createIteratorStack(hitRatio, NUM_ROWS, NUM_DOCIDS, columnFamilies, otherColumnFamilies, docs);
    SortedKeyValueIterator<Key, Value> source2 = createIteratorStack(hitRatio, NUM_ROWS, NUM_DOCIDS, columnFamilies, otherColumnFamilies, docs);
    ArrayList<SortedKeyValueIterator<Key, Value>> sourceIters = new ArrayList<>();
    sourceIters.add(source);
    sourceIters.add(source2);
    MultiIterator mi = new MultiIterator(sourceIters, false);
    IteratorSetting is = new IteratorSetting(1, IntersectingIterator.class);
    IntersectingIterator.setColumnFamilies(is, columnFamilies);
    IntersectingIterator iter = new IntersectingIterator();
    iter.init(mi, is.getOptions(), env);
    iter.seek(new Range(), EMPTY_COL_FAMS, false);
    int hitCount = 0;
    while (iter.hasTop()) {
        hitCount++;
        Key k = iter.getTopKey();
        assertTrue(docs.contains(k.getColumnQualifier()));
        iter.next();
    }
    assertEquals(hitCount, docs.size());
    cleanup();
}

Aggregations

MultiIterator (org.apache.accumulo.core.iteratorsImpl.system.MultiIterator)20 Key (org.apache.accumulo.core.data.Key)19 Value (org.apache.accumulo.core.data.Value)18 ArrayList (java.util.ArrayList)17 SortedKeyValueIterator (org.apache.accumulo.core.iterators.SortedKeyValueIterator)17 Range (org.apache.accumulo.core.data.Range)11 FileSKVIterator (org.apache.accumulo.core.file.FileSKVIterator)8 SortedMapIterator (org.apache.accumulo.core.iteratorsImpl.system.SortedMapIterator)8 Text (org.apache.hadoop.io.Text)8 Test (org.junit.jupiter.api.Test)8 TreeMap (java.util.TreeMap)7 IOException (java.io.IOException)6 Path (org.apache.hadoop.fs.Path)6 AccumuloConfiguration (org.apache.accumulo.core.conf.AccumuloConfiguration)4 IterLoad (org.apache.accumulo.core.conf.IterLoad)4 TreeSet (java.util.TreeSet)3 IteratorSetting (org.apache.accumulo.core.client.IteratorSetting)3 TabletFile (org.apache.accumulo.core.metadata.TabletFile)3 Parameter (com.beust.jcommander.Parameter)2 AutoService (com.google.auto.service.AutoService)2