Search in sources :

Example 11 with MultiIterator

use of org.apache.accumulo.core.iterators.system.MultiIterator in project accumulo by apache.

the class FileUtil method estimatePercentageLTE.

public static double estimatePercentageLTE(VolumeManager fs, String tabletDir, AccumuloConfiguration acuconf, Text prevEndRow, Text endRow, Collection<String> mapFiles, Text splitRow) throws IOException {
    Configuration conf = CachedConfiguration.getInstance();
    Path tmpDir = null;
    int maxToOpen = acuconf.getCount(Property.TSERV_TABLET_SPLIT_FINDMIDPOINT_MAXOPEN);
    ArrayList<FileSKVIterator> readers = new ArrayList<>(mapFiles.size());
    try {
        if (mapFiles.size() > maxToOpen) {
            tmpDir = createTmpDir(acuconf, fs, tabletDir);
            log.debug("Too many indexes ({}) to open at once for {} {}, reducing in tmpDir = {}", mapFiles.size(), endRow, prevEndRow, tmpDir);
            long t1 = System.currentTimeMillis();
            mapFiles = reduceFiles(acuconf, conf, fs, prevEndRow, endRow, mapFiles, maxToOpen, tmpDir, 0);
            long t2 = System.currentTimeMillis();
            log.debug("Finished reducing indexes for {} {} in {}", endRow, prevEndRow, String.format("%6.2f secs", (t2 - t1) / 1000.0));
        }
        if (prevEndRow == null)
            prevEndRow = new Text();
        long numKeys = 0;
        numKeys = countIndexEntries(acuconf, prevEndRow, endRow, mapFiles, true, conf, fs, readers);
        if (numKeys == 0) {
            // the data just punt and return .5
            return .5;
        }
        List<SortedKeyValueIterator<Key, Value>> iters = new ArrayList<>(readers);
        MultiIterator mmfi = new MultiIterator(iters, true);
        // skip the prevendrow
        while (mmfi.hasTop() && mmfi.getTopKey().compareRow(prevEndRow) <= 0) {
            mmfi.next();
        }
        int numLte = 0;
        while (mmfi.hasTop() && mmfi.getTopKey().compareRow(splitRow) <= 0) {
            numLte++;
            mmfi.next();
        }
        if (numLte > numKeys) {
            // something went wrong
            throw new RuntimeException("numLte > numKeys " + numLte + " " + numKeys + " " + prevEndRow + " " + endRow + " " + splitRow + " " + mapFiles);
        }
        // do not want to return 0% or 100%, so add 1 and 2 below
        return (numLte + 1) / (double) (numKeys + 2);
    } finally {
        cleanupIndexOp(tmpDir, fs, readers);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileSKVIterator(org.apache.accumulo.core.file.FileSKVIterator) MultiIterator(org.apache.accumulo.core.iterators.system.MultiIterator) Configuration(org.apache.hadoop.conf.Configuration) AccumuloConfiguration(org.apache.accumulo.core.conf.AccumuloConfiguration) CachedConfiguration(org.apache.accumulo.core.util.CachedConfiguration) ArrayList(java.util.ArrayList) SortedKeyValueIterator(org.apache.accumulo.core.iterators.SortedKeyValueIterator) Text(org.apache.hadoop.io.Text)

Example 12 with MultiIterator

use of org.apache.accumulo.core.iterators.system.MultiIterator in project accumulo by apache.

the class FileUtil method findMidPoint.

/**
 * @param mapFiles
 *          - list MapFiles to find the mid point key
 *
 *          ISSUES : This method used the index files to find the mid point. If the map files have different index intervals this method will not return an
 *          accurate mid point. Also, it would be tricky to use this method in conjunction with an in memory map because the indexing interval is unknown.
 */
public static SortedMap<Double, Key> findMidPoint(VolumeManager fs, String tabletDirectory, AccumuloConfiguration acuConf, Text prevEndRow, Text endRow, Collection<String> mapFiles, double minSplit, boolean useIndex) throws IOException {
    Configuration conf = CachedConfiguration.getInstance();
    Collection<String> origMapFiles = mapFiles;
    Path tmpDir = null;
    int maxToOpen = acuConf.getCount(Property.TSERV_TABLET_SPLIT_FINDMIDPOINT_MAXOPEN);
    ArrayList<FileSKVIterator> readers = new ArrayList<>(mapFiles.size());
    try {
        if (mapFiles.size() > maxToOpen) {
            if (!useIndex)
                throw new IOException("Cannot find mid point using data files, too many " + mapFiles.size());
            tmpDir = createTmpDir(acuConf, fs, tabletDirectory);
            log.debug("Too many indexes ({}) to open at once for {} {}, reducing in tmpDir = {}", mapFiles.size(), endRow, prevEndRow, tmpDir);
            long t1 = System.currentTimeMillis();
            mapFiles = reduceFiles(acuConf, conf, fs, prevEndRow, endRow, mapFiles, maxToOpen, tmpDir, 0);
            long t2 = System.currentTimeMillis();
            log.debug("Finished reducing indexes for {} {} in {}", endRow, prevEndRow, String.format("%6.2f secs", (t2 - t1) / 1000.0));
        }
        if (prevEndRow == null)
            prevEndRow = new Text();
        long t1 = System.currentTimeMillis();
        long numKeys = 0;
        numKeys = countIndexEntries(acuConf, prevEndRow, endRow, mapFiles, tmpDir == null ? useIndex : false, conf, fs, readers);
        if (numKeys == 0) {
            if (useIndex) {
                log.warn("Failed to find mid point using indexes, falling back to data files which is slower. No entries between {} and {} for {}", prevEndRow, endRow, mapFiles);
                // need to pass original map files, not possibly reduced indexes
                return findMidPoint(fs, tabletDirectory, acuConf, prevEndRow, endRow, origMapFiles, minSplit, false);
            }
            throw new IOException("Failed to find mid point, no entries between " + prevEndRow + " and " + endRow + " for " + mapFiles);
        }
        List<SortedKeyValueIterator<Key, Value>> iters = new ArrayList<>(readers);
        MultiIterator mmfi = new MultiIterator(iters, true);
        // skip the prevendrow
        while (mmfi.hasTop() && mmfi.getTopKey().compareRow(prevEndRow) <= 0) mmfi.next();
        // read half of the keys in the index
        TreeMap<Double, Key> ret = new TreeMap<>();
        Key lastKey = null;
        long keysRead = 0;
        Key keyBeforeMidPoint = null;
        long keyBeforeMidPointPosition = 0;
        while (keysRead < numKeys / 2) {
            if (lastKey != null && !lastKey.equals(mmfi.getTopKey(), PartialKey.ROW) && (keysRead - 1) / (double) numKeys >= minSplit) {
                keyBeforeMidPoint = new Key(lastKey);
                keyBeforeMidPointPosition = keysRead - 1;
            }
            if (lastKey == null)
                lastKey = new Key();
            lastKey.set(mmfi.getTopKey());
            keysRead++;
            // consume minimum
            mmfi.next();
        }
        if (keyBeforeMidPoint != null)
            ret.put(keyBeforeMidPointPosition / (double) numKeys, keyBeforeMidPoint);
        long t2 = System.currentTimeMillis();
        log.debug(String.format("Found midPoint from indexes in %6.2f secs.%n", ((t2 - t1) / 1000.0)));
        ret.put(.5, mmfi.getTopKey());
        // sanity check
        for (Key key : ret.values()) {
            boolean inRange = (key.compareRow(prevEndRow) > 0 && (endRow == null || key.compareRow(endRow) < 1));
            if (!inRange) {
                throw new IOException("Found mid point is not in range " + key + " " + prevEndRow + " " + endRow + " " + mapFiles);
            }
        }
        return ret;
    } finally {
        cleanupIndexOp(tmpDir, fs, readers);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileSKVIterator(org.apache.accumulo.core.file.FileSKVIterator) MultiIterator(org.apache.accumulo.core.iterators.system.MultiIterator) Configuration(org.apache.hadoop.conf.Configuration) AccumuloConfiguration(org.apache.accumulo.core.conf.AccumuloConfiguration) CachedConfiguration(org.apache.accumulo.core.util.CachedConfiguration) ArrayList(java.util.ArrayList) SortedKeyValueIterator(org.apache.accumulo.core.iterators.SortedKeyValueIterator) Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) TreeMap(java.util.TreeMap) Key(org.apache.accumulo.core.data.Key) PartialKey(org.apache.accumulo.core.data.PartialKey)

Example 13 with MultiIterator

use of org.apache.accumulo.core.iterators.system.MultiIterator in project accumulo by apache.

the class CombinerTest method test5.

@Test
public void test5() throws IOException {
    Encoder<Long> encoder = LongCombiner.STRING_ENCODER;
    // try aggregating across multiple data sets that contain
    // the exact same keys w/ different values
    TreeMap<Key, Value> tm1 = new TreeMap<>();
    newKeyValue(tm1, 1, 1, 1, 1, false, 2l, encoder);
    TreeMap<Key, Value> tm2 = new TreeMap<>();
    newKeyValue(tm2, 1, 1, 1, 1, false, 3l, encoder);
    TreeMap<Key, Value> tm3 = new TreeMap<>();
    newKeyValue(tm3, 1, 1, 1, 1, false, 4l, encoder);
    Combiner ai = new SummingCombiner();
    IteratorSetting is = new IteratorSetting(1, SummingCombiner.class);
    LongCombiner.setEncodingType(is, StringEncoder.class);
    Combiner.setColumns(is, Collections.singletonList(new IteratorSetting.Column("cf001")));
    List<SortedKeyValueIterator<Key, Value>> sources = new ArrayList<>(3);
    sources.add(new SortedMapIterator(tm1));
    sources.add(new SortedMapIterator(tm2));
    sources.add(new SortedMapIterator(tm3));
    MultiIterator mi = new MultiIterator(sources, true);
    ai.init(mi, is.getOptions(), SCAN_IE);
    ai.seek(new Range(), EMPTY_COL_FAMS, false);
    assertTrue(ai.hasTop());
    assertEquals(newKey(1, 1, 1, 1), ai.getTopKey());
    assertEquals("9", encoder.decode(ai.getTopValue().get()).toString());
}
Also used : MultiIterator(org.apache.accumulo.core.iterators.system.MultiIterator) ArrayList(java.util.ArrayList) SortedKeyValueIterator(org.apache.accumulo.core.iterators.SortedKeyValueIterator) LongCombiner(org.apache.accumulo.core.iterators.LongCombiner) TypedValueCombiner(org.apache.accumulo.core.iterators.TypedValueCombiner) Combiner(org.apache.accumulo.core.iterators.Combiner) TreeMap(java.util.TreeMap) SortedMapIterator(org.apache.accumulo.core.iterators.SortedMapIterator) Range(org.apache.accumulo.core.data.Range) IteratorSetting(org.apache.accumulo.core.client.IteratorSetting) Value(org.apache.accumulo.core.data.Value) Key(org.apache.accumulo.core.data.Key) Test(org.junit.Test)

Example 14 with MultiIterator

use of org.apache.accumulo.core.iterators.system.MultiIterator in project accumulo by apache.

the class IntersectingIteratorTest method test3.

@Test
public void test3() throws IOException {
    columnFamilies = new Text[6];
    columnFamilies[0] = new Text("C");
    columnFamilies[1] = new Text("E");
    columnFamilies[2] = new Text("G");
    columnFamilies[3] = new Text("H");
    columnFamilies[4] = new Text("I");
    columnFamilies[5] = new Text("J");
    otherColumnFamilies = new Text[4];
    otherColumnFamilies[0] = new Text("A");
    otherColumnFamilies[1] = new Text("B");
    otherColumnFamilies[2] = new Text("D");
    otherColumnFamilies[3] = new Text("F");
    float hitRatio = 0.5f;
    SortedKeyValueIterator<Key, Value> source = createIteratorStack(hitRatio, NUM_ROWS, NUM_DOCIDS, columnFamilies, otherColumnFamilies, docs);
    SortedKeyValueIterator<Key, Value> source2 = createIteratorStack(hitRatio, NUM_ROWS, NUM_DOCIDS, columnFamilies, otherColumnFamilies, docs);
    ArrayList<SortedKeyValueIterator<Key, Value>> sourceIters = new ArrayList<>();
    sourceIters.add(source);
    sourceIters.add(source2);
    MultiIterator mi = new MultiIterator(sourceIters, false);
    IteratorSetting is = new IteratorSetting(1, IntersectingIterator.class);
    IntersectingIterator.setColumnFamilies(is, columnFamilies);
    IntersectingIterator iter = new IntersectingIterator();
    iter.init(mi, is.getOptions(), env);
    iter.seek(new Range(), EMPTY_COL_FAMS, false);
    int hitCount = 0;
    while (iter.hasTop()) {
        hitCount++;
        Key k = iter.getTopKey();
        assertTrue(docs.contains(k.getColumnQualifier()));
        iter.next();
    }
    assertTrue(hitCount == docs.size());
    cleanup();
}
Also used : MultiIterator(org.apache.accumulo.core.iterators.system.MultiIterator) ArrayList(java.util.ArrayList) SortedKeyValueIterator(org.apache.accumulo.core.iterators.SortedKeyValueIterator) Text(org.apache.hadoop.io.Text) Range(org.apache.accumulo.core.data.Range) IteratorSetting(org.apache.accumulo.core.client.IteratorSetting) Value(org.apache.accumulo.core.data.Value) Key(org.apache.accumulo.core.data.Key) Test(org.junit.Test)

Aggregations

MultiIterator (org.apache.accumulo.core.iterators.system.MultiIterator)14 Key (org.apache.accumulo.core.data.Key)13 Value (org.apache.accumulo.core.data.Value)12 ArrayList (java.util.ArrayList)11 SortedKeyValueIterator (org.apache.accumulo.core.iterators.SortedKeyValueIterator)10 Range (org.apache.accumulo.core.data.Range)6 Text (org.apache.hadoop.io.Text)6 TreeMap (java.util.TreeMap)5 FileSKVIterator (org.apache.accumulo.core.file.FileSKVIterator)5 IOException (java.io.IOException)4 SortedMapIterator (org.apache.accumulo.core.iterators.SortedMapIterator)4 IteratorSetting (org.apache.accumulo.core.client.IteratorSetting)3 AccumuloConfiguration (org.apache.accumulo.core.conf.AccumuloConfiguration)3 PartialKey (org.apache.accumulo.core.data.PartialKey)3 SamplerConfigurationImpl (org.apache.accumulo.core.sample.impl.SamplerConfigurationImpl)3 CachedConfiguration (org.apache.accumulo.core.util.CachedConfiguration)3 Configuration (org.apache.hadoop.conf.Configuration)3 Test (org.junit.Test)3 HashMap (java.util.HashMap)2 ConfigurationCopy (org.apache.accumulo.core.conf.ConfigurationCopy)2