Search in sources :

Example 26 with FileSKVIterator

use of org.apache.accumulo.core.file.FileSKVIterator in project accumulo by apache.

the class FileUtil method findMidPoint.

 * @param mapFiles
 *          - list MapFiles to find the mid point key
 *          ISSUES : This method used the index files to find the mid point. If the map files have different index intervals this method will not return an
 *          accurate mid point. Also, it would be tricky to use this method in conjunction with an in memory map because the indexing interval is unknown.
public static SortedMap<Double, Key> findMidPoint(VolumeManager fs, String tabletDirectory, AccumuloConfiguration acuConf, Text prevEndRow, Text endRow, Collection<String> mapFiles, double minSplit, boolean useIndex) throws IOException {
    Configuration conf = CachedConfiguration.getInstance();
    Collection<String> origMapFiles = mapFiles;
    Path tmpDir = null;
    int maxToOpen = acuConf.getCount(Property.TSERV_TABLET_SPLIT_FINDMIDPOINT_MAXOPEN);
    ArrayList<FileSKVIterator> readers = new ArrayList<>(mapFiles.size());
    try {
        if (mapFiles.size() > maxToOpen) {
            if (!useIndex)
                throw new IOException("Cannot find mid point using data files, too many " + mapFiles.size());
            tmpDir = createTmpDir(acuConf, fs, tabletDirectory);
            log.debug("Too many indexes ({}) to open at once for {} {}, reducing in tmpDir = {}", mapFiles.size(), endRow, prevEndRow, tmpDir);
            long t1 = System.currentTimeMillis();
            mapFiles = reduceFiles(acuConf, conf, fs, prevEndRow, endRow, mapFiles, maxToOpen, tmpDir, 0);
            long t2 = System.currentTimeMillis();
            log.debug("Finished reducing indexes for {} {} in {}", endRow, prevEndRow, String.format("%6.2f secs", (t2 - t1) / 1000.0));
        if (prevEndRow == null)
            prevEndRow = new Text();
        long t1 = System.currentTimeMillis();
        long numKeys = 0;
        numKeys = countIndexEntries(acuConf, prevEndRow, endRow, mapFiles, tmpDir == null ? useIndex : false, conf, fs, readers);
        if (numKeys == 0) {
            if (useIndex) {
                log.warn("Failed to find mid point using indexes, falling back to data files which is slower. No entries between {} and {} for {}", prevEndRow, endRow, mapFiles);
                // need to pass original map files, not possibly reduced indexes
                return findMidPoint(fs, tabletDirectory, acuConf, prevEndRow, endRow, origMapFiles, minSplit, false);
            throw new IOException("Failed to find mid point, no entries between " + prevEndRow + " and " + endRow + " for " + mapFiles);
        List<SortedKeyValueIterator<Key, Value>> iters = new ArrayList<>(readers);
        MultiIterator mmfi = new MultiIterator(iters, true);
        // skip the prevendrow
        while (mmfi.hasTop() && mmfi.getTopKey().compareRow(prevEndRow) <= 0);
        // read half of the keys in the index
        TreeMap<Double, Key> ret = new TreeMap<>();
        Key lastKey = null;
        long keysRead = 0;
        Key keyBeforeMidPoint = null;
        long keyBeforeMidPointPosition = 0;
        while (keysRead < numKeys / 2) {
            if (lastKey != null && !lastKey.equals(mmfi.getTopKey(), PartialKey.ROW) && (keysRead - 1) / (double) numKeys >= minSplit) {
                keyBeforeMidPoint = new Key(lastKey);
                keyBeforeMidPointPosition = keysRead - 1;
            if (lastKey == null)
                lastKey = new Key();
            // consume minimum
        if (keyBeforeMidPoint != null)
            ret.put(keyBeforeMidPointPosition / (double) numKeys, keyBeforeMidPoint);
        long t2 = System.currentTimeMillis();
        log.debug(String.format("Found midPoint from indexes in %6.2f secs.%n", ((t2 - t1) / 1000.0)));
        ret.put(.5, mmfi.getTopKey());
        // sanity check
        for (Key key : ret.values()) {
            boolean inRange = (key.compareRow(prevEndRow) > 0 && (endRow == null || key.compareRow(endRow) < 1));
            if (!inRange) {
                throw new IOException("Found mid point is not in range " + key + " " + prevEndRow + " " + endRow + " " + mapFiles);
        return ret;
    } finally {
        cleanupIndexOp(tmpDir, fs, readers);
Also used : Path(org.apache.hadoop.fs.Path) FileSKVIterator(org.apache.accumulo.core.file.FileSKVIterator) MultiIterator(org.apache.accumulo.core.iterators.system.MultiIterator) Configuration(org.apache.hadoop.conf.Configuration) AccumuloConfiguration(org.apache.accumulo.core.conf.AccumuloConfiguration) CachedConfiguration(org.apache.accumulo.core.util.CachedConfiguration) ArrayList(java.util.ArrayList) SortedKeyValueIterator(org.apache.accumulo.core.iterators.SortedKeyValueIterator) Text( IOException( TreeMap(java.util.TreeMap) Key( PartialKey(

Example 27 with FileSKVIterator

use of org.apache.accumulo.core.file.FileSKVIterator in project accumulo by apache.

the class AccumuloFileOutputFormatIT method handleWriteTests.

private void handleWriteTests(boolean content) throws Exception {
    File f = folder.newFile(testName.getMethodName());
    if (f.delete()) {
        log.debug("Deleted {}", f);
    MRTester.main(new String[] { content ? TEST_TABLE : EMPTY_TABLE, f.getAbsolutePath() });
    File[] files = f.listFiles(new FileFilter() {

        public boolean accept(File file) {
            return file.getName().startsWith("part-m-");
    if (content) {
        assertEquals(1, files.length);
        Configuration conf = CachedConfiguration.getInstance();
        DefaultConfiguration acuconf = DefaultConfiguration.getInstance();
        FileSKVIterator sample = RFileOperations.getInstance().newReaderBuilder().forFile(files[0].toString(), FileSystem.get(conf), conf).withTableConfiguration(acuconf).build().getSample(new SamplerConfigurationImpl(SAMPLER_CONFIG));
    } else {
        assertEquals(0, files.length);
Also used : FileSKVIterator(org.apache.accumulo.core.file.FileSKVIterator) Configuration(org.apache.hadoop.conf.Configuration) SamplerConfiguration(org.apache.accumulo.core.client.sample.SamplerConfiguration) DefaultConfiguration(org.apache.accumulo.core.conf.DefaultConfiguration) CachedConfiguration(org.apache.accumulo.core.util.CachedConfiguration) SamplerConfigurationImpl(org.apache.accumulo.core.sample.impl.SamplerConfigurationImpl) DefaultConfiguration(org.apache.accumulo.core.conf.DefaultConfiguration) FileFilter( File(

Example 28 with FileSKVIterator

use of org.apache.accumulo.core.file.FileSKVIterator in project accumulo by apache.

the class PrintInfo method execute.

public void execute(final String[] args) throws Exception {
    Opts opts = new Opts();
    opts.parseArgs("accumulo rfile-info", args);
    if (opts.files.isEmpty()) {
        System.err.println("No files were given");
    Configuration conf = new Configuration();
    for (String confFile : opts.configFiles) {
        log.debug("Adding Hadoop configuration file {}", confFile);
        conf.addResource(new Path(confFile));
    FileSystem hadoopFs = FileSystem.get(conf);
    FileSystem localFs = FileSystem.getLocal(conf);
    LogHistogram kvHistogram = new LogHistogram();
    KeyStats dataKeyStats = new KeyStats();
    KeyStats indexKeyStats = new KeyStats();
    for (String arg : opts.files) {
        Path path = new Path(arg);
        FileSystem fs;
        if (arg.contains(":"))
            fs = path.getFileSystem(conf);
        else {
            log.warn("Attempting to find file across filesystems. Consider providing URI instead of path");
            // fall back to local
            fs = hadoopFs.exists(path) ? hadoopFs : localFs;
        System.out.println("Reading file: " + path.makeQualified(fs.getUri(), fs.getWorkingDirectory()).toString());
        CachableBlockFile.Reader _rdr = new CachableBlockFile.Reader(fs, path, conf, null, null, SiteConfiguration.getInstance());
        Reader iter = new RFile.Reader(_rdr);
        MetricsGatherer<Map<String, ArrayList<VisibilityMetric>>> vmg = new VisMetricsGatherer();
        if (opts.vis || opts.hash)
        org.apache.accumulo.core.file.rfile.bcfile.PrintInfo.main(new String[] { arg });
        Map<String, ArrayList<ByteSequence>> localityGroupCF = null;
        if (opts.histogram || opts.dump || opts.vis || opts.hash || opts.keyStats) {
            localityGroupCF = iter.getLocalityGroupCF();
            FileSKVIterator dataIter;
            if (opts.useSample) {
                dataIter = iter.getSample();
                if (dataIter == null) {
                    System.out.println("ERROR : This rfile has no sample data");
            } else {
                dataIter = iter;
            if (opts.keyStats) {
                FileSKVIterator indexIter = iter.getIndex();
                while (indexIter.hasTop()) {
            for (String lgName : localityGroupCF.keySet()) {
      , new Range(), lgName, localityGroupCF);
                while (dataIter.hasTop()) {
                    Key key = dataIter.getTopKey();
                    Value value = dataIter.getTopValue();
                    if (opts.dump) {
                        System.out.println(key + " -> " + value);
                        if (System.out.checkError())
                    if (opts.histogram) {
                        kvHistogram.add(key.getSize() + value.getSize());
                    if (opts.keyStats) {
        if (opts.printSummary) {
            SummaryReader.print(iter, System.out);
        if (opts.vis || opts.hash) {
            vmg.printMetrics(opts.hash, "Visibility", System.out);
        if (opts.histogram) {
        if (opts.keyStats) {
            System.out.println("Statistics for keys in data :");
            System.out.println("Statistics for keys in index :");
        // If the output stream has closed, there is no reason to keep going.
        if (System.out.checkError())
Also used : Path(org.apache.hadoop.fs.Path) FileSKVIterator(org.apache.accumulo.core.file.FileSKVIterator) SiteConfiguration(org.apache.accumulo.core.conf.SiteConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) SummaryReader(org.apache.accumulo.core.summary.SummaryReader) Reader(org.apache.accumulo.core.file.rfile.RFile.Reader) Range( FileSystem(org.apache.hadoop.fs.FileSystem) Value( CachableBlockFile(org.apache.accumulo.core.file.blockfile.impl.CachableBlockFile) Map(java.util.Map) Key(

Example 29 with FileSKVIterator

use of org.apache.accumulo.core.file.FileSKVIterator in project accumulo by apache.

the class MapFileOperations method openScanReader.

protected FileSKVIterator openScanReader(OpenScanReaderOperation options) throws IOException {
    MapFileIterator mfIter = new MapFileIterator(options.getTableConfiguration(), options.getFileSystem(), options.getFilename(), options.getConfiguration());
    FileSKVIterator iter = new RangeIterator(mfIter);, options.getColumnFamilies(), options.isRangeInclusive());
    return iter;
Also used : FileSKVIterator(org.apache.accumulo.core.file.FileSKVIterator) MapFileIterator(org.apache.accumulo.core.iterators.system.MapFileIterator)

Example 30 with FileSKVIterator

use of org.apache.accumulo.core.file.FileSKVIterator in project accumulo by apache.

the class RFileTest method testSample.

public void testSample() throws IOException {
    int num = 10000;
    for (int sampleBufferSize : new int[] { 1 << 10, 1 << 20 }) {
        // force sample buffer to flush for smaller data
        for (int modulus : new int[] { 19, 103, 1019 }) {
            Hasher dataHasher = Hashing.md5().newHasher();
            List<Entry<Key, Value>> sampleData = new ArrayList<>();
            ConfigurationCopy sampleConf = new ConfigurationCopy(conf == null ? DefaultConfiguration.getInstance() : conf);
            sampleConf.set(Property.TABLE_SAMPLER, RowSampler.class.getName());
            sampleConf.set(Property.TABLE_SAMPLER_OPTS + "hasher", "murmur3_32");
            sampleConf.set(Property.TABLE_SAMPLER_OPTS + "modulus", modulus + "");
            Sampler sampler = SamplerFactory.newSampler(SamplerConfigurationImpl.newSamplerConfig(sampleConf), sampleConf);
            TestRFile trf = new TestRFile(sampleConf);
            for (int i = 0; i < num; i++) {
                add(trf, newKey(i, 0), newValue(i, 0), dataHasher, sampleData, sampler);
                add(trf, newKey(i, 1), newValue(i, 1), dataHasher, sampleData, sampler);
            HashCode expectedDataHash = dataHasher.hash();
            FileSKVIterator sample = trf.reader.getSample(SamplerConfigurationImpl.newSamplerConfig(sampleConf));
            checkSample(sample, sampleData);
            Assert.assertEquals(expectedDataHash, hash(trf.reader));
            SampleIE ie = new SampleIE(SamplerConfigurationImpl.newSamplerConfig(sampleConf).toSamplerConfiguration());
            for (int i = 0; i < 3; i++) {
                // test opening and closing deep copies a few times.
                sample = trf.reader.getSample(SamplerConfigurationImpl.newSamplerConfig(sampleConf));
                SortedKeyValueIterator<Key, Value> sampleDC1 = sample.deepCopy(ie);
                SortedKeyValueIterator<Key, Value> sampleDC2 = sample.deepCopy(ie);
                SortedKeyValueIterator<Key, Value> sampleDC3 = trf.reader.deepCopy(ie);
                SortedKeyValueIterator<Key, Value> allDC1 = sampleDC1.deepCopy(new SampleIE(null));
                SortedKeyValueIterator<Key, Value> allDC2 = sample.deepCopy(new SampleIE(null));
                Assert.assertEquals(expectedDataHash, hash(allDC1));
                Assert.assertEquals(expectedDataHash, hash(allDC2));
                checkSample(sample, sampleData);
                checkSample(sampleDC1, sampleData);
                checkSample(sampleDC2, sampleData);
                checkSample(sampleDC3, sampleData);
Also used : FileSKVIterator(org.apache.accumulo.core.file.FileSKVIterator) ConfigurationCopy(org.apache.accumulo.core.conf.ConfigurationCopy) ArrayList(java.util.ArrayList) RowSampler(org.apache.accumulo.core.client.sample.RowSampler) Hasher( Entry(java.util.Map.Entry) HashCode( Sampler(org.apache.accumulo.core.client.sample.Sampler) RowSampler(org.apache.accumulo.core.client.sample.RowSampler) Value( Key( PartialKey( CryptoTest( Test(org.junit.Test)


FileSKVIterator (org.apache.accumulo.core.file.FileSKVIterator)32 Key ( FileSystem (org.apache.hadoop.fs.FileSystem)17 ArrayList (java.util.ArrayList)13 PartialKey ( Value ( IOException ( Configuration (org.apache.hadoop.conf.Configuration)10 Path (org.apache.hadoop.fs.Path)9 Range ( CachedConfiguration (org.apache.accumulo.core.util.CachedConfiguration)7 AccumuloConfiguration (org.apache.accumulo.core.conf.AccumuloConfiguration)5 ConfigurationCopy (org.apache.accumulo.core.conf.ConfigurationCopy)5 SortedKeyValueIterator (org.apache.accumulo.core.iterators.SortedKeyValueIterator)5 MultiIterator (org.apache.accumulo.core.iterators.system.MultiIterator)5 Text ( Test (org.junit.Test)5 File ( HashMap (java.util.HashMap)4 CryptoTest (