Search in sources :

Example 1 with Reader

use of org.apache.accumulo.core.file.rfile.RFile.Reader in project accumulo by apache.

the class SplitLarge method main.

public static void main(String[] args) throws Exception {
    Configuration conf = CachedConfiguration.getInstance();
    FileSystem fs = FileSystem.get(conf);
    Opts opts = new Opts();
    opts.parseArgs(SplitLarge.class.getName(), args);
    for (String file : opts.files) {
        AccumuloConfiguration aconf = DefaultConfiguration.getInstance();
        Path path = new Path(file);
        CachableBlockFile.Reader rdr = new CachableBlockFile.Reader(fs, path, conf, null, null, aconf);
        try (Reader iter = new RFile.Reader(rdr)) {
            if (!file.endsWith(".rf")) {
                throw new IllegalArgumentException("File must end with .rf");
            }
            String smallName = file.substring(0, file.length() - 3) + "_small.rf";
            String largeName = file.substring(0, file.length() - 3) + "_large.rf";
            int blockSize = (int) aconf.getAsBytes(Property.TABLE_FILE_BLOCK_SIZE);
            try (Writer small = new RFile.Writer(new CachableBlockFile.Writer(fs, new Path(smallName), "gz", null, conf, aconf), blockSize);
                Writer large = new RFile.Writer(new CachableBlockFile.Writer(fs, new Path(largeName), "gz", null, conf, aconf), blockSize)) {
                small.startDefaultLocalityGroup();
                large.startDefaultLocalityGroup();
                iter.seek(new Range(), new ArrayList<>(), false);
                while (iter.hasTop()) {
                    Key key = iter.getTopKey();
                    Value value = iter.getTopValue();
                    if (key.getSize() + value.getSize() < opts.maxSize) {
                        small.append(key, value);
                    } else {
                        large.append(key, value);
                    }
                    iter.next();
                }
            }
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) DefaultConfiguration(org.apache.accumulo.core.conf.DefaultConfiguration) AccumuloConfiguration(org.apache.accumulo.core.conf.AccumuloConfiguration) CachedConfiguration(org.apache.accumulo.core.util.CachedConfiguration) Configuration(org.apache.hadoop.conf.Configuration) Reader(org.apache.accumulo.core.file.rfile.RFile.Reader) Range(org.apache.accumulo.core.data.Range) FileSystem(org.apache.hadoop.fs.FileSystem) Value(org.apache.accumulo.core.data.Value) CachableBlockFile(org.apache.accumulo.core.file.blockfile.impl.CachableBlockFile) Writer(org.apache.accumulo.core.file.rfile.RFile.Writer) Key(org.apache.accumulo.core.data.Key) AccumuloConfiguration(org.apache.accumulo.core.conf.AccumuloConfiguration)

Example 2 with Reader

use of org.apache.accumulo.core.file.rfile.RFile.Reader in project accumulo by apache.

the class RFileTest method runVersionTest.

private void runVersionTest(int version) throws IOException {
    InputStream in = this.getClass().getClassLoader().getResourceAsStream("org/apache/accumulo/core/file/rfile/ver_" + version + ".rf");
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    byte[] buf = new byte[1024];
    int read;
    while ((read = in.read(buf)) > 0) baos.write(buf, 0, read);
    byte[] data = baos.toByteArray();
    SeekableByteArrayInputStream bais = new SeekableByteArrayInputStream(data);
    FSDataInputStream in2 = new FSDataInputStream(bais);
    AccumuloConfiguration aconf = DefaultConfiguration.getInstance();
    CachableBlockFile.Reader _cbr = new CachableBlockFile.Reader(in2, data.length, CachedConfiguration.getInstance(), aconf);
    Reader reader = new RFile.Reader(_cbr);
    checkIndex(reader);
    ColumnFamilySkippingIterator iter = new ColumnFamilySkippingIterator(reader);
    for (int start : new int[] { 0, 10, 100, 998 }) {
        for (int cf = 1; cf <= 4; cf++) {
            if (start == 0)
                iter.seek(new Range(), newColFamByteSequence(formatString("cf_", cf)), true);
            else
                iter.seek(new Range(formatString("r_", start), null), newColFamByteSequence(formatString("cf_", cf)), true);
            for (int i = start; i < 1000; i++) {
                assertTrue(iter.hasTop());
                assertEquals(newKey(formatString("r_", i), formatString("cf_", cf), formatString("cq_", 0), "", 1000 - i), iter.getTopKey());
                assertEquals(newValue(i + ""), iter.getTopValue());
                iter.next();
            }
            assertFalse(iter.hasTop());
        }
        if (start == 0)
            iter.seek(new Range(), newColFamByteSequence(), false);
        else
            iter.seek(new Range(formatString("r_", start), null), newColFamByteSequence(), false);
        for (int i = start; i < 1000; i++) {
            for (int cf = 1; cf <= 4; cf++) {
                assertTrue(iter.hasTop());
                assertEquals(newKey(formatString("r_", i), formatString("cf_", cf), formatString("cq_", 0), "", 1000 - i), iter.getTopKey());
                assertEquals(newValue(i + ""), iter.getTopValue());
                iter.next();
            }
        }
        assertFalse(iter.hasTop());
    }
    reader.close();
}
Also used : ColumnFamilySkippingIterator(org.apache.accumulo.core.iterators.system.ColumnFamilySkippingIterator) ByteArrayInputStream(java.io.ByteArrayInputStream) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) DataInputStream(java.io.DataInputStream) InputStream(java.io.InputStream) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) Reader(org.apache.accumulo.core.file.rfile.RFile.Reader) CachableBlockFile(org.apache.accumulo.core.file.blockfile.impl.CachableBlockFile) ByteArrayOutputStream(java.io.ByteArrayOutputStream) Range(org.apache.accumulo.core.data.Range) AccumuloConfiguration(org.apache.accumulo.core.conf.AccumuloConfiguration)

Example 3 with Reader

use of org.apache.accumulo.core.file.rfile.RFile.Reader in project accumulo by apache.

the class RFileTest method testWriterTableProperties.

@Test
public void testWriterTableProperties() throws Exception {
    LocalFileSystem localFs = FileSystem.getLocal(new Configuration());
    String testFile = createTmpTestFile();
    Map<String, String> props = new HashMap<>();
    props.put(Property.TABLE_FILE_COMPRESSED_BLOCK_SIZE.getKey(), "1K");
    props.put(Property.TABLE_FILE_COMPRESSED_BLOCK_SIZE_INDEX.getKey(), "1K");
    RFileWriter writer = RFile.newWriter().to(testFile).withFileSystem(localFs).withTableProperties(props).build();
    SortedMap<Key, Value> testData1 = createTestData(10, 10, 10);
    writer.append(testData1.entrySet());
    writer.close();
    Reader reader = getReader(localFs, testFile);
    FileSKVIterator iiter = reader.getIndex();
    int count = 0;
    while (iiter.hasTop()) {
        count++;
        iiter.next();
    }
    // if settings are used then should create multiple index entries
    Assert.assertTrue(count > 10);
    reader.close();
    Scanner scanner = RFile.newScanner().from(testFile).withFileSystem(localFs).build();
    Assert.assertEquals(testData1, toMap(scanner));
    scanner.close();
}
Also used : FileSKVIterator(org.apache.accumulo.core.file.FileSKVIterator) Scanner(org.apache.accumulo.core.client.Scanner) SummarizerConfiguration(org.apache.accumulo.core.client.summary.SummarizerConfiguration) NewTableConfiguration(org.apache.accumulo.core.client.admin.NewTableConfiguration) Configuration(org.apache.hadoop.conf.Configuration) SamplerConfiguration(org.apache.accumulo.core.client.sample.SamplerConfiguration) DefaultConfiguration(org.apache.accumulo.core.conf.DefaultConfiguration) HashMap(java.util.HashMap) Reader(org.apache.accumulo.core.file.rfile.RFile.Reader) LocalFileSystem(org.apache.hadoop.fs.LocalFileSystem) Value(org.apache.accumulo.core.data.Value) Key(org.apache.accumulo.core.data.Key) Test(org.junit.Test)

Example 4 with Reader

use of org.apache.accumulo.core.file.rfile.RFile.Reader in project accumulo by apache.

the class RFileTest method testLocalityGroups.

@Test
public void testLocalityGroups() throws Exception {
    SortedMap<Key, Value> testData1 = createTestData(0, 10, 0, 2, 10);
    SortedMap<Key, Value> testData2 = createTestData(0, 10, 2, 1, 10);
    SortedMap<Key, Value> defaultData = createTestData(0, 10, 3, 7, 10);
    LocalFileSystem localFs = FileSystem.getLocal(new Configuration());
    String testFile = createTmpTestFile();
    RFileWriter writer = RFile.newWriter().to(testFile).withFileSystem(localFs).build();
    writer.startNewLocalityGroup("z", colStr(0), colStr(1));
    writer.append(testData1.entrySet());
    writer.startNewLocalityGroup("h", colStr(2));
    writer.append(testData2.entrySet());
    writer.startDefaultLocalityGroup();
    writer.append(defaultData.entrySet());
    writer.close();
    Scanner scanner = RFile.newScanner().from(testFile).withFileSystem(localFs).build();
    scanner.fetchColumnFamily(new Text(colStr(0)));
    scanner.fetchColumnFamily(new Text(colStr(1)));
    Assert.assertEquals(testData1, toMap(scanner));
    scanner.clearColumns();
    scanner.fetchColumnFamily(new Text(colStr(2)));
    Assert.assertEquals(testData2, toMap(scanner));
    scanner.clearColumns();
    for (int i = 3; i < 10; i++) {
        scanner.fetchColumnFamily(new Text(colStr(i)));
    }
    Assert.assertEquals(defaultData, toMap(scanner));
    scanner.clearColumns();
    Assert.assertEquals(createTestData(10, 10, 10), toMap(scanner));
    scanner.close();
    Reader reader = getReader(localFs, testFile);
    Map<String, ArrayList<ByteSequence>> lGroups = reader.getLocalityGroupCF();
    Assert.assertTrue(lGroups.containsKey("z"));
    Assert.assertTrue(lGroups.get("z").size() == 2);
    Assert.assertTrue(lGroups.get("z").contains(new ArrayByteSequence(colStr(0))));
    Assert.assertTrue(lGroups.get("z").contains(new ArrayByteSequence(colStr(1))));
    Assert.assertTrue(lGroups.containsKey("h"));
    Assert.assertEquals(Arrays.asList(new ArrayByteSequence(colStr(2))), lGroups.get("h"));
    reader.close();
}
Also used : Scanner(org.apache.accumulo.core.client.Scanner) SummarizerConfiguration(org.apache.accumulo.core.client.summary.SummarizerConfiguration) NewTableConfiguration(org.apache.accumulo.core.client.admin.NewTableConfiguration) Configuration(org.apache.hadoop.conf.Configuration) SamplerConfiguration(org.apache.accumulo.core.client.sample.SamplerConfiguration) DefaultConfiguration(org.apache.accumulo.core.conf.DefaultConfiguration) ArrayList(java.util.ArrayList) Reader(org.apache.accumulo.core.file.rfile.RFile.Reader) Text(org.apache.hadoop.io.Text) LocalFileSystem(org.apache.hadoop.fs.LocalFileSystem) Value(org.apache.accumulo.core.data.Value) ArrayByteSequence(org.apache.accumulo.core.data.ArrayByteSequence) Key(org.apache.accumulo.core.data.Key) Test(org.junit.Test)

Example 5 with Reader

use of org.apache.accumulo.core.file.rfile.RFile.Reader in project accumulo by apache.

the class RFileScanner method iterator.

@Override
public Iterator<Entry<Key, Value>> iterator() {
    try {
        RFileSource[] sources = opts.in.getSources();
        List<SortedKeyValueIterator<Key, Value>> readers = new ArrayList<>(sources.length);
        for (int i = 0; i < sources.length; i++) {
            // TODO may have been a bug with multiple files and caching in older version...
            FSDataInputStream inputStream = (FSDataInputStream) sources[i].getInputStream();
            readers.add(new RFile.Reader(new CachableBlockFile.Reader("source-" + i, inputStream, sources[i].getLength(), opts.in.getConf(), dataCache, indexCache, DefaultConfiguration.getInstance())));
        }
        if (getSamplerConfiguration() != null) {
            for (int i = 0; i < readers.size(); i++) {
                readers.set(i, ((Reader) readers.get(i)).getSample(new SamplerConfigurationImpl(getSamplerConfiguration())));
            }
        }
        SortedKeyValueIterator<Key, Value> iterator;
        if (opts.bounds != null) {
            iterator = new MultiIterator(readers, opts.bounds);
        } else {
            iterator = new MultiIterator(readers, false);
        }
        Set<ByteSequence> families = Collections.emptySet();
        if (opts.useSystemIterators) {
            SortedSet<Column> cols = this.getFetchedColumns();
            families = LocalityGroupUtil.families(cols);
            iterator = IteratorUtil.setupSystemScanIterators(iterator, cols, getAuthorizations(), EMPTY_BYTES);
        }
        try {
            if (opts.tableConfig != null && opts.tableConfig.size() > 0) {
                ConfigurationCopy conf = new ConfigurationCopy(opts.tableConfig);
                iterator = IteratorUtil.loadIterators(IteratorScope.scan, iterator, null, conf, serverSideIteratorList, serverSideIteratorOptions, new IterEnv());
            } else {
                iterator = IteratorUtil.loadIterators(iterator, serverSideIteratorList, serverSideIteratorOptions, new IterEnv(), false, null);
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        iterator.seek(getRange() == null ? EMPTY_RANGE : getRange(), families, families.size() == 0 ? false : true);
        return new IteratorAdapter(iterator);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}
Also used : MultiIterator(org.apache.accumulo.core.iterators.system.MultiIterator) ConfigurationCopy(org.apache.accumulo.core.conf.ConfigurationCopy) IteratorAdapter(org.apache.accumulo.core.iterators.IteratorAdapter) SamplerConfigurationImpl(org.apache.accumulo.core.sample.impl.SamplerConfigurationImpl) ArrayList(java.util.ArrayList) SortedKeyValueIterator(org.apache.accumulo.core.iterators.SortedKeyValueIterator) Reader(org.apache.accumulo.core.file.rfile.RFile.Reader) RFile(org.apache.accumulo.core.file.rfile.RFile) IOException(java.io.IOException) Reader(org.apache.accumulo.core.file.rfile.RFile.Reader) Column(org.apache.accumulo.core.data.Column) Value(org.apache.accumulo.core.data.Value) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) Key(org.apache.accumulo.core.data.Key) ByteSequence(org.apache.accumulo.core.data.ByteSequence)

Aggregations

Reader (org.apache.accumulo.core.file.rfile.RFile.Reader)6 Key (org.apache.accumulo.core.data.Key)5 Value (org.apache.accumulo.core.data.Value)5 Configuration (org.apache.hadoop.conf.Configuration)4 ArrayList (java.util.ArrayList)3 DefaultConfiguration (org.apache.accumulo.core.conf.DefaultConfiguration)3 Range (org.apache.accumulo.core.data.Range)3 CachableBlockFile (org.apache.accumulo.core.file.blockfile.impl.CachableBlockFile)3 Scanner (org.apache.accumulo.core.client.Scanner)2 NewTableConfiguration (org.apache.accumulo.core.client.admin.NewTableConfiguration)2 SamplerConfiguration (org.apache.accumulo.core.client.sample.SamplerConfiguration)2 SummarizerConfiguration (org.apache.accumulo.core.client.summary.SummarizerConfiguration)2 AccumuloConfiguration (org.apache.accumulo.core.conf.AccumuloConfiguration)2 FileSKVIterator (org.apache.accumulo.core.file.FileSKVIterator)2 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 LocalFileSystem (org.apache.hadoop.fs.LocalFileSystem)2 Path (org.apache.hadoop.fs.Path)2 Test (org.junit.Test)2 ByteArrayInputStream (java.io.ByteArrayInputStream)1