use of org.apache.accumulo.core.file.FileSKVIterator in project accumulo by apache.
the class FileUtil method findMidPoint.
/**
* @param mapFiles
* - list MapFiles to find the mid point key
*
* ISSUES : This method used the index files to find the mid point. If the map files have different index intervals this method will not return an
* accurate mid point. Also, it would be tricky to use this method in conjunction with an in memory map because the indexing interval is unknown.
*/
public static SortedMap<Double, Key> findMidPoint(VolumeManager fs, String tabletDirectory, AccumuloConfiguration acuConf, Text prevEndRow, Text endRow, Collection<String> mapFiles, double minSplit, boolean useIndex) throws IOException {
Configuration conf = CachedConfiguration.getInstance();
Collection<String> origMapFiles = mapFiles;
Path tmpDir = null;
int maxToOpen = acuConf.getCount(Property.TSERV_TABLET_SPLIT_FINDMIDPOINT_MAXOPEN);
ArrayList<FileSKVIterator> readers = new ArrayList<>(mapFiles.size());
try {
if (mapFiles.size() > maxToOpen) {
if (!useIndex)
throw new IOException("Cannot find mid point using data files, too many " + mapFiles.size());
tmpDir = createTmpDir(acuConf, fs, tabletDirectory);
log.debug("Too many indexes ({}) to open at once for {} {}, reducing in tmpDir = {}", mapFiles.size(), endRow, prevEndRow, tmpDir);
long t1 = System.currentTimeMillis();
mapFiles = reduceFiles(acuConf, conf, fs, prevEndRow, endRow, mapFiles, maxToOpen, tmpDir, 0);
long t2 = System.currentTimeMillis();
log.debug("Finished reducing indexes for {} {} in {}", endRow, prevEndRow, String.format("%6.2f secs", (t2 - t1) / 1000.0));
}
if (prevEndRow == null)
prevEndRow = new Text();
long t1 = System.currentTimeMillis();
long numKeys = 0;
numKeys = countIndexEntries(acuConf, prevEndRow, endRow, mapFiles, tmpDir == null ? useIndex : false, conf, fs, readers);
if (numKeys == 0) {
if (useIndex) {
log.warn("Failed to find mid point using indexes, falling back to data files which is slower. No entries between {} and {} for {}", prevEndRow, endRow, mapFiles);
// need to pass original map files, not possibly reduced indexes
return findMidPoint(fs, tabletDirectory, acuConf, prevEndRow, endRow, origMapFiles, minSplit, false);
}
throw new IOException("Failed to find mid point, no entries between " + prevEndRow + " and " + endRow + " for " + mapFiles);
}
List<SortedKeyValueIterator<Key, Value>> iters = new ArrayList<>(readers);
MultiIterator mmfi = new MultiIterator(iters, true);
// skip the prevendrow
while (mmfi.hasTop() && mmfi.getTopKey().compareRow(prevEndRow) <= 0) mmfi.next();
// read half of the keys in the index
TreeMap<Double, Key> ret = new TreeMap<>();
Key lastKey = null;
long keysRead = 0;
Key keyBeforeMidPoint = null;
long keyBeforeMidPointPosition = 0;
while (keysRead < numKeys / 2) {
if (lastKey != null && !lastKey.equals(mmfi.getTopKey(), PartialKey.ROW) && (keysRead - 1) / (double) numKeys >= minSplit) {
keyBeforeMidPoint = new Key(lastKey);
keyBeforeMidPointPosition = keysRead - 1;
}
if (lastKey == null)
lastKey = new Key();
lastKey.set(mmfi.getTopKey());
keysRead++;
// consume minimum
mmfi.next();
}
if (keyBeforeMidPoint != null)
ret.put(keyBeforeMidPointPosition / (double) numKeys, keyBeforeMidPoint);
long t2 = System.currentTimeMillis();
log.debug(String.format("Found midPoint from indexes in %6.2f secs.%n", ((t2 - t1) / 1000.0)));
ret.put(.5, mmfi.getTopKey());
// sanity check
for (Key key : ret.values()) {
boolean inRange = (key.compareRow(prevEndRow) > 0 && (endRow == null || key.compareRow(endRow) < 1));
if (!inRange) {
throw new IOException("Found mid point is not in range " + key + " " + prevEndRow + " " + endRow + " " + mapFiles);
}
}
return ret;
} finally {
cleanupIndexOp(tmpDir, fs, readers);
}
}
use of org.apache.accumulo.core.file.FileSKVIterator in project accumulo by apache.
the class AccumuloFileOutputFormatIT method handleWriteTests.
private void handleWriteTests(boolean content) throws Exception {
File f = folder.newFile(testName.getMethodName());
if (f.delete()) {
log.debug("Deleted {}", f);
}
MRTester.main(new String[] { content ? TEST_TABLE : EMPTY_TABLE, f.getAbsolutePath() });
assertTrue(f.exists());
File[] files = f.listFiles(new FileFilter() {
@Override
public boolean accept(File file) {
return file.getName().startsWith("part-m-");
}
});
assertNotNull(files);
if (content) {
assertEquals(1, files.length);
assertTrue(files[0].exists());
Configuration conf = CachedConfiguration.getInstance();
DefaultConfiguration acuconf = DefaultConfiguration.getInstance();
FileSKVIterator sample = RFileOperations.getInstance().newReaderBuilder().forFile(files[0].toString(), FileSystem.get(conf), conf).withTableConfiguration(acuconf).build().getSample(new SamplerConfigurationImpl(SAMPLER_CONFIG));
assertNotNull(sample);
} else {
assertEquals(0, files.length);
}
}
use of org.apache.accumulo.core.file.FileSKVIterator in project accumulo by apache.
the class PrintInfo method execute.
@Override
public void execute(final String[] args) throws Exception {
Opts opts = new Opts();
opts.parseArgs("accumulo rfile-info", args);
if (opts.files.isEmpty()) {
System.err.println("No files were given");
System.exit(-1);
}
Configuration conf = new Configuration();
for (String confFile : opts.configFiles) {
log.debug("Adding Hadoop configuration file {}", confFile);
conf.addResource(new Path(confFile));
}
FileSystem hadoopFs = FileSystem.get(conf);
FileSystem localFs = FileSystem.getLocal(conf);
LogHistogram kvHistogram = new LogHistogram();
KeyStats dataKeyStats = new KeyStats();
KeyStats indexKeyStats = new KeyStats();
for (String arg : opts.files) {
Path path = new Path(arg);
FileSystem fs;
if (arg.contains(":"))
fs = path.getFileSystem(conf);
else {
log.warn("Attempting to find file across filesystems. Consider providing URI instead of path");
// fall back to local
fs = hadoopFs.exists(path) ? hadoopFs : localFs;
}
System.out.println("Reading file: " + path.makeQualified(fs.getUri(), fs.getWorkingDirectory()).toString());
CachableBlockFile.Reader _rdr = new CachableBlockFile.Reader(fs, path, conf, null, null, SiteConfiguration.getInstance());
Reader iter = new RFile.Reader(_rdr);
MetricsGatherer<Map<String, ArrayList<VisibilityMetric>>> vmg = new VisMetricsGatherer();
if (opts.vis || opts.hash)
iter.registerMetrics(vmg);
iter.printInfo(opts.printIndex);
System.out.println();
org.apache.accumulo.core.file.rfile.bcfile.PrintInfo.main(new String[] { arg });
Map<String, ArrayList<ByteSequence>> localityGroupCF = null;
if (opts.histogram || opts.dump || opts.vis || opts.hash || opts.keyStats) {
localityGroupCF = iter.getLocalityGroupCF();
FileSKVIterator dataIter;
if (opts.useSample) {
dataIter = iter.getSample();
if (dataIter == null) {
System.out.println("ERROR : This rfile has no sample data");
return;
}
} else {
dataIter = iter;
}
if (opts.keyStats) {
FileSKVIterator indexIter = iter.getIndex();
while (indexIter.hasTop()) {
indexKeyStats.add(indexIter.getTopKey());
indexIter.next();
}
}
for (String lgName : localityGroupCF.keySet()) {
LocalityGroupUtil.seek(dataIter, new Range(), lgName, localityGroupCF);
while (dataIter.hasTop()) {
Key key = dataIter.getTopKey();
Value value = dataIter.getTopValue();
if (opts.dump) {
System.out.println(key + " -> " + value);
if (System.out.checkError())
return;
}
if (opts.histogram) {
kvHistogram.add(key.getSize() + value.getSize());
}
if (opts.keyStats) {
dataKeyStats.add(key);
}
dataIter.next();
}
}
}
if (opts.printSummary) {
SummaryReader.print(iter, System.out);
}
iter.close();
if (opts.vis || opts.hash) {
System.out.println();
vmg.printMetrics(opts.hash, "Visibility", System.out);
}
if (opts.histogram) {
System.out.println();
kvHistogram.print("");
}
if (opts.keyStats) {
System.out.println();
System.out.println("Statistics for keys in data :");
dataKeyStats.print("\t");
System.out.println();
System.out.println("Statistics for keys in index :");
indexKeyStats.print("\t");
}
// If the output stream has closed, there is no reason to keep going.
if (System.out.checkError())
return;
}
}
use of org.apache.accumulo.core.file.FileSKVIterator in project accumulo by apache.
the class MapFileOperations method openScanReader.
@Override
protected FileSKVIterator openScanReader(OpenScanReaderOperation options) throws IOException {
MapFileIterator mfIter = new MapFileIterator(options.getTableConfiguration(), options.getFileSystem(), options.getFilename(), options.getConfiguration());
FileSKVIterator iter = new RangeIterator(mfIter);
iter.seek(options.getRange(), options.getColumnFamilies(), options.isRangeInclusive());
return iter;
}
use of org.apache.accumulo.core.file.FileSKVIterator in project accumulo by apache.
the class RFileTest method testSample.
@Test
public void testSample() throws IOException {
int num = 10000;
for (int sampleBufferSize : new int[] { 1 << 10, 1 << 20 }) {
// force sample buffer to flush for smaller data
RFile.setSampleBufferSize(sampleBufferSize);
for (int modulus : new int[] { 19, 103, 1019 }) {
Hasher dataHasher = Hashing.md5().newHasher();
List<Entry<Key, Value>> sampleData = new ArrayList<>();
ConfigurationCopy sampleConf = new ConfigurationCopy(conf == null ? DefaultConfiguration.getInstance() : conf);
sampleConf.set(Property.TABLE_SAMPLER, RowSampler.class.getName());
sampleConf.set(Property.TABLE_SAMPLER_OPTS + "hasher", "murmur3_32");
sampleConf.set(Property.TABLE_SAMPLER_OPTS + "modulus", modulus + "");
Sampler sampler = SamplerFactory.newSampler(SamplerConfigurationImpl.newSamplerConfig(sampleConf), sampleConf);
TestRFile trf = new TestRFile(sampleConf);
trf.openWriter();
for (int i = 0; i < num; i++) {
add(trf, newKey(i, 0), newValue(i, 0), dataHasher, sampleData, sampler);
add(trf, newKey(i, 1), newValue(i, 1), dataHasher, sampleData, sampler);
}
HashCode expectedDataHash = dataHasher.hash();
trf.closeWriter();
trf.openReader();
FileSKVIterator sample = trf.reader.getSample(SamplerConfigurationImpl.newSamplerConfig(sampleConf));
checkSample(sample, sampleData);
Assert.assertEquals(expectedDataHash, hash(trf.reader));
SampleIE ie = new SampleIE(SamplerConfigurationImpl.newSamplerConfig(sampleConf).toSamplerConfiguration());
for (int i = 0; i < 3; i++) {
// test opening and closing deep copies a few times.
trf.reader.closeDeepCopies();
sample = trf.reader.getSample(SamplerConfigurationImpl.newSamplerConfig(sampleConf));
SortedKeyValueIterator<Key, Value> sampleDC1 = sample.deepCopy(ie);
SortedKeyValueIterator<Key, Value> sampleDC2 = sample.deepCopy(ie);
SortedKeyValueIterator<Key, Value> sampleDC3 = trf.reader.deepCopy(ie);
SortedKeyValueIterator<Key, Value> allDC1 = sampleDC1.deepCopy(new SampleIE(null));
SortedKeyValueIterator<Key, Value> allDC2 = sample.deepCopy(new SampleIE(null));
Assert.assertEquals(expectedDataHash, hash(allDC1));
Assert.assertEquals(expectedDataHash, hash(allDC2));
checkSample(sample, sampleData);
checkSample(sampleDC1, sampleData);
checkSample(sampleDC2, sampleData);
checkSample(sampleDC3, sampleData);
}
trf.reader.closeDeepCopies();
trf.closeReader();
}
}
}
Aggregations