use of com.twitter.elephantbird.mapreduce.input.LuceneIndexInputFormat.LuceneIndexInputSplit in project elephant-bird by twitter.
the class TestLuceneIndexRecordReader method testLuceneIndexRecordReader.
private void testLuceneIndexRecordReader(ArrayList<String> queryStrings, ArrayList<Path> indexPaths, ArrayList<ArrayList<ArrayList<Integer>>> indexesQueriesDocIds) throws Exception {
LuceneIndexInputSplit split = createStrictMock(LuceneIndexInputSplit.class);
expect(split.getIndexDirs()).andReturn(indexPaths);
replay(split);
Configuration conf = new Configuration();
TaskAttemptContext context = createStrictMock(TaskAttemptContext.class);
expect(HadoopCompat.getConfiguration(context)).andStubReturn(conf);
// casting to avoid Hadoop 2 incompatibility
((Progressable) context).progress();
expectLastCall().atLeastOnce();
replay(context);
LuceneIndexInputFormat.setQueries(queryStrings, conf);
LuceneIndexRecordReader<IntWritable> rr = createMockBuilder(MockRecordReader.class).addMockedMethod("openIndex").addMockedMethod("createSearcher").createMock();
Query[] queries = new Query[queryStrings.size()];
for (int i = 0; i < queries.length; i++) {
Query query = createStrictMock(Query.class);
replay(query);
queries[i] = query;
expect(rr.deserializeQuery(queryStrings.get(i))).andReturn(query);
}
for (int index = 0; index < indexPaths.size(); index++) {
IndexReader reader = createStrictMock(IndexReader.class);
expect(reader.maxDoc()).andStubReturn(4);
replay(reader);
expect(rr.openIndex(indexPaths.get(index), conf)).andReturn(reader);
IndexSearcher searcher = createStrictMock(IndexSearcher.class);
expect(rr.createSearcher(reader)).andReturn(searcher);
for (int query = 0; query < queries.length; query++) {
final ArrayList<Integer> ids = indexesQueriesDocIds.get(index).get(query);
final Capture<Collector> collectorCapture = new Capture<Collector>();
expect(searcher.getIndexReader()).andReturn(reader);
searcher.search(eq(queries[query]), capture(collectorCapture));
expectLastCall().andAnswer(new IAnswer<Void>() {
@Override
public Void answer() throws Throwable {
for (int id : ids) {
collectorCapture.getValue().collect(id);
}
return null;
}
});
for (int docId : ids) {
expect(searcher.doc(docId)).andReturn(docs[docId]);
}
}
replay(searcher);
}
replay(rr);
rr.initialize(split, context);
float prevProgress = -1;
for (int index = 0; index < indexesQueriesDocIds.size(); index++) {
for (int query = 0; query < indexesQueriesDocIds.get(index).size(); query++) {
for (int docId : indexesQueriesDocIds.get(index).get(query)) {
assertTrue(rr.nextKeyValue());
assertEquals(query, rr.getCurrentKey().get());
assertEquals(docsAndValues.get(docs[docId]), (Integer) rr.getCurrentValue().get());
float newProgress = rr.getProgress();
assertTrue(newProgress > prevProgress);
assertTrue(newProgress <= 1.0);
}
}
}
assertFalse(rr.nextKeyValue());
assertFalse(rr.nextKeyValue());
verifyAll();
}
use of com.twitter.elephantbird.mapreduce.input.LuceneIndexInputFormat.LuceneIndexInputSplit in project elephant-bird by twitter.
the class TestLuceneIndexInputFormat method testCombineSplits.
@Test
public void testCombineSplits() throws Exception {
DummyLuceneInputFormat lif = new DummyLuceneInputFormat();
PriorityQueue<LuceneIndexInputSplit> splits = new PriorityQueue<LuceneIndexInputSplit>();
String[] paths = new String[] { "/index/1", "/index/2", "/index/3", "/index/4", "/index/5", "/index/6" };
Long[] sizes = new Long[] { 500L, 300L, 100L, 150L, 1200L, 500L };
for (int i = 0; i < paths.length; i++) {
splits.add(new LuceneIndexInputSplit(Lists.newArrayList(new Path(paths[i])), sizes[i]));
}
List<InputSplit> combined = lif.combineSplits(splits, 1000L, 10000L);
assertEquals(3, combined.size());
List<Path> dirs = ((LuceneIndexInputSplit) combined.get(0)).getIndexDirs();
Set<String> dirsStrings = Sets.newHashSet(Iterables.transform(dirs, Functions.toStringFunction()));
assertEquals(3, dirsStrings.size());
assertTrue(dirsStrings.contains("/index/2"));
assertTrue(dirsStrings.contains("/index/3"));
assertTrue(dirsStrings.contains("/index/4"));
dirs = ((LuceneIndexInputSplit) combined.get(1)).getIndexDirs();
dirsStrings = Sets.newHashSet(Iterables.transform(dirs, Functions.toStringFunction()));
assertEquals(2, dirsStrings.size());
assertTrue(dirsStrings.contains("/index/1"));
assertTrue(dirsStrings.contains("/index/6"));
dirs = ((LuceneIndexInputSplit) combined.get(2)).getIndexDirs();
dirsStrings = Sets.newHashSet(Iterables.transform(dirs, Functions.toStringFunction()));
assertEquals(1, dirsStrings.size());
assertTrue(dirsStrings.contains("/index/5"));
}
use of com.twitter.elephantbird.mapreduce.input.LuceneIndexInputFormat.LuceneIndexInputSplit in project elephant-bird by twitter.
the class TestLuceneIndexInputFormat method testCombineSplitsAllTooBig.
@Test
public void testCombineSplitsAllTooBig() throws Exception {
DummyLuceneInputFormat lif = new DummyLuceneInputFormat();
PriorityQueue<LuceneIndexInputSplit> splits = new PriorityQueue<LuceneIndexInputSplit>();
String[] paths = new String[] { "/index/1", "/index/2", "/index/3" };
Long[] sizes = new Long[] { 1500L, 1501L, 1502L };
for (int i = 0; i < paths.length; i++) {
splits.add(new LuceneIndexInputSplit(Lists.newArrayList(new Path(paths[i])), sizes[i]));
}
List<InputSplit> combined = lif.combineSplits(splits, 1000L, 10000L);
assertEquals(3, combined.size());
for (int i = 0; i < paths.length; i++) {
List<Path> dirs = ((LuceneIndexInputSplit) combined.get(i)).getIndexDirs();
List<String> dirsStrings = Lists.newLinkedList(Iterables.transform(dirs, Functions.toStringFunction()));
assertEquals(1, dirsStrings.size());
assertEquals("/index/" + String.valueOf(i + 1), dirsStrings.get(0));
}
}
Aggregations