Search in sources :

Example 6 with LuceneIndexInputSplit

use of com.twitter.elephantbird.mapreduce.input.LuceneIndexInputFormat.LuceneIndexInputSplit in project elephant-bird by twitter.

the class TestLuceneIndexRecordReader method testLuceneIndexRecordReader.

private void testLuceneIndexRecordReader(ArrayList<String> queryStrings, ArrayList<Path> indexPaths, ArrayList<ArrayList<ArrayList<Integer>>> indexesQueriesDocIds) throws Exception {
    LuceneIndexInputSplit split = createStrictMock(LuceneIndexInputSplit.class);
    expect(split.getIndexDirs()).andReturn(indexPaths);
    replay(split);
    Configuration conf = new Configuration();
    TaskAttemptContext context = createStrictMock(TaskAttemptContext.class);
    expect(HadoopCompat.getConfiguration(context)).andStubReturn(conf);
    // casting to avoid Hadoop 2 incompatibility
    ((Progressable) context).progress();
    expectLastCall().atLeastOnce();
    replay(context);
    LuceneIndexInputFormat.setQueries(queryStrings, conf);
    LuceneIndexRecordReader<IntWritable> rr = createMockBuilder(MockRecordReader.class).addMockedMethod("openIndex").addMockedMethod("createSearcher").createMock();
    Query[] queries = new Query[queryStrings.size()];
    for (int i = 0; i < queries.length; i++) {
        Query query = createStrictMock(Query.class);
        replay(query);
        queries[i] = query;
        expect(rr.deserializeQuery(queryStrings.get(i))).andReturn(query);
    }
    for (int index = 0; index < indexPaths.size(); index++) {
        IndexReader reader = createStrictMock(IndexReader.class);
        expect(reader.maxDoc()).andStubReturn(4);
        replay(reader);
        expect(rr.openIndex(indexPaths.get(index), conf)).andReturn(reader);
        IndexSearcher searcher = createStrictMock(IndexSearcher.class);
        expect(rr.createSearcher(reader)).andReturn(searcher);
        for (int query = 0; query < queries.length; query++) {
            final ArrayList<Integer> ids = indexesQueriesDocIds.get(index).get(query);
            final Capture<Collector> collectorCapture = new Capture<Collector>();
            expect(searcher.getIndexReader()).andReturn(reader);
            searcher.search(eq(queries[query]), capture(collectorCapture));
            expectLastCall().andAnswer(new IAnswer<Void>() {

                @Override
                public Void answer() throws Throwable {
                    for (int id : ids) {
                        collectorCapture.getValue().collect(id);
                    }
                    return null;
                }
            });
            for (int docId : ids) {
                expect(searcher.doc(docId)).andReturn(docs[docId]);
            }
        }
        replay(searcher);
    }
    replay(rr);
    rr.initialize(split, context);
    float prevProgress = -1;
    for (int index = 0; index < indexesQueriesDocIds.size(); index++) {
        for (int query = 0; query < indexesQueriesDocIds.get(index).size(); query++) {
            for (int docId : indexesQueriesDocIds.get(index).get(query)) {
                assertTrue(rr.nextKeyValue());
                assertEquals(query, rr.getCurrentKey().get());
                assertEquals(docsAndValues.get(docs[docId]), (Integer) rr.getCurrentValue().get());
                float newProgress = rr.getProgress();
                assertTrue(newProgress > prevProgress);
                assertTrue(newProgress <= 1.0);
            }
        }
    }
    assertFalse(rr.nextKeyValue());
    assertFalse(rr.nextKeyValue());
    verifyAll();
}
Also used : LuceneIndexInputSplit(com.twitter.elephantbird.mapreduce.input.LuceneIndexInputFormat.LuceneIndexInputSplit) IndexSearcher(org.apache.lucene.search.IndexSearcher) Configuration(org.apache.hadoop.conf.Configuration) Query(org.apache.lucene.search.Query) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) Capture(org.easymock.Capture) Progressable(org.apache.hadoop.util.Progressable) IndexReader(org.apache.lucene.index.IndexReader) Collector(org.apache.lucene.search.Collector) IntWritable(org.apache.hadoop.io.IntWritable)

Example 7 with LuceneIndexInputSplit

use of com.twitter.elephantbird.mapreduce.input.LuceneIndexInputFormat.LuceneIndexInputSplit in project elephant-bird by twitter.

the class TestLuceneIndexInputFormat method testCombineSplits.

@Test
public void testCombineSplits() throws Exception {
    DummyLuceneInputFormat lif = new DummyLuceneInputFormat();
    PriorityQueue<LuceneIndexInputSplit> splits = new PriorityQueue<LuceneIndexInputSplit>();
    String[] paths = new String[] { "/index/1", "/index/2", "/index/3", "/index/4", "/index/5", "/index/6" };
    Long[] sizes = new Long[] { 500L, 300L, 100L, 150L, 1200L, 500L };
    for (int i = 0; i < paths.length; i++) {
        splits.add(new LuceneIndexInputSplit(Lists.newArrayList(new Path(paths[i])), sizes[i]));
    }
    List<InputSplit> combined = lif.combineSplits(splits, 1000L, 10000L);
    assertEquals(3, combined.size());
    List<Path> dirs = ((LuceneIndexInputSplit) combined.get(0)).getIndexDirs();
    Set<String> dirsStrings = Sets.newHashSet(Iterables.transform(dirs, Functions.toStringFunction()));
    assertEquals(3, dirsStrings.size());
    assertTrue(dirsStrings.contains("/index/2"));
    assertTrue(dirsStrings.contains("/index/3"));
    assertTrue(dirsStrings.contains("/index/4"));
    dirs = ((LuceneIndexInputSplit) combined.get(1)).getIndexDirs();
    dirsStrings = Sets.newHashSet(Iterables.transform(dirs, Functions.toStringFunction()));
    assertEquals(2, dirsStrings.size());
    assertTrue(dirsStrings.contains("/index/1"));
    assertTrue(dirsStrings.contains("/index/6"));
    dirs = ((LuceneIndexInputSplit) combined.get(2)).getIndexDirs();
    dirsStrings = Sets.newHashSet(Iterables.transform(dirs, Functions.toStringFunction()));
    assertEquals(1, dirsStrings.size());
    assertTrue(dirsStrings.contains("/index/5"));
}
Also used : LuceneIndexInputSplit(com.twitter.elephantbird.mapreduce.input.LuceneIndexInputFormat.LuceneIndexInputSplit) Path(org.apache.hadoop.fs.Path) PriorityQueue(java.util.PriorityQueue) InputSplit(org.apache.hadoop.mapreduce.InputSplit) LuceneIndexInputSplit(com.twitter.elephantbird.mapreduce.input.LuceneIndexInputFormat.LuceneIndexInputSplit) Test(org.junit.Test)

Example 8 with LuceneIndexInputSplit

use of com.twitter.elephantbird.mapreduce.input.LuceneIndexInputFormat.LuceneIndexInputSplit in project elephant-bird by twitter.

the class TestLuceneIndexInputFormat method testCombineSplitsAllTooBig.

@Test
public void testCombineSplitsAllTooBig() throws Exception {
    DummyLuceneInputFormat lif = new DummyLuceneInputFormat();
    PriorityQueue<LuceneIndexInputSplit> splits = new PriorityQueue<LuceneIndexInputSplit>();
    String[] paths = new String[] { "/index/1", "/index/2", "/index/3" };
    Long[] sizes = new Long[] { 1500L, 1501L, 1502L };
    for (int i = 0; i < paths.length; i++) {
        splits.add(new LuceneIndexInputSplit(Lists.newArrayList(new Path(paths[i])), sizes[i]));
    }
    List<InputSplit> combined = lif.combineSplits(splits, 1000L, 10000L);
    assertEquals(3, combined.size());
    for (int i = 0; i < paths.length; i++) {
        List<Path> dirs = ((LuceneIndexInputSplit) combined.get(i)).getIndexDirs();
        List<String> dirsStrings = Lists.newLinkedList(Iterables.transform(dirs, Functions.toStringFunction()));
        assertEquals(1, dirsStrings.size());
        assertEquals("/index/" + String.valueOf(i + 1), dirsStrings.get(0));
    }
}
Also used : LuceneIndexInputSplit(com.twitter.elephantbird.mapreduce.input.LuceneIndexInputFormat.LuceneIndexInputSplit) Path(org.apache.hadoop.fs.Path) PriorityQueue(java.util.PriorityQueue) InputSplit(org.apache.hadoop.mapreduce.InputSplit) LuceneIndexInputSplit(com.twitter.elephantbird.mapreduce.input.LuceneIndexInputFormat.LuceneIndexInputSplit) Test(org.junit.Test)

Aggregations

LuceneIndexInputSplit (com.twitter.elephantbird.mapreduce.input.LuceneIndexInputFormat.LuceneIndexInputSplit)8 Path (org.apache.hadoop.fs.Path)6 Test (org.junit.Test)6 InputSplit (org.apache.hadoop.mapreduce.InputSplit)5 PriorityQueue (java.util.PriorityQueue)4 Configuration (org.apache.hadoop.conf.Configuration)3 ByteArrayInputStream (java.io.ByteArrayInputStream)1 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 DataInputStream (java.io.DataInputStream)1 DataOutputStream (java.io.DataOutputStream)1 IntWritable (org.apache.hadoop.io.IntWritable)1 JobContext (org.apache.hadoop.mapreduce.JobContext)1 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)1 Progressable (org.apache.hadoop.util.Progressable)1 IndexReader (org.apache.lucene.index.IndexReader)1 Collector (org.apache.lucene.search.Collector)1 IndexSearcher (org.apache.lucene.search.IndexSearcher)1 Query (org.apache.lucene.search.Query)1 Capture (org.easymock.Capture)1