Search in sources :

Example 86 with Document

use of org.apache.lucene.document.Document in project elasticsearch by elastic.

the class BlendedTermQueryTests method testDismaxQuery.

public void testDismaxQuery() throws IOException {
    Directory dir = newDirectory();
    IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
    String[] username = new String[] { "foo fighters", "some cool fan", "cover band" };
    String[] song = new String[] { "generator", "foo fighers - generator", "foo fighters generator" };
    final boolean omitNorms = random().nextBoolean();
    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
    ft.setIndexOptions(random().nextBoolean() ? IndexOptions.DOCS : IndexOptions.DOCS_AND_FREQS);
    ft.setOmitNorms(omitNorms);
    ft.freeze();
    FieldType ft1 = new FieldType(TextField.TYPE_NOT_STORED);
    ft1.setIndexOptions(random().nextBoolean() ? IndexOptions.DOCS : IndexOptions.DOCS_AND_FREQS);
    ft1.setOmitNorms(omitNorms);
    ft1.freeze();
    for (int i = 0; i < username.length; i++) {
        Document d = new Document();
        d.add(new TextField("id", Integer.toString(i), Field.Store.YES));
        d.add(new Field("username", username[i], ft));
        d.add(new Field("song", song[i], ft));
        w.addDocument(d);
    }
    int iters = scaledRandomIntBetween(25, 100);
    for (int j = 0; j < iters; j++) {
        Document d = new Document();
        d.add(new TextField("id", Integer.toString(username.length + j), Field.Store.YES));
        d.add(new Field("username", "foo fighters", ft1));
        d.add(new Field("song", "some bogus text to bump up IDF", ft1));
        w.addDocument(d);
    }
    w.commit();
    DirectoryReader reader = DirectoryReader.open(w);
    IndexSearcher searcher = setSimilarity(newSearcher(reader));
    {
        String[] fields = new String[] { "username", "song" };
        BooleanQuery.Builder query = new BooleanQuery.Builder();
        query.setDisableCoord(true);
        query.add(BlendedTermQuery.dismaxBlendedQuery(toTerms(fields, "foo"), 0.1f), BooleanClause.Occur.SHOULD);
        query.add(BlendedTermQuery.dismaxBlendedQuery(toTerms(fields, "fighters"), 0.1f), BooleanClause.Occur.SHOULD);
        query.add(BlendedTermQuery.dismaxBlendedQuery(toTerms(fields, "generator"), 0.1f), BooleanClause.Occur.SHOULD);
        TopDocs search = searcher.search(query.build(), 10);
        ScoreDoc[] scoreDocs = search.scoreDocs;
        assertEquals(Integer.toString(0), reader.document(scoreDocs[0].doc).getField("id").stringValue());
    }
    {
        BooleanQuery.Builder query = new BooleanQuery.Builder();
        query.setDisableCoord(true);
        DisjunctionMaxQuery uname = new DisjunctionMaxQuery(Arrays.asList(new TermQuery(new Term("username", "foo")), new TermQuery(new Term("song", "foo"))), 0.0f);
        DisjunctionMaxQuery s = new DisjunctionMaxQuery(Arrays.asList(new TermQuery(new Term("username", "fighers")), new TermQuery(new Term("song", "fighers"))), 0.0f);
        DisjunctionMaxQuery gen = new DisjunctionMaxQuery(Arrays.asList(new TermQuery(new Term("username", "generator")), new TermQuery(new Term("song", "generator"))), 0f);
        query.add(uname, BooleanClause.Occur.SHOULD);
        query.add(s, BooleanClause.Occur.SHOULD);
        query.add(gen, BooleanClause.Occur.SHOULD);
        TopDocs search = searcher.search(query.build(), 4);
        ScoreDoc[] scoreDocs = search.scoreDocs;
        assertEquals(Integer.toString(1), reader.document(scoreDocs[0].doc).getField("id").stringValue());
    }
    reader.close();
    w.close();
    dir.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) BooleanQuery(org.apache.lucene.search.BooleanQuery) TermQuery(org.apache.lucene.search.TermQuery) DirectoryReader(org.apache.lucene.index.DirectoryReader) DisjunctionMaxQuery(org.apache.lucene.search.DisjunctionMaxQuery) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) FieldType(org.apache.lucene.document.FieldType) TopDocs(org.apache.lucene.search.TopDocs) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) IndexWriter(org.apache.lucene.index.IndexWriter) TextField(org.apache.lucene.document.TextField) Directory(org.apache.lucene.store.Directory)

Example 87 with Document

use of org.apache.lucene.document.Document in project elasticsearch by elastic.

the class MinDocQueryTests method testRandom.

public void testRandom() throws IOException {
    final int numDocs = randomIntBetween(10, 200);
    final Document doc = new Document();
    final Directory dir = newDirectory();
    final RandomIndexWriter w = new RandomIndexWriter(random(), dir);
    for (int i = 0; i < numDocs; ++i) {
        w.addDocument(doc);
    }
    final IndexReader reader = w.getReader();
    final IndexSearcher searcher = newSearcher(reader);
    for (int i = 0; i <= numDocs; ++i) {
        assertEquals(numDocs - i, searcher.count(new MinDocQuery(i)));
    }
    w.close();
    reader.close();
    dir.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) IndexReader(org.apache.lucene.index.IndexReader) Document(org.apache.lucene.document.Document) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Directory(org.apache.lucene.store.Directory)

Example 88 with Document

use of org.apache.lucene.document.Document in project elasticsearch by elastic.

the class CustomPostingsHighlighterTests method testCustomPostingsHighlighter.

public void testCustomPostingsHighlighter() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
    iwc.setMergePolicy(newLogMergePolicy());
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
    FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
    offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    //good position but only one match
    final String firstValue = "This is a test. Just a test1 highlighting from postings highlighter.";
    Field body = new Field("body", "", offsetsType);
    Document doc = new Document();
    doc.add(body);
    body.setStringValue(firstValue);
    //two matches, not the best snippet due to its length though
    final String secondValue = "This is the second highlighting value to perform highlighting on a longer text that gets scored lower.";
    Field body2 = new Field("body", "", offsetsType);
    doc.add(body2);
    body2.setStringValue(secondValue);
    //two matches and short, will be scored highest
    final String thirdValue = "This is highlighting the third short highlighting value.";
    Field body3 = new Field("body", "", offsetsType);
    doc.add(body3);
    body3.setStringValue(thirdValue);
    //one match, same as first but at the end, will be scored lower due to its position
    final String fourthValue = "Just a test4 highlighting from postings highlighter.";
    Field body4 = new Field("body", "", offsetsType);
    doc.add(body4);
    body4.setStringValue(fourthValue);
    iw.addDocument(doc);
    IndexReader ir = iw.getReader();
    iw.close();
    String firstHlValue = "Just a test1 <b>highlighting</b> from postings highlighter.";
    String secondHlValue = "This is the second <b>highlighting</b> value to perform <b>highlighting</b> on a longer text that gets scored lower.";
    String thirdHlValue = "This is <b>highlighting</b> the third short <b>highlighting</b> value.";
    String fourthHlValue = "Just a test4 <b>highlighting</b> from postings highlighter.";
    IndexSearcher searcher = newSearcher(ir);
    Query query = new TermQuery(new Term("body", "highlighting"));
    TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
    assertThat(topDocs.totalHits, equalTo(1));
    int docId = topDocs.scoreDocs[0].doc;
    String fieldValue = firstValue + HighlightUtils.PARAGRAPH_SEPARATOR + secondValue + HighlightUtils.PARAGRAPH_SEPARATOR + thirdValue + HighlightUtils.PARAGRAPH_SEPARATOR + fourthValue;
    CustomPostingsHighlighter highlighter = new CustomPostingsHighlighter(null, new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), fieldValue, false);
    Snippet[] snippets = highlighter.highlightField("body", query, searcher, docId, 5);
    assertThat(snippets.length, equalTo(4));
    assertThat(snippets[0].getText(), equalTo(firstHlValue));
    assertThat(snippets[1].getText(), equalTo(secondHlValue));
    assertThat(snippets[2].getText(), equalTo(thirdHlValue));
    assertThat(snippets[3].getText(), equalTo(fourthHlValue));
    ir.close();
    dir.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) TermQuery(org.apache.lucene.search.TermQuery) Query(org.apache.lucene.search.Query) TermQuery(org.apache.lucene.search.TermQuery) Term(org.apache.lucene.index.Term) Snippet(org.apache.lucene.search.highlight.Snippet) Document(org.apache.lucene.document.Document) FieldType(org.apache.lucene.document.FieldType) TopDocs(org.apache.lucene.search.TopDocs) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) DefaultEncoder(org.apache.lucene.search.highlight.DefaultEncoder) IndexReader(org.apache.lucene.index.IndexReader) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Directory(org.apache.lucene.store.Directory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 89 with Document

use of org.apache.lucene.document.Document in project Solbase by Photobucket.

the class ShardDocumentLoader method loadObject.

public CachedObjectWrapper<Document, Long> loadObject(String docIdKey, int start, int end, LayeredCache<String, Document, Long, Document> cache) throws IOException {
    Document document = new Document();
    // TODO should probably get SchemaField from Schema object.
    String keyField = "docId";
    Field field = new Field(keyField, this.parseDocIdFromKey(docIdKey), Store.YES, Index.ANALYZED);
    document.add(field);
    // TODO, get from result
    Long versionIdentifier = 0l;
    return new CachedObjectWrapper<Document, Long>(document, versionIdentifier, System.currentTimeMillis());
}
Also used : CachedObjectWrapper(org.solbase.cache.CachedObjectWrapper) Field(org.apache.lucene.document.Field) Document(org.apache.lucene.document.Document)

Example 90 with Document

use of org.apache.lucene.document.Document in project Solbase by Photobucket.

the class SolbaseInitialIndexMapper method map.

protected void map(ImmutableBytesWritable row, Result values, Context context) throws IOException {
    context.getCounter(Counters.TOTAL_ROWS).increment(1);
    context.setStatus(context.getCounter(Counters.TOTAL_ROWS) + "");
    // global id is user_media row key
    String globalId = Bytes.toString(row.get());
    Document doc = indexerUtil.createLuceneDocument(Bytes.toString(row.get()), values, context);
    byte[] checksum = values.getValue(Bytes.toBytes("meta"), Bytes.toBytes("checksum"));
    if (doc == null) {
        // validation must have failed if it returned null
        return;
    }
    // exists already
    if (this.idCounter > (SolbaseUtil.UNIQ_ID_CHUNK - 1) || docId == null) {
        docId = SolbaseUtil.generateUniqId();
        this.idCounter = 0;
    } else {
        docId--;
    }
    // for us, docId is going to be global uniq id, meaning we are tied to 2 billion docs limitation
    // it doesn't really hurt to add this field to doc. and it only really matters when sharding comes in, trying to fetch docs by their docid
    indexerUtil.addFieldToDoc(doc, "docId", docId + "");
    // incrementing chunking sequence (lucene doc id)
    this.idCounter++;
    try {
        ParsedDoc parsedDoc = indexerUtil.getIndexWriter().parseDoc(doc, indexerUtil.getAnalyzer(), "", docId, indexerUtil.getSortFieldNames());
        List<TermDocMetadata> metadatas = parsedDoc.getTermDocMetadatas();
        MapWritable mapWritable = new MapWritable();
        DocumentPutWritable docWritable = new DocumentPutWritable(parsedDoc.getFieldsMap(), parsedDoc.getAllTerms(), docId, globalId);
        mapWritable.put(new BytesWritable(Bytes.toBytes("doc")), docWritable);
        for (TermDocMetadata metadata : metadatas) {
            byte[] key = metadata.getFieldTermKey();
            ByteBuffer buf = metadata.serialize();
            TermDocMetadataWritable writable = new TermDocMetadataWritable(docId, Bytes.toBytes(buf), key);
            mapWritable.put(new BytesWritable(key), writable);
        }
        context.write(new BytesWritable(checksum), mapWritable);
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}
Also used : ParsedDoc(org.solbase.indexer.ParsedDoc) TermDocMetadataWritable(org.solbase.indexer.writable.TermDocMetadataWritable) TermDocMetadata(org.solbase.lucenehbase.TermDocMetadata) BytesWritable(org.apache.hadoop.io.BytesWritable) ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) MapWritable(org.apache.hadoop.io.MapWritable) Document(org.apache.lucene.document.Document) ByteBuffer(java.nio.ByteBuffer) DocumentPutWritable(org.solbase.indexer.writable.DocumentPutWritable)

Aggregations

Document (org.apache.lucene.document.Document)2344 Directory (org.apache.lucene.store.Directory)1374 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)798 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)752 IndexReader (org.apache.lucene.index.IndexReader)598 Field (org.apache.lucene.document.Field)480 IndexSearcher (org.apache.lucene.search.IndexSearcher)470 Term (org.apache.lucene.index.Term)456 BytesRef (org.apache.lucene.util.BytesRef)415 StringField (org.apache.lucene.document.StringField)403 TextField (org.apache.lucene.document.TextField)389 NumericDocValuesField (org.apache.lucene.document.NumericDocValuesField)325 IndexWriter (org.apache.lucene.index.IndexWriter)312 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)278 TopDocs (org.apache.lucene.search.TopDocs)270 TermQuery (org.apache.lucene.search.TermQuery)237 FieldType (org.apache.lucene.document.FieldType)231 DirectoryReader (org.apache.lucene.index.DirectoryReader)226 Test (org.junit.Test)222 RAMDirectory (org.apache.lucene.store.RAMDirectory)211