Search in sources :

Example 1 with Document

use of org.apache.lucene.document.Document in project camel by apache.

the class LuceneSearcher method search.

public Hits search(String searchPhrase, int maxNumberOfHits, Version luceneVersion, boolean returnLuceneDocs) throws Exception {
    Hits searchHits = new Hits();
    int numberOfHits = doSearch(searchPhrase, maxNumberOfHits, luceneVersion);
    searchHits.setNumberOfHits(numberOfHits);
    for (ScoreDoc hit : hits) {
        Document selectedDocument = indexSearcher.doc(hit.doc);
        Hit aHit = new Hit();
        if (returnLuceneDocs) {
            aHit.setDocument(selectedDocument);
        }
        aHit.setHitLocation(hit.doc);
        aHit.setScore(hit.score);
        aHit.setData(selectedDocument.get("contents"));
        searchHits.getHit().add(aHit);
    }
    return searchHits;
}
Also used : Hits(org.apache.camel.processor.lucene.support.Hits) Hit(org.apache.camel.processor.lucene.support.Hit) Document(org.apache.lucene.document.Document) ScoreDoc(org.apache.lucene.search.ScoreDoc)

Example 2 with Document

use of org.apache.lucene.document.Document in project zeppelin by apache.

the class LuceneSearch method doSearch.

private List<Map<String, String>> doSearch(IndexSearcher searcher, Query query, Analyzer analyzer, Highlighter highlighter) {
    List<Map<String, String>> matchingParagraphs = Lists.newArrayList();
    ScoreDoc[] hits;
    try {
        hits = searcher.search(query, 20).scoreDocs;
        for (int i = 0; i < hits.length; i++) {
            LOG.debug("doc={} score={}", hits[i].doc, hits[i].score);
            int id = hits[i].doc;
            Document doc = searcher.doc(id);
            String path = doc.get(ID_FIELD);
            if (path != null) {
                LOG.debug((i + 1) + ". " + path);
                String title = doc.get("title");
                if (title != null) {
                    LOG.debug("   Title: {}", doc.get("title"));
                }
                String text = doc.get(SEARCH_FIELD_TEXT);
                String header = doc.get(SEARCH_FIELD_TITLE);
                String fragment = "";
                if (text != null) {
                    TokenStream tokenStream = TokenSources.getTokenStream(searcher.getIndexReader(), id, SEARCH_FIELD_TEXT, analyzer);
                    TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, true, 3);
                    LOG.debug("    {} fragments found for query '{}'", frag.length, query);
                    for (int j = 0; j < frag.length; j++) {
                        if ((frag[j] != null) && (frag[j].getScore() > 0)) {
                            LOG.debug("    Fragment: {}", frag[j].toString());
                        }
                    }
                    fragment = (frag != null && frag.length > 0) ? frag[0].toString() : "";
                }
                if (header != null) {
                    TokenStream tokenTitle = TokenSources.getTokenStream(searcher.getIndexReader(), id, SEARCH_FIELD_TITLE, analyzer);
                    TextFragment[] frgTitle = highlighter.getBestTextFragments(tokenTitle, header, true, 3);
                    header = (frgTitle != null && frgTitle.length > 0) ? frgTitle[0].toString() : "";
                } else {
                    header = "";
                }
                matchingParagraphs.add(// <noteId>/paragraph/<paragraphId>
                ImmutableMap.of(// <noteId>/paragraph/<paragraphId>
                "id", // <noteId>/paragraph/<paragraphId>
                path, "name", title, "snippet", fragment, "text", text, "header", header));
            } else {
                LOG.info("{}. No {} for this document", i + 1, ID_FIELD);
            }
        }
    } catch (IOException | InvalidTokenOffsetsException e) {
        LOG.error("Exception on searching for {}", query, e);
    }
    return matchingParagraphs;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) IOException(java.io.IOException) Document(org.apache.lucene.document.Document) TextFragment(org.apache.lucene.search.highlight.TextFragment) ScoreDoc(org.apache.lucene.search.ScoreDoc) InvalidTokenOffsetsException(org.apache.lucene.search.highlight.InvalidTokenOffsetsException) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap)

Example 3 with Document

use of org.apache.lucene.document.Document in project zeppelin by apache.

the class LuceneSearch method updateDoc.

/**
   * Updates index for the given note: either note.name or a paragraph If
   * paragraph is <code>null</code> - updates only for the note.name
   *
   * @param noteId
   * @param noteName
   * @param p
   * @throws IOException
   */
private void updateDoc(String noteId, String noteName, Paragraph p) throws IOException {
    String id = formatId(noteId, p);
    Document doc = newDocument(id, noteName, p);
    try {
        writer.updateDocument(new Term(ID_FIELD, id), doc);
        writer.commit();
    } catch (IOException e) {
        LOG.error("Failed to updaet index of notebook {}", noteId, e);
    }
}
Also used : Term(org.apache.lucene.index.Term) IOException(java.io.IOException) Document(org.apache.lucene.document.Document)

Example 4 with Document

use of org.apache.lucene.document.Document in project zeppelin by apache.

the class LuceneSearch method newDocument.

/**
   * If paragraph is not null, indexes code in the paragraph, otherwise indexes
   * the notebook name.
   *
   * @param id id of the document, different for Note name and paragraph
   * @param noteName name of the note
   * @param p paragraph
   * @return
   */
private Document newDocument(String id, String noteName, Paragraph p) {
    Document doc = new Document();
    Field pathField = new StringField(ID_FIELD, id, Field.Store.YES);
    doc.add(pathField);
    doc.add(new StringField("title", noteName, Field.Store.YES));
    if (null != p) {
        doc.add(new TextField(SEARCH_FIELD_TEXT, p.getText(), Field.Store.YES));
        if (p.getTitle() != null) {
            doc.add(new TextField(SEARCH_FIELD_TITLE, p.getTitle(), Field.Store.YES));
        }
        Date date = p.getDateStarted() != null ? p.getDateStarted() : p.getDateCreated();
        doc.add(new LongField("modified", date.getTime(), Field.Store.NO));
    } else {
        doc.add(new TextField(SEARCH_FIELD_TEXT, noteName, Field.Store.YES));
    }
    return doc;
}
Also used : LongField(org.apache.lucene.document.LongField) StringField(org.apache.lucene.document.StringField) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) LongField(org.apache.lucene.document.LongField) StringField(org.apache.lucene.document.StringField) TextField(org.apache.lucene.document.TextField) Document(org.apache.lucene.document.Document) Date(java.util.Date)

Example 5 with Document

use of org.apache.lucene.document.Document in project crate by crate.

the class OrderedLuceneBatchIteratorBenchmark method createLuceneBatchIterator.

@Setup
public void createLuceneBatchIterator() throws Exception {
    IndexWriter iw = new IndexWriter(new RAMDirectory(), new IndexWriterConfig(new StandardAnalyzer()));
    dummyShardId = new ShardId("dummy", 1);
    columnName = "x";
    for (int i = 0; i < 10_000_000; i++) {
        Document doc = new Document();
        doc.add(new NumericDocValuesField(columnName, i));
        iw.addDocument(doc);
    }
    iw.commit();
    iw.forceMerge(1, true);
    indexSearcher = new IndexSearcher(DirectoryReader.open(iw, true));
    collectorContext = new CollectorContext(mock(IndexFieldDataService.class), new CollectorFieldsVisitor(0));
    fieldTypeLookup = column -> {
        IntegerFieldMapper.IntegerFieldType integerFieldType = new IntegerFieldMapper.IntegerFieldType();
        integerFieldType.setNames(new MappedFieldType.Names(column));
        return integerFieldType;
    };
    reference = new Reference(new ReferenceIdent(new TableIdent(null, "dummyTable"), columnName), RowGranularity.DOC, DataTypes.INTEGER);
    orderBy = new OrderBy(Collections.singletonList(reference), reverseFlags, nullsFirst);
}
Also used : OrderBy(io.crate.analyze.OrderBy) Reference(io.crate.metadata.Reference) TableIdent(io.crate.metadata.TableIdent) Document(org.apache.lucene.document.Document) IntegerFieldMapper(org.elasticsearch.index.mapper.core.IntegerFieldMapper) RAMDirectory(org.apache.lucene.store.RAMDirectory) ReferenceIdent(io.crate.metadata.ReferenceIdent) ShardId(org.elasticsearch.index.shard.ShardId) NumericDocValuesField(org.apache.lucene.document.NumericDocValuesField) IndexWriter(org.apache.lucene.index.IndexWriter) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) MappedFieldType(org.elasticsearch.index.mapper.MappedFieldType) CollectorContext(io.crate.operation.reference.doc.lucene.CollectorContext) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Aggregations

Document (org.apache.lucene.document.Document)2344 Directory (org.apache.lucene.store.Directory)1374 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)798 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)752 IndexReader (org.apache.lucene.index.IndexReader)598 Field (org.apache.lucene.document.Field)480 IndexSearcher (org.apache.lucene.search.IndexSearcher)470 Term (org.apache.lucene.index.Term)456 BytesRef (org.apache.lucene.util.BytesRef)415 StringField (org.apache.lucene.document.StringField)403 TextField (org.apache.lucene.document.TextField)389 NumericDocValuesField (org.apache.lucene.document.NumericDocValuesField)325 IndexWriter (org.apache.lucene.index.IndexWriter)312 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)278 TopDocs (org.apache.lucene.search.TopDocs)270 TermQuery (org.apache.lucene.search.TermQuery)237 FieldType (org.apache.lucene.document.FieldType)231 DirectoryReader (org.apache.lucene.index.DirectoryReader)226 Test (org.junit.Test)222 RAMDirectory (org.apache.lucene.store.RAMDirectory)211