Search in sources :

Example 46 with IndexWriterConfig

use of org.apache.lucene.index.IndexWriterConfig in project gerrit by GerritCodeReview.

the class DocIndexer method index.

private RAMDirectory index() throws IOException, UnsupportedEncodingException, FileNotFoundException {
    RAMDirectory directory = new RAMDirectory();
    IndexWriterConfig config = new IndexWriterConfig(new StandardAnalyzer(CharArraySet.EMPTY_SET));
    config.setOpenMode(OpenMode.CREATE);
    config.setCommitOnClose(true);
    try (IndexWriter iwriter = new IndexWriter(directory, config)) {
        for (String inputFile : inputFiles) {
            File file = new File(inputFile);
            if (file.length() == 0) {
                continue;
            }
            String title;
            try (BufferedReader titleReader = new BufferedReader(new InputStreamReader(Files.newInputStream(file.toPath()), UTF_8))) {
                title = titleReader.readLine();
                if (title != null && title.startsWith("[[")) {
                    // Generally the first line of the txt is the title. In a few cases the
                    // first line is a "[[tag]]" and the second line is the title.
                    title = titleReader.readLine();
                }
            }
            Matcher matcher = SECTION_HEADER.matcher(title);
            if (matcher.matches()) {
                title = matcher.group(1);
            }
            String outputFile = AsciiDoctor.mapInFileToOutFile(inputFile, inExt, outExt);
            try (FileReader reader = new FileReader(file)) {
                Document doc = new Document();
                doc.add(new TextField(Constants.DOC_FIELD, reader));
                doc.add(new StringField(Constants.URL_FIELD, prefix + outputFile, Field.Store.YES));
                doc.add(new TextField(Constants.TITLE_FIELD, title, Field.Store.YES));
                iwriter.addDocument(doc);
            }
        }
    }
    return directory;
}
Also used : InputStreamReader(java.io.InputStreamReader) Matcher(java.util.regex.Matcher) Document(org.apache.lucene.document.Document) RAMDirectory(org.apache.lucene.store.RAMDirectory) IndexWriter(org.apache.lucene.index.IndexWriter) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) StringField(org.apache.lucene.document.StringField) BufferedReader(java.io.BufferedReader) TextField(org.apache.lucene.document.TextField) FileReader(java.io.FileReader) File(java.io.File) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 47 with IndexWriterConfig

use of org.apache.lucene.index.IndexWriterConfig in project lucene-solr by apache.

the class TestSuggestField method iwcWithSuggestField.

static IndexWriterConfig iwcWithSuggestField(Analyzer analyzer, final Set<String> suggestFields) {
    IndexWriterConfig iwc = newIndexWriterConfig(random(), analyzer);
    iwc.setMergePolicy(newLogMergePolicy());
    Codec filterCodec = new Lucene70Codec() {

        PostingsFormat postingsFormat = new Completion50PostingsFormat();

        @Override
        public PostingsFormat getPostingsFormatForField(String field) {
            if (suggestFields.contains(field)) {
                return postingsFormat;
            }
            return super.getPostingsFormatForField(field);
        }
    };
    iwc.setCodec(filterCodec);
    return iwc;
}
Also used : Codec(org.apache.lucene.codecs.Codec) Lucene70Codec(org.apache.lucene.codecs.lucene70.Lucene70Codec) PostingsFormat(org.apache.lucene.codecs.PostingsFormat) Lucene70Codec(org.apache.lucene.codecs.lucene70.Lucene70Codec) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 48 with IndexWriterConfig

use of org.apache.lucene.index.IndexWriterConfig in project lucene-solr by apache.

the class TestSuggestField method testRandom.

public void testRandom() throws Exception {
    int numDigits = TestUtil.nextInt(random(), 1, 6);
    Set<String> keys = new HashSet<>();
    int keyCount = TestUtil.nextInt(random(), 1, 20);
    if (numDigits == 1) {
        keyCount = Math.min(9, keyCount);
    }
    while (keys.size() < keyCount) {
        keys.add(randomSimpleString(numDigits, 10));
    }
    List<String> keysList = new ArrayList<>(keys);
    Analyzer analyzer = new MockAnalyzer(random());
    IndexWriterConfig iwc = iwcWithSuggestField(analyzer, "suggest_field");
    // we rely on docID order:
    iwc.setMergePolicy(newLogMergePolicy());
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
    int docCount = TestUtil.nextInt(random(), 1, 200);
    Entry[] docs = new Entry[docCount];
    for (int i = 0; i < docCount; i++) {
        int weight = random().nextInt(40);
        String key = keysList.get(random().nextInt(keyCount));
        //System.out.println("KEY: " + key);
        docs[i] = new Entry(key, null, weight, i);
        Document doc = new Document();
        doc.add(new SuggestField("suggest_field", key, weight));
        iw.addDocument(doc);
        if (usually()) {
            iw.commit();
        }
    }
    DirectoryReader reader = iw.getReader();
    SuggestIndexSearcher searcher = new SuggestIndexSearcher(reader);
    int iters = atLeast(200);
    for (int iter = 0; iter < iters; iter++) {
        String prefix = randomSimpleString(numDigits, 2);
        if (VERBOSE) {
            System.out.println("\nTEST: prefix=" + prefix);
        }
        // slow but hopefully correct suggester:
        List<Entry> expected = new ArrayList<>();
        for (Entry doc : docs) {
            if (doc.output.startsWith(prefix)) {
                expected.add(doc);
            }
        }
        Collections.sort(expected, new Comparator<Entry>() {

            @Override
            public int compare(Entry a, Entry b) {
                // sort by higher score:
                int cmp = Float.compare(b.value, a.value);
                if (cmp == 0) {
                    // tie break by smaller docID:
                    cmp = Integer.compare(a.id, b.id);
                }
                return cmp;
            }
        });
        boolean dedup = random().nextBoolean();
        if (dedup) {
            List<Entry> deduped = new ArrayList<>();
            Set<String> seen = new HashSet<>();
            for (Entry entry : expected) {
                if (seen.contains(entry.output) == false) {
                    seen.add(entry.output);
                    deduped.add(entry);
                }
            }
            expected = deduped;
        }
        // TODO: re-enable this, except something is buggy about tie breaks at the topN threshold now:
        //int topN = TestUtil.nextInt(random(), 1, docCount+10);
        int topN = docCount;
        if (VERBOSE) {
            if (dedup) {
                System.out.println("  expected (dedup'd) topN=" + topN + ":");
            } else {
                System.out.println("  expected topN=" + topN + ":");
            }
            for (int i = 0; i < expected.size(); i++) {
                if (i >= topN) {
                    System.out.println("    leftover: " + i + ": " + expected.get(i));
                } else {
                    System.out.println("    " + i + ": " + expected.get(i));
                }
            }
        }
        expected = expected.subList(0, Math.min(topN, expected.size()));
        PrefixCompletionQuery query = new PrefixCompletionQuery(analyzer, new Term("suggest_field", prefix));
        TopSuggestDocsCollector collector = new TopSuggestDocsCollector(topN, dedup);
        searcher.suggest(query, collector);
        TopSuggestDocs actual = collector.get();
        if (VERBOSE) {
            System.out.println("  actual:");
            SuggestScoreDoc[] suggestScoreDocs = (SuggestScoreDoc[]) actual.scoreDocs;
            for (int i = 0; i < suggestScoreDocs.length; i++) {
                System.out.println("    " + i + ": " + suggestScoreDocs[i]);
            }
        }
        assertSuggestions(actual, expected.toArray(new Entry[expected.size()]));
    }
    reader.close();
    iw.close();
}
Also used : ArrayList(java.util.ArrayList) CopyOnWriteArrayList(java.util.concurrent.CopyOnWriteArrayList) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Document(org.apache.lucene.document.Document) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) SuggestScoreDoc(org.apache.lucene.search.suggest.document.TopSuggestDocs.SuggestScoreDoc) HashSet(java.util.HashSet) DirectoryReader(org.apache.lucene.index.DirectoryReader) Term(org.apache.lucene.index.Term) IntPoint(org.apache.lucene.document.IntPoint) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 49 with IndexWriterConfig

use of org.apache.lucene.index.IndexWriterConfig in project lucene-solr by apache.

the class CollationTestBase method testFarsiTermRangeQuery.

public void testFarsiTermRangeQuery(Analyzer analyzer, BytesRef firstBeg, BytesRef firstEnd, BytesRef secondBeg, BytesRef secondEnd) throws Exception {
    Directory farsiIndex = newDirectory();
    IndexWriter writer = new IndexWriter(farsiIndex, new IndexWriterConfig(analyzer));
    Document doc = new Document();
    doc.add(new TextField("content", "ساب", Field.Store.YES));
    doc.add(new StringField("body", "body", Field.Store.YES));
    writer.addDocument(doc);
    writer.close();
    IndexReader reader = DirectoryReader.open(farsiIndex);
    IndexSearcher search = newSearcher(reader);
    // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
    // orders the U+0698 character before the U+0633 character, so the single
    // index Term below should NOT be returned by a TermRangeQuery
    // with a Farsi Collator (or an Arabic one for the case when Farsi is 
    // not supported).
    Query csrq = new TermRangeQuery("content", firstBeg, firstEnd, true, true);
    ScoreDoc[] result = search.search(csrq, 1000).scoreDocs;
    assertEquals("The index Term should not be included.", 0, result.length);
    csrq = new TermRangeQuery("content", secondBeg, secondEnd, true, true);
    result = search.search(csrq, 1000).scoreDocs;
    assertEquals("The index Term should be included.", 1, result.length);
    reader.close();
    farsiIndex.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) Query(org.apache.lucene.search.Query) TermQuery(org.apache.lucene.search.TermQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) TermRangeQuery(org.apache.lucene.search.TermRangeQuery) IndexWriter(org.apache.lucene.index.IndexWriter) StringField(org.apache.lucene.document.StringField) TermRangeQuery(org.apache.lucene.search.TermRangeQuery) IndexReader(org.apache.lucene.index.IndexReader) TextField(org.apache.lucene.document.TextField) Document(org.apache.lucene.document.Document) Directory(org.apache.lucene.store.Directory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig) ScoreDoc(org.apache.lucene.search.ScoreDoc)

Example 50 with IndexWriterConfig

use of org.apache.lucene.index.IndexWriterConfig in project lucene-solr by apache.

the class TestCompressingTermVectorsFormat method testChunkCleanup.

/**
   * writes some tiny segments with incomplete compressed blocks,
   * and ensures merge recompresses them.
   */
public void testChunkCleanup() throws IOException {
    Directory dir = newDirectory();
    IndexWriterConfig iwConf = newIndexWriterConfig(new MockAnalyzer(random()));
    iwConf.setMergePolicy(NoMergePolicy.INSTANCE);
    // we have to enforce certain things like maxDocsPerChunk to cause dirty chunks to be created
    // by this test.
    iwConf.setCodec(CompressingCodec.randomInstance(random(), 4 * 1024, 100, false, 8));
    IndexWriter iw = new IndexWriter(dir, iwConf);
    DirectoryReader ir = DirectoryReader.open(iw);
    for (int i = 0; i < 5; i++) {
        Document doc = new Document();
        FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
        ft.setStoreTermVectors(true);
        doc.add(new Field("text", "not very long at all", ft));
        iw.addDocument(doc);
        // force flush
        DirectoryReader ir2 = DirectoryReader.openIfChanged(ir);
        assertNotNull(ir2);
        ir.close();
        ir = ir2;
        // examine dirty counts:
        for (LeafReaderContext leaf : ir2.leaves()) {
            CodecReader sr = (CodecReader) leaf.reader();
            CompressingTermVectorsReader reader = (CompressingTermVectorsReader) sr.getTermVectorsReader();
            assertEquals(1, reader.getNumChunks());
            assertEquals(1, reader.getNumDirtyChunks());
        }
    }
    iw.getConfig().setMergePolicy(newLogMergePolicy());
    iw.forceMerge(1);
    DirectoryReader ir2 = DirectoryReader.openIfChanged(ir);
    assertNotNull(ir2);
    ir.close();
    ir = ir2;
    CodecReader sr = (CodecReader) getOnlyLeafReader(ir);
    CompressingTermVectorsReader reader = (CompressingTermVectorsReader) sr.getTermVectorsReader();
    // we could get lucky, and have zero, but typically one.
    assertTrue(reader.getNumDirtyChunks() <= 1);
    ir.close();
    iw.close();
    dir.close();
}
Also used : Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) CodecReader(org.apache.lucene.index.CodecReader) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) IndexWriter(org.apache.lucene.index.IndexWriter) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) DirectoryReader(org.apache.lucene.index.DirectoryReader) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) Document(org.apache.lucene.document.Document) Directory(org.apache.lucene.store.Directory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig) FieldType(org.apache.lucene.document.FieldType)

Aggregations

IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)513 IndexWriter (org.apache.lucene.index.IndexWriter)362 Document (org.apache.lucene.document.Document)311 Directory (org.apache.lucene.store.Directory)289 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)162 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)143 IndexReader (org.apache.lucene.index.IndexReader)140 Term (org.apache.lucene.index.Term)116 IndexSearcher (org.apache.lucene.search.IndexSearcher)106 TextField (org.apache.lucene.document.TextField)93 DirectoryReader (org.apache.lucene.index.DirectoryReader)92 RAMDirectory (org.apache.lucene.store.RAMDirectory)89 IOException (java.io.IOException)88 BytesRef (org.apache.lucene.util.BytesRef)80 Field (org.apache.lucene.document.Field)78 Analyzer (org.apache.lucene.analysis.Analyzer)74 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)61 Test (org.junit.Test)61 StringField (org.apache.lucene.document.StringField)59 NumericDocValuesField (org.apache.lucene.document.NumericDocValuesField)49