Search in sources :

Example 56 with StandardAnalyzer

use of org.apache.lucene.analysis.standard.StandardAnalyzer in project gerrit by GerritCodeReview.

the class DocIndexer method index.

private RAMDirectory index() throws IOException, UnsupportedEncodingException, FileNotFoundException {
    RAMDirectory directory = new RAMDirectory();
    IndexWriterConfig config = new IndexWriterConfig(new StandardAnalyzer(CharArraySet.EMPTY_SET));
    config.setOpenMode(OpenMode.CREATE);
    config.setCommitOnClose(true);
    try (IndexWriter iwriter = new IndexWriter(directory, config)) {
        for (String inputFile : inputFiles) {
            File file = new File(inputFile);
            if (file.length() == 0) {
                continue;
            }
            String title;
            try (BufferedReader titleReader = new BufferedReader(new InputStreamReader(Files.newInputStream(file.toPath()), UTF_8))) {
                title = titleReader.readLine();
                if (title != null && title.startsWith("[[")) {
                    // Generally the first line of the txt is the title. In a few cases the
                    // first line is a "[[tag]]" and the second line is the title.
                    title = titleReader.readLine();
                }
            }
            Matcher matcher = SECTION_HEADER.matcher(title);
            if (matcher.matches()) {
                title = matcher.group(1);
            }
            String outputFile = AsciiDoctor.mapInFileToOutFile(inputFile, inExt, outExt);
            try (FileReader reader = new FileReader(file)) {
                Document doc = new Document();
                doc.add(new TextField(Constants.DOC_FIELD, reader));
                doc.add(new StringField(Constants.URL_FIELD, prefix + outputFile, Field.Store.YES));
                doc.add(new TextField(Constants.TITLE_FIELD, title, Field.Store.YES));
                iwriter.addDocument(doc);
            }
        }
    }
    return directory;
}
Also used : InputStreamReader(java.io.InputStreamReader) Matcher(java.util.regex.Matcher) Document(org.apache.lucene.document.Document) RAMDirectory(org.apache.lucene.store.RAMDirectory) IndexWriter(org.apache.lucene.index.IndexWriter) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) StringField(org.apache.lucene.document.StringField) BufferedReader(java.io.BufferedReader) TextField(org.apache.lucene.document.TextField) FileReader(java.io.FileReader) File(java.io.File) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 57 with StandardAnalyzer

use of org.apache.lucene.analysis.standard.StandardAnalyzer in project ddf by codice.

the class TestGeoNamesQueryLuceneIndex method initializeIndex.

private void initializeIndex() throws IOException {
    directory = new RAMDirectory();
    final IndexWriterConfig indexWriterConfig = new IndexWriterConfig(new StandardAnalyzer());
    indexWriterConfig.setOpenMode(OpenMode.CREATE);
    final IndexWriter indexWriter = new IndexWriter(directory, indexWriterConfig);
    indexWriter.addDocument(createDocumentFromGeoEntry(GEO_ENTRY_1));
    indexWriter.addDocument(createDocumentFromGeoEntry(GEO_ENTRY_2));
    indexWriter.addDocument(createDocumentFromGeoEntry(GEO_ENTRY_3));
    indexWriter.close();
}
Also used : IndexWriter(org.apache.lucene.index.IndexWriter) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) RAMDirectory(org.apache.lucene.store.RAMDirectory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 58 with StandardAnalyzer

use of org.apache.lucene.analysis.standard.StandardAnalyzer in project ddf by codice.

the class GeoNamesQueryLuceneIndex method createQuery.

protected Query createQuery(final String queryString) throws ParseException {
    final StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
    final QueryParser nameQueryParser = new QueryParser(GeoNamesLuceneConstants.NAME_FIELD, standardAnalyzer);
    nameQueryParser.setEnablePositionIncrements(false);
    /* For the name, we construct a query searching for exactly the query string (the phrase
        query), a query searching for all the terms in the query string (the AND query), and a
        query searching for any of the terms in the query string (the OR query). We take the
        maximum of the scores generated by these three queries and use that as the score for the
        name. */
    // Surround with quotes so Lucene looks for the words in the query as a phrase.
    // Phrase query gets the biggest boost - 3.2 was obtained after some experimentation.
    final Query phraseNameQuery = new BoostQuery(nameQueryParser.parse("\"" + queryString + "\""), 3.2f);
    // By default, QueryParser uses OR to separate terms.
    // We give OR queries the lowest boost because they're not as good as phrase matches or
    // AND matches - 1 (the default boost value) was obtained after some experimentation.
    final Query orNameQuery = nameQueryParser.parse(queryString);
    nameQueryParser.setDefaultOperator(QueryParser.AND_OPERATOR);
    // We give AND queries the second-biggest boost because they're better than OR matches but
    // not as good as phrase matches - 2 was obtained after some experimentation.
    final Query andNameQuery = new BoostQuery(nameQueryParser.parse(queryString), 2f);
    final List<Query> nameQueryList = Arrays.asList(phraseNameQuery, orNameQuery, andNameQuery);
    // This query will score each document by the maximum of the three sub-queries.
    final Query nameQuery = new DisjunctionMaxQuery(nameQueryList, 0);
    final QueryParser alternateNamesQueryParser = new QueryParser(GeoNamesLuceneConstants.ALTERNATE_NAMES_FIELD, standardAnalyzer);
    // For the alternate names, we perform an AND query and an OR query, both of which are
    // boosted less than the name query because the alternate names are generally not as
    // important.
    // The OR query gets a lower boost - 0.5 was obtained after some experimentation.
    final Query orAlternateNamesQuery = new BoostQuery(alternateNamesQueryParser.parse(queryString), 0.5f);
    alternateNamesQueryParser.setDefaultOperator(QueryParser.AND_OPERATOR);
    // The AND query gets a higher boost - 1 (the default boost value) was obtained after some
    // experimentation.
    final Query andAlternateNamesQuery = alternateNamesQueryParser.parse(queryString);
    final List<Query> alternateNamesQueryList = Arrays.asList(orAlternateNamesQuery, andAlternateNamesQuery);
    // This query will score each document by the maximum of the two sub-queries.
    final Query alternateNamesQuery = new DisjunctionMaxQuery(alternateNamesQueryList, 0);
    final List<Query> queryList = Arrays.asList(nameQuery, alternateNamesQuery);
    // This query will score each document by the sum of the two sub-queries, since both the
    // name and the alternate names are important.
    // The boost values ensure that how well the query matches the name has a bigger impact on
    // the final score than how well it matches the alternate names.
    final DisjunctionMaxQuery disjunctionMaxQuery = new DisjunctionMaxQuery(queryList, 1.0f);
    // This is the boost we calculated at index time, and it is applied in the CustomScoreQuery.
    final FunctionQuery boostQuery = new FunctionQuery(new FloatFieldSource(GeoNamesLuceneConstants.BOOST_FIELD));
    return new CustomScoreQuery(disjunctionMaxQuery, boostQuery);
}
Also used : QueryParser(org.apache.lucene.queryparser.classic.QueryParser) FunctionQuery(org.apache.lucene.queries.function.FunctionQuery) Query(org.apache.lucene.search.Query) FunctionQuery(org.apache.lucene.queries.function.FunctionQuery) CustomScoreQuery(org.apache.lucene.queries.CustomScoreQuery) DisjunctionMaxQuery(org.apache.lucene.search.DisjunctionMaxQuery) TermQuery(org.apache.lucene.search.TermQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) BoostQuery(org.apache.lucene.search.BoostQuery) FloatFieldSource(org.apache.lucene.queries.function.valuesource.FloatFieldSource) DisjunctionMaxQuery(org.apache.lucene.search.DisjunctionMaxQuery) CustomScoreQuery(org.apache.lucene.queries.CustomScoreQuery) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) BoostQuery(org.apache.lucene.search.BoostQuery)

Example 59 with StandardAnalyzer

use of org.apache.lucene.analysis.standard.StandardAnalyzer in project bigbluebutton by bigbluebutton.

the class Index method startIndex.

public void startIndex(String uid) {
    try {
        IndexReader.unlock(FSDirectory.getDirectory(ConfigHandler.indexPath));
        if (logger.isInfoEnabled()) {
            logger.info("index file path " + ConfigHandler.indexPath);
        }
        reader = IndexReader.open(ConfigHandler.indexPath);
        TermEnum uidIter = reader.terms(new Term("uid"));
        while (uidIter.term() != null) {
            if (uid.equalsIgnoreCase(uidIter.term().text())) {
                reader.deleteDocuments(uidIter.term());
            }
            uidIter.next();
        }
        reader.close();
    } catch (CorruptIndexException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (LockObtainFailedException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    try {
        writer = new IndexWriter(ConfigHandler.indexPath, new StandardAnalyzer(), new IndexWriter.MaxFieldLength(1000000));
    } catch (CorruptIndexException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (LockObtainFailedException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}
Also used : LockObtainFailedException(org.apache.lucene.store.LockObtainFailedException) IndexWriter(org.apache.lucene.index.IndexWriter) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) CorruptIndexException(org.apache.lucene.index.CorruptIndexException) Term(org.apache.lucene.index.Term) IOException(java.io.IOException) TermEnum(org.apache.lucene.index.TermEnum)

Example 60 with StandardAnalyzer

use of org.apache.lucene.analysis.standard.StandardAnalyzer in project lucene-skos by behas.

the class SKOSStandardQueryParserTest method queryParserSearchWithBoosts.

@Test
public void queryParserSearchWithBoosts() throws IOException, QueryNodeException {
    Document doc = new Document();
    doc.add(new Field("content", "The quick brown fox jumps over the lazy dog", TextField.TYPE_STORED));
    writer.addDocument(doc);
    searcher = new IndexSearcher(DirectoryReader.open(writer, false));
    SKOSStandardQueryParser parser = new SKOSStandardQueryParser(skosAnalyzer);
    parser.setBoost(SKOSType.ALT, 0.5f);
    Query query = parser.parse("\"fox jumps\"", "content");
    assertEquals(1, searcher.search(query, 1).totalHits);
    // boosts do not work in phrase queries
    assertEquals("content:\"fox (jumps hops leaps)\"", query.toString());
    assertEquals("org.apache.lucene.search.MultiPhraseQuery", query.getClass().getName());
    query = parser.parse("fox jumps", "content");
    assertEquals(1, searcher.search(query, 1).totalHits);
    assertEquals("content:fox (content:jumps content:hops^0.5 content:leaps^0.5)", query.toString());
    assertEquals("org.apache.lucene.search.BooleanQuery", query.getClass().getName());
    query = new SKOSStandardQueryParser(new StandardAnalyzer()).parse("fox jumps", "content");
    assertEquals(1, searcher.search(query, 1).totalHits);
    assertEquals("content:fox content:jumps", query.toString());
    assertEquals("org.apache.lucene.search.BooleanQuery", query.getClass().getName());
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) Query(org.apache.lucene.search.Query) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) Document(org.apache.lucene.document.Document) SKOSStandardQueryParser(at.ac.univie.mminf.luceneSKOS.queryparser.flexible.standard.SKOSStandardQueryParser) Test(org.junit.Test)

Aggregations

StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)112 Analyzer (org.apache.lucene.analysis.Analyzer)37 IndexWriter (org.apache.lucene.index.IndexWriter)36 Document (org.apache.lucene.document.Document)29 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)29 IndexSearcher (org.apache.lucene.search.IndexSearcher)24 Term (org.apache.lucene.index.Term)22 RAMDirectory (org.apache.lucene.store.RAMDirectory)21 Test (org.junit.Test)21 Query (org.apache.lucene.search.Query)20 BooleanQuery (org.apache.lucene.search.BooleanQuery)19 TermQuery (org.apache.lucene.search.TermQuery)19 IOException (java.io.IOException)16 Before (org.junit.Before)15 IndexReader (org.apache.lucene.index.IndexReader)14 HashMap (java.util.HashMap)13 Field (org.apache.lucene.document.Field)13 ArrayList (java.util.ArrayList)12 QueryParser (org.apache.lucene.queryparser.classic.QueryParser)12 Directory (org.apache.lucene.store.Directory)12