use of org.apache.lucene.index.IndexWriterConfig in project gerrit by GerritCodeReview.
the class DocIndexer method index.
private RAMDirectory index() throws IOException, UnsupportedEncodingException, FileNotFoundException {
RAMDirectory directory = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig(new StandardAnalyzer(CharArraySet.EMPTY_SET));
config.setOpenMode(OpenMode.CREATE);
config.setCommitOnClose(true);
try (IndexWriter iwriter = new IndexWriter(directory, config)) {
for (String inputFile : inputFiles) {
File file = new File(inputFile);
if (file.length() == 0) {
continue;
}
String title;
try (BufferedReader titleReader = new BufferedReader(new InputStreamReader(Files.newInputStream(file.toPath()), UTF_8))) {
title = titleReader.readLine();
if (title != null && title.startsWith("[[")) {
// Generally the first line of the txt is the title. In a few cases the
// first line is a "[[tag]]" and the second line is the title.
title = titleReader.readLine();
}
}
Matcher matcher = SECTION_HEADER.matcher(title);
if (matcher.matches()) {
title = matcher.group(1);
}
String outputFile = AsciiDoctor.mapInFileToOutFile(inputFile, inExt, outExt);
try (FileReader reader = new FileReader(file)) {
Document doc = new Document();
doc.add(new TextField(Constants.DOC_FIELD, reader));
doc.add(new StringField(Constants.URL_FIELD, prefix + outputFile, Field.Store.YES));
doc.add(new TextField(Constants.TITLE_FIELD, title, Field.Store.YES));
iwriter.addDocument(doc);
}
}
}
return directory;
}
use of org.apache.lucene.index.IndexWriterConfig in project lucene-solr by apache.
the class TestSuggestField method iwcWithSuggestField.
static IndexWriterConfig iwcWithSuggestField(Analyzer analyzer, final Set<String> suggestFields) {
IndexWriterConfig iwc = newIndexWriterConfig(random(), analyzer);
iwc.setMergePolicy(newLogMergePolicy());
Codec filterCodec = new Lucene70Codec() {
PostingsFormat postingsFormat = new Completion50PostingsFormat();
@Override
public PostingsFormat getPostingsFormatForField(String field) {
if (suggestFields.contains(field)) {
return postingsFormat;
}
return super.getPostingsFormatForField(field);
}
};
iwc.setCodec(filterCodec);
return iwc;
}
use of org.apache.lucene.index.IndexWriterConfig in project lucene-solr by apache.
the class TestSuggestField method testRandom.
public void testRandom() throws Exception {
int numDigits = TestUtil.nextInt(random(), 1, 6);
Set<String> keys = new HashSet<>();
int keyCount = TestUtil.nextInt(random(), 1, 20);
if (numDigits == 1) {
keyCount = Math.min(9, keyCount);
}
while (keys.size() < keyCount) {
keys.add(randomSimpleString(numDigits, 10));
}
List<String> keysList = new ArrayList<>(keys);
Analyzer analyzer = new MockAnalyzer(random());
IndexWriterConfig iwc = iwcWithSuggestField(analyzer, "suggest_field");
// we rely on docID order:
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
int docCount = TestUtil.nextInt(random(), 1, 200);
Entry[] docs = new Entry[docCount];
for (int i = 0; i < docCount; i++) {
int weight = random().nextInt(40);
String key = keysList.get(random().nextInt(keyCount));
//System.out.println("KEY: " + key);
docs[i] = new Entry(key, null, weight, i);
Document doc = new Document();
doc.add(new SuggestField("suggest_field", key, weight));
iw.addDocument(doc);
if (usually()) {
iw.commit();
}
}
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher searcher = new SuggestIndexSearcher(reader);
int iters = atLeast(200);
for (int iter = 0; iter < iters; iter++) {
String prefix = randomSimpleString(numDigits, 2);
if (VERBOSE) {
System.out.println("\nTEST: prefix=" + prefix);
}
// slow but hopefully correct suggester:
List<Entry> expected = new ArrayList<>();
for (Entry doc : docs) {
if (doc.output.startsWith(prefix)) {
expected.add(doc);
}
}
Collections.sort(expected, new Comparator<Entry>() {
@Override
public int compare(Entry a, Entry b) {
// sort by higher score:
int cmp = Float.compare(b.value, a.value);
if (cmp == 0) {
// tie break by smaller docID:
cmp = Integer.compare(a.id, b.id);
}
return cmp;
}
});
boolean dedup = random().nextBoolean();
if (dedup) {
List<Entry> deduped = new ArrayList<>();
Set<String> seen = new HashSet<>();
for (Entry entry : expected) {
if (seen.contains(entry.output) == false) {
seen.add(entry.output);
deduped.add(entry);
}
}
expected = deduped;
}
// TODO: re-enable this, except something is buggy about tie breaks at the topN threshold now:
//int topN = TestUtil.nextInt(random(), 1, docCount+10);
int topN = docCount;
if (VERBOSE) {
if (dedup) {
System.out.println(" expected (dedup'd) topN=" + topN + ":");
} else {
System.out.println(" expected topN=" + topN + ":");
}
for (int i = 0; i < expected.size(); i++) {
if (i >= topN) {
System.out.println(" leftover: " + i + ": " + expected.get(i));
} else {
System.out.println(" " + i + ": " + expected.get(i));
}
}
}
expected = expected.subList(0, Math.min(topN, expected.size()));
PrefixCompletionQuery query = new PrefixCompletionQuery(analyzer, new Term("suggest_field", prefix));
TopSuggestDocsCollector collector = new TopSuggestDocsCollector(topN, dedup);
searcher.suggest(query, collector);
TopSuggestDocs actual = collector.get();
if (VERBOSE) {
System.out.println(" actual:");
SuggestScoreDoc[] suggestScoreDocs = (SuggestScoreDoc[]) actual.scoreDocs;
for (int i = 0; i < suggestScoreDocs.length; i++) {
System.out.println(" " + i + ": " + suggestScoreDocs[i]);
}
}
assertSuggestions(actual, expected.toArray(new Entry[expected.size()]));
}
reader.close();
iw.close();
}
use of org.apache.lucene.index.IndexWriterConfig in project lucene-solr by apache.
the class CollationTestBase method testFarsiTermRangeQuery.
public void testFarsiTermRangeQuery(Analyzer analyzer, BytesRef firstBeg, BytesRef firstEnd, BytesRef secondBeg, BytesRef secondEnd) throws Exception {
Directory farsiIndex = newDirectory();
IndexWriter writer = new IndexWriter(farsiIndex, new IndexWriterConfig(analyzer));
Document doc = new Document();
doc.add(new TextField("content", "ساب", Field.Store.YES));
doc.add(new StringField("body", "body", Field.Store.YES));
writer.addDocument(doc);
writer.close();
IndexReader reader = DirectoryReader.open(farsiIndex);
IndexSearcher search = newSearcher(reader);
// Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
// orders the U+0698 character before the U+0633 character, so the single
// index Term below should NOT be returned by a TermRangeQuery
// with a Farsi Collator (or an Arabic one for the case when Farsi is
// not supported).
Query csrq = new TermRangeQuery("content", firstBeg, firstEnd, true, true);
ScoreDoc[] result = search.search(csrq, 1000).scoreDocs;
assertEquals("The index Term should not be included.", 0, result.length);
csrq = new TermRangeQuery("content", secondBeg, secondEnd, true, true);
result = search.search(csrq, 1000).scoreDocs;
assertEquals("The index Term should be included.", 1, result.length);
reader.close();
farsiIndex.close();
}
use of org.apache.lucene.index.IndexWriterConfig in project lucene-solr by apache.
the class TestCompressingTermVectorsFormat method testChunkCleanup.
/**
* writes some tiny segments with incomplete compressed blocks,
* and ensures merge recompresses them.
*/
public void testChunkCleanup() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwConf = newIndexWriterConfig(new MockAnalyzer(random()));
iwConf.setMergePolicy(NoMergePolicy.INSTANCE);
// we have to enforce certain things like maxDocsPerChunk to cause dirty chunks to be created
// by this test.
iwConf.setCodec(CompressingCodec.randomInstance(random(), 4 * 1024, 100, false, 8));
IndexWriter iw = new IndexWriter(dir, iwConf);
DirectoryReader ir = DirectoryReader.open(iw);
for (int i = 0; i < 5; i++) {
Document doc = new Document();
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setStoreTermVectors(true);
doc.add(new Field("text", "not very long at all", ft));
iw.addDocument(doc);
// force flush
DirectoryReader ir2 = DirectoryReader.openIfChanged(ir);
assertNotNull(ir2);
ir.close();
ir = ir2;
// examine dirty counts:
for (LeafReaderContext leaf : ir2.leaves()) {
CodecReader sr = (CodecReader) leaf.reader();
CompressingTermVectorsReader reader = (CompressingTermVectorsReader) sr.getTermVectorsReader();
assertEquals(1, reader.getNumChunks());
assertEquals(1, reader.getNumDirtyChunks());
}
}
iw.getConfig().setMergePolicy(newLogMergePolicy());
iw.forceMerge(1);
DirectoryReader ir2 = DirectoryReader.openIfChanged(ir);
assertNotNull(ir2);
ir.close();
ir = ir2;
CodecReader sr = (CodecReader) getOnlyLeafReader(ir);
CompressingTermVectorsReader reader = (CompressingTermVectorsReader) sr.getTermVectorsReader();
// we could get lucky, and have zero, but typically one.
assertTrue(reader.getNumDirtyChunks() <= 1);
ir.close();
iw.close();
dir.close();
}
Aggregations