Search in sources :

Example 86 with FieldType

use of org.apache.lucene.document.FieldType in project lucene-solr by apache.

the class TestIndexWriterMerging method testForceMergeDeletes.

// LUCENE-325: test forceMergeDeletes, when 2 singular merges
// are required
public void testForceMergeDeletes() throws IOException {
    Directory dir = newDirectory();
    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())).setMaxBufferedDocs(2).setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH));
    Document document = new Document();
    FieldType customType = new FieldType();
    customType.setStored(true);
    FieldType customType1 = new FieldType(TextField.TYPE_STORED);
    customType1.setTokenized(false);
    customType1.setStoreTermVectors(true);
    customType1.setStoreTermVectorPositions(true);
    customType1.setStoreTermVectorOffsets(true);
    Field idField = newStringField("id", "", Field.Store.NO);
    document.add(idField);
    Field storedField = newField("stored", "stored", customType);
    document.add(storedField);
    Field termVectorField = newField("termVector", "termVector", customType1);
    document.add(termVectorField);
    for (int i = 0; i < 10; i++) {
        idField.setStringValue("" + i);
        writer.addDocument(document);
    }
    writer.close();
    IndexReader ir = DirectoryReader.open(dir);
    assertEquals(10, ir.maxDoc());
    assertEquals(10, ir.numDocs());
    ir.close();
    IndexWriterConfig dontMergeConfig = new IndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(NoMergePolicy.INSTANCE);
    writer = new IndexWriter(dir, dontMergeConfig);
    writer.deleteDocuments(new Term("id", "0"));
    writer.deleteDocuments(new Term("id", "7"));
    writer.close();
    ir = DirectoryReader.open(dir);
    assertEquals(8, ir.numDocs());
    ir.close();
    writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(newLogMergePolicy()));
    assertEquals(8, writer.numDocs());
    assertEquals(10, writer.maxDoc());
    writer.forceMergeDeletes();
    assertEquals(8, writer.numDocs());
    writer.close();
    ir = DirectoryReader.open(dir);
    assertEquals(8, ir.maxDoc());
    assertEquals(8, ir.numDocs());
    ir.close();
    dir.close();
}
Also used : Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Document(org.apache.lucene.document.Document) Directory(org.apache.lucene.store.Directory) FieldType(org.apache.lucene.document.FieldType)

Example 87 with FieldType

use of org.apache.lucene.document.FieldType in project lucene-solr by apache.

the class TestBackwardsCompatibility method addNoProxDoc.

private void addNoProxDoc(IndexWriter writer) throws IOException {
    Document doc = new Document();
    FieldType customType = new FieldType(TextField.TYPE_STORED);
    customType.setIndexOptions(IndexOptions.DOCS);
    Field f = new Field("content3", "aaa", customType);
    doc.add(f);
    FieldType customType2 = new FieldType();
    customType2.setStored(true);
    customType2.setIndexOptions(IndexOptions.DOCS);
    f = new Field("content4", "aaa", customType2);
    doc.add(f);
    writer.addDocument(doc);
}
Also used : SortedNumericDocValuesField(org.apache.lucene.document.SortedNumericDocValuesField) SortField(org.apache.lucene.search.SortField) NumericDocValuesField(org.apache.lucene.document.NumericDocValuesField) SortedSetDocValuesField(org.apache.lucene.document.SortedSetDocValuesField) BinaryDocValuesField(org.apache.lucene.document.BinaryDocValuesField) SortedDocValuesField(org.apache.lucene.document.SortedDocValuesField) StringField(org.apache.lucene.document.StringField) DoubleDocValuesField(org.apache.lucene.document.DoubleDocValuesField) FloatDocValuesField(org.apache.lucene.document.FloatDocValuesField) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) Document(org.apache.lucene.document.Document) FieldType(org.apache.lucene.document.FieldType)

Example 88 with FieldType

use of org.apache.lucene.document.FieldType in project lucene-solr by apache.

the class DocMaker method setConfig.

/** Set the configuration parameters of this doc maker. */
public void setConfig(Config config, ContentSource source) {
    this.config = config;
    this.source = source;
    boolean stored = config.get("doc.stored", false);
    boolean bodyStored = config.get("doc.body.stored", stored);
    boolean tokenized = config.get("doc.tokenized", true);
    boolean bodyTokenized = config.get("doc.body.tokenized", tokenized);
    boolean norms = config.get("doc.tokenized.norms", false);
    boolean bodyNorms = config.get("doc.body.tokenized.norms", true);
    boolean bodyOffsets = config.get("doc.body.offsets", false);
    boolean termVec = config.get("doc.term.vector", false);
    boolean termVecPositions = config.get("doc.term.vector.positions", false);
    boolean termVecOffsets = config.get("doc.term.vector.offsets", false);
    valType = new FieldType(TextField.TYPE_NOT_STORED);
    valType.setStored(stored);
    valType.setTokenized(tokenized);
    valType.setOmitNorms(!norms);
    valType.setStoreTermVectors(termVec);
    valType.setStoreTermVectorPositions(termVecPositions);
    valType.setStoreTermVectorOffsets(termVecOffsets);
    valType.freeze();
    bodyValType = new FieldType(TextField.TYPE_NOT_STORED);
    bodyValType.setStored(bodyStored);
    bodyValType.setTokenized(bodyTokenized);
    bodyValType.setOmitNorms(!bodyNorms);
    if (bodyTokenized && bodyOffsets) {
        bodyValType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    }
    bodyValType.setStoreTermVectors(termVec);
    bodyValType.setStoreTermVectorPositions(termVecPositions);
    bodyValType.setStoreTermVectorOffsets(termVecOffsets);
    bodyValType.freeze();
    storeBytes = config.get("doc.store.body.bytes", false);
    reuseFields = config.get("doc.reuse.fields", true);
    // In a multi-rounds run, it is important to reset DocState since settings
    // of fields may change between rounds, and this is the only way to reset
    // the cache of all threads.
    docState = new ThreadLocal<>();
    indexProperties = config.get("doc.index.props", false);
    updateDocIDLimit = config.get("doc.random.id.limit", -1);
    if (updateDocIDLimit != -1) {
        r = new Random(179);
    }
}
Also used : Random(java.util.Random) FieldType(org.apache.lucene.document.FieldType)

Example 89 with FieldType

use of org.apache.lucene.document.FieldType in project lucene-solr by apache.

the class DocMaker method createDocument.

// create a doc
// use only part of the body, modify it to keep the rest (or use all if size==0).
// reset the docdata properties so they are not added more than once.
private Document createDocument(DocData docData, int size, int cnt) throws UnsupportedEncodingException {
    final DocState ds = getDocState();
    final Document doc = reuseFields ? ds.doc : new Document();
    doc.clear();
    // Set ID_FIELD
    FieldType ft = new FieldType(valType);
    ft.setStored(true);
    Field idField = ds.getField(ID_FIELD, ft);
    int id;
    if (r != null) {
        id = r.nextInt(updateDocIDLimit);
    } else {
        id = docData.getID();
        if (id == -1) {
            id = numDocsCreated.getAndIncrement();
        }
    }
    idField.setStringValue(Integer.toString(id));
    doc.add(idField);
    // Set NAME_FIELD
    String name = docData.getName();
    if (name == null)
        name = "";
    name = cnt < 0 ? name : name + "_" + cnt;
    Field nameField = ds.getField(NAME_FIELD, valType);
    nameField.setStringValue(name);
    doc.add(nameField);
    // Set DATE_FIELD
    DateUtil util = dateParsers.get();
    if (util == null) {
        util = new DateUtil();
        dateParsers.set(util);
    }
    Date date = null;
    String dateString = docData.getDate();
    if (dateString != null) {
        util.pos.setIndex(0);
        date = util.parser.parse(dateString, util.pos);
    //System.out.println(dateString + " parsed to " + date);
    } else {
        dateString = "";
    }
    Field dateStringField = ds.getField(DATE_FIELD, valType);
    dateStringField.setStringValue(dateString);
    doc.add(dateStringField);
    if (date == null) {
        // just set to right now
        date = new Date();
    }
    Field dateField = ds.getNumericField(DATE_MSEC_FIELD, Long.class);
    dateField.setLongValue(date.getTime());
    doc.add(dateField);
    util.cal.setTime(date);
    final int sec = util.cal.get(Calendar.HOUR_OF_DAY) * 3600 + util.cal.get(Calendar.MINUTE) * 60 + util.cal.get(Calendar.SECOND);
    Field timeSecField = ds.getNumericField(TIME_SEC_FIELD, Integer.class);
    timeSecField.setIntValue(sec);
    doc.add(timeSecField);
    // Set TITLE_FIELD
    String title = docData.getTitle();
    Field titleField = ds.getField(TITLE_FIELD, valType);
    titleField.setStringValue(title == null ? "" : title);
    doc.add(titleField);
    String body = docData.getBody();
    if (body != null && body.length() > 0) {
        String bdy;
        if (size <= 0 || size >= body.length()) {
            // use all
            bdy = body;
            // nothing left
            docData.setBody("");
        } else {
            // attempt not to break words - if whitespace found within next 20 chars...
            for (int n = size - 1; n < size + 20 && n < body.length(); n++) {
                if (Character.isWhitespace(body.charAt(n))) {
                    size = n;
                    break;
                }
            }
            // use part
            bdy = body.substring(0, size);
            // some left
            docData.setBody(body.substring(size));
        }
        Field bodyField = ds.getField(BODY_FIELD, bodyValType);
        bodyField.setStringValue(bdy);
        doc.add(bodyField);
        if (storeBytes) {
            Field bytesField = ds.getField(BYTES_FIELD, StringField.TYPE_STORED);
            bytesField.setBytesValue(bdy.getBytes(StandardCharsets.UTF_8));
            doc.add(bytesField);
        }
    }
    if (indexProperties) {
        Properties props = docData.getProps();
        if (props != null) {
            for (final Map.Entry<Object, Object> entry : props.entrySet()) {
                Field f = ds.getField((String) entry.getKey(), valType);
                f.setStringValue((String) entry.getValue());
                doc.add(f);
            }
            docData.setProps(null);
        }
    }
    //System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n==========");
    return doc;
}
Also used : StringField(org.apache.lucene.document.StringField) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) Document(org.apache.lucene.document.Document) Properties(java.util.Properties) HashMap(java.util.HashMap) Map(java.util.Map) LongPoint(org.apache.lucene.document.LongPoint) DoublePoint(org.apache.lucene.document.DoublePoint) IntPoint(org.apache.lucene.document.IntPoint) FloatPoint(org.apache.lucene.document.FloatPoint) Date(java.util.Date) FieldType(org.apache.lucene.document.FieldType)

Example 90 with FieldType

use of org.apache.lucene.document.FieldType in project lucene-solr by apache.

the class TestBackwardsCompatibility method addDoc.

private void addDoc(IndexWriter writer, int id) throws IOException {
    Document doc = new Document();
    doc.add(new TextField("content", "aaa", Field.Store.NO));
    doc.add(new StringField("id", Integer.toString(id), Field.Store.YES));
    FieldType customType2 = new FieldType(TextField.TYPE_STORED);
    customType2.setStoreTermVectors(true);
    customType2.setStoreTermVectorPositions(true);
    customType2.setStoreTermVectorOffsets(true);
    doc.add(new Field("autf8", "Luš„žceš… ne  ā˜  abń•°—cd", customType2));
    doc.add(new Field("utf8", "Luš„žceš… ne  ā˜  abń•°—cd", customType2));
    doc.add(new Field("content2", "here is more content with aaa aaa aaa", customType2));
    doc.add(new Field("fieā±·ld", "field with non-ascii name", customType2));
    // add docvalues fields
    doc.add(new NumericDocValuesField("dvByte", (byte) id));
    byte[] bytes = new byte[] { (byte) (id >>> 24), (byte) (id >>> 16), (byte) (id >>> 8), (byte) id };
    BytesRef ref = new BytesRef(bytes);
    doc.add(new BinaryDocValuesField("dvBytesDerefFixed", ref));
    doc.add(new BinaryDocValuesField("dvBytesDerefVar", ref));
    doc.add(new SortedDocValuesField("dvBytesSortedFixed", ref));
    doc.add(new SortedDocValuesField("dvBytesSortedVar", ref));
    doc.add(new BinaryDocValuesField("dvBytesStraightFixed", ref));
    doc.add(new BinaryDocValuesField("dvBytesStraightVar", ref));
    doc.add(new DoubleDocValuesField("dvDouble", (double) id));
    doc.add(new FloatDocValuesField("dvFloat", (float) id));
    doc.add(new NumericDocValuesField("dvInt", id));
    doc.add(new NumericDocValuesField("dvLong", id));
    doc.add(new NumericDocValuesField("dvPacked", id));
    doc.add(new NumericDocValuesField("dvShort", (short) id));
    doc.add(new SortedSetDocValuesField("dvSortedSet", ref));
    doc.add(new SortedNumericDocValuesField("dvSortedNumeric", id));
    doc.add(new IntPoint("intPoint1d", id));
    doc.add(new IntPoint("intPoint2d", id, 2 * id));
    doc.add(new FloatPoint("floatPoint1d", (float) id));
    doc.add(new FloatPoint("floatPoint2d", (float) id, (float) 2 * id));
    doc.add(new LongPoint("longPoint1d", id));
    doc.add(new LongPoint("longPoint2d", id, 2 * id));
    doc.add(new DoublePoint("doublePoint1d", (double) id));
    doc.add(new DoublePoint("doublePoint2d", (double) id, (double) 2 * id));
    doc.add(new BinaryPoint("binaryPoint1d", bytes));
    doc.add(new BinaryPoint("binaryPoint2d", bytes, bytes));
    // a field with both offsets and term vectors for a cross-check
    FieldType customType3 = new FieldType(TextField.TYPE_STORED);
    customType3.setStoreTermVectors(true);
    customType3.setStoreTermVectorPositions(true);
    customType3.setStoreTermVectorOffsets(true);
    customType3.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    doc.add(new Field("content5", "here is more content with aaa aaa aaa", customType3));
    // a field that omits only positions
    FieldType customType4 = new FieldType(TextField.TYPE_STORED);
    customType4.setStoreTermVectors(true);
    customType4.setStoreTermVectorPositions(false);
    customType4.setStoreTermVectorOffsets(true);
    customType4.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
    doc.add(new Field("content6", "here is more content with aaa aaa aaa", customType4));
    // TODO: 
    //   index different norms types via similarity (we use a random one currently?!)
    //   remove any analyzer randomness, explicitly add payloads for certain fields.
    writer.addDocument(doc);
}
Also used : BinaryPoint(org.apache.lucene.document.BinaryPoint) FloatDocValuesField(org.apache.lucene.document.FloatDocValuesField) LongPoint(org.apache.lucene.document.LongPoint) Document(org.apache.lucene.document.Document) BinaryDocValuesField(org.apache.lucene.document.BinaryDocValuesField) FieldType(org.apache.lucene.document.FieldType) SortedNumericDocValuesField(org.apache.lucene.document.SortedNumericDocValuesField) SortField(org.apache.lucene.search.SortField) NumericDocValuesField(org.apache.lucene.document.NumericDocValuesField) SortedSetDocValuesField(org.apache.lucene.document.SortedSetDocValuesField) BinaryDocValuesField(org.apache.lucene.document.BinaryDocValuesField) SortedDocValuesField(org.apache.lucene.document.SortedDocValuesField) StringField(org.apache.lucene.document.StringField) DoubleDocValuesField(org.apache.lucene.document.DoubleDocValuesField) FloatDocValuesField(org.apache.lucene.document.FloatDocValuesField) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) IntPoint(org.apache.lucene.document.IntPoint) SortedNumericDocValuesField(org.apache.lucene.document.SortedNumericDocValuesField) SortedNumericDocValuesField(org.apache.lucene.document.SortedNumericDocValuesField) NumericDocValuesField(org.apache.lucene.document.NumericDocValuesField) FloatPoint(org.apache.lucene.document.FloatPoint) StringField(org.apache.lucene.document.StringField) DoubleDocValuesField(org.apache.lucene.document.DoubleDocValuesField) SortedDocValuesField(org.apache.lucene.document.SortedDocValuesField) DoublePoint(org.apache.lucene.document.DoublePoint) TextField(org.apache.lucene.document.TextField) SortedSetDocValuesField(org.apache.lucene.document.SortedSetDocValuesField) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

FieldType (org.apache.lucene.document.FieldType)262 Document (org.apache.lucene.document.Document)229 Field (org.apache.lucene.document.Field)191 Directory (org.apache.lucene.store.Directory)172 TextField (org.apache.lucene.document.TextField)166 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)125 StringField (org.apache.lucene.document.StringField)72 StoredField (org.apache.lucene.document.StoredField)65 IndexReader (org.apache.lucene.index.IndexReader)49 IndexWriter (org.apache.lucene.index.IndexWriter)49 BytesRef (org.apache.lucene.util.BytesRef)47 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)45 IndexSearcher (org.apache.lucene.search.IndexSearcher)45 Term (org.apache.lucene.index.Term)40 NumericDocValuesField (org.apache.lucene.document.NumericDocValuesField)38 RAMDirectory (org.apache.lucene.store.RAMDirectory)37 TermQuery (org.apache.lucene.search.TermQuery)33 TopDocs (org.apache.lucene.search.TopDocs)32 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)30 Analyzer (org.apache.lucene.analysis.Analyzer)27