Search in sources :

Example 76 with IntPoint

use of org.apache.lucene.document.IntPoint in project carbondata by apache.

the class LuceneDataMapWriter method onPageAdded.

/**
 * Add the column pages row to the datamap, order of pages is same as `indexColumns` in
 * DataMapMeta returned in DataMapFactory.
 * Implementation should copy the content of `pages` as needed, because `pages` memory
 * may be freed after this method returns, if using unsafe column page.
 */
public void onPageAdded(int blockletId, int pageId, ColumnPage[] pages) throws IOException {
    // save index data into ram, write into disk after one page finished
    RAMDirectory ramDir = new RAMDirectory();
    IndexWriter ramIndexWriter = new IndexWriter(ramDir, new IndexWriterConfig(analyzer));
    int columnsCount = pages.length;
    if (columnsCount <= 0) {
        LOGGER.warn("empty data");
        ramIndexWriter.close();
        ramDir.close();
        return;
    }
    int pageSize = pages[0].getPageSize();
    for (int rowId = 0; rowId < pageSize; rowId++) {
        // create a new document
        Document doc = new Document();
        // add block id, save this id
        doc.add(new StringField(BLOCKID_NAME, blockId, Field.Store.YES));
        // add blocklet Id
        doc.add(new IntPoint(BLOCKLETID_NAME, new int[] { blockletId }));
        doc.add(new StoredField(BLOCKLETID_NAME, blockletId));
        // add page id and row id in Fine Grain data map
        if (isFineGrain) {
            // add page Id
            doc.add(new IntPoint(PAGEID_NAME, new int[] { pageId }));
            doc.add(new StoredField(PAGEID_NAME, pageId));
            // doc.add(new NumericDocValuesField(PAGEID_NAME,pageId));
            // add row id
            doc.add(new IntPoint(ROWID_NAME, new int[] { rowId }));
            doc.add(new StoredField(ROWID_NAME, rowId));
        // doc.add(new NumericDocValuesField(ROWID_NAME,rowId));
        }
        // add other fields
        for (int colIdx = 0; colIdx < columnsCount; colIdx++) {
            if (!pages[colIdx].getNullBits().get(rowId)) {
                addField(doc, pages[colIdx], rowId, Field.Store.NO);
            }
        }
        // add this document
        ramIndexWriter.addDocument(doc);
    }
    // close ram writer
    ramIndexWriter.close();
    // add ram index data into disk
    indexWriter.addIndexes(new Directory[] { ramDir });
    // delete this ram data
    ramDir.close();
}
Also used : IntPoint(org.apache.lucene.document.IntPoint) StoredField(org.apache.lucene.document.StoredField) IndexWriter(org.apache.lucene.index.IndexWriter) StringField(org.apache.lucene.document.StringField) Document(org.apache.lucene.document.Document) RAMDirectory(org.apache.lucene.store.RAMDirectory) LongPoint(org.apache.lucene.document.LongPoint) DoublePoint(org.apache.lucene.document.DoublePoint) IntPoint(org.apache.lucene.document.IntPoint) FloatPoint(org.apache.lucene.document.FloatPoint) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 77 with IntPoint

use of org.apache.lucene.document.IntPoint in project Anserini by castorini.

the class BibtexGenerator method createDocument.

@Override
public Document createDocument(BibtexCollection.Document bibtexDoc) throws GeneratorException {
    String id = bibtexDoc.id();
    String content = bibtexDoc.contents();
    String type = bibtexDoc.type();
    BibTeXEntry bibtexEntry = bibtexDoc.bibtexEntry();
    if (content == null || content.trim().isEmpty()) {
        throw new EmptyDocumentException();
    }
    Document doc = new Document();
    // Store the collection docid.
    doc.add(new StringField(IndexArgs.ID, id, Field.Store.YES));
    // This is needed to break score ties by docid.
    doc.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef(id)));
    // Store the collection's bibtex type
    doc.add(new StringField(TYPE, type, Field.Store.YES));
    if (args.storeRaw) {
        doc.add(new StoredField(IndexArgs.RAW, bibtexDoc.raw()));
    }
    FieldType fieldType = new FieldType();
    fieldType.setStored(args.storeContents);
    // Are we storing document vectors?
    if (args.storeDocvectors) {
        fieldType.setStoreTermVectors(true);
        fieldType.setStoreTermVectorPositions(true);
    }
    // Are we building a "positional" or "count" index?
    if (args.storePositions) {
        fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    } else {
        fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
    }
    doc.add(new Field(IndexArgs.CONTENTS, content, fieldType));
    for (Map.Entry<Key, Value> fieldEntry : bibtexEntry.getFields().entrySet()) {
        String fieldKey = fieldEntry.getKey().toString();
        String fieldValue = fieldEntry.getValue().toUserString();
        // not worth trying to parse/normalize all numbers at the moment
        if (fieldKey.equals(BibtexField.NUMBER.name)) {
            continue;
        }
        if (STRING_FIELD_NAMES.contains(fieldKey)) {
            // index field as single token
            doc.add(new StringField(fieldKey, fieldValue, Field.Store.YES));
        } else if (FIELDS_WITHOUT_STEMMING.contains(fieldKey)) {
            // index field without stemming but store original string value
            FieldType nonStemmedType = new FieldType(fieldType);
            nonStemmedType.setStored(true);
            // token stream to be indexed
            Analyzer nonStemmingAnalyzer = DefaultEnglishAnalyzer.newNonStemmingInstance(CharArraySet.EMPTY_SET);
            StringReader reader = new StringReader(fieldValue);
            TokenStream stream = nonStemmingAnalyzer.tokenStream(null, reader);
            Field field = new Field(fieldKey, fieldValue, nonStemmedType);
            field.setTokenStream(stream);
            doc.add(field);
            nonStemmingAnalyzer.close();
        } else if (fieldKey.equals(BibtexField.YEAR.name)) {
            if (fieldValue != "") {
                // index as numeric value to allow range queries
                doc.add(new IntPoint(fieldKey, Integer.parseInt(fieldValue)));
            }
            doc.add(new StoredField(fieldKey, fieldValue));
        } else {
            // default to normal Field with tokenization and stemming
            doc.add(new Field(fieldKey, fieldValue, fieldType));
        }
    }
    return doc;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) BibTeXEntry(org.jbibtex.BibTeXEntry) Document(org.apache.lucene.document.Document) Analyzer(org.apache.lucene.analysis.Analyzer) DefaultEnglishAnalyzer(io.anserini.analysis.DefaultEnglishAnalyzer) FieldType(org.apache.lucene.document.FieldType) StringField(org.apache.lucene.document.StringField) StoredField(org.apache.lucene.document.StoredField) SortedDocValuesField(org.apache.lucene.document.SortedDocValuesField) Field(org.apache.lucene.document.Field) IntPoint(org.apache.lucene.document.IntPoint) StoredField(org.apache.lucene.document.StoredField) StringField(org.apache.lucene.document.StringField) SortedDocValuesField(org.apache.lucene.document.SortedDocValuesField) Value(org.jbibtex.Value) StringReader(java.io.StringReader) Map(java.util.Map) BytesRef(org.apache.lucene.util.BytesRef) Key(org.jbibtex.Key)

Example 78 with IntPoint

use of org.apache.lucene.document.IntPoint in project Anserini by castorini.

the class TweetGenerator method createDocument.

@Override
public Document createDocument(TweetCollection.Document tweetDoc) throws GeneratorException {
    String id = tweetDoc.id();
    if (tweetDoc.contents().trim().isEmpty()) {
        throw new EmptyDocumentException();
    }
    final TwitterTextParseResults result = TwitterTextParser.parseTweet(tweetDoc.contents().trim());
    if (!result.isValid) {
        throw new InvalidDocumentException();
    }
    String text = tweetDoc.contents().trim().substring(result.validTextRange.start, result.validTextRange.end);
    if (!args.tweetKeepUrls) {
        final Extractor extractor = new Extractor();
        final List<String> urls = extractor.extractURLs(text);
        for (String url : urls) {
            text = text.replaceAll(url, "");
        }
    }
    text = text.trim();
    if (text.isEmpty()) {
        throw new EmptyDocumentException();
    }
    // Skip deletes tweetids.
    if (deletes != null && deletes.contains(id)) {
        throw new SkippedDocumentException();
    }
    if (tweetDoc.getIdLong() > args.tweetMaxId) {
        throw new SkippedDocumentException();
    }
    if (!args.tweetKeepRetweets && tweetDoc.getRetweetedStatusId().isPresent()) {
        throw new SkippedDocumentException();
    }
    Document doc = new Document();
    doc.add(new StringField(IndexArgs.ID, id, Field.Store.YES));
    // We need this to break scoring ties.
    doc.add(new LongPoint(TweetField.ID_LONG.name, tweetDoc.getIdLong()));
    doc.add(new NumericDocValuesField(TweetField.ID_LONG.name, tweetDoc.getIdLong()));
    tweetDoc.getEpoch().ifPresent(epoch -> doc.add(new LongPoint(TweetField.EPOCH.name, epoch)));
    doc.add(new StringField(TweetField.SCREEN_NAME.name, tweetDoc.getScreenName(), Field.Store.NO));
    doc.add(new IntPoint(TweetField.FRIENDS_COUNT.name, tweetDoc.getFollowersCount()));
    doc.add(new IntPoint(TweetField.FOLLOWERS_COUNT.name, tweetDoc.getFriendsCount()));
    doc.add(new IntPoint(TweetField.STATUSES_COUNT.name, tweetDoc.getStatusesCount()));
    tweetDoc.getInReplyToStatusId().ifPresent(rid -> {
        doc.add(new LongPoint(TweetField.IN_REPLY_TO_STATUS_ID.name, rid));
        tweetDoc.getInReplyToUserId().ifPresent(ruid -> doc.add(new LongPoint(TweetField.IN_REPLY_TO_USER_ID.name, ruid)));
    });
    tweetDoc.getRetweetedStatusId().ifPresent(rid -> {
        doc.add(new LongPoint(TweetField.RETWEETED_STATUS_ID.name, rid));
        tweetDoc.getRetweetedUserId().ifPresent(ruid -> doc.add(new LongPoint(TweetField.RETWEETED_USER_ID.name, ruid)));
        tweetDoc.getRetweetCount().ifPresent(rc -> doc.add(new LongPoint(TweetField.RETWEET_COUNT.name, rc)));
    });
    tweetDoc.getLang().ifPresent(lang -> doc.add(new StringField(TweetField.LANG.name, lang, Field.Store.NO)));
    if (args.storeRaw) {
        // store the raw json string as one single field
        doc.add(new StoredField(IndexArgs.RAW, tweetDoc.getJsonString()));
    }
    FieldType fieldType = new FieldType();
    fieldType.setStored(args.storeContents);
    // Are we storing document vectors?
    if (args.storeDocvectors) {
        fieldType.setStoreTermVectors(true);
        fieldType.setStoreTermVectorPositions(true);
    }
    // Are we building a "positional" or "count" index?
    if (args.storePositions) {
        fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    } else {
        fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
    }
    doc.add(new Field(IndexArgs.CONTENTS, text, fieldType));
    return doc;
}
Also used : TwitterTextParseResults(com.twitter.twittertext.TwitterTextParseResults) LongPoint(org.apache.lucene.document.LongPoint) Document(org.apache.lucene.document.Document) FieldType(org.apache.lucene.document.FieldType) IntPoint(org.apache.lucene.document.IntPoint) NumericDocValuesField(org.apache.lucene.document.NumericDocValuesField) StringField(org.apache.lucene.document.StringField) StoredField(org.apache.lucene.document.StoredField) Field(org.apache.lucene.document.Field) StoredField(org.apache.lucene.document.StoredField) NumericDocValuesField(org.apache.lucene.document.NumericDocValuesField) StringField(org.apache.lucene.document.StringField) Extractor(com.twitter.twittertext.Extractor)

Example 79 with IntPoint

use of org.apache.lucene.document.IntPoint in project cxf by apache.

the class TikaLuceneContentExtractor method addField.

private static void addField(final Document document, final LuceneDocumentMetadata documentMetadata, final String name, final String value) {
    final Class<?> type = documentMetadata.getFieldType(name);
    final ParamConverterProvider provider = documentMetadata.getFieldTypeConverter();
    if (type != null) {
        if (Number.class.isAssignableFrom(type)) {
            if (Double.class.isAssignableFrom(type)) {
                Double number = ParamConverterUtils.getValue(Double.class, provider, value);
                document.add(new DoublePoint(name, number));
                document.add(new StoredField(name, number));
            } else if (Float.class.isAssignableFrom(type)) {
                Float number = ParamConverterUtils.getValue(Float.class, provider, value);
                document.add(new FloatPoint(name, number));
                document.add(new StoredField(name, number));
            } else if (Long.class.isAssignableFrom(type)) {
                Long number = ParamConverterUtils.getValue(Long.class, provider, value);
                document.add(new LongPoint(name, number));
                document.add(new StoredField(name, number));
            } else if (Integer.class.isAssignableFrom(type) || Byte.class.isAssignableFrom(type)) {
                Integer number = ParamConverterUtils.getValue(Integer.class, provider, value);
                document.add(new IntPoint(name, number));
                document.add(new StoredField(name, number));
            } else {
                document.add(new StringField(name, value, Store.YES));
            }
            return;
        } else if (Date.class.isAssignableFrom(type)) {
            final Date date = ParamConverterUtils.getValue(Date.class, provider, value);
            final Field field;
            if (date != null) {
                field = new StringField(name, ParamConverterUtils.getString(Date.class, provider, date), Store.YES);
            } else {
                field = new StringField(name, value, Store.YES);
            }
            document.add(field);
            return;
        }
    }
    document.add(new StringField(name, value, Store.YES));
}
Also used : LongPoint(org.apache.lucene.document.LongPoint) Date(java.util.Date) IntPoint(org.apache.lucene.document.IntPoint) StringField(org.apache.lucene.document.StringField) StoredField(org.apache.lucene.document.StoredField) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) StoredField(org.apache.lucene.document.StoredField) FloatPoint(org.apache.lucene.document.FloatPoint) ParamConverterProvider(javax.ws.rs.ext.ParamConverterProvider) StringField(org.apache.lucene.document.StringField) DoublePoint(org.apache.lucene.document.DoublePoint)

Example 80 with IntPoint

use of org.apache.lucene.document.IntPoint in project janusgraph by JanusGraph.

the class LuceneIndex method buildIndexFields.

private List<IndexableField> buildIndexFields(final Document doc, final KeyInformation.StoreRetriever information) {
    List<IndexableField> fields = new ArrayList<>();
    for (IndexableField field : doc.getFields()) {
        String fieldName = field.name();
        if (fieldName.equals(DOCID)) {
            continue;
        }
        KeyInformation ki = information.get(getOrigFieldName(fieldName));
        boolean isPossibleSortIndex = ki.getCardinality() == Cardinality.SINGLE;
        Class<?> dataType = ki.getDataType();
        if (AttributeUtils.isWholeNumber(dataType)) {
            long value = field.numericValue().longValue();
            fields.add(new LongPoint(fieldName, value));
            if (isPossibleSortIndex) {
                fields.add(new NumericDocValuesField(fieldName, value));
            }
        } else if (AttributeUtils.isDecimal(dataType)) {
            double value = field.numericValue().doubleValue();
            fields.add(new DoublePoint(fieldName, value));
            if (isPossibleSortIndex) {
                fields.add(new DoubleDocValuesField(fieldName, value));
            }
        } else if (AttributeUtils.isString(dataType)) {
            final Mapping mapping = Mapping.getMapping(ki);
            if ((mapping == Mapping.STRING || mapping == Mapping.TEXTSTRING) && isPossibleSortIndex) {
                fields.add(new SortedDocValuesField(fieldName, new BytesRef(field.stringValue())));
            }
        } else if (AttributeUtils.isGeo(dataType)) {
            if (log.isTraceEnabled())
                log.trace("Updating geo-indexes for key {}", fieldName);
            Shape shape;
            try {
                shape = Geoshape.fromWkt(field.stringValue().substring(GEOID.length())).getShape();
            } catch (java.text.ParseException e) {
                throw new IllegalArgumentException("Geoshape was not parsable", e);
            }
            final SpatialStrategy spatialStrategy = getSpatialStrategy(fieldName, ki);
            Collections.addAll(fields, spatialStrategy.createIndexableFields(shape));
        } else if (dataType.equals(Date.class) || dataType.equals(Instant.class)) {
            long value = field.numericValue().longValue();
            fields.add(new LongPoint(fieldName, value));
            if (isPossibleSortIndex) {
                fields.add(new NumericDocValuesField(fieldName, value));
            }
        } else if (dataType.equals(Boolean.class)) {
            fields.add(new IntPoint(fieldName, field.numericValue().intValue() == 1 ? 1 : 0));
            if (isPossibleSortIndex) {
                fields.add(new NumericDocValuesField(fieldName, field.numericValue().intValue()));
            }
        }
    }
    return fields;
}
Also used : Shape(org.locationtech.spatial4j.shape.Shape) Instant(java.time.Instant) ArrayList(java.util.ArrayList) Mapping(org.janusgraph.core.schema.Mapping) LongPoint(org.apache.lucene.document.LongPoint) SpatialStrategy(org.apache.lucene.spatial.SpatialStrategy) Date(java.util.Date) KeyInformation(org.janusgraph.diskstorage.indexing.KeyInformation) IndexableField(org.apache.lucene.index.IndexableField) IntPoint(org.apache.lucene.document.IntPoint) NumericDocValuesField(org.apache.lucene.document.NumericDocValuesField) DoubleDocValuesField(org.apache.lucene.document.DoubleDocValuesField) DoublePoint(org.apache.lucene.document.DoublePoint) SortedDocValuesField(org.apache.lucene.document.SortedDocValuesField) ParseException(org.apache.lucene.queryparser.classic.ParseException) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

IntPoint (org.apache.lucene.document.IntPoint)81 Document (org.apache.lucene.document.Document)71 Directory (org.apache.lucene.store.Directory)47 LongPoint (org.apache.lucene.document.LongPoint)30 DoublePoint (org.apache.lucene.document.DoublePoint)29 FloatPoint (org.apache.lucene.document.FloatPoint)28 StoredField (org.apache.lucene.document.StoredField)27 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)25 NumericDocValuesField (org.apache.lucene.document.NumericDocValuesField)24 IndexReader (org.apache.lucene.index.IndexReader)22 StringField (org.apache.lucene.document.StringField)21 IndexWriter (org.apache.lucene.index.IndexWriter)21 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)20 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)18 BytesRef (org.apache.lucene.util.BytesRef)18 BinaryPoint (org.apache.lucene.document.BinaryPoint)17 RAMDirectory (org.apache.lucene.store.RAMDirectory)16 Field (org.apache.lucene.document.Field)13 SortedDocValuesField (org.apache.lucene.document.SortedDocValuesField)13 IndexSearcher (org.apache.lucene.search.IndexSearcher)12