use of org.apache.lucene.document.IntPoint in project carbondata by apache.
the class LuceneDataMapWriter method onPageAdded.
/**
* Add the column pages row to the datamap, order of pages is same as `indexColumns` in
* DataMapMeta returned in DataMapFactory.
* Implementation should copy the content of `pages` as needed, because `pages` memory
* may be freed after this method returns, if using unsafe column page.
*/
public void onPageAdded(int blockletId, int pageId, ColumnPage[] pages) throws IOException {
// save index data into ram, write into disk after one page finished
RAMDirectory ramDir = new RAMDirectory();
IndexWriter ramIndexWriter = new IndexWriter(ramDir, new IndexWriterConfig(analyzer));
int columnsCount = pages.length;
if (columnsCount <= 0) {
LOGGER.warn("empty data");
ramIndexWriter.close();
ramDir.close();
return;
}
int pageSize = pages[0].getPageSize();
for (int rowId = 0; rowId < pageSize; rowId++) {
// create a new document
Document doc = new Document();
// add block id, save this id
doc.add(new StringField(BLOCKID_NAME, blockId, Field.Store.YES));
// add blocklet Id
doc.add(new IntPoint(BLOCKLETID_NAME, new int[] { blockletId }));
doc.add(new StoredField(BLOCKLETID_NAME, blockletId));
// add page id and row id in Fine Grain data map
if (isFineGrain) {
// add page Id
doc.add(new IntPoint(PAGEID_NAME, new int[] { pageId }));
doc.add(new StoredField(PAGEID_NAME, pageId));
// doc.add(new NumericDocValuesField(PAGEID_NAME,pageId));
// add row id
doc.add(new IntPoint(ROWID_NAME, new int[] { rowId }));
doc.add(new StoredField(ROWID_NAME, rowId));
// doc.add(new NumericDocValuesField(ROWID_NAME,rowId));
}
// add other fields
for (int colIdx = 0; colIdx < columnsCount; colIdx++) {
if (!pages[colIdx].getNullBits().get(rowId)) {
addField(doc, pages[colIdx], rowId, Field.Store.NO);
}
}
// add this document
ramIndexWriter.addDocument(doc);
}
// close ram writer
ramIndexWriter.close();
// add ram index data into disk
indexWriter.addIndexes(new Directory[] { ramDir });
// delete this ram data
ramDir.close();
}
use of org.apache.lucene.document.IntPoint in project Anserini by castorini.
the class BibtexGenerator method createDocument.
@Override
public Document createDocument(BibtexCollection.Document bibtexDoc) throws GeneratorException {
String id = bibtexDoc.id();
String content = bibtexDoc.contents();
String type = bibtexDoc.type();
BibTeXEntry bibtexEntry = bibtexDoc.bibtexEntry();
if (content == null || content.trim().isEmpty()) {
throw new EmptyDocumentException();
}
Document doc = new Document();
// Store the collection docid.
doc.add(new StringField(IndexArgs.ID, id, Field.Store.YES));
// This is needed to break score ties by docid.
doc.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef(id)));
// Store the collection's bibtex type
doc.add(new StringField(TYPE, type, Field.Store.YES));
if (args.storeRaw) {
doc.add(new StoredField(IndexArgs.RAW, bibtexDoc.raw()));
}
FieldType fieldType = new FieldType();
fieldType.setStored(args.storeContents);
// Are we storing document vectors?
if (args.storeDocvectors) {
fieldType.setStoreTermVectors(true);
fieldType.setStoreTermVectorPositions(true);
}
// Are we building a "positional" or "count" index?
if (args.storePositions) {
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
} else {
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
}
doc.add(new Field(IndexArgs.CONTENTS, content, fieldType));
for (Map.Entry<Key, Value> fieldEntry : bibtexEntry.getFields().entrySet()) {
String fieldKey = fieldEntry.getKey().toString();
String fieldValue = fieldEntry.getValue().toUserString();
// not worth trying to parse/normalize all numbers at the moment
if (fieldKey.equals(BibtexField.NUMBER.name)) {
continue;
}
if (STRING_FIELD_NAMES.contains(fieldKey)) {
// index field as single token
doc.add(new StringField(fieldKey, fieldValue, Field.Store.YES));
} else if (FIELDS_WITHOUT_STEMMING.contains(fieldKey)) {
// index field without stemming but store original string value
FieldType nonStemmedType = new FieldType(fieldType);
nonStemmedType.setStored(true);
// token stream to be indexed
Analyzer nonStemmingAnalyzer = DefaultEnglishAnalyzer.newNonStemmingInstance(CharArraySet.EMPTY_SET);
StringReader reader = new StringReader(fieldValue);
TokenStream stream = nonStemmingAnalyzer.tokenStream(null, reader);
Field field = new Field(fieldKey, fieldValue, nonStemmedType);
field.setTokenStream(stream);
doc.add(field);
nonStemmingAnalyzer.close();
} else if (fieldKey.equals(BibtexField.YEAR.name)) {
if (fieldValue != "") {
// index as numeric value to allow range queries
doc.add(new IntPoint(fieldKey, Integer.parseInt(fieldValue)));
}
doc.add(new StoredField(fieldKey, fieldValue));
} else {
// default to normal Field with tokenization and stemming
doc.add(new Field(fieldKey, fieldValue, fieldType));
}
}
return doc;
}
use of org.apache.lucene.document.IntPoint in project Anserini by castorini.
the class TweetGenerator method createDocument.
@Override
public Document createDocument(TweetCollection.Document tweetDoc) throws GeneratorException {
String id = tweetDoc.id();
if (tweetDoc.contents().trim().isEmpty()) {
throw new EmptyDocumentException();
}
final TwitterTextParseResults result = TwitterTextParser.parseTweet(tweetDoc.contents().trim());
if (!result.isValid) {
throw new InvalidDocumentException();
}
String text = tweetDoc.contents().trim().substring(result.validTextRange.start, result.validTextRange.end);
if (!args.tweetKeepUrls) {
final Extractor extractor = new Extractor();
final List<String> urls = extractor.extractURLs(text);
for (String url : urls) {
text = text.replaceAll(url, "");
}
}
text = text.trim();
if (text.isEmpty()) {
throw new EmptyDocumentException();
}
// Skip deletes tweetids.
if (deletes != null && deletes.contains(id)) {
throw new SkippedDocumentException();
}
if (tweetDoc.getIdLong() > args.tweetMaxId) {
throw new SkippedDocumentException();
}
if (!args.tweetKeepRetweets && tweetDoc.getRetweetedStatusId().isPresent()) {
throw new SkippedDocumentException();
}
Document doc = new Document();
doc.add(new StringField(IndexArgs.ID, id, Field.Store.YES));
// We need this to break scoring ties.
doc.add(new LongPoint(TweetField.ID_LONG.name, tweetDoc.getIdLong()));
doc.add(new NumericDocValuesField(TweetField.ID_LONG.name, tweetDoc.getIdLong()));
tweetDoc.getEpoch().ifPresent(epoch -> doc.add(new LongPoint(TweetField.EPOCH.name, epoch)));
doc.add(new StringField(TweetField.SCREEN_NAME.name, tweetDoc.getScreenName(), Field.Store.NO));
doc.add(new IntPoint(TweetField.FRIENDS_COUNT.name, tweetDoc.getFollowersCount()));
doc.add(new IntPoint(TweetField.FOLLOWERS_COUNT.name, tweetDoc.getFriendsCount()));
doc.add(new IntPoint(TweetField.STATUSES_COUNT.name, tweetDoc.getStatusesCount()));
tweetDoc.getInReplyToStatusId().ifPresent(rid -> {
doc.add(new LongPoint(TweetField.IN_REPLY_TO_STATUS_ID.name, rid));
tweetDoc.getInReplyToUserId().ifPresent(ruid -> doc.add(new LongPoint(TweetField.IN_REPLY_TO_USER_ID.name, ruid)));
});
tweetDoc.getRetweetedStatusId().ifPresent(rid -> {
doc.add(new LongPoint(TweetField.RETWEETED_STATUS_ID.name, rid));
tweetDoc.getRetweetedUserId().ifPresent(ruid -> doc.add(new LongPoint(TweetField.RETWEETED_USER_ID.name, ruid)));
tweetDoc.getRetweetCount().ifPresent(rc -> doc.add(new LongPoint(TweetField.RETWEET_COUNT.name, rc)));
});
tweetDoc.getLang().ifPresent(lang -> doc.add(new StringField(TweetField.LANG.name, lang, Field.Store.NO)));
if (args.storeRaw) {
// store the raw json string as one single field
doc.add(new StoredField(IndexArgs.RAW, tweetDoc.getJsonString()));
}
FieldType fieldType = new FieldType();
fieldType.setStored(args.storeContents);
// Are we storing document vectors?
if (args.storeDocvectors) {
fieldType.setStoreTermVectors(true);
fieldType.setStoreTermVectorPositions(true);
}
// Are we building a "positional" or "count" index?
if (args.storePositions) {
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
} else {
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
}
doc.add(new Field(IndexArgs.CONTENTS, text, fieldType));
return doc;
}
use of org.apache.lucene.document.IntPoint in project cxf by apache.
the class TikaLuceneContentExtractor method addField.
private static void addField(final Document document, final LuceneDocumentMetadata documentMetadata, final String name, final String value) {
final Class<?> type = documentMetadata.getFieldType(name);
final ParamConverterProvider provider = documentMetadata.getFieldTypeConverter();
if (type != null) {
if (Number.class.isAssignableFrom(type)) {
if (Double.class.isAssignableFrom(type)) {
Double number = ParamConverterUtils.getValue(Double.class, provider, value);
document.add(new DoublePoint(name, number));
document.add(new StoredField(name, number));
} else if (Float.class.isAssignableFrom(type)) {
Float number = ParamConverterUtils.getValue(Float.class, provider, value);
document.add(new FloatPoint(name, number));
document.add(new StoredField(name, number));
} else if (Long.class.isAssignableFrom(type)) {
Long number = ParamConverterUtils.getValue(Long.class, provider, value);
document.add(new LongPoint(name, number));
document.add(new StoredField(name, number));
} else if (Integer.class.isAssignableFrom(type) || Byte.class.isAssignableFrom(type)) {
Integer number = ParamConverterUtils.getValue(Integer.class, provider, value);
document.add(new IntPoint(name, number));
document.add(new StoredField(name, number));
} else {
document.add(new StringField(name, value, Store.YES));
}
return;
} else if (Date.class.isAssignableFrom(type)) {
final Date date = ParamConverterUtils.getValue(Date.class, provider, value);
final Field field;
if (date != null) {
field = new StringField(name, ParamConverterUtils.getString(Date.class, provider, date), Store.YES);
} else {
field = new StringField(name, value, Store.YES);
}
document.add(field);
return;
}
}
document.add(new StringField(name, value, Store.YES));
}
use of org.apache.lucene.document.IntPoint in project janusgraph by JanusGraph.
the class LuceneIndex method buildIndexFields.
private List<IndexableField> buildIndexFields(final Document doc, final KeyInformation.StoreRetriever information) {
List<IndexableField> fields = new ArrayList<>();
for (IndexableField field : doc.getFields()) {
String fieldName = field.name();
if (fieldName.equals(DOCID)) {
continue;
}
KeyInformation ki = information.get(getOrigFieldName(fieldName));
boolean isPossibleSortIndex = ki.getCardinality() == Cardinality.SINGLE;
Class<?> dataType = ki.getDataType();
if (AttributeUtils.isWholeNumber(dataType)) {
long value = field.numericValue().longValue();
fields.add(new LongPoint(fieldName, value));
if (isPossibleSortIndex) {
fields.add(new NumericDocValuesField(fieldName, value));
}
} else if (AttributeUtils.isDecimal(dataType)) {
double value = field.numericValue().doubleValue();
fields.add(new DoublePoint(fieldName, value));
if (isPossibleSortIndex) {
fields.add(new DoubleDocValuesField(fieldName, value));
}
} else if (AttributeUtils.isString(dataType)) {
final Mapping mapping = Mapping.getMapping(ki);
if ((mapping == Mapping.STRING || mapping == Mapping.TEXTSTRING) && isPossibleSortIndex) {
fields.add(new SortedDocValuesField(fieldName, new BytesRef(field.stringValue())));
}
} else if (AttributeUtils.isGeo(dataType)) {
if (log.isTraceEnabled())
log.trace("Updating geo-indexes for key {}", fieldName);
Shape shape;
try {
shape = Geoshape.fromWkt(field.stringValue().substring(GEOID.length())).getShape();
} catch (java.text.ParseException e) {
throw new IllegalArgumentException("Geoshape was not parsable", e);
}
final SpatialStrategy spatialStrategy = getSpatialStrategy(fieldName, ki);
Collections.addAll(fields, spatialStrategy.createIndexableFields(shape));
} else if (dataType.equals(Date.class) || dataType.equals(Instant.class)) {
long value = field.numericValue().longValue();
fields.add(new LongPoint(fieldName, value));
if (isPossibleSortIndex) {
fields.add(new NumericDocValuesField(fieldName, value));
}
} else if (dataType.equals(Boolean.class)) {
fields.add(new IntPoint(fieldName, field.numericValue().intValue() == 1 ? 1 : 0));
if (isPossibleSortIndex) {
fields.add(new NumericDocValuesField(fieldName, field.numericValue().intValue()));
}
}
}
return fields;
}
Aggregations