use of org.apache.lucene.document.Fieldable in project polymap4-core by Polymap4.
the class LuceneRecordState method put.
public <T> LuceneRecordState put(String key, T value) {
assert key != null && key.length() > 0 : "Key must not be null or empty.";
assert value != null : "Value must not be null.";
checkCopyOnWrite();
Fieldable old = doc.getFieldable(key);
if (old != null) {
// FIXME ValueCoder may have different/additional keys
doc.removeField(key);
}
boolean indexed = store.getIndexFieldSelector().test(key);
store.valueCoders.encode(doc, key, value, indexed);
return this;
}
use of org.apache.lucene.document.Fieldable in project neo4j-mobile-android by neo4j-contrib.
the class IndexType method clearDocument.
private void clearDocument(Document document) {
Set<String> names = new HashSet<String>();
for (Fieldable field : document.getFields()) {
names.add(field.name());
}
names.remove(LuceneIndex.KEY_DOC_ID);
for (String name : names) {
document.removeFields(name);
}
}
use of org.apache.lucene.document.Fieldable in project jackrabbit by apache.
the class NodeIndexer method createDoc.
/**
* Creates a lucene Document.
*
* @return the lucene Document with the index layout.
* @throws RepositoryException if an error occurs while reading property
* values from the <code>ItemStateProvider</code>.
*/
public Document createDoc() throws RepositoryException {
doNotUseInExcerpt.clear();
Document doc = new Document();
doc.setBoost(getNodeBoost());
// special fields
// UUID
doc.add(new IDField(node.getNodeId()));
try {
// parent UUID
if (node.getParentId() == null) {
// root node
Field parent = new Field(FieldNames.PARENT, false, "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO);
parent.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY);
doc.add(parent);
addNodeName(doc, "", "");
} else if (node.getSharedSet().isEmpty()) {
addParentChildRelation(doc, node.getParentId());
} else {
// shareable node
for (NodeId id : node.getSharedSet()) {
addParentChildRelation(doc, id);
}
// mark shareable nodes
doc.add(new Field(FieldNames.SHAREABLE_NODE, false, "", Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO));
}
} catch (NoSuchItemStateException e) {
throwRepositoryException(e);
} catch (ItemStateException e) {
throwRepositoryException(e);
} catch (NamespaceException e) {
// will never happen, because this.mappings will dynamically add
// unknown uri<->prefix mappings
}
Set<Name> props = node.getPropertyNames();
for (Name propName : props) {
if (isIndexed(propName)) {
PropertyId id = new PropertyId(node.getNodeId(), propName);
try {
PropertyState propState = (PropertyState) stateProvider.getItemState(id);
// beginning with V2
if (indexFormatVersion.getVersion() >= IndexFormatVersion.V2.getVersion()) {
addPropertyName(doc, propState.getName());
}
InternalValue[] values = propState.getValues();
for (InternalValue value : values) {
addValue(doc, value, propState.getName());
}
if (values.length > 1) {
// real multi-valued
addMVPName(doc, propState.getName());
}
} catch (NoSuchItemStateException e) {
throwRepositoryException(e);
} catch (ItemStateException e) {
throwRepositoryException(e);
}
}
}
// now add fields that are not used in excerpt (must go at the end)
for (Fieldable field : doNotUseInExcerpt) {
doc.add(field);
}
return doc;
}
use of org.apache.lucene.document.Fieldable in project Solbase by Photobucket.
the class DocumentLoader method processDocument.
private ParsedDoc processDocument(Document newDoc, Document oldDoc, String indexName, int docNumber, SolbaseIndexUtil indexUtil, IndexWriter writer, boolean updateStore) {
try {
@SuppressWarnings("unchecked") List<Fieldable> newFields = newDoc.getFields();
boolean termVectorChanged = false;
for (Fieldable field : newFields) {
if (field.isIndexed() || field instanceof EmbeddedSortField) {
termVectorChanged = true;
break;
}
}
// do diff on terms
if (termVectorChanged) {
Field docIdField = oldDoc.getField("docId");
// cloning old doc, so it won't conflict with read
oldDoc = new Document(oldDoc);
oldDoc.removeField("docId");
// parsing old doc to get all terms
try {
ParsedDoc oldParsedDoc = writer.parseDoc(oldDoc, schema.getAnalyzer(), indexName, docNumber, indexUtil.getSortFieldNames());
List<Term> oldTerms = oldParsedDoc.getAllTerms();
List<TermDocMetadata> oldTermDocMetas = oldParsedDoc.getTermDocMetadatas();
Document mergedDoc = mergeOldAndNew(oldDoc, newDoc);
ParsedDoc parsedDoc = writer.parseDoc(mergedDoc, schema.getAnalyzer(), indexName, docNumber, indexUtil.getSortFieldNames());
List<TermDocMetadata> newTermDocMetas = parsedDoc.getTermDocMetadatas();
List<Term> newTerms = parsedDoc.getAllTerms();
List<Term> updateList = new ArrayList<Term>(oldTerms);
List<Term> deleteList = new ArrayList<Term>(oldTerms);
List<Term> addList = new ArrayList<Term>(newTerms);
Collections.copy(updateList, oldTerms);
Collections.copy(deleteList, oldTerms);
Collections.copy(addList, newTerms);
updateList.retainAll(newTerms);
deleteList.removeAll(newTerms);
addList.removeAll(oldTerms);
int shardNum = SolbaseShardUtil.getShardNum(indexName);
int startDocId = SolbaseShardUtil.getStartDocId(shardNum);
int endDocId = SolbaseShardUtil.getEndDocId(shardNum);
// updating tv first
for (TermDocMetadata termDocMeta : newTermDocMetas) {
Term term = termDocMeta.getTerm();
if (updateList.contains(term)) {
logger.debug("updating this term: " + term.toString());
ReaderCache.updateTermDocsMetadata(term, termDocMeta, indexName, writer, LayeredCache.ModificationType.UPDATE, updateStore, startDocId, endDocId);
} else if (addList.contains(term)) {
ReaderCache.updateTermDocsMetadata(term, termDocMeta, indexName, writer, LayeredCache.ModificationType.ADD, updateStore, startDocId, endDocId);
}
}
// clean up deletes
if (deleteList.size() > 0) {
for (TermDocMetadata termDocMeta : oldTermDocMetas) {
Term term = termDocMeta.getTerm();
if (deleteList.contains(term)) {
ReaderCache.updateTermDocsMetadata(term, termDocMeta, indexName, writer, LayeredCache.ModificationType.DELETE, updateStore, startDocId, endDocId);
}
}
}
parsedDoc.getDocument().add(docIdField);
return parsedDoc;
} catch (NullPointerException e) {
return null;
}
} else {
Document mergedDoc = mergeOldAndNew(oldDoc, newDoc);
ParsedDoc parsedDoc = writer.parseDoc(mergedDoc, schema.getAnalyzer(), indexName, docNumber, indexUtil.getSortFieldNames());
return parsedDoc;
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (MemcachedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (TimeoutException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}
use of org.apache.lucene.document.Fieldable in project Solbase by Photobucket.
the class IndexWriter method parseDoc.
@SuppressWarnings("unchecked")
public ParsedDoc parseDoc(Document doc, Analyzer analyzer, String indexName, int docNumber, List<String> sortFieldNames) throws CorruptIndexException, IOException {
// given doc, what are all of terms we indexed
List<Term> allIndexedTerms = new ArrayList<Term>();
Map<String, byte[]> fieldCache = new HashMap<String, byte[]>(1024);
// need to hold onto TermDocMetaData, so it can return this array
List<TermDocMetadata> metadatas = new ArrayList<TermDocMetadata>();
byte[] docId = Bytes.toBytes(docNumber);
int position = 0;
for (Fieldable field : (List<Fieldable>) doc.getFields()) {
// Indexed field
if (field.isIndexed() && field.isTokenized()) {
TokenStream tokens = field.tokenStreamValue();
if (tokens == null) {
tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
}
// collect term information per field
Map<Term, Map<ByteBuffer, List<Number>>> allTermInformation = new ConcurrentSkipListMap<Term, Map<ByteBuffer, List<Number>>>();
int lastOffset = 0;
if (position > 0) {
position += analyzer.getPositionIncrementGap(field.name());
}
// reset the TokenStream to the first token
tokens.reset();
// offsets
OffsetAttribute offsetAttribute = null;
if (field.isStoreOffsetWithTermVector())
offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class);
// positions
PositionIncrementAttribute posIncrAttribute = null;
if (field.isStorePositionWithTermVector())
posIncrAttribute = (PositionIncrementAttribute) tokens.addAttribute(PositionIncrementAttribute.class);
TermAttribute termAttribute = (TermAttribute) tokens.addAttribute(TermAttribute.class);
// store normalizations of field per term per document
// rather
// than per field.
// this adds more to write but less to read on other side
Integer tokensInField = new Integer(0);
while (tokens.incrementToken()) {
tokensInField++;
Term term = new Term(field.name(), termAttribute.term());
allIndexedTerms.add(term);
// fetch all collected information for this term
Map<ByteBuffer, List<Number>> termInfo = allTermInformation.get(term);
if (termInfo == null) {
termInfo = new ConcurrentSkipListMap<ByteBuffer, List<Number>>();
allTermInformation.put(term, termInfo);
}
// term frequency
List<Number> termFrequency = termInfo.get(TermDocMetadata.termFrequencyKeyBytes);
if (termFrequency == null) {
termFrequency = new ArrayList<Number>();
termFrequency.add(new Integer(0));
termInfo.put(TermDocMetadata.termFrequencyKeyBytes, termFrequency);
}
// increment
termFrequency.set(0, termFrequency.get(0).intValue() + 1);
// position vector
if (field.isStorePositionWithTermVector()) {
position += (posIncrAttribute.getPositionIncrement() - 1);
List<Number> positionVector = termInfo.get(TermDocMetadata.positionVectorKeyBytes);
if (positionVector == null) {
positionVector = new ArrayList<Number>();
termInfo.put(TermDocMetadata.positionVectorKeyBytes, positionVector);
}
positionVector.add(++position);
}
// term offsets
if (field.isStoreOffsetWithTermVector()) {
List<Number> offsetVector = termInfo.get(TermDocMetadata.offsetVectorKeyBytes);
if (offsetVector == null) {
offsetVector = new ArrayList<Number>();
termInfo.put(TermDocMetadata.offsetVectorKeyBytes, offsetVector);
}
offsetVector.add(lastOffset + offsetAttribute.startOffset());
offsetVector.add(lastOffset + offsetAttribute.endOffset());
}
List<Number> sortValues = new ArrayList<Number>();
// init sortValues
for (int i = 0; i < Scorer.numSort; i++) {
sortValues.add(new Integer(-1));
}
int order = 0;
// extract sort field value and store it in term doc metadata obj
for (String fieldName : sortFieldNames) {
Fieldable fieldable = doc.getFieldable(fieldName);
if (fieldable instanceof EmbeddedSortField) {
EmbeddedSortField sortField = (EmbeddedSortField) fieldable;
int value = -1;
if (sortField.stringValue() != null) {
value = Integer.parseInt(sortField.stringValue());
}
int sortSlot = sortField.getSortSlot();
sortValues.set(sortSlot - 1, new Integer(value));
} else {
// TODO: this logic is used for real time indexing.
// hacky. depending on order of sort field names in array
int value = -1;
if (fieldable.stringValue() != null) {
value = Integer.parseInt(fieldable.stringValue());
}
sortValues.set(order++, new Integer(value));
}
}
termInfo.put(TermDocMetadata.sortFieldKeyBytes, sortValues);
}
List<Number> bnorm = null;
if (!field.getOmitNorms()) {
bnorm = new ArrayList<Number>();
float norm = doc.getBoost();
norm *= field.getBoost();
norm *= similarity.lengthNorm(field.name(), tokensInField);
bnorm.add(Similarity.encodeNorm(norm));
}
for (Map.Entry<Term, Map<ByteBuffer, List<Number>>> term : allTermInformation.entrySet()) {
Term tempTerm = term.getKey();
byte[] fieldTermKeyBytes = SolbaseUtil.generateTermKey(tempTerm);
// more writes but faster on read side.
if (!field.getOmitNorms()) {
term.getValue().put(TermDocMetadata.normsKeyBytes, bnorm);
}
TermDocMetadata data = new TermDocMetadata(docNumber, term.getValue(), fieldTermKeyBytes, tempTerm);
metadatas.add(data);
}
}
// Untokenized fields go in without a termPosition
if (field.isIndexed() && !field.isTokenized()) {
Term term = new Term(field.name(), field.stringValue());
allIndexedTerms.add(term);
byte[] fieldTermKeyBytes = SolbaseUtil.generateTermKey(term);
Map<ByteBuffer, List<Number>> termMap = new ConcurrentSkipListMap<ByteBuffer, List<Number>>();
termMap.put(TermDocMetadata.termFrequencyKeyBytes, Arrays.asList(new Number[] {}));
termMap.put(TermDocMetadata.positionVectorKeyBytes, Arrays.asList(new Number[] {}));
TermDocMetadata data = new TermDocMetadata(docNumber, termMap, fieldTermKeyBytes, term);
metadatas.add(data);
}
// Stores each field as a column under this doc key
if (field.isStored()) {
byte[] _value = field.isBinary() ? field.getBinaryValue() : Bytes.toBytes(field.stringValue());
// first byte flags if binary or not
byte[] value = new byte[_value.length + 1];
System.arraycopy(_value, 0, value, 0, _value.length);
value[value.length - 1] = (byte) (field.isBinary() ? Byte.MAX_VALUE : Byte.MIN_VALUE);
// logic to handle multiple fields w/ same name
byte[] currentValue = fieldCache.get(field.name());
if (currentValue == null) {
fieldCache.put(field.name(), value);
} else {
// append new data
byte[] newValue = new byte[currentValue.length + SolbaseUtil.delimiter.length + value.length - 1];
System.arraycopy(currentValue, 0, newValue, 0, currentValue.length - 1);
System.arraycopy(SolbaseUtil.delimiter, 0, newValue, currentValue.length - 1, SolbaseUtil.delimiter.length);
System.arraycopy(value, 0, newValue, currentValue.length + SolbaseUtil.delimiter.length - 1, value.length);
fieldCache.put(field.name(), newValue);
}
}
}
Put documentPut = new Put(SolbaseUtil.randomize(docNumber));
// Store each field as a column under this docId
for (Map.Entry<String, byte[]> field : fieldCache.entrySet()) {
documentPut.add(Bytes.toBytes("field"), Bytes.toBytes(field.getKey()), field.getValue());
}
// in case of real time update, we need to add back docId field
if (!documentPut.has(Bytes.toBytes("field"), Bytes.toBytes("docId"))) {
byte[] docIdStr = Bytes.toBytes(new Integer(docNumber).toString());
// first byte flags if binary or not
byte[] value = new byte[docIdStr.length + 1];
System.arraycopy(docIdStr, 0, value, 0, docIdStr.length);
value[value.length - 1] = (byte) (Byte.MIN_VALUE);
documentPut.add(Bytes.toBytes("field"), Bytes.toBytes("docId"), value);
}
// Finally, Store meta-data so we can delete this document
documentPut.add(Bytes.toBytes("allTerms"), Bytes.toBytes("allTerms"), SolbaseUtil.toBytes(allIndexedTerms).array());
ParsedDoc parsedDoc = new ParsedDoc(metadatas, doc, documentPut, fieldCache.entrySet(), allIndexedTerms);
return parsedDoc;
}
Aggregations