use of org.apache.lucene.document.Document in project Solbase by Photobucket.
the class DocumentLoader method loadObject.
public CachedObjectWrapper<Document, Long> loadObject(Integer docNum, int start, int end, LayeredCache<Integer, Document, Long, ParsedDoc> cache) throws IOException {
Document document = new Document();
Get documentGet = new Get(SolbaseUtil.randomize(docNum));
if (fieldNames == null || fieldNames.size() == 0) {
// get all columns ( except this skips meta info )
documentGet.addFamily(Bytes.toBytes("field"));
} else {
for (byte[] fieldName : fieldNames) {
documentGet.addColumn(Bytes.toBytes("field"), fieldName);
}
}
Result documentResult = null;
// if docTable is set up, reuse instance, otherwise create brand new one and close after done
if (this.docTable == null) {
HTableInterface docTable = null;
try {
docTable = SolbaseUtil.getDocTable();
documentResult = docTable.get(documentGet);
} finally {
SolbaseUtil.releaseTable(docTable);
}
} else {
documentResult = this.docTable.get(documentGet);
}
if (documentResult == null || documentResult.isEmpty()) {
return null;
}
// TODO, get from result
Long versionIdentifier = 0l;
NavigableMap<byte[], byte[]> familyMap = documentResult.getFamilyMap(Bytes.toBytes("field"));
for (Map.Entry<byte[], byte[]> fieldColumn : familyMap.entrySet()) {
Field field = null;
String fieldName = Bytes.toString(fieldColumn.getKey());
byte[] value;
ByteBuffer v = ByteBuffer.wrap(fieldColumn.getValue());
int vlimit = v.limit() + v.arrayOffset();
if (v.array()[vlimit - 1] != Byte.MAX_VALUE && v.array()[vlimit - 1] != Byte.MIN_VALUE) {
throw new CorruptIndexException("Solbase field is not properly encoded: " + docNum + "(" + fieldName + ")");
} else if (v.array()[vlimit - 1] == Byte.MAX_VALUE) {
// Binary
value = new byte[vlimit - 1];
System.arraycopy(v.array(), v.position() + v.arrayOffset(), value, 0, vlimit - 1);
field = new Field(fieldName, value, Store.YES);
document.add(field);
} else if (v.array()[vlimit - 1] == Byte.MIN_VALUE) {
// String
value = new byte[vlimit - 1];
System.arraycopy(v.array(), v.position() + v.arrayOffset(), value, 0, vlimit - 1);
// Check for multi-fields
String fieldString = new String(value, "UTF-8");
if (fieldString.indexOf(Bytes.toString(SolbaseUtil.delimiter)) >= 0) {
StringTokenizer tok = new StringTokenizer(fieldString, Bytes.toString(SolbaseUtil.delimiter));
while (tok.hasMoreTokens()) {
// update logic
if (schema != null) {
SchemaField sfield = schema.getFieldOrNull(fieldName);
if (sfield.getType() instanceof EmbeddedIndexedIntField) {
EmbeddedIndexedIntField eiif = (EmbeddedIndexedIntField) sfield.getType();
EmbeddedSortField sf = new EmbeddedSortField(fieldName, tok.nextToken(), Field.Store.YES, Field.Index.NO, eiif.getFieldNumber());
document.add(sf);
} else {
Field f = sfield.createField(tok.nextToken(), 1.0f);
if (f != null) {
// null fields are not added
document.add(f);
}
}
} else {
field = new Field(fieldName, tok.nextToken(), Store.YES, Index.ANALYZED);
document.add(field);
}
}
} else {
// update logic
if (schema != null) {
SchemaField sfield = schema.getFieldOrNull(fieldName);
if (sfield.getType() instanceof EmbeddedIndexedIntField) {
EmbeddedIndexedIntField eiif = (EmbeddedIndexedIntField) sfield.getType();
EmbeddedSortField sf = new EmbeddedSortField(fieldName, fieldString, Field.Store.YES, Field.Index.NO, eiif.getFieldNumber());
document.add(sf);
} else {
Field f = sfield.createField(fieldString, 1.0f);
if (f != null) {
// null fields are not added
document.add(f);
}
}
} else {
field = new Field(fieldName, fieldString, Store.YES, Index.ANALYZED);
document.add(field);
}
}
}
}
return new CachedObjectWrapper<Document, Long>(document, versionIdentifier, System.currentTimeMillis());
}
use of org.apache.lucene.document.Document in project Solbase by Photobucket.
the class DocumentLoader method updateObject.
@Override
public void updateObject(CachedObjectWrapper<Document, Long> object, ParsedDoc modificationData, LayeredCache<Integer, Document, Long, ParsedDoc> cache, LayeredCache.ModificationType modType, int startDocId, int endDocId) throws IOException {
if (modType == ModificationType.DELETE) {
//don't want to delete, because someone else might still be using, just let it fall out of LRU cache
Document oldDoc = object.getValue();
if (oldDoc != null) {
// other cluster might have already deleted this doc and cache doesn't have this doc
modificationData.copyFrom(deleteDocument(oldDoc, Integer.parseInt(oldDoc.getField("docId").stringValue()), modificationData.getIndexName(), modificationData.getIndexWriter(), modificationData.getIndexUtil(), modificationData.getUpdateStore(), startDocId, endDocId));
}
} else if (modType == LayeredCache.ModificationType.UPDATE) {
Document newDoc = modificationData.getDocument();
Document oldDoc = object.getValue();
logger.debug("process document() call in updateObject() for docId: " + Integer.parseInt(oldDoc.getField("docId").stringValue()));
modificationData.copyFrom(processDocument(newDoc, oldDoc, modificationData.getIndexName(), Integer.parseInt(oldDoc.getField("docId").stringValue()), modificationData.getIndexUtil(), modificationData.getIndexWriter(), modificationData.getUpdateStore()));
object.setValue(modificationData.getDocument());
} else if (modType == LayeredCache.ModificationType.ADD) {
// TODO: it should never hit here, newly added doc is obviously not going to be in cache
Document oldDoc = object.getValue();
logger.warn("it should never hit here, newly added doc should never be in cache: " + oldDoc.toString());
}
}
use of org.apache.lucene.document.Document in project Solbase by Photobucket.
the class DocumentLoader method processDocument.
private ParsedDoc processDocument(Document newDoc, Document oldDoc, String indexName, int docNumber, SolbaseIndexUtil indexUtil, IndexWriter writer, boolean updateStore) {
try {
@SuppressWarnings("unchecked") List<Fieldable> newFields = newDoc.getFields();
boolean termVectorChanged = false;
for (Fieldable field : newFields) {
if (field.isIndexed() || field instanceof EmbeddedSortField) {
termVectorChanged = true;
break;
}
}
// do diff on terms
if (termVectorChanged) {
Field docIdField = oldDoc.getField("docId");
// cloning old doc, so it won't conflict with read
oldDoc = new Document(oldDoc);
oldDoc.removeField("docId");
// parsing old doc to get all terms
try {
ParsedDoc oldParsedDoc = writer.parseDoc(oldDoc, schema.getAnalyzer(), indexName, docNumber, indexUtil.getSortFieldNames());
List<Term> oldTerms = oldParsedDoc.getAllTerms();
List<TermDocMetadata> oldTermDocMetas = oldParsedDoc.getTermDocMetadatas();
Document mergedDoc = mergeOldAndNew(oldDoc, newDoc);
ParsedDoc parsedDoc = writer.parseDoc(mergedDoc, schema.getAnalyzer(), indexName, docNumber, indexUtil.getSortFieldNames());
List<TermDocMetadata> newTermDocMetas = parsedDoc.getTermDocMetadatas();
List<Term> newTerms = parsedDoc.getAllTerms();
List<Term> updateList = new ArrayList<Term>(oldTerms);
List<Term> deleteList = new ArrayList<Term>(oldTerms);
List<Term> addList = new ArrayList<Term>(newTerms);
Collections.copy(updateList, oldTerms);
Collections.copy(deleteList, oldTerms);
Collections.copy(addList, newTerms);
updateList.retainAll(newTerms);
deleteList.removeAll(newTerms);
addList.removeAll(oldTerms);
int shardNum = SolbaseShardUtil.getShardNum(indexName);
int startDocId = SolbaseShardUtil.getStartDocId(shardNum);
int endDocId = SolbaseShardUtil.getEndDocId(shardNum);
// updating tv first
for (TermDocMetadata termDocMeta : newTermDocMetas) {
Term term = termDocMeta.getTerm();
if (updateList.contains(term)) {
logger.debug("updating this term: " + term.toString());
ReaderCache.updateTermDocsMetadata(term, termDocMeta, indexName, writer, LayeredCache.ModificationType.UPDATE, updateStore, startDocId, endDocId);
} else if (addList.contains(term)) {
ReaderCache.updateTermDocsMetadata(term, termDocMeta, indexName, writer, LayeredCache.ModificationType.ADD, updateStore, startDocId, endDocId);
}
}
// clean up deletes
if (deleteList.size() > 0) {
for (TermDocMetadata termDocMeta : oldTermDocMetas) {
Term term = termDocMeta.getTerm();
if (deleteList.contains(term)) {
ReaderCache.updateTermDocsMetadata(term, termDocMeta, indexName, writer, LayeredCache.ModificationType.DELETE, updateStore, startDocId, endDocId);
}
}
}
parsedDoc.getDocument().add(docIdField);
return parsedDoc;
} catch (NullPointerException e) {
return null;
}
} else {
Document mergedDoc = mergeOldAndNew(oldDoc, newDoc);
ParsedDoc parsedDoc = writer.parseDoc(mergedDoc, schema.getAnalyzer(), indexName, docNumber, indexUtil.getSortFieldNames());
return parsedDoc;
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (MemcachedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (TimeoutException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}
use of org.apache.lucene.document.Document in project Solbase by Photobucket.
the class SolbaseIndexWriter method editDoc.
/**
* Doing edit logic here. instead of blindingly inserting, we need to compare new doc with old doc and do appropriate modification
* to tv and doc
* @param newDoc
* @param indexName
* @return
*/
public boolean editDoc(Document newDoc, String indexName, int docNumber, boolean updateStore) {
try {
CachedObjectWrapper<Document, Long> cachedObj = ReaderCache.getDocument(docNumber, null, indexName, 0, 0);
if (cachedObj == null || cachedObj.getValue() == null) {
// document doesn't exist, so let's just bail out here
return true;
}
ParsedDoc parsedDoc = new ParsedDoc(newDoc);
parsedDoc.setIndexName(indexName);
parsedDoc.setIndexUtil(indexUtil);
parsedDoc.setIndexWriter(writer);
parsedDoc.setUpdateStore(updateStore);
int shardNum = SolbaseShardUtil.getShardNum(indexName);
int startDocId = SolbaseShardUtil.getStartDocId(shardNum);
int endDocId = SolbaseShardUtil.getEndDocId(shardNum);
ReaderCache.updateDocument(docNumber, parsedDoc, indexName, writer, LayeredCache.ModificationType.UPDATE, updateStore, startDocId, endDocId);
return true;
} catch (IOException e) {
logger.info("edit doc failed: " + docNumber);
logger.info(e.toString());
} catch (InterruptedException e) {
logger.info("edit doc failed: " + docNumber);
logger.info(e.toString());
} catch (MemcachedException e) {
logger.info("edit doc failed: " + docNumber);
logger.info(e.toString());
} catch (TimeoutException e) {
logger.info("edit doc failed: " + docNumber);
logger.info(e.toString());
} catch (SolbaseException e) {
logger.info("edit doc failed: " + docNumber);
logger.info(e.toString());
}
return false;
}
use of org.apache.lucene.document.Document in project Solbase by Photobucket.
the class SolbaseIndexWriter method delete.
public void delete(DeleteUpdateCommand cmd) throws IOException {
deleteByIdCommands.incrementAndGet();
deleteByIdCommandsCumulative.incrementAndGet();
if (!cmd.fromPending && !cmd.fromCommitted) {
numErrors.incrementAndGet();
numErrorsCumulative.incrementAndGet();
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "meaningless command: " + cmd);
}
if (!cmd.fromPending || !cmd.fromCommitted) {
numErrors.incrementAndGet();
numErrorsCumulative.incrementAndGet();
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "operation not supported" + cmd);
}
// Delete all terms/fields/etc
String indexName = core.getName();
writer.setIndexName(indexName);
writer.setIndexUtil(indexUtil);
int docId = Integer.parseInt(cmd.id);
logger.info("deleting doc: " + docId);
try {
CachedObjectWrapper<Document, Long> wrapper = ReaderCache.getDocument(docId, null, indexName, 0, 0);
boolean updateStore = cmd.getUpdateStore();
ParsedDoc parsedDoc = new ParsedDoc();
parsedDoc.setIndexName(indexName);
parsedDoc.setIndexUtil(indexUtil);
parsedDoc.setIndexWriter(writer);
parsedDoc.setUpdateStore(updateStore);
int shardNum = SolbaseShardUtil.getShardNum(indexName);
int startDocId = SolbaseShardUtil.getStartDocId(shardNum);
int endDocId = SolbaseShardUtil.getEndDocId(shardNum);
ReaderCache.updateDocument(docId, parsedDoc, indexName, writer, LayeredCache.ModificationType.DELETE, updateStore, startDocId, endDocId);
} catch (InterruptedException e) {
logger.info("delete doc failed: " + docId);
logger.info(e.toString());
} catch (MemcachedException e) {
logger.info("delete doc failed: " + docId);
logger.info(e.toString());
} catch (TimeoutException e) {
logger.info("delete doc failed: " + docId);
logger.info(e.toString());
} catch (SolbaseException e) {
logger.info("delete doc failed: " + docId);
logger.info(e.toString());
}
}
Aggregations