use of org.solbase.indexer.ParsedDoc in project Solbase by Photobucket.
the class IndexWriter method parseDoc.
@SuppressWarnings("unchecked")
public ParsedDoc parseDoc(Document doc, Analyzer analyzer, String indexName, int docNumber, List<String> sortFieldNames) throws CorruptIndexException, IOException {
// given doc, what are all of terms we indexed
List<Term> allIndexedTerms = new ArrayList<Term>();
Map<String, byte[]> fieldCache = new HashMap<String, byte[]>(1024);
// need to hold onto TermDocMetaData, so it can return this array
List<TermDocMetadata> metadatas = new ArrayList<TermDocMetadata>();
byte[] docId = Bytes.toBytes(docNumber);
int position = 0;
for (Fieldable field : (List<Fieldable>) doc.getFields()) {
// Indexed field
if (field.isIndexed() && field.isTokenized()) {
TokenStream tokens = field.tokenStreamValue();
if (tokens == null) {
tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
}
// collect term information per field
Map<Term, Map<ByteBuffer, List<Number>>> allTermInformation = new ConcurrentSkipListMap<Term, Map<ByteBuffer, List<Number>>>();
int lastOffset = 0;
if (position > 0) {
position += analyzer.getPositionIncrementGap(field.name());
}
// reset the TokenStream to the first token
tokens.reset();
// offsets
OffsetAttribute offsetAttribute = null;
if (field.isStoreOffsetWithTermVector())
offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class);
// positions
PositionIncrementAttribute posIncrAttribute = null;
if (field.isStorePositionWithTermVector())
posIncrAttribute = (PositionIncrementAttribute) tokens.addAttribute(PositionIncrementAttribute.class);
TermAttribute termAttribute = (TermAttribute) tokens.addAttribute(TermAttribute.class);
// store normalizations of field per term per document
// rather
// than per field.
// this adds more to write but less to read on other side
Integer tokensInField = new Integer(0);
while (tokens.incrementToken()) {
tokensInField++;
Term term = new Term(field.name(), termAttribute.term());
allIndexedTerms.add(term);
// fetch all collected information for this term
Map<ByteBuffer, List<Number>> termInfo = allTermInformation.get(term);
if (termInfo == null) {
termInfo = new ConcurrentSkipListMap<ByteBuffer, List<Number>>();
allTermInformation.put(term, termInfo);
}
// term frequency
List<Number> termFrequency = termInfo.get(TermDocMetadata.termFrequencyKeyBytes);
if (termFrequency == null) {
termFrequency = new ArrayList<Number>();
termFrequency.add(new Integer(0));
termInfo.put(TermDocMetadata.termFrequencyKeyBytes, termFrequency);
}
// increment
termFrequency.set(0, termFrequency.get(0).intValue() + 1);
// position vector
if (field.isStorePositionWithTermVector()) {
position += (posIncrAttribute.getPositionIncrement() - 1);
List<Number> positionVector = termInfo.get(TermDocMetadata.positionVectorKeyBytes);
if (positionVector == null) {
positionVector = new ArrayList<Number>();
termInfo.put(TermDocMetadata.positionVectorKeyBytes, positionVector);
}
positionVector.add(++position);
}
// term offsets
if (field.isStoreOffsetWithTermVector()) {
List<Number> offsetVector = termInfo.get(TermDocMetadata.offsetVectorKeyBytes);
if (offsetVector == null) {
offsetVector = new ArrayList<Number>();
termInfo.put(TermDocMetadata.offsetVectorKeyBytes, offsetVector);
}
offsetVector.add(lastOffset + offsetAttribute.startOffset());
offsetVector.add(lastOffset + offsetAttribute.endOffset());
}
List<Number> sortValues = new ArrayList<Number>();
// init sortValues
for (int i = 0; i < Scorer.numSort; i++) {
sortValues.add(new Integer(-1));
}
int order = 0;
// extract sort field value and store it in term doc metadata obj
for (String fieldName : sortFieldNames) {
Fieldable fieldable = doc.getFieldable(fieldName);
if (fieldable instanceof EmbeddedSortField) {
EmbeddedSortField sortField = (EmbeddedSortField) fieldable;
int value = -1;
if (sortField.stringValue() != null) {
value = Integer.parseInt(sortField.stringValue());
}
int sortSlot = sortField.getSortSlot();
sortValues.set(sortSlot - 1, new Integer(value));
} else {
// TODO: this logic is used for real time indexing.
// hacky. depending on order of sort field names in array
int value = -1;
if (fieldable.stringValue() != null) {
value = Integer.parseInt(fieldable.stringValue());
}
sortValues.set(order++, new Integer(value));
}
}
termInfo.put(TermDocMetadata.sortFieldKeyBytes, sortValues);
}
List<Number> bnorm = null;
if (!field.getOmitNorms()) {
bnorm = new ArrayList<Number>();
float norm = doc.getBoost();
norm *= field.getBoost();
norm *= similarity.lengthNorm(field.name(), tokensInField);
bnorm.add(Similarity.encodeNorm(norm));
}
for (Map.Entry<Term, Map<ByteBuffer, List<Number>>> term : allTermInformation.entrySet()) {
Term tempTerm = term.getKey();
byte[] fieldTermKeyBytes = SolbaseUtil.generateTermKey(tempTerm);
// more writes but faster on read side.
if (!field.getOmitNorms()) {
term.getValue().put(TermDocMetadata.normsKeyBytes, bnorm);
}
TermDocMetadata data = new TermDocMetadata(docNumber, term.getValue(), fieldTermKeyBytes, tempTerm);
metadatas.add(data);
}
}
// Untokenized fields go in without a termPosition
if (field.isIndexed() && !field.isTokenized()) {
Term term = new Term(field.name(), field.stringValue());
allIndexedTerms.add(term);
byte[] fieldTermKeyBytes = SolbaseUtil.generateTermKey(term);
Map<ByteBuffer, List<Number>> termMap = new ConcurrentSkipListMap<ByteBuffer, List<Number>>();
termMap.put(TermDocMetadata.termFrequencyKeyBytes, Arrays.asList(new Number[] {}));
termMap.put(TermDocMetadata.positionVectorKeyBytes, Arrays.asList(new Number[] {}));
TermDocMetadata data = new TermDocMetadata(docNumber, termMap, fieldTermKeyBytes, term);
metadatas.add(data);
}
// Stores each field as a column under this doc key
if (field.isStored()) {
byte[] _value = field.isBinary() ? field.getBinaryValue() : Bytes.toBytes(field.stringValue());
// first byte flags if binary or not
byte[] value = new byte[_value.length + 1];
System.arraycopy(_value, 0, value, 0, _value.length);
value[value.length - 1] = (byte) (field.isBinary() ? Byte.MAX_VALUE : Byte.MIN_VALUE);
// logic to handle multiple fields w/ same name
byte[] currentValue = fieldCache.get(field.name());
if (currentValue == null) {
fieldCache.put(field.name(), value);
} else {
// append new data
byte[] newValue = new byte[currentValue.length + SolbaseUtil.delimiter.length + value.length - 1];
System.arraycopy(currentValue, 0, newValue, 0, currentValue.length - 1);
System.arraycopy(SolbaseUtil.delimiter, 0, newValue, currentValue.length - 1, SolbaseUtil.delimiter.length);
System.arraycopy(value, 0, newValue, currentValue.length + SolbaseUtil.delimiter.length - 1, value.length);
fieldCache.put(field.name(), newValue);
}
}
}
Put documentPut = new Put(SolbaseUtil.randomize(docNumber));
// Store each field as a column under this docId
for (Map.Entry<String, byte[]> field : fieldCache.entrySet()) {
documentPut.add(Bytes.toBytes("field"), Bytes.toBytes(field.getKey()), field.getValue());
}
// in case of real time update, we need to add back docId field
if (!documentPut.has(Bytes.toBytes("field"), Bytes.toBytes("docId"))) {
byte[] docIdStr = Bytes.toBytes(new Integer(docNumber).toString());
// first byte flags if binary or not
byte[] value = new byte[docIdStr.length + 1];
System.arraycopy(docIdStr, 0, value, 0, docIdStr.length);
value[value.length - 1] = (byte) (Byte.MIN_VALUE);
documentPut.add(Bytes.toBytes("field"), Bytes.toBytes("docId"), value);
}
// Finally, Store meta-data so we can delete this document
documentPut.add(Bytes.toBytes("allTerms"), Bytes.toBytes("allTerms"), SolbaseUtil.toBytes(allIndexedTerms).array());
ParsedDoc parsedDoc = new ParsedDoc(metadatas, doc, documentPut, fieldCache.entrySet(), allIndexedTerms);
return parsedDoc;
}
use of org.solbase.indexer.ParsedDoc in project Solbase by Photobucket.
the class SolbaseInitialIndexMapper method map.
protected void map(ImmutableBytesWritable row, Result values, Context context) throws IOException {
context.getCounter(Counters.TOTAL_ROWS).increment(1);
context.setStatus(context.getCounter(Counters.TOTAL_ROWS) + "");
// global id is user_media row key
String globalId = Bytes.toString(row.get());
Document doc = indexerUtil.createLuceneDocument(Bytes.toString(row.get()), values, context);
byte[] checksum = values.getValue(Bytes.toBytes("meta"), Bytes.toBytes("checksum"));
if (doc == null) {
// validation must have failed if it returned null
return;
}
// exists already
if (this.idCounter > (SolbaseUtil.UNIQ_ID_CHUNK - 1) || docId == null) {
docId = SolbaseUtil.generateUniqId();
this.idCounter = 0;
} else {
docId--;
}
// for us, docId is going to be global uniq id, meaning we are tied to 2 billion docs limitation
// it doesn't really hurt to add this field to doc. and it only really matters when sharding comes in, trying to fetch docs by their docid
indexerUtil.addFieldToDoc(doc, "docId", docId + "");
// incrementing chunking sequence (lucene doc id)
this.idCounter++;
try {
ParsedDoc parsedDoc = indexerUtil.getIndexWriter().parseDoc(doc, indexerUtil.getAnalyzer(), "", docId, indexerUtil.getSortFieldNames());
List<TermDocMetadata> metadatas = parsedDoc.getTermDocMetadatas();
MapWritable mapWritable = new MapWritable();
DocumentPutWritable docWritable = new DocumentPutWritable(parsedDoc.getFieldsMap(), parsedDoc.getAllTerms(), docId, globalId);
mapWritable.put(new BytesWritable(Bytes.toBytes("doc")), docWritable);
for (TermDocMetadata metadata : metadatas) {
byte[] key = metadata.getFieldTermKey();
ByteBuffer buf = metadata.serialize();
TermDocMetadataWritable writable = new TermDocMetadataWritable(docId, Bytes.toBytes(buf), key);
mapWritable.put(new BytesWritable(key), writable);
}
context.write(new BytesWritable(checksum), mapWritable);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
use of org.solbase.indexer.ParsedDoc in project Solbase by Photobucket.
the class DocumentLoader method processDocument.
private ParsedDoc processDocument(Document newDoc, Document oldDoc, String indexName, int docNumber, SolbaseIndexUtil indexUtil, IndexWriter writer, boolean updateStore) {
try {
@SuppressWarnings("unchecked") List<Fieldable> newFields = newDoc.getFields();
boolean termVectorChanged = false;
for (Fieldable field : newFields) {
if (field.isIndexed() || field instanceof EmbeddedSortField) {
termVectorChanged = true;
break;
}
}
// do diff on terms
if (termVectorChanged) {
Field docIdField = oldDoc.getField("docId");
// cloning old doc, so it won't conflict with read
oldDoc = new Document(oldDoc);
oldDoc.removeField("docId");
// parsing old doc to get all terms
try {
ParsedDoc oldParsedDoc = writer.parseDoc(oldDoc, schema.getAnalyzer(), indexName, docNumber, indexUtil.getSortFieldNames());
List<Term> oldTerms = oldParsedDoc.getAllTerms();
List<TermDocMetadata> oldTermDocMetas = oldParsedDoc.getTermDocMetadatas();
Document mergedDoc = mergeOldAndNew(oldDoc, newDoc);
ParsedDoc parsedDoc = writer.parseDoc(mergedDoc, schema.getAnalyzer(), indexName, docNumber, indexUtil.getSortFieldNames());
List<TermDocMetadata> newTermDocMetas = parsedDoc.getTermDocMetadatas();
List<Term> newTerms = parsedDoc.getAllTerms();
List<Term> updateList = new ArrayList<Term>(oldTerms);
List<Term> deleteList = new ArrayList<Term>(oldTerms);
List<Term> addList = new ArrayList<Term>(newTerms);
Collections.copy(updateList, oldTerms);
Collections.copy(deleteList, oldTerms);
Collections.copy(addList, newTerms);
updateList.retainAll(newTerms);
deleteList.removeAll(newTerms);
addList.removeAll(oldTerms);
int shardNum = SolbaseShardUtil.getShardNum(indexName);
int startDocId = SolbaseShardUtil.getStartDocId(shardNum);
int endDocId = SolbaseShardUtil.getEndDocId(shardNum);
// updating tv first
for (TermDocMetadata termDocMeta : newTermDocMetas) {
Term term = termDocMeta.getTerm();
if (updateList.contains(term)) {
logger.debug("updating this term: " + term.toString());
ReaderCache.updateTermDocsMetadata(term, termDocMeta, indexName, writer, LayeredCache.ModificationType.UPDATE, updateStore, startDocId, endDocId);
} else if (addList.contains(term)) {
ReaderCache.updateTermDocsMetadata(term, termDocMeta, indexName, writer, LayeredCache.ModificationType.ADD, updateStore, startDocId, endDocId);
}
}
// clean up deletes
if (deleteList.size() > 0) {
for (TermDocMetadata termDocMeta : oldTermDocMetas) {
Term term = termDocMeta.getTerm();
if (deleteList.contains(term)) {
ReaderCache.updateTermDocsMetadata(term, termDocMeta, indexName, writer, LayeredCache.ModificationType.DELETE, updateStore, startDocId, endDocId);
}
}
}
parsedDoc.getDocument().add(docIdField);
return parsedDoc;
} catch (NullPointerException e) {
return null;
}
} else {
Document mergedDoc = mergeOldAndNew(oldDoc, newDoc);
ParsedDoc parsedDoc = writer.parseDoc(mergedDoc, schema.getAnalyzer(), indexName, docNumber, indexUtil.getSortFieldNames());
return parsedDoc;
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (MemcachedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (TimeoutException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}
use of org.solbase.indexer.ParsedDoc in project Solbase by Photobucket.
the class SolbaseIndexWriter method editDoc.
/**
* Doing edit logic here. instead of blindingly inserting, we need to compare new doc with old doc and do appropriate modification
* to tv and doc
* @param newDoc
* @param indexName
* @return
*/
public boolean editDoc(Document newDoc, String indexName, int docNumber, boolean updateStore) {
try {
CachedObjectWrapper<Document, Long> cachedObj = ReaderCache.getDocument(docNumber, null, indexName, 0, 0);
if (cachedObj == null || cachedObj.getValue() == null) {
// document doesn't exist, so let's just bail out here
return true;
}
ParsedDoc parsedDoc = new ParsedDoc(newDoc);
parsedDoc.setIndexName(indexName);
parsedDoc.setIndexUtil(indexUtil);
parsedDoc.setIndexWriter(writer);
parsedDoc.setUpdateStore(updateStore);
int shardNum = SolbaseShardUtil.getShardNum(indexName);
int startDocId = SolbaseShardUtil.getStartDocId(shardNum);
int endDocId = SolbaseShardUtil.getEndDocId(shardNum);
ReaderCache.updateDocument(docNumber, parsedDoc, indexName, writer, LayeredCache.ModificationType.UPDATE, updateStore, startDocId, endDocId);
return true;
} catch (IOException e) {
logger.info("edit doc failed: " + docNumber);
logger.info(e.toString());
} catch (InterruptedException e) {
logger.info("edit doc failed: " + docNumber);
logger.info(e.toString());
} catch (MemcachedException e) {
logger.info("edit doc failed: " + docNumber);
logger.info(e.toString());
} catch (TimeoutException e) {
logger.info("edit doc failed: " + docNumber);
logger.info(e.toString());
} catch (SolbaseException e) {
logger.info("edit doc failed: " + docNumber);
logger.info(e.toString());
}
return false;
}
use of org.solbase.indexer.ParsedDoc in project Solbase by Photobucket.
the class SolbaseIndexWriter method delete.
public void delete(DeleteUpdateCommand cmd) throws IOException {
deleteByIdCommands.incrementAndGet();
deleteByIdCommandsCumulative.incrementAndGet();
if (!cmd.fromPending && !cmd.fromCommitted) {
numErrors.incrementAndGet();
numErrorsCumulative.incrementAndGet();
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "meaningless command: " + cmd);
}
if (!cmd.fromPending || !cmd.fromCommitted) {
numErrors.incrementAndGet();
numErrorsCumulative.incrementAndGet();
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "operation not supported" + cmd);
}
// Delete all terms/fields/etc
String indexName = core.getName();
writer.setIndexName(indexName);
writer.setIndexUtil(indexUtil);
int docId = Integer.parseInt(cmd.id);
logger.info("deleting doc: " + docId);
try {
CachedObjectWrapper<Document, Long> wrapper = ReaderCache.getDocument(docId, null, indexName, 0, 0);
boolean updateStore = cmd.getUpdateStore();
ParsedDoc parsedDoc = new ParsedDoc();
parsedDoc.setIndexName(indexName);
parsedDoc.setIndexUtil(indexUtil);
parsedDoc.setIndexWriter(writer);
parsedDoc.setUpdateStore(updateStore);
int shardNum = SolbaseShardUtil.getShardNum(indexName);
int startDocId = SolbaseShardUtil.getStartDocId(shardNum);
int endDocId = SolbaseShardUtil.getEndDocId(shardNum);
ReaderCache.updateDocument(docId, parsedDoc, indexName, writer, LayeredCache.ModificationType.DELETE, updateStore, startDocId, endDocId);
} catch (InterruptedException e) {
logger.info("delete doc failed: " + docId);
logger.info(e.toString());
} catch (MemcachedException e) {
logger.info("delete doc failed: " + docId);
logger.info(e.toString());
} catch (TimeoutException e) {
logger.info("delete doc failed: " + docId);
logger.info(e.toString());
} catch (SolbaseException e) {
logger.info("delete doc failed: " + docId);
logger.info(e.toString());
}
}
Aggregations