Search in sources :

Example 1 with ParsedDoc

use of org.solbase.indexer.ParsedDoc in project Solbase by Photobucket.

the class IndexWriter method parseDoc.

@SuppressWarnings("unchecked")
public ParsedDoc parseDoc(Document doc, Analyzer analyzer, String indexName, int docNumber, List<String> sortFieldNames) throws CorruptIndexException, IOException {
    // given doc, what are all of terms we indexed
    List<Term> allIndexedTerms = new ArrayList<Term>();
    Map<String, byte[]> fieldCache = new HashMap<String, byte[]>(1024);
    // need to hold onto TermDocMetaData, so it can return this array
    List<TermDocMetadata> metadatas = new ArrayList<TermDocMetadata>();
    byte[] docId = Bytes.toBytes(docNumber);
    int position = 0;
    for (Fieldable field : (List<Fieldable>) doc.getFields()) {
        // Indexed field
        if (field.isIndexed() && field.isTokenized()) {
            TokenStream tokens = field.tokenStreamValue();
            if (tokens == null) {
                tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
            }
            // collect term information per field
            Map<Term, Map<ByteBuffer, List<Number>>> allTermInformation = new ConcurrentSkipListMap<Term, Map<ByteBuffer, List<Number>>>();
            int lastOffset = 0;
            if (position > 0) {
                position += analyzer.getPositionIncrementGap(field.name());
            }
            // reset the TokenStream to the first token
            tokens.reset();
            // offsets
            OffsetAttribute offsetAttribute = null;
            if (field.isStoreOffsetWithTermVector())
                offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class);
            // positions
            PositionIncrementAttribute posIncrAttribute = null;
            if (field.isStorePositionWithTermVector())
                posIncrAttribute = (PositionIncrementAttribute) tokens.addAttribute(PositionIncrementAttribute.class);
            TermAttribute termAttribute = (TermAttribute) tokens.addAttribute(TermAttribute.class);
            // store normalizations of field per term per document
            // rather
            // than per field.
            // this adds more to write but less to read on other side
            Integer tokensInField = new Integer(0);
            while (tokens.incrementToken()) {
                tokensInField++;
                Term term = new Term(field.name(), termAttribute.term());
                allIndexedTerms.add(term);
                // fetch all collected information for this term
                Map<ByteBuffer, List<Number>> termInfo = allTermInformation.get(term);
                if (termInfo == null) {
                    termInfo = new ConcurrentSkipListMap<ByteBuffer, List<Number>>();
                    allTermInformation.put(term, termInfo);
                }
                // term frequency
                List<Number> termFrequency = termInfo.get(TermDocMetadata.termFrequencyKeyBytes);
                if (termFrequency == null) {
                    termFrequency = new ArrayList<Number>();
                    termFrequency.add(new Integer(0));
                    termInfo.put(TermDocMetadata.termFrequencyKeyBytes, termFrequency);
                }
                // increment
                termFrequency.set(0, termFrequency.get(0).intValue() + 1);
                // position vector
                if (field.isStorePositionWithTermVector()) {
                    position += (posIncrAttribute.getPositionIncrement() - 1);
                    List<Number> positionVector = termInfo.get(TermDocMetadata.positionVectorKeyBytes);
                    if (positionVector == null) {
                        positionVector = new ArrayList<Number>();
                        termInfo.put(TermDocMetadata.positionVectorKeyBytes, positionVector);
                    }
                    positionVector.add(++position);
                }
                // term offsets
                if (field.isStoreOffsetWithTermVector()) {
                    List<Number> offsetVector = termInfo.get(TermDocMetadata.offsetVectorKeyBytes);
                    if (offsetVector == null) {
                        offsetVector = new ArrayList<Number>();
                        termInfo.put(TermDocMetadata.offsetVectorKeyBytes, offsetVector);
                    }
                    offsetVector.add(lastOffset + offsetAttribute.startOffset());
                    offsetVector.add(lastOffset + offsetAttribute.endOffset());
                }
                List<Number> sortValues = new ArrayList<Number>();
                // init sortValues
                for (int i = 0; i < Scorer.numSort; i++) {
                    sortValues.add(new Integer(-1));
                }
                int order = 0;
                // extract sort field value and store it in term doc metadata obj
                for (String fieldName : sortFieldNames) {
                    Fieldable fieldable = doc.getFieldable(fieldName);
                    if (fieldable instanceof EmbeddedSortField) {
                        EmbeddedSortField sortField = (EmbeddedSortField) fieldable;
                        int value = -1;
                        if (sortField.stringValue() != null) {
                            value = Integer.parseInt(sortField.stringValue());
                        }
                        int sortSlot = sortField.getSortSlot();
                        sortValues.set(sortSlot - 1, new Integer(value));
                    } else {
                        // TODO: this logic is used for real time indexing.
                        // hacky. depending on order of sort field names in array
                        int value = -1;
                        if (fieldable.stringValue() != null) {
                            value = Integer.parseInt(fieldable.stringValue());
                        }
                        sortValues.set(order++, new Integer(value));
                    }
                }
                termInfo.put(TermDocMetadata.sortFieldKeyBytes, sortValues);
            }
            List<Number> bnorm = null;
            if (!field.getOmitNorms()) {
                bnorm = new ArrayList<Number>();
                float norm = doc.getBoost();
                norm *= field.getBoost();
                norm *= similarity.lengthNorm(field.name(), tokensInField);
                bnorm.add(Similarity.encodeNorm(norm));
            }
            for (Map.Entry<Term, Map<ByteBuffer, List<Number>>> term : allTermInformation.entrySet()) {
                Term tempTerm = term.getKey();
                byte[] fieldTermKeyBytes = SolbaseUtil.generateTermKey(tempTerm);
                // more writes but faster on read side.
                if (!field.getOmitNorms()) {
                    term.getValue().put(TermDocMetadata.normsKeyBytes, bnorm);
                }
                TermDocMetadata data = new TermDocMetadata(docNumber, term.getValue(), fieldTermKeyBytes, tempTerm);
                metadatas.add(data);
            }
        }
        // Untokenized fields go in without a termPosition
        if (field.isIndexed() && !field.isTokenized()) {
            Term term = new Term(field.name(), field.stringValue());
            allIndexedTerms.add(term);
            byte[] fieldTermKeyBytes = SolbaseUtil.generateTermKey(term);
            Map<ByteBuffer, List<Number>> termMap = new ConcurrentSkipListMap<ByteBuffer, List<Number>>();
            termMap.put(TermDocMetadata.termFrequencyKeyBytes, Arrays.asList(new Number[] {}));
            termMap.put(TermDocMetadata.positionVectorKeyBytes, Arrays.asList(new Number[] {}));
            TermDocMetadata data = new TermDocMetadata(docNumber, termMap, fieldTermKeyBytes, term);
            metadatas.add(data);
        }
        // Stores each field as a column under this doc key
        if (field.isStored()) {
            byte[] _value = field.isBinary() ? field.getBinaryValue() : Bytes.toBytes(field.stringValue());
            // first byte flags if binary or not
            byte[] value = new byte[_value.length + 1];
            System.arraycopy(_value, 0, value, 0, _value.length);
            value[value.length - 1] = (byte) (field.isBinary() ? Byte.MAX_VALUE : Byte.MIN_VALUE);
            // logic to handle multiple fields w/ same name
            byte[] currentValue = fieldCache.get(field.name());
            if (currentValue == null) {
                fieldCache.put(field.name(), value);
            } else {
                // append new data
                byte[] newValue = new byte[currentValue.length + SolbaseUtil.delimiter.length + value.length - 1];
                System.arraycopy(currentValue, 0, newValue, 0, currentValue.length - 1);
                System.arraycopy(SolbaseUtil.delimiter, 0, newValue, currentValue.length - 1, SolbaseUtil.delimiter.length);
                System.arraycopy(value, 0, newValue, currentValue.length + SolbaseUtil.delimiter.length - 1, value.length);
                fieldCache.put(field.name(), newValue);
            }
        }
    }
    Put documentPut = new Put(SolbaseUtil.randomize(docNumber));
    // Store each field as a column under this docId
    for (Map.Entry<String, byte[]> field : fieldCache.entrySet()) {
        documentPut.add(Bytes.toBytes("field"), Bytes.toBytes(field.getKey()), field.getValue());
    }
    // in case of real time update, we need to add back docId field
    if (!documentPut.has(Bytes.toBytes("field"), Bytes.toBytes("docId"))) {
        byte[] docIdStr = Bytes.toBytes(new Integer(docNumber).toString());
        // first byte flags if binary or not
        byte[] value = new byte[docIdStr.length + 1];
        System.arraycopy(docIdStr, 0, value, 0, docIdStr.length);
        value[value.length - 1] = (byte) (Byte.MIN_VALUE);
        documentPut.add(Bytes.toBytes("field"), Bytes.toBytes("docId"), value);
    }
    // Finally, Store meta-data so we can delete this document
    documentPut.add(Bytes.toBytes("allTerms"), Bytes.toBytes("allTerms"), SolbaseUtil.toBytes(allIndexedTerms).array());
    ParsedDoc parsedDoc = new ParsedDoc(metadatas, doc, documentPut, fieldCache.entrySet(), allIndexedTerms);
    return parsedDoc;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) EmbeddedSortField(org.apache.lucene.document.EmbeddedSortField) Fieldable(org.apache.lucene.document.Fieldable) StringReader(java.io.StringReader) ArrayList(java.util.ArrayList) List(java.util.List) TermAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute) ConcurrentSkipListMap(java.util.concurrent.ConcurrentSkipListMap) Term(org.apache.lucene.index.Term) ByteBuffer(java.nio.ByteBuffer) Put(org.apache.hadoop.hbase.client.Put) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) ParsedDoc(org.solbase.indexer.ParsedDoc) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) HashMap(java.util.HashMap) Map(java.util.Map) ConcurrentSkipListMap(java.util.concurrent.ConcurrentSkipListMap)

Example 2 with ParsedDoc

use of org.solbase.indexer.ParsedDoc in project Solbase by Photobucket.

the class SolbaseInitialIndexMapper method map.

protected void map(ImmutableBytesWritable row, Result values, Context context) throws IOException {
    context.getCounter(Counters.TOTAL_ROWS).increment(1);
    context.setStatus(context.getCounter(Counters.TOTAL_ROWS) + "");
    // global id is user_media row key
    String globalId = Bytes.toString(row.get());
    Document doc = indexerUtil.createLuceneDocument(Bytes.toString(row.get()), values, context);
    byte[] checksum = values.getValue(Bytes.toBytes("meta"), Bytes.toBytes("checksum"));
    if (doc == null) {
        // validation must have failed if it returned null
        return;
    }
    // exists already
    if (this.idCounter > (SolbaseUtil.UNIQ_ID_CHUNK - 1) || docId == null) {
        docId = SolbaseUtil.generateUniqId();
        this.idCounter = 0;
    } else {
        docId--;
    }
    // for us, docId is going to be global uniq id, meaning we are tied to 2 billion docs limitation
    // it doesn't really hurt to add this field to doc. and it only really matters when sharding comes in, trying to fetch docs by their docid
    indexerUtil.addFieldToDoc(doc, "docId", docId + "");
    // incrementing chunking sequence (lucene doc id)
    this.idCounter++;
    try {
        ParsedDoc parsedDoc = indexerUtil.getIndexWriter().parseDoc(doc, indexerUtil.getAnalyzer(), "", docId, indexerUtil.getSortFieldNames());
        List<TermDocMetadata> metadatas = parsedDoc.getTermDocMetadatas();
        MapWritable mapWritable = new MapWritable();
        DocumentPutWritable docWritable = new DocumentPutWritable(parsedDoc.getFieldsMap(), parsedDoc.getAllTerms(), docId, globalId);
        mapWritable.put(new BytesWritable(Bytes.toBytes("doc")), docWritable);
        for (TermDocMetadata metadata : metadatas) {
            byte[] key = metadata.getFieldTermKey();
            ByteBuffer buf = metadata.serialize();
            TermDocMetadataWritable writable = new TermDocMetadataWritable(docId, Bytes.toBytes(buf), key);
            mapWritable.put(new BytesWritable(key), writable);
        }
        context.write(new BytesWritable(checksum), mapWritable);
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}
Also used : ParsedDoc(org.solbase.indexer.ParsedDoc) TermDocMetadataWritable(org.solbase.indexer.writable.TermDocMetadataWritable) TermDocMetadata(org.solbase.lucenehbase.TermDocMetadata) BytesWritable(org.apache.hadoop.io.BytesWritable) ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) MapWritable(org.apache.hadoop.io.MapWritable) Document(org.apache.lucene.document.Document) ByteBuffer(java.nio.ByteBuffer) DocumentPutWritable(org.solbase.indexer.writable.DocumentPutWritable)

Example 3 with ParsedDoc

use of org.solbase.indexer.ParsedDoc in project Solbase by Photobucket.

the class DocumentLoader method processDocument.

private ParsedDoc processDocument(Document newDoc, Document oldDoc, String indexName, int docNumber, SolbaseIndexUtil indexUtil, IndexWriter writer, boolean updateStore) {
    try {
        @SuppressWarnings("unchecked") List<Fieldable> newFields = newDoc.getFields();
        boolean termVectorChanged = false;
        for (Fieldable field : newFields) {
            if (field.isIndexed() || field instanceof EmbeddedSortField) {
                termVectorChanged = true;
                break;
            }
        }
        // do diff on terms
        if (termVectorChanged) {
            Field docIdField = oldDoc.getField("docId");
            // cloning old doc, so it won't conflict with read
            oldDoc = new Document(oldDoc);
            oldDoc.removeField("docId");
            // parsing old doc to get all terms
            try {
                ParsedDoc oldParsedDoc = writer.parseDoc(oldDoc, schema.getAnalyzer(), indexName, docNumber, indexUtil.getSortFieldNames());
                List<Term> oldTerms = oldParsedDoc.getAllTerms();
                List<TermDocMetadata> oldTermDocMetas = oldParsedDoc.getTermDocMetadatas();
                Document mergedDoc = mergeOldAndNew(oldDoc, newDoc);
                ParsedDoc parsedDoc = writer.parseDoc(mergedDoc, schema.getAnalyzer(), indexName, docNumber, indexUtil.getSortFieldNames());
                List<TermDocMetadata> newTermDocMetas = parsedDoc.getTermDocMetadatas();
                List<Term> newTerms = parsedDoc.getAllTerms();
                List<Term> updateList = new ArrayList<Term>(oldTerms);
                List<Term> deleteList = new ArrayList<Term>(oldTerms);
                List<Term> addList = new ArrayList<Term>(newTerms);
                Collections.copy(updateList, oldTerms);
                Collections.copy(deleteList, oldTerms);
                Collections.copy(addList, newTerms);
                updateList.retainAll(newTerms);
                deleteList.removeAll(newTerms);
                addList.removeAll(oldTerms);
                int shardNum = SolbaseShardUtil.getShardNum(indexName);
                int startDocId = SolbaseShardUtil.getStartDocId(shardNum);
                int endDocId = SolbaseShardUtil.getEndDocId(shardNum);
                // updating tv first
                for (TermDocMetadata termDocMeta : newTermDocMetas) {
                    Term term = termDocMeta.getTerm();
                    if (updateList.contains(term)) {
                        logger.debug("updating this term: " + term.toString());
                        ReaderCache.updateTermDocsMetadata(term, termDocMeta, indexName, writer, LayeredCache.ModificationType.UPDATE, updateStore, startDocId, endDocId);
                    } else if (addList.contains(term)) {
                        ReaderCache.updateTermDocsMetadata(term, termDocMeta, indexName, writer, LayeredCache.ModificationType.ADD, updateStore, startDocId, endDocId);
                    }
                }
                // clean up deletes
                if (deleteList.size() > 0) {
                    for (TermDocMetadata termDocMeta : oldTermDocMetas) {
                        Term term = termDocMeta.getTerm();
                        if (deleteList.contains(term)) {
                            ReaderCache.updateTermDocsMetadata(term, termDocMeta, indexName, writer, LayeredCache.ModificationType.DELETE, updateStore, startDocId, endDocId);
                        }
                    }
                }
                parsedDoc.getDocument().add(docIdField);
                return parsedDoc;
            } catch (NullPointerException e) {
                return null;
            }
        } else {
            Document mergedDoc = mergeOldAndNew(oldDoc, newDoc);
            ParsedDoc parsedDoc = writer.parseDoc(mergedDoc, schema.getAnalyzer(), indexName, docNumber, indexUtil.getSortFieldNames());
            return parsedDoc;
        }
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (InterruptedException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (MemcachedException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (TimeoutException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    return null;
}
Also used : ArrayList(java.util.ArrayList) EmbeddedSortField(org.apache.lucene.document.EmbeddedSortField) Term(org.apache.lucene.index.Term) IOException(java.io.IOException) Document(org.apache.lucene.document.Document) SolrInputDocument(org.apache.solr.common.SolrInputDocument) EmbeddedIndexedIntField(org.apache.solr.schema.EmbeddedIndexedIntField) SchemaField(org.apache.solr.schema.SchemaField) Field(org.apache.lucene.document.Field) EmbeddedSortField(org.apache.lucene.document.EmbeddedSortField) ParsedDoc(org.solbase.indexer.ParsedDoc) Fieldable(org.apache.lucene.document.Fieldable) MemcachedException(net.rubyeye.xmemcached.exception.MemcachedException) TimeoutException(java.util.concurrent.TimeoutException)

Example 4 with ParsedDoc

use of org.solbase.indexer.ParsedDoc in project Solbase by Photobucket.

the class SolbaseIndexWriter method editDoc.

/**
	 * Doing edit logic here. instead of blindingly inserting, we need to compare new doc with old doc and do appropriate modification
	 * to tv and doc
	 * @param newDoc
	 * @param indexName
	 * @return
	 */
public boolean editDoc(Document newDoc, String indexName, int docNumber, boolean updateStore) {
    try {
        CachedObjectWrapper<Document, Long> cachedObj = ReaderCache.getDocument(docNumber, null, indexName, 0, 0);
        if (cachedObj == null || cachedObj.getValue() == null) {
            // document doesn't exist, so let's just bail out here
            return true;
        }
        ParsedDoc parsedDoc = new ParsedDoc(newDoc);
        parsedDoc.setIndexName(indexName);
        parsedDoc.setIndexUtil(indexUtil);
        parsedDoc.setIndexWriter(writer);
        parsedDoc.setUpdateStore(updateStore);
        int shardNum = SolbaseShardUtil.getShardNum(indexName);
        int startDocId = SolbaseShardUtil.getStartDocId(shardNum);
        int endDocId = SolbaseShardUtil.getEndDocId(shardNum);
        ReaderCache.updateDocument(docNumber, parsedDoc, indexName, writer, LayeredCache.ModificationType.UPDATE, updateStore, startDocId, endDocId);
        return true;
    } catch (IOException e) {
        logger.info("edit doc failed: " + docNumber);
        logger.info(e.toString());
    } catch (InterruptedException e) {
        logger.info("edit doc failed: " + docNumber);
        logger.info(e.toString());
    } catch (MemcachedException e) {
        logger.info("edit doc failed: " + docNumber);
        logger.info(e.toString());
    } catch (TimeoutException e) {
        logger.info("edit doc failed: " + docNumber);
        logger.info(e.toString());
    } catch (SolbaseException e) {
        logger.info("edit doc failed: " + docNumber);
        logger.info(e.toString());
    }
    return false;
}
Also used : ParsedDoc(org.solbase.indexer.ParsedDoc) SolbaseException(org.solbase.common.SolbaseException) AtomicLong(java.util.concurrent.atomic.AtomicLong) IOException(java.io.IOException) Document(org.apache.lucene.document.Document) MemcachedException(net.rubyeye.xmemcached.exception.MemcachedException) TimeoutException(java.util.concurrent.TimeoutException)

Example 5 with ParsedDoc

use of org.solbase.indexer.ParsedDoc in project Solbase by Photobucket.

the class SolbaseIndexWriter method delete.

public void delete(DeleteUpdateCommand cmd) throws IOException {
    deleteByIdCommands.incrementAndGet();
    deleteByIdCommandsCumulative.incrementAndGet();
    if (!cmd.fromPending && !cmd.fromCommitted) {
        numErrors.incrementAndGet();
        numErrorsCumulative.incrementAndGet();
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "meaningless command: " + cmd);
    }
    if (!cmd.fromPending || !cmd.fromCommitted) {
        numErrors.incrementAndGet();
        numErrorsCumulative.incrementAndGet();
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "operation not supported" + cmd);
    }
    // Delete all terms/fields/etc
    String indexName = core.getName();
    writer.setIndexName(indexName);
    writer.setIndexUtil(indexUtil);
    int docId = Integer.parseInt(cmd.id);
    logger.info("deleting doc: " + docId);
    try {
        CachedObjectWrapper<Document, Long> wrapper = ReaderCache.getDocument(docId, null, indexName, 0, 0);
        boolean updateStore = cmd.getUpdateStore();
        ParsedDoc parsedDoc = new ParsedDoc();
        parsedDoc.setIndexName(indexName);
        parsedDoc.setIndexUtil(indexUtil);
        parsedDoc.setIndexWriter(writer);
        parsedDoc.setUpdateStore(updateStore);
        int shardNum = SolbaseShardUtil.getShardNum(indexName);
        int startDocId = SolbaseShardUtil.getStartDocId(shardNum);
        int endDocId = SolbaseShardUtil.getEndDocId(shardNum);
        ReaderCache.updateDocument(docId, parsedDoc, indexName, writer, LayeredCache.ModificationType.DELETE, updateStore, startDocId, endDocId);
    } catch (InterruptedException e) {
        logger.info("delete doc failed: " + docId);
        logger.info(e.toString());
    } catch (MemcachedException e) {
        logger.info("delete doc failed: " + docId);
        logger.info(e.toString());
    } catch (TimeoutException e) {
        logger.info("delete doc failed: " + docId);
        logger.info(e.toString());
    } catch (SolbaseException e) {
        logger.info("delete doc failed: " + docId);
        logger.info(e.toString());
    }
}
Also used : ParsedDoc(org.solbase.indexer.ParsedDoc) SolbaseException(org.solbase.common.SolbaseException) AtomicLong(java.util.concurrent.atomic.AtomicLong) Document(org.apache.lucene.document.Document) SolrException(org.apache.solr.common.SolrException) MemcachedException(net.rubyeye.xmemcached.exception.MemcachedException) TimeoutException(java.util.concurrent.TimeoutException)

Aggregations

ParsedDoc (org.solbase.indexer.ParsedDoc)7 Document (org.apache.lucene.document.Document)6 TimeoutException (java.util.concurrent.TimeoutException)5 MemcachedException (net.rubyeye.xmemcached.exception.MemcachedException)5 IOException (java.io.IOException)4 SolbaseException (org.solbase.common.SolbaseException)3 ByteBuffer (java.nio.ByteBuffer)2 ArrayList (java.util.ArrayList)2 AtomicLong (java.util.concurrent.atomic.AtomicLong)2 EmbeddedSortField (org.apache.lucene.document.EmbeddedSortField)2 Fieldable (org.apache.lucene.document.Fieldable)2 Term (org.apache.lucene.index.Term)2 SolrInputDocument (org.apache.solr.common.SolrInputDocument)2 SchemaField (org.apache.solr.schema.SchemaField)2 TermDocMetadata (org.solbase.lucenehbase.TermDocMetadata)2 StringReader (java.io.StringReader)1 HashMap (java.util.HashMap)1 List (java.util.List)1 Map (java.util.Map)1 ConcurrentSkipListMap (java.util.concurrent.ConcurrentSkipListMap)1