Search in sources :

Example 91 with Document

use of org.apache.lucene.document.Document in project Solbase by Photobucket.

the class DocumentLoader method loadObject.

public CachedObjectWrapper<Document, Long> loadObject(Integer docNum, int start, int end, LayeredCache<Integer, Document, Long, ParsedDoc> cache) throws IOException {
    Document document = new Document();
    Get documentGet = new Get(SolbaseUtil.randomize(docNum));
    if (fieldNames == null || fieldNames.size() == 0) {
        // get all columns ( except this skips meta info )
        documentGet.addFamily(Bytes.toBytes("field"));
    } else {
        for (byte[] fieldName : fieldNames) {
            documentGet.addColumn(Bytes.toBytes("field"), fieldName);
        }
    }
    Result documentResult = null;
    // if docTable is set up, reuse instance, otherwise create brand new one and close after done
    if (this.docTable == null) {
        HTableInterface docTable = null;
        try {
            docTable = SolbaseUtil.getDocTable();
            documentResult = docTable.get(documentGet);
        } finally {
            SolbaseUtil.releaseTable(docTable);
        }
    } else {
        documentResult = this.docTable.get(documentGet);
    }
    if (documentResult == null || documentResult.isEmpty()) {
        return null;
    }
    // TODO, get from result
    Long versionIdentifier = 0l;
    NavigableMap<byte[], byte[]> familyMap = documentResult.getFamilyMap(Bytes.toBytes("field"));
    for (Map.Entry<byte[], byte[]> fieldColumn : familyMap.entrySet()) {
        Field field = null;
        String fieldName = Bytes.toString(fieldColumn.getKey());
        byte[] value;
        ByteBuffer v = ByteBuffer.wrap(fieldColumn.getValue());
        int vlimit = v.limit() + v.arrayOffset();
        if (v.array()[vlimit - 1] != Byte.MAX_VALUE && v.array()[vlimit - 1] != Byte.MIN_VALUE) {
            throw new CorruptIndexException("Solbase field is not properly encoded: " + docNum + "(" + fieldName + ")");
        } else if (v.array()[vlimit - 1] == Byte.MAX_VALUE) {
            // Binary
            value = new byte[vlimit - 1];
            System.arraycopy(v.array(), v.position() + v.arrayOffset(), value, 0, vlimit - 1);
            field = new Field(fieldName, value, Store.YES);
            document.add(field);
        } else if (v.array()[vlimit - 1] == Byte.MIN_VALUE) {
            // String
            value = new byte[vlimit - 1];
            System.arraycopy(v.array(), v.position() + v.arrayOffset(), value, 0, vlimit - 1);
            // Check for multi-fields
            String fieldString = new String(value, "UTF-8");
            if (fieldString.indexOf(Bytes.toString(SolbaseUtil.delimiter)) >= 0) {
                StringTokenizer tok = new StringTokenizer(fieldString, Bytes.toString(SolbaseUtil.delimiter));
                while (tok.hasMoreTokens()) {
                    // update logic
                    if (schema != null) {
                        SchemaField sfield = schema.getFieldOrNull(fieldName);
                        if (sfield.getType() instanceof EmbeddedIndexedIntField) {
                            EmbeddedIndexedIntField eiif = (EmbeddedIndexedIntField) sfield.getType();
                            EmbeddedSortField sf = new EmbeddedSortField(fieldName, tok.nextToken(), Field.Store.YES, Field.Index.NO, eiif.getFieldNumber());
                            document.add(sf);
                        } else {
                            Field f = sfield.createField(tok.nextToken(), 1.0f);
                            if (f != null) {
                                // null fields are not added
                                document.add(f);
                            }
                        }
                    } else {
                        field = new Field(fieldName, tok.nextToken(), Store.YES, Index.ANALYZED);
                        document.add(field);
                    }
                }
            } else {
                // update logic
                if (schema != null) {
                    SchemaField sfield = schema.getFieldOrNull(fieldName);
                    if (sfield.getType() instanceof EmbeddedIndexedIntField) {
                        EmbeddedIndexedIntField eiif = (EmbeddedIndexedIntField) sfield.getType();
                        EmbeddedSortField sf = new EmbeddedSortField(fieldName, fieldString, Field.Store.YES, Field.Index.NO, eiif.getFieldNumber());
                        document.add(sf);
                    } else {
                        Field f = sfield.createField(fieldString, 1.0f);
                        if (f != null) {
                            // null fields are not added
                            document.add(f);
                        }
                    }
                } else {
                    field = new Field(fieldName, fieldString, Store.YES, Index.ANALYZED);
                    document.add(field);
                }
            }
        }
    }
    return new CachedObjectWrapper<Document, Long>(document, versionIdentifier, System.currentTimeMillis());
}
Also used : CachedObjectWrapper(org.solbase.cache.CachedObjectWrapper) CorruptIndexException(org.apache.lucene.index.CorruptIndexException) EmbeddedSortField(org.apache.lucene.document.EmbeddedSortField) Document(org.apache.lucene.document.Document) SolrInputDocument(org.apache.solr.common.SolrInputDocument) HTableInterface(org.apache.hadoop.hbase.client.HTableInterface) ByteBuffer(java.nio.ByteBuffer) Result(org.apache.hadoop.hbase.client.Result) SchemaField(org.apache.solr.schema.SchemaField) EmbeddedIndexedIntField(org.apache.solr.schema.EmbeddedIndexedIntField) SchemaField(org.apache.solr.schema.SchemaField) Field(org.apache.lucene.document.Field) EmbeddedSortField(org.apache.lucene.document.EmbeddedSortField) StringTokenizer(java.util.StringTokenizer) Get(org.apache.hadoop.hbase.client.Get) Map(java.util.Map) NavigableMap(java.util.NavigableMap) EmbeddedIndexedIntField(org.apache.solr.schema.EmbeddedIndexedIntField)

Example 92 with Document

use of org.apache.lucene.document.Document in project Solbase by Photobucket.

the class DocumentLoader method updateObject.

@Override
public void updateObject(CachedObjectWrapper<Document, Long> object, ParsedDoc modificationData, LayeredCache<Integer, Document, Long, ParsedDoc> cache, LayeredCache.ModificationType modType, int startDocId, int endDocId) throws IOException {
    if (modType == ModificationType.DELETE) {
        //don't want to delete, because someone else might still be using, just let it fall out of LRU cache
        Document oldDoc = object.getValue();
        if (oldDoc != null) {
            // other cluster might have already deleted this doc and cache doesn't have this doc
            modificationData.copyFrom(deleteDocument(oldDoc, Integer.parseInt(oldDoc.getField("docId").stringValue()), modificationData.getIndexName(), modificationData.getIndexWriter(), modificationData.getIndexUtil(), modificationData.getUpdateStore(), startDocId, endDocId));
        }
    } else if (modType == LayeredCache.ModificationType.UPDATE) {
        Document newDoc = modificationData.getDocument();
        Document oldDoc = object.getValue();
        logger.debug("process document() call in updateObject() for docId: " + Integer.parseInt(oldDoc.getField("docId").stringValue()));
        modificationData.copyFrom(processDocument(newDoc, oldDoc, modificationData.getIndexName(), Integer.parseInt(oldDoc.getField("docId").stringValue()), modificationData.getIndexUtil(), modificationData.getIndexWriter(), modificationData.getUpdateStore()));
        object.setValue(modificationData.getDocument());
    } else if (modType == LayeredCache.ModificationType.ADD) {
        // TODO: it should never hit here, newly added doc is obviously not going to be in cache
        Document oldDoc = object.getValue();
        logger.warn("it should never hit here, newly added doc should never be in cache: " + oldDoc.toString());
    }
}
Also used : Document(org.apache.lucene.document.Document) SolrInputDocument(org.apache.solr.common.SolrInputDocument)

Example 93 with Document

use of org.apache.lucene.document.Document in project Solbase by Photobucket.

the class DocumentLoader method processDocument.

private ParsedDoc processDocument(Document newDoc, Document oldDoc, String indexName, int docNumber, SolbaseIndexUtil indexUtil, IndexWriter writer, boolean updateStore) {
    try {
        @SuppressWarnings("unchecked") List<Fieldable> newFields = newDoc.getFields();
        boolean termVectorChanged = false;
        for (Fieldable field : newFields) {
            if (field.isIndexed() || field instanceof EmbeddedSortField) {
                termVectorChanged = true;
                break;
            }
        }
        // do diff on terms
        if (termVectorChanged) {
            Field docIdField = oldDoc.getField("docId");
            // cloning old doc, so it won't conflict with read
            oldDoc = new Document(oldDoc);
            oldDoc.removeField("docId");
            // parsing old doc to get all terms
            try {
                ParsedDoc oldParsedDoc = writer.parseDoc(oldDoc, schema.getAnalyzer(), indexName, docNumber, indexUtil.getSortFieldNames());
                List<Term> oldTerms = oldParsedDoc.getAllTerms();
                List<TermDocMetadata> oldTermDocMetas = oldParsedDoc.getTermDocMetadatas();
                Document mergedDoc = mergeOldAndNew(oldDoc, newDoc);
                ParsedDoc parsedDoc = writer.parseDoc(mergedDoc, schema.getAnalyzer(), indexName, docNumber, indexUtil.getSortFieldNames());
                List<TermDocMetadata> newTermDocMetas = parsedDoc.getTermDocMetadatas();
                List<Term> newTerms = parsedDoc.getAllTerms();
                List<Term> updateList = new ArrayList<Term>(oldTerms);
                List<Term> deleteList = new ArrayList<Term>(oldTerms);
                List<Term> addList = new ArrayList<Term>(newTerms);
                Collections.copy(updateList, oldTerms);
                Collections.copy(deleteList, oldTerms);
                Collections.copy(addList, newTerms);
                updateList.retainAll(newTerms);
                deleteList.removeAll(newTerms);
                addList.removeAll(oldTerms);
                int shardNum = SolbaseShardUtil.getShardNum(indexName);
                int startDocId = SolbaseShardUtil.getStartDocId(shardNum);
                int endDocId = SolbaseShardUtil.getEndDocId(shardNum);
                // updating tv first
                for (TermDocMetadata termDocMeta : newTermDocMetas) {
                    Term term = termDocMeta.getTerm();
                    if (updateList.contains(term)) {
                        logger.debug("updating this term: " + term.toString());
                        ReaderCache.updateTermDocsMetadata(term, termDocMeta, indexName, writer, LayeredCache.ModificationType.UPDATE, updateStore, startDocId, endDocId);
                    } else if (addList.contains(term)) {
                        ReaderCache.updateTermDocsMetadata(term, termDocMeta, indexName, writer, LayeredCache.ModificationType.ADD, updateStore, startDocId, endDocId);
                    }
                }
                // clean up deletes
                if (deleteList.size() > 0) {
                    for (TermDocMetadata termDocMeta : oldTermDocMetas) {
                        Term term = termDocMeta.getTerm();
                        if (deleteList.contains(term)) {
                            ReaderCache.updateTermDocsMetadata(term, termDocMeta, indexName, writer, LayeredCache.ModificationType.DELETE, updateStore, startDocId, endDocId);
                        }
                    }
                }
                parsedDoc.getDocument().add(docIdField);
                return parsedDoc;
            } catch (NullPointerException e) {
                return null;
            }
        } else {
            Document mergedDoc = mergeOldAndNew(oldDoc, newDoc);
            ParsedDoc parsedDoc = writer.parseDoc(mergedDoc, schema.getAnalyzer(), indexName, docNumber, indexUtil.getSortFieldNames());
            return parsedDoc;
        }
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (InterruptedException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (MemcachedException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (TimeoutException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    return null;
}
Also used : ArrayList(java.util.ArrayList) EmbeddedSortField(org.apache.lucene.document.EmbeddedSortField) Term(org.apache.lucene.index.Term) IOException(java.io.IOException) Document(org.apache.lucene.document.Document) SolrInputDocument(org.apache.solr.common.SolrInputDocument) EmbeddedIndexedIntField(org.apache.solr.schema.EmbeddedIndexedIntField) SchemaField(org.apache.solr.schema.SchemaField) Field(org.apache.lucene.document.Field) EmbeddedSortField(org.apache.lucene.document.EmbeddedSortField) ParsedDoc(org.solbase.indexer.ParsedDoc) Fieldable(org.apache.lucene.document.Fieldable) MemcachedException(net.rubyeye.xmemcached.exception.MemcachedException) TimeoutException(java.util.concurrent.TimeoutException)

Example 94 with Document

use of org.apache.lucene.document.Document in project Solbase by Photobucket.

the class SolbaseIndexWriter method editDoc.

/**
	 * Doing edit logic here. instead of blindingly inserting, we need to compare new doc with old doc and do appropriate modification
	 * to tv and doc
	 * @param newDoc
	 * @param indexName
	 * @return
	 */
public boolean editDoc(Document newDoc, String indexName, int docNumber, boolean updateStore) {
    try {
        CachedObjectWrapper<Document, Long> cachedObj = ReaderCache.getDocument(docNumber, null, indexName, 0, 0);
        if (cachedObj == null || cachedObj.getValue() == null) {
            // document doesn't exist, so let's just bail out here
            return true;
        }
        ParsedDoc parsedDoc = new ParsedDoc(newDoc);
        parsedDoc.setIndexName(indexName);
        parsedDoc.setIndexUtil(indexUtil);
        parsedDoc.setIndexWriter(writer);
        parsedDoc.setUpdateStore(updateStore);
        int shardNum = SolbaseShardUtil.getShardNum(indexName);
        int startDocId = SolbaseShardUtil.getStartDocId(shardNum);
        int endDocId = SolbaseShardUtil.getEndDocId(shardNum);
        ReaderCache.updateDocument(docNumber, parsedDoc, indexName, writer, LayeredCache.ModificationType.UPDATE, updateStore, startDocId, endDocId);
        return true;
    } catch (IOException e) {
        logger.info("edit doc failed: " + docNumber);
        logger.info(e.toString());
    } catch (InterruptedException e) {
        logger.info("edit doc failed: " + docNumber);
        logger.info(e.toString());
    } catch (MemcachedException e) {
        logger.info("edit doc failed: " + docNumber);
        logger.info(e.toString());
    } catch (TimeoutException e) {
        logger.info("edit doc failed: " + docNumber);
        logger.info(e.toString());
    } catch (SolbaseException e) {
        logger.info("edit doc failed: " + docNumber);
        logger.info(e.toString());
    }
    return false;
}
Also used : ParsedDoc(org.solbase.indexer.ParsedDoc) SolbaseException(org.solbase.common.SolbaseException) AtomicLong(java.util.concurrent.atomic.AtomicLong) IOException(java.io.IOException) Document(org.apache.lucene.document.Document) MemcachedException(net.rubyeye.xmemcached.exception.MemcachedException) TimeoutException(java.util.concurrent.TimeoutException)

Example 95 with Document

use of org.apache.lucene.document.Document in project Solbase by Photobucket.

the class SolbaseIndexWriter method delete.

public void delete(DeleteUpdateCommand cmd) throws IOException {
    deleteByIdCommands.incrementAndGet();
    deleteByIdCommandsCumulative.incrementAndGet();
    if (!cmd.fromPending && !cmd.fromCommitted) {
        numErrors.incrementAndGet();
        numErrorsCumulative.incrementAndGet();
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "meaningless command: " + cmd);
    }
    if (!cmd.fromPending || !cmd.fromCommitted) {
        numErrors.incrementAndGet();
        numErrorsCumulative.incrementAndGet();
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "operation not supported" + cmd);
    }
    // Delete all terms/fields/etc
    String indexName = core.getName();
    writer.setIndexName(indexName);
    writer.setIndexUtil(indexUtil);
    int docId = Integer.parseInt(cmd.id);
    logger.info("deleting doc: " + docId);
    try {
        CachedObjectWrapper<Document, Long> wrapper = ReaderCache.getDocument(docId, null, indexName, 0, 0);
        boolean updateStore = cmd.getUpdateStore();
        ParsedDoc parsedDoc = new ParsedDoc();
        parsedDoc.setIndexName(indexName);
        parsedDoc.setIndexUtil(indexUtil);
        parsedDoc.setIndexWriter(writer);
        parsedDoc.setUpdateStore(updateStore);
        int shardNum = SolbaseShardUtil.getShardNum(indexName);
        int startDocId = SolbaseShardUtil.getStartDocId(shardNum);
        int endDocId = SolbaseShardUtil.getEndDocId(shardNum);
        ReaderCache.updateDocument(docId, parsedDoc, indexName, writer, LayeredCache.ModificationType.DELETE, updateStore, startDocId, endDocId);
    } catch (InterruptedException e) {
        logger.info("delete doc failed: " + docId);
        logger.info(e.toString());
    } catch (MemcachedException e) {
        logger.info("delete doc failed: " + docId);
        logger.info(e.toString());
    } catch (TimeoutException e) {
        logger.info("delete doc failed: " + docId);
        logger.info(e.toString());
    } catch (SolbaseException e) {
        logger.info("delete doc failed: " + docId);
        logger.info(e.toString());
    }
}
Also used : ParsedDoc(org.solbase.indexer.ParsedDoc) SolbaseException(org.solbase.common.SolbaseException) AtomicLong(java.util.concurrent.atomic.AtomicLong) Document(org.apache.lucene.document.Document) SolrException(org.apache.solr.common.SolrException) MemcachedException(net.rubyeye.xmemcached.exception.MemcachedException) TimeoutException(java.util.concurrent.TimeoutException)

Aggregations

Document (org.apache.lucene.document.Document)2344 Directory (org.apache.lucene.store.Directory)1374 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)798 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)752 IndexReader (org.apache.lucene.index.IndexReader)598 Field (org.apache.lucene.document.Field)480 IndexSearcher (org.apache.lucene.search.IndexSearcher)470 Term (org.apache.lucene.index.Term)456 BytesRef (org.apache.lucene.util.BytesRef)415 StringField (org.apache.lucene.document.StringField)403 TextField (org.apache.lucene.document.TextField)389 NumericDocValuesField (org.apache.lucene.document.NumericDocValuesField)325 IndexWriter (org.apache.lucene.index.IndexWriter)312 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)278 TopDocs (org.apache.lucene.search.TopDocs)270 TermQuery (org.apache.lucene.search.TermQuery)237 FieldType (org.apache.lucene.document.FieldType)231 DirectoryReader (org.apache.lucene.index.DirectoryReader)226 Test (org.junit.Test)222 RAMDirectory (org.apache.lucene.store.RAMDirectory)211