use of org.apache.lucene.analysis.tokenattributes.TermAttribute in project Solbase by Photobucket.
the class IndexWriter method parseDoc.
@SuppressWarnings("unchecked")
public ParsedDoc parseDoc(Document doc, Analyzer analyzer, String indexName, int docNumber, List<String> sortFieldNames) throws CorruptIndexException, IOException {
// given doc, what are all of terms we indexed
List<Term> allIndexedTerms = new ArrayList<Term>();
Map<String, byte[]> fieldCache = new HashMap<String, byte[]>(1024);
// need to hold onto TermDocMetaData, so it can return this array
List<TermDocMetadata> metadatas = new ArrayList<TermDocMetadata>();
byte[] docId = Bytes.toBytes(docNumber);
int position = 0;
for (Fieldable field : (List<Fieldable>) doc.getFields()) {
// Indexed field
if (field.isIndexed() && field.isTokenized()) {
TokenStream tokens = field.tokenStreamValue();
if (tokens == null) {
tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
}
// collect term information per field
Map<Term, Map<ByteBuffer, List<Number>>> allTermInformation = new ConcurrentSkipListMap<Term, Map<ByteBuffer, List<Number>>>();
int lastOffset = 0;
if (position > 0) {
position += analyzer.getPositionIncrementGap(field.name());
}
// reset the TokenStream to the first token
tokens.reset();
// offsets
OffsetAttribute offsetAttribute = null;
if (field.isStoreOffsetWithTermVector())
offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class);
// positions
PositionIncrementAttribute posIncrAttribute = null;
if (field.isStorePositionWithTermVector())
posIncrAttribute = (PositionIncrementAttribute) tokens.addAttribute(PositionIncrementAttribute.class);
TermAttribute termAttribute = (TermAttribute) tokens.addAttribute(TermAttribute.class);
// store normalizations of field per term per document
// rather
// than per field.
// this adds more to write but less to read on other side
Integer tokensInField = new Integer(0);
while (tokens.incrementToken()) {
tokensInField++;
Term term = new Term(field.name(), termAttribute.term());
allIndexedTerms.add(term);
// fetch all collected information for this term
Map<ByteBuffer, List<Number>> termInfo = allTermInformation.get(term);
if (termInfo == null) {
termInfo = new ConcurrentSkipListMap<ByteBuffer, List<Number>>();
allTermInformation.put(term, termInfo);
}
// term frequency
List<Number> termFrequency = termInfo.get(TermDocMetadata.termFrequencyKeyBytes);
if (termFrequency == null) {
termFrequency = new ArrayList<Number>();
termFrequency.add(new Integer(0));
termInfo.put(TermDocMetadata.termFrequencyKeyBytes, termFrequency);
}
// increment
termFrequency.set(0, termFrequency.get(0).intValue() + 1);
// position vector
if (field.isStorePositionWithTermVector()) {
position += (posIncrAttribute.getPositionIncrement() - 1);
List<Number> positionVector = termInfo.get(TermDocMetadata.positionVectorKeyBytes);
if (positionVector == null) {
positionVector = new ArrayList<Number>();
termInfo.put(TermDocMetadata.positionVectorKeyBytes, positionVector);
}
positionVector.add(++position);
}
// term offsets
if (field.isStoreOffsetWithTermVector()) {
List<Number> offsetVector = termInfo.get(TermDocMetadata.offsetVectorKeyBytes);
if (offsetVector == null) {
offsetVector = new ArrayList<Number>();
termInfo.put(TermDocMetadata.offsetVectorKeyBytes, offsetVector);
}
offsetVector.add(lastOffset + offsetAttribute.startOffset());
offsetVector.add(lastOffset + offsetAttribute.endOffset());
}
List<Number> sortValues = new ArrayList<Number>();
// init sortValues
for (int i = 0; i < Scorer.numSort; i++) {
sortValues.add(new Integer(-1));
}
int order = 0;
// extract sort field value and store it in term doc metadata obj
for (String fieldName : sortFieldNames) {
Fieldable fieldable = doc.getFieldable(fieldName);
if (fieldable instanceof EmbeddedSortField) {
EmbeddedSortField sortField = (EmbeddedSortField) fieldable;
int value = -1;
if (sortField.stringValue() != null) {
value = Integer.parseInt(sortField.stringValue());
}
int sortSlot = sortField.getSortSlot();
sortValues.set(sortSlot - 1, new Integer(value));
} else {
// TODO: this logic is used for real time indexing.
// hacky. depending on order of sort field names in array
int value = -1;
if (fieldable.stringValue() != null) {
value = Integer.parseInt(fieldable.stringValue());
}
sortValues.set(order++, new Integer(value));
}
}
termInfo.put(TermDocMetadata.sortFieldKeyBytes, sortValues);
}
List<Number> bnorm = null;
if (!field.getOmitNorms()) {
bnorm = new ArrayList<Number>();
float norm = doc.getBoost();
norm *= field.getBoost();
norm *= similarity.lengthNorm(field.name(), tokensInField);
bnorm.add(Similarity.encodeNorm(norm));
}
for (Map.Entry<Term, Map<ByteBuffer, List<Number>>> term : allTermInformation.entrySet()) {
Term tempTerm = term.getKey();
byte[] fieldTermKeyBytes = SolbaseUtil.generateTermKey(tempTerm);
// more writes but faster on read side.
if (!field.getOmitNorms()) {
term.getValue().put(TermDocMetadata.normsKeyBytes, bnorm);
}
TermDocMetadata data = new TermDocMetadata(docNumber, term.getValue(), fieldTermKeyBytes, tempTerm);
metadatas.add(data);
}
}
// Untokenized fields go in without a termPosition
if (field.isIndexed() && !field.isTokenized()) {
Term term = new Term(field.name(), field.stringValue());
allIndexedTerms.add(term);
byte[] fieldTermKeyBytes = SolbaseUtil.generateTermKey(term);
Map<ByteBuffer, List<Number>> termMap = new ConcurrentSkipListMap<ByteBuffer, List<Number>>();
termMap.put(TermDocMetadata.termFrequencyKeyBytes, Arrays.asList(new Number[] {}));
termMap.put(TermDocMetadata.positionVectorKeyBytes, Arrays.asList(new Number[] {}));
TermDocMetadata data = new TermDocMetadata(docNumber, termMap, fieldTermKeyBytes, term);
metadatas.add(data);
}
// Stores each field as a column under this doc key
if (field.isStored()) {
byte[] _value = field.isBinary() ? field.getBinaryValue() : Bytes.toBytes(field.stringValue());
// first byte flags if binary or not
byte[] value = new byte[_value.length + 1];
System.arraycopy(_value, 0, value, 0, _value.length);
value[value.length - 1] = (byte) (field.isBinary() ? Byte.MAX_VALUE : Byte.MIN_VALUE);
// logic to handle multiple fields w/ same name
byte[] currentValue = fieldCache.get(field.name());
if (currentValue == null) {
fieldCache.put(field.name(), value);
} else {
// append new data
byte[] newValue = new byte[currentValue.length + SolbaseUtil.delimiter.length + value.length - 1];
System.arraycopy(currentValue, 0, newValue, 0, currentValue.length - 1);
System.arraycopy(SolbaseUtil.delimiter, 0, newValue, currentValue.length - 1, SolbaseUtil.delimiter.length);
System.arraycopy(value, 0, newValue, currentValue.length + SolbaseUtil.delimiter.length - 1, value.length);
fieldCache.put(field.name(), newValue);
}
}
}
Put documentPut = new Put(SolbaseUtil.randomize(docNumber));
// Store each field as a column under this docId
for (Map.Entry<String, byte[]> field : fieldCache.entrySet()) {
documentPut.add(Bytes.toBytes("field"), Bytes.toBytes(field.getKey()), field.getValue());
}
// in case of real time update, we need to add back docId field
if (!documentPut.has(Bytes.toBytes("field"), Bytes.toBytes("docId"))) {
byte[] docIdStr = Bytes.toBytes(new Integer(docNumber).toString());
// first byte flags if binary or not
byte[] value = new byte[docIdStr.length + 1];
System.arraycopy(docIdStr, 0, value, 0, docIdStr.length);
value[value.length - 1] = (byte) (Byte.MIN_VALUE);
documentPut.add(Bytes.toBytes("field"), Bytes.toBytes("docId"), value);
}
// Finally, Store meta-data so we can delete this document
documentPut.add(Bytes.toBytes("allTerms"), Bytes.toBytes("allTerms"), SolbaseUtil.toBytes(allIndexedTerms).array());
ParsedDoc parsedDoc = new ParsedDoc(metadatas, doc, documentPut, fieldCache.entrySet(), allIndexedTerms);
return parsedDoc;
}
use of org.apache.lucene.analysis.tokenattributes.TermAttribute in project libresonic by Libresonic.
the class SearchService method analyzeQuery.
private String analyzeQuery(String query) throws IOException {
StringBuilder result = new StringBuilder();
ASCIIFoldingFilter filter = new ASCIIFoldingFilter(new StandardTokenizer(LUCENE_VERSION, new StringReader(query)));
TermAttribute termAttribute = filter.getAttribute(TermAttribute.class);
while (filter.incrementToken()) {
result.append(termAttribute.term()).append("* ");
}
return result.toString();
}
use of org.apache.lucene.analysis.tokenattributes.TermAttribute in project jackrabbit by apache.
the class AbstractExcerpt method createTermPositionVector.
/**
* @param text the text.
* @return a <code>TermPositionVector</code> for the given text.
*/
private TermPositionVector createTermPositionVector(String text) {
// term -> TermVectorOffsetInfo[]
final SortedMap<String, TermVectorOffsetInfo[]> termMap = new TreeMap<String, TermVectorOffsetInfo[]>();
Reader r = new StringReader(text);
TokenStream ts = index.getTextAnalyzer().tokenStream("", r);
try {
while (ts.incrementToken()) {
OffsetAttribute offset = ts.getAttribute(OffsetAttribute.class);
TermAttribute term = ts.getAttribute(TermAttribute.class);
String termText = term.term();
TermVectorOffsetInfo[] info = termMap.get(termText);
if (info == null) {
info = new TermVectorOffsetInfo[1];
} else {
TermVectorOffsetInfo[] tmp = info;
info = new TermVectorOffsetInfo[tmp.length + 1];
System.arraycopy(tmp, 0, info, 0, tmp.length);
}
info[info.length - 1] = new TermVectorOffsetInfo(offset.startOffset(), offset.endOffset());
termMap.put(termText, info);
}
ts.end();
ts.close();
} catch (IOException e) {
// should never happen, we are reading from a string
}
return new TermPositionVector() {
private String[] terms = (String[]) termMap.keySet().toArray(new String[termMap.size()]);
public int[] getTermPositions(int index) {
return null;
}
public TermVectorOffsetInfo[] getOffsets(int index) {
TermVectorOffsetInfo[] info = TermVectorOffsetInfo.EMPTY_OFFSET_INFO;
if (index >= 0 && index < terms.length) {
info = termMap.get(terms[index]);
}
return info;
}
public String getField() {
return "";
}
public int size() {
return terms.length;
}
public String[] getTerms() {
return terms;
}
public int[] getTermFrequencies() {
int[] freqs = new int[terms.length];
for (int i = 0; i < terms.length; i++) {
freqs[i] = termMap.get(terms[i]).length;
}
return freqs;
}
public int indexOf(String term) {
int res = Arrays.binarySearch(terms, term);
return res >= 0 ? res : -1;
}
public int[] indexesOf(String[] terms, int start, int len) {
int[] res = new int[len];
for (int i = 0; i < len; i++) {
res[i] = indexOf(terms[i]);
}
return res;
}
};
}
use of org.apache.lucene.analysis.tokenattributes.TermAttribute in project jackrabbit by apache.
the class MoreLikeThis method addTermFrequencies.
/**
* Adds term frequencies found by tokenizing text from reader into the Map words
* @param r a source of text to be tokenized
* @param termFreqMap a Map of terms and their frequencies
* @param fieldName Used by analyzer for any special per-field analysis
*/
private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName) throws IOException {
TokenStream ts = analyzer.tokenStream(fieldName, r);
int tokenCount = 0;
// for every token
while (ts.incrementToken()) {
TermAttribute term = ts.getAttribute(TermAttribute.class);
String word = term.term();
tokenCount++;
if (tokenCount > maxNumTokensParsed) {
break;
}
if (isNoiseWord(word)) {
continue;
}
// increment frequency
Int cnt = termFreqMap.get(word);
if (cnt == null) {
termFreqMap.put(word, new Int());
} else {
cnt.x++;
}
}
ts.end();
ts.close();
}
use of org.apache.lucene.analysis.tokenattributes.TermAttribute in project ddf by codice.
the class ContextualEvaluator method logTokens.
private static void logTokens(Analyzer analyzer, String fieldName, String fullDocument, String analyzerName) throws IOException {
if (!LOGGER.isDebugEnabled()) {
return;
}
TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(fullDocument));
OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class);
LOGGER.debug("----- {} tokens -----", analyzerName);
while (tokenStream.incrementToken()) {
int startOffset = offsetAttribute.startOffset();
int endOffset = offsetAttribute.endOffset();
String term = termAttribute.term();
LOGGER.debug(term);
}
LOGGER.debug("----- END: {} tokens -----", analyzerName);
}
Aggregations