use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project HongsCORE by ihongs.
the class DemoTest method main.
public static void main(String[] args) throws IOException {
Analyzer az = CustomAnalyzer.builder().withTokenizer("Name").addTokenFilter("EdgeNGram", "minGramSize", "1", "maxGramSize", "20").build();
StringReader sr = new StringReader(args[0]);
TokenStream ts = az.tokenStream("", sr);
OffsetAttribute oa = ts.addAttribute(OffsetAttribute.class);
CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
try {
// Resets this stream to the beginning. (Required)
ts.reset();
while (ts.incrementToken()) {
System.out.println(ta.toString() + "|" + ta.length() + "[" + oa.startOffset() + "," + oa.endOffset() + "]");
}
// Perform end-of-stream operations, e.g. set the final offset.
ts.end();
} finally {
// Release resources associated with this stream.
ts.close();
}
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project Anserini by castorini.
the class SequentialDependenceModel method computeOrderedFrequencyScore.
private float computeOrderedFrequencyScore(Document doc, Terms terms, RerankerContext context) throws IOException {
List<String> queryTokens = context.getQueryTokens();
Map<String, String> queryPairMap = new HashMap<>();
Map<String, Integer> phraseCountMap = new HashMap<>();
Map<String, Integer> singleCountMap = new HashMap<>();
// Construct a count map and a map of phrase pair x y, x->y
for (int i = 0; i < queryTokens.size() - 1; i++) {
queryPairMap.put(queryTokens.get(i), queryTokens.get(i + 1));
phraseCountMap.put(queryTokens.get(i), 0);
// This will serve as our smoothing param
singleCountMap.put(queryTokens.get(i), 1);
}
// Construct token stream with offset 0
TokenStream stream = new TokenStreamFromTermVector(terms, 0);
CharTermAttribute termAttribute = stream.addAttribute(CharTermAttribute.class);
float docSize = 0.0f;
// Use these to track which token we need to see to increment count
// count tracked on the first token
String expectedToken = "";
String tokenToIncrement = "";
while (stream.incrementToken()) {
docSize++;
String token = termAttribute.toString();
if (token.equalsIgnoreCase(expectedToken)) {
phraseCountMap.put(tokenToIncrement, phraseCountMap.get(tokenToIncrement) + 1);
}
// Check now if this token could be the start of an ordered phrase
if (queryPairMap.containsKey(token)) {
expectedToken = queryPairMap.get(token);
singleCountMap.put(token, singleCountMap.get(token) + 1);
tokenToIncrement = token;
} else {
expectedToken = "";
tokenToIncrement = "";
}
}
float score = 0.0f;
// Smoothing count of 1
docSize++;
for (String queryToken : phraseCountMap.keySet()) {
score += Math.log((float) (phraseCountMap.get(queryToken) + 1) / docSize);
}
return score;
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project Anserini by castorini.
the class UnigramFeatureExtractor method computeFullIndependenceScore.
/**
* The single term scoring function: lambda* log( (1-alpha) tf/ |D|)
* @param doc
* @param terms
* @param context
* @return
*/
private float computeFullIndependenceScore(Document doc, Terms terms, RerankerContext context) throws IOException {
// tf can be calculated by iterating over terms, number of times a term occurs in doc
// |D| total number of terms can be calculated by iterating over stream
IndexReader reader = context.getIndexSearcher().getIndexReader();
List<String> queryTokenList = context.getQueryTokens();
Map<String, Integer> termCount = new HashMap<>();
for (String queryToken : queryTokenList) {
termCount.put(queryToken, 0);
}
TokenStream stream = new TokenStreamFromTermVector(terms, -1);
CharTermAttribute termAttribute = stream.addAttribute(CharTermAttribute.class);
stream.reset();
float docSize = 0;
// Count all the tokens
while (stream.incrementToken()) {
docSize++;
String token = termAttribute.toString();
if (termCount.containsKey(token)) {
termCount.put(token, termCount.get(token) + 1);
}
}
float score = 0.0f;
// Smoothing count of 1
docSize++;
// Only compute the score for what's in term count all else 0
for (String queryToken : termCount.keySet()) {
score += termCount.get(queryToken);
}
stream.end();
stream.close();
return score;
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project elasticsearch by elastic.
the class SimpleQueryParser method newPossiblyAnalyzedQuery.
/**
* Analyze the given string using its analyzer, constructing either a
* {@code PrefixQuery} or a {@code BooleanQuery} made up
* of {@code TermQuery}s and {@code PrefixQuery}s
*/
private Query newPossiblyAnalyzedQuery(String field, String termStr) {
List<List<BytesRef>> tlist = new ArrayList<>();
// get Analyzer from superclass and tokenize the term
try (TokenStream source = getAnalyzer().tokenStream(field, termStr)) {
source.reset();
List<BytesRef> currentPos = new ArrayList<>();
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posAtt = source.addAttribute(PositionIncrementAttribute.class);
try {
boolean hasMoreTokens = source.incrementToken();
while (hasMoreTokens) {
if (currentPos.isEmpty() == false && posAtt.getPositionIncrement() > 0) {
tlist.add(currentPos);
currentPos = new ArrayList<>();
}
final BytesRef term = getAnalyzer().normalize(field, termAtt.toString());
currentPos.add(term);
hasMoreTokens = source.incrementToken();
}
if (currentPos.isEmpty() == false) {
tlist.add(currentPos);
}
} catch (IOException e) {
// ignore
// TODO: we should not ignore the exception and return a prefix query with the original term ?
}
} catch (IOException e) {
// Bail on any exceptions, going with a regular prefix query
return new PrefixQuery(new Term(field, termStr));
}
if (tlist.size() == 0) {
return null;
}
if (tlist.size() == 1 && tlist.get(0).size() == 1) {
return new PrefixQuery(new Term(field, tlist.get(0).get(0)));
}
// build a boolean query with prefix on the last position only.
BooleanQuery.Builder builder = new BooleanQuery.Builder();
for (int pos = 0; pos < tlist.size(); pos++) {
List<BytesRef> plist = tlist.get(pos);
boolean isLastPos = (pos == tlist.size() - 1);
Query posQuery;
if (plist.size() == 1) {
if (isLastPos) {
posQuery = new PrefixQuery(new Term(field, plist.get(0)));
} else {
posQuery = newTermQuery(new Term(field, plist.get(0)));
}
} else if (isLastPos == false) {
// build a synonym query for terms in the same position.
Term[] terms = new Term[plist.size()];
for (int i = 0; i < plist.size(); i++) {
terms[i] = new Term(field, plist.get(i));
}
posQuery = new SynonymQuery(terms);
} else {
BooleanQuery.Builder innerBuilder = new BooleanQuery.Builder();
for (BytesRef token : plist) {
innerBuilder.add(new BooleanClause(new PrefixQuery(new Term(field, token)), BooleanClause.Occur.SHOULD));
}
posQuery = innerBuilder.setDisableCoord(true).build();
}
builder.add(new BooleanClause(posQuery, getDefaultOperator()));
}
return builder.build();
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project elasticsearch by elastic.
the class KeywordFieldMapper method parseCreateField.
@Override
protected void parseCreateField(ParseContext context, List<IndexableField> fields) throws IOException {
String value;
if (context.externalValueSet()) {
value = context.externalValue().toString();
} else {
XContentParser parser = context.parser();
if (parser.currentToken() == XContentParser.Token.VALUE_NULL) {
value = fieldType().nullValueAsString();
} else {
value = parser.textOrNull();
}
}
if (value == null || value.length() > ignoreAbove) {
return;
}
final NamedAnalyzer normalizer = fieldType().normalizer();
if (normalizer != null) {
try (TokenStream ts = normalizer.tokenStream(name(), value)) {
final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
if (ts.incrementToken() == false) {
throw new IllegalStateException("The normalization token stream is " + "expected to produce exactly 1 token, but got 0 for analyzer " + normalizer + " and input \"" + value + "\"");
}
final String newValue = termAtt.toString();
if (ts.incrementToken()) {
throw new IllegalStateException("The normalization token stream is " + "expected to produce exactly 1 token, but got 2+ for analyzer " + normalizer + " and input \"" + value + "\"");
}
ts.end();
value = newValue;
}
}
if (context.includeInAll(includeInAll, this)) {
context.allEntries().addText(fieldType().name(), value, fieldType().boost());
}
// convert to utf8 only once before feeding postings/dv/stored fields
final BytesRef binaryValue = new BytesRef(value);
if (fieldType().indexOptions() != IndexOptions.NONE || fieldType().stored()) {
Field field = new Field(fieldType().name(), binaryValue, fieldType());
fields.add(field);
}
if (fieldType().hasDocValues()) {
fields.add(new SortedSetDocValuesField(fieldType().name(), binaryValue));
}
}
Aggregations