use of org.apache.lucene.analysis.tokenattributes.TypeAttribute in project lucene-solr by apache.
the class SimplePreAnalyzedParser method toFormattedString.
@Override
public String toFormattedString(Field f) throws IOException {
StringBuilder sb = new StringBuilder();
sb.append(VERSION + " ");
if (f.fieldType().stored()) {
String s = f.stringValue();
if (s != null) {
// encode the equals sign
s = s.replaceAll("=", "\\=");
sb.append('=');
sb.append(s);
sb.append('=');
}
}
TokenStream ts = f.tokenStreamValue();
if (ts != null) {
StringBuilder tok = new StringBuilder();
boolean next = false;
while (ts.incrementToken()) {
if (next) {
sb.append(' ');
} else {
next = true;
}
tok.setLength(0);
Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
String cTerm = null;
String tTerm = null;
while (it.hasNext()) {
Class<? extends Attribute> cl = it.next();
Attribute att = ts.getAttribute(cl);
if (att == null) {
continue;
}
if (cl.isAssignableFrom(CharTermAttribute.class)) {
CharTermAttribute catt = (CharTermAttribute) att;
cTerm = escape(catt.buffer(), catt.length());
} else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
TermToBytesRefAttribute tatt = (TermToBytesRefAttribute) att;
char[] tTermChars = tatt.getBytesRef().utf8ToString().toCharArray();
tTerm = escape(tTermChars, tTermChars.length);
} else {
if (tok.length() > 0)
tok.append(',');
if (cl.isAssignableFrom(FlagsAttribute.class)) {
tok.append("f=" + Integer.toHexString(((FlagsAttribute) att).getFlags()));
} else if (cl.isAssignableFrom(OffsetAttribute.class)) {
tok.append("s=" + ((OffsetAttribute) att).startOffset() + ",e=" + ((OffsetAttribute) att).endOffset());
} else if (cl.isAssignableFrom(PayloadAttribute.class)) {
BytesRef p = ((PayloadAttribute) att).getPayload();
if (p != null && p.length > 0) {
tok.append("p=" + bytesToHex(p.bytes, p.offset, p.length));
} else if (tok.length() > 0) {
// remove the last comma
tok.setLength(tok.length() - 1);
}
} else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
tok.append("i=" + ((PositionIncrementAttribute) att).getPositionIncrement());
} else if (cl.isAssignableFrom(TypeAttribute.class)) {
tok.append("y=" + escape(((TypeAttribute) att).type()));
} else {
tok.append(cl.getName() + "=" + escape(att.toString()));
}
}
}
String term = null;
if (cTerm != null) {
term = cTerm;
} else {
term = tTerm;
}
if (term != null && term.length() > 0) {
if (tok.length() > 0) {
tok.insert(0, term + ",");
} else {
tok.insert(0, term);
}
}
sb.append(tok);
}
}
return sb.toString();
}
use of org.apache.lucene.analysis.tokenattributes.TypeAttribute in project elasticsearch by elastic.
the class NoisyChannelSpellChecker method getCorrections.
public Result getCorrections(TokenStream stream, final CandidateGenerator generator, float maxErrors, int numCorrections, WordScorer wordScorer, float confidence, int gramSize) throws IOException {
final List<CandidateSet> candidateSetsList = new ArrayList<>();
DirectCandidateGenerator.analyze(stream, new DirectCandidateGenerator.TokenConsumer() {
CandidateSet currentSet = null;
private TypeAttribute typeAttribute;
private final BytesRefBuilder termsRef = new BytesRefBuilder();
private boolean anyUnigram = false;
private boolean anyTokens = false;
@Override
public void reset(TokenStream stream) {
super.reset(stream);
typeAttribute = stream.addAttribute(TypeAttribute.class);
}
@Override
public void nextToken() throws IOException {
anyTokens = true;
BytesRef term = fillBytesRef(termsRef);
if (requireUnigram && typeAttribute.type() == ShingleFilter.DEFAULT_TOKEN_TYPE) {
return;
}
anyUnigram = true;
if (posIncAttr.getPositionIncrement() == 0 && typeAttribute.type() == SynonymFilter.TYPE_SYNONYM) {
assert currentSet != null;
long freq = 0;
if ((freq = generator.frequency(term)) > 0) {
currentSet.addOneCandidate(generator.createCandidate(BytesRef.deepCopyOf(term), freq, realWordLikelihood));
}
} else {
if (currentSet != null) {
candidateSetsList.add(currentSet);
}
currentSet = new CandidateSet(Candidate.EMPTY, generator.createCandidate(BytesRef.deepCopyOf(term), true));
}
}
@Override
public void end() {
if (currentSet != null) {
candidateSetsList.add(currentSet);
}
if (requireUnigram && !anyUnigram && anyTokens) {
throw new IllegalStateException("At least one unigram is required but all tokens were ngrams");
}
}
});
if (candidateSetsList.isEmpty() || candidateSetsList.size() >= tokenLimit) {
return Result.EMPTY;
}
for (CandidateSet candidateSet : candidateSetsList) {
generator.drawCandidates(candidateSet);
}
double cutoffScore = Double.MIN_VALUE;
CandidateScorer scorer = new CandidateScorer(wordScorer, numCorrections, gramSize);
CandidateSet[] candidateSets = candidateSetsList.toArray(new CandidateSet[candidateSetsList.size()]);
if (confidence > 0.0) {
Candidate[] candidates = new Candidate[candidateSets.length];
for (int i = 0; i < candidates.length; i++) {
candidates[i] = candidateSets[i].originalTerm;
}
double inputPhraseScore = scorer.score(candidates, candidateSets);
cutoffScore = inputPhraseScore * confidence;
}
Correction[] bestCandidates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore);
return new Result(bestCandidates, cutoffScore);
}
use of org.apache.lucene.analysis.tokenattributes.TypeAttribute in project jackrabbit by apache.
the class JackrabbitQueryParser method getPrefixQuery.
/**
* {@inheritDoc}
*/
protected Query getPrefixQuery(String field, String termStr) throws ParseException {
// only create a prefix query when the term is a single word / token
Analyzer a = getAnalyzer();
TokenStream ts = a.tokenStream(field, new StringReader(termStr));
int count = 0;
boolean isCJ = false;
try {
TypeAttribute t = ts.addAttribute(TypeAttribute.class);
ts.reset();
while (ts.incrementToken()) {
count++;
isCJ = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.CJ].equals(t.type());
}
ts.end();
} catch (IOException e) {
throw new ParseException(e.getMessage());
} finally {
try {
ts.close();
} catch (IOException e) {
// ignore
}
}
if (count > 1 && isCJ) {
return getFieldQuery(field, termStr);
} else {
return getWildcardQuery(field, termStr + "*");
}
}
use of org.apache.lucene.analysis.tokenattributes.TypeAttribute in project lucene-solr by apache.
the class SimpleQueryConverter method convert.
@Override
public Collection<Token> convert(String origQuery) {
Collection<Token> result = new HashSet<>();
WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
try (TokenStream ts = analyzer.tokenStream("", origQuery)) {
// TODO: support custom attributes
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
ts.reset();
while (ts.incrementToken()) {
Token tok = new Token();
tok.copyBuffer(termAtt.buffer(), 0, termAtt.length());
tok.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
tok.setFlags(flagsAtt.getFlags());
tok.setPayload(payloadAtt.getPayload());
tok.setPositionIncrement(posIncAtt.getPositionIncrement());
tok.setType(typeAtt.type());
result.add(tok);
}
ts.end();
return result;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
use of org.apache.lucene.analysis.tokenattributes.TypeAttribute in project lucene-solr by apache.
the class JsonPreAnalyzedParser method toFormattedString.
@Override
public String toFormattedString(Field f) throws IOException {
Map<String, Object> map = new LinkedHashMap<>();
map.put(VERSION_KEY, VERSION);
if (f.fieldType().stored()) {
String stringValue = f.stringValue();
if (stringValue != null) {
map.put(STRING_KEY, stringValue);
}
BytesRef binaryValue = f.binaryValue();
if (binaryValue != null) {
map.put(BINARY_KEY, Base64.byteArrayToBase64(binaryValue.bytes, binaryValue.offset, binaryValue.length));
}
}
TokenStream ts = f.tokenStreamValue();
if (ts != null) {
List<Map<String, Object>> tokens = new LinkedList<>();
while (ts.incrementToken()) {
Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
String cTerm = null;
String tTerm = null;
Map<String, Object> tok = new TreeMap<>();
while (it.hasNext()) {
Class<? extends Attribute> cl = it.next();
Attribute att = ts.getAttribute(cl);
if (att == null) {
continue;
}
if (cl.isAssignableFrom(CharTermAttribute.class)) {
CharTermAttribute catt = (CharTermAttribute) att;
cTerm = new String(catt.buffer(), 0, catt.length());
} else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
TermToBytesRefAttribute tatt = (TermToBytesRefAttribute) att;
tTerm = tatt.getBytesRef().utf8ToString();
} else {
if (cl.isAssignableFrom(FlagsAttribute.class)) {
tok.put(FLAGS_KEY, Integer.toHexString(((FlagsAttribute) att).getFlags()));
} else if (cl.isAssignableFrom(OffsetAttribute.class)) {
tok.put(OFFSET_START_KEY, ((OffsetAttribute) att).startOffset());
tok.put(OFFSET_END_KEY, ((OffsetAttribute) att).endOffset());
} else if (cl.isAssignableFrom(PayloadAttribute.class)) {
BytesRef p = ((PayloadAttribute) att).getPayload();
if (p != null && p.length > 0) {
tok.put(PAYLOAD_KEY, Base64.byteArrayToBase64(p.bytes, p.offset, p.length));
}
} else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
tok.put(POSINCR_KEY, ((PositionIncrementAttribute) att).getPositionIncrement());
} else if (cl.isAssignableFrom(TypeAttribute.class)) {
tok.put(TYPE_KEY, ((TypeAttribute) att).type());
} else {
tok.put(cl.getName(), att.toString());
}
}
}
String term = null;
if (cTerm != null) {
term = cTerm;
} else {
term = tTerm;
}
if (term != null && term.length() > 0) {
tok.put(TOKEN_KEY, term);
}
tokens.add(tok);
}
map.put(TOKENS_KEY, tokens);
}
return JSONUtil.toJSON(map, -1);
}
Aggregations