use of org.apache.lucene.util.BytesRefBuilder in project elasticsearch by elastic.
the class CompositeBytesReference method toBytesRef.
@Override
public BytesRef toBytesRef() {
BytesRefBuilder builder = new BytesRefBuilder();
builder.grow(length());
BytesRef spare;
BytesRefIterator iterator = iterator();
try {
while ((spare = iterator.next()) != null) {
builder.append(spare);
}
} catch (IOException ex) {
// this is really an error since we don't do IO in our bytesreferences
throw new AssertionError("won't happen", ex);
}
return builder.toBytesRef();
}
use of org.apache.lucene.util.BytesRefBuilder in project elasticsearch by elastic.
the class TermsQueryBuilder method convert.
/**
* Convert the list in a way that optimizes storage in the case that all
* elements are either integers or {@link String}s/{@link BytesRef}s. This
* is useful to help garbage collections for use-cases that involve sending
* very large terms queries to Elasticsearch. If the list does not only
* contain integers or {@link String}s, then a list is returned where all
* {@link String}s have been replaced with {@link BytesRef}s.
*/
static List<?> convert(List<?> list) {
if (list.isEmpty()) {
return Collections.emptyList();
}
final boolean allNumbers = list.stream().allMatch(o -> o != null && INTEGER_TYPES.contains(o.getClass()));
if (allNumbers) {
final long[] elements = list.stream().mapToLong(o -> ((Number) o).longValue()).toArray();
return new AbstractList<Object>() {
@Override
public Object get(int index) {
return elements[index];
}
@Override
public int size() {
return elements.length;
}
};
}
final boolean allStrings = list.stream().allMatch(o -> o != null && STRING_TYPES.contains(o.getClass()));
if (allStrings) {
final BytesRefBuilder builder = new BytesRefBuilder();
try (BytesStreamOutput bytesOut = new BytesStreamOutput()) {
final int[] endOffsets = new int[list.size()];
int i = 0;
for (Object o : list) {
BytesRef b;
if (o instanceof BytesRef) {
b = (BytesRef) o;
} else {
builder.copyChars(o.toString());
b = builder.get();
}
bytesOut.writeBytes(b.bytes, b.offset, b.length);
if (i == 0) {
endOffsets[0] = b.length;
} else {
endOffsets[i] = Math.addExact(endOffsets[i - 1], b.length);
}
++i;
}
final BytesReference bytes = bytesOut.bytes();
return new AbstractList<Object>() {
public Object get(int i) {
final int startOffset = i == 0 ? 0 : endOffsets[i - 1];
final int endOffset = endOffsets[i];
return bytes.slice(startOffset, endOffset - startOffset).toBytesRef();
}
public int size() {
return endOffsets.length;
}
};
}
}
return list.stream().map(o -> o instanceof String ? new BytesRef(o.toString()) : o).collect(Collectors.toList());
}
use of org.apache.lucene.util.BytesRefBuilder in project elasticsearch by elastic.
the class PhraseSuggester method innerExecute.
/*
* More Ideas:
* - add ability to find whitespace problems -> we can build a poor mans decompounder with our index based on a automaton?
* - add ability to build different error models maybe based on a confusion matrix?
* - try to combine a token with its subsequent token to find / detect word splits (optional)
* - for this to work we need some way to defined the position length of a candidate
* - phonetic filters could be interesting here too for candidate selection
*/
@Override
public Suggestion<? extends Entry<? extends Option>> innerExecute(String name, PhraseSuggestionContext suggestion, IndexSearcher searcher, CharsRefBuilder spare) throws IOException {
double realWordErrorLikelihood = suggestion.realworldErrorLikelyhood();
final PhraseSuggestion response = new PhraseSuggestion(name, suggestion.getSize());
final IndexReader indexReader = searcher.getIndexReader();
List<PhraseSuggestionContext.DirectCandidateGenerator> generators = suggestion.generators();
final int numGenerators = generators.size();
final List<CandidateGenerator> gens = new ArrayList<>(generators.size());
for (int i = 0; i < numGenerators; i++) {
PhraseSuggestionContext.DirectCandidateGenerator generator = generators.get(i);
DirectSpellChecker directSpellChecker = generator.createDirectSpellChecker();
Terms terms = MultiFields.getTerms(indexReader, generator.field());
if (terms != null) {
gens.add(new DirectCandidateGenerator(directSpellChecker, generator.field(), generator.suggestMode(), indexReader, realWordErrorLikelihood, generator.size(), generator.preFilter(), generator.postFilter(), terms));
}
}
final String suggestField = suggestion.getField();
final Terms suggestTerms = MultiFields.getTerms(indexReader, suggestField);
if (gens.size() > 0 && suggestTerms != null) {
final NoisyChannelSpellChecker checker = new NoisyChannelSpellChecker(realWordErrorLikelihood, suggestion.getRequireUnigram(), suggestion.getTokenLimit());
final BytesRef separator = suggestion.separator();
WordScorer wordScorer = suggestion.model().newScorer(indexReader, suggestTerms, suggestField, realWordErrorLikelihood, separator);
Result checkerResult;
try (TokenStream stream = checker.tokenStream(suggestion.getAnalyzer(), suggestion.getText(), spare, suggestion.getField())) {
checkerResult = checker.getCorrections(stream, new MultiCandidateGeneratorWrapper(suggestion.getShardSize(), gens.toArray(new CandidateGenerator[gens.size()])), suggestion.maxErrors(), suggestion.getShardSize(), wordScorer, suggestion.confidence(), suggestion.gramSize());
}
PhraseSuggestion.Entry resultEntry = buildResultEntry(suggestion, spare, checkerResult.cutoffScore);
response.addTerm(resultEntry);
final BytesRefBuilder byteSpare = new BytesRefBuilder();
final Function<Map<String, Object>, ExecutableScript> collateScript = suggestion.getCollateQueryScript();
final boolean collatePrune = (collateScript != null) && suggestion.collatePrune();
for (int i = 0; i < checkerResult.corrections.length; i++) {
Correction correction = checkerResult.corrections[i];
spare.copyUTF8Bytes(correction.join(SEPARATOR, byteSpare, null, null));
boolean collateMatch = true;
if (collateScript != null) {
// Checks if the template query collateScript yields any documents
// from the index for a correction, collateMatch is updated
final Map<String, Object> vars = suggestion.getCollateScriptParams();
vars.put(SUGGESTION_TEMPLATE_VAR_NAME, spare.toString());
QueryShardContext shardContext = suggestion.getShardContext();
final ExecutableScript executable = collateScript.apply(vars);
final BytesReference querySource = (BytesReference) executable.run();
try (XContentParser parser = XContentFactory.xContent(querySource).createParser(shardContext.getXContentRegistry(), querySource)) {
QueryBuilder innerQueryBuilder = shardContext.newParseContext(parser).parseInnerQueryBuilder();
final ParsedQuery parsedQuery = shardContext.toQuery(innerQueryBuilder);
collateMatch = Lucene.exists(searcher, parsedQuery.query());
}
}
if (!collateMatch && !collatePrune) {
continue;
}
Text phrase = new Text(spare.toString());
Text highlighted = null;
if (suggestion.getPreTag() != null) {
spare.copyUTF8Bytes(correction.join(SEPARATOR, byteSpare, suggestion.getPreTag(), suggestion.getPostTag()));
highlighted = new Text(spare.toString());
}
if (collatePrune) {
resultEntry.addOption(new Suggestion.Entry.Option(phrase, highlighted, (float) (correction.score), collateMatch));
} else {
resultEntry.addOption(new Suggestion.Entry.Option(phrase, highlighted, (float) (correction.score)));
}
}
} else {
response.addTerm(buildResultEntry(suggestion, spare, Double.MIN_VALUE));
}
return response;
}
use of org.apache.lucene.util.BytesRefBuilder in project elasticsearch by elastic.
the class PercolatorFieldMapper method processQuery.
void processQuery(Query query, ParseContext context) {
ParseContext.Document doc = context.doc();
FieldType pft = (FieldType) this.fieldType();
QueryAnalyzer.Result result;
try {
result = QueryAnalyzer.analyze(query);
} catch (QueryAnalyzer.UnsupportedQueryException e) {
doc.add(new Field(pft.extractionResultField.name(), EXTRACTION_FAILED, extractionResultField.fieldType()));
return;
}
for (Term term : result.terms) {
BytesRefBuilder builder = new BytesRefBuilder();
builder.append(new BytesRef(term.field()));
builder.append(FIELD_VALUE_SEPARATOR);
builder.append(term.bytes());
doc.add(new Field(queryTermsField.name(), builder.toBytesRef(), queryTermsField.fieldType()));
}
if (result.verified) {
doc.add(new Field(extractionResultField.name(), EXTRACTION_COMPLETE, extractionResultField.fieldType()));
} else {
doc.add(new Field(extractionResultField.name(), EXTRACTION_PARTIAL, extractionResultField.fieldType()));
}
}
use of org.apache.lucene.util.BytesRefBuilder in project elasticsearch by elastic.
the class CollectionUtils method sortAndDedup.
public static int sortAndDedup(final BytesRefArray bytes, final int[] indices) {
final BytesRefBuilder scratch = new BytesRefBuilder();
final BytesRefBuilder scratch1 = new BytesRefBuilder();
final int numValues = bytes.size();
assert indices.length >= numValues;
if (numValues <= 1) {
return numValues;
}
sort(scratch, scratch1, bytes, indices);
int uniqueCount = 1;
BytesRefBuilder previous = scratch;
BytesRefBuilder current = scratch1;
bytes.get(previous, indices[0]);
for (int i = 1; i < numValues; ++i) {
bytes.get(current, indices[i]);
if (!previous.get().equals(current.get())) {
indices[uniqueCount++] = indices[i];
}
BytesRefBuilder tmp = previous;
previous = current;
current = tmp;
}
return uniqueCount;
}
Aggregations