use of com.apple.foundationdb.record.provider.common.text.TextTokenizer in project fdb-record-layer by FoundationDB.
the class TextScan method getTokenList.
// Get the comparand as a list of strings. This might involve tokenizing the
// query string if the comparison didn't do that already.
private List<String> getTokenList(@Nonnull FDBRecordStoreBase<?> store, @Nonnull EvaluationContext context, boolean removeStopWords) {
final Object comparand = textComparison.getComparand(store, context);
List<String> tokenList;
if (comparand instanceof List<?>) {
tokenList = ((List<?>) comparand).stream().map(Object::toString).collect(Collectors.toList());
} else if (comparand instanceof String) {
TextTokenizer tokenizer = TextIndexMaintainer.getTokenizer(index);
int tokenizerVersion = TextIndexMaintainer.getIndexTokenizerVersion(index);
tokenList = tokenizer.tokenizeToList((String) comparand, tokenizerVersion, TextTokenizer.TokenizerMode.QUERY);
} else {
throw new RecordCoreException("Comparand for text query of incompatible type: " + (comparand == null ? "null" : comparand.getClass()));
}
if (removeStopWords && tokenList.contains("")) {
// Remove all stop words from this list
tokenList = tokenList.stream().filter(s -> !s.isEmpty()).collect(Collectors.toList());
}
return tokenList;
}
use of com.apple.foundationdb.record.provider.common.text.TextTokenizer in project fdb-record-layer by FoundationDB.
the class TextIndexTest method queryScanEquivalence.
/**
* Generate random documents and then make sure that querying them using the index
* produces the same result as performing a full scan of all records.
*/
@MethodSource("indexArguments")
@ParameterizedTest
public void queryScanEquivalence(@Nonnull Index index) throws Exception {
final Random r = new Random(0xba5eba1L + index.getName().hashCode());
final int recordCount = 100;
final int recordBatch = 25;
final int queryCount = 25;
final List<String> lexicon = getStandardLexicon();
TextTokenizerRegistryImpl.instance().register(FILTERING_TOKENIZER);
final TextTokenizer tokenizer = TextIndexMaintainer.getTokenizer(index);
final RecordMetaDataHook hook = metaDataBuilder -> {
metaDataBuilder.removeIndex(TextIndexTestUtils.SIMPLE_DEFAULT_NAME);
metaDataBuilder.addIndex(SIMPLE_DOC, index);
};
long seed = r.nextLong();
LOGGER.info(KeyValueLogMessage.of("initializing random number generator", TestLogMessageKeys.SEED, seed));
r.setSeed(seed);
for (int i = 0; i < recordCount; i += recordBatch) {
List<SimpleDocument> records = getRandomRecords(r, recordBatch, lexicon);
LOGGER.info(KeyValueLogMessage.of("creating and saving random records", TestLogMessageKeys.BATCH_SIZE, recordBatch));
try (FDBRecordContext context = openContext()) {
openRecordStore(context, hook);
records.forEach(recordStore::saveRecord);
commit(context);
}
}
double[] proportions = getZipfProportions(lexicon);
long totalScanningTime = 0;
long totalQueryingTime = 0;
long totalResults = 0;
for (int i = 0; i < queryCount; i++) {
// Generate a random text query
List<String> tokens = getRandomWords(r, lexicon, proportions, 6, 3);
String tokenString = String.join(" ", tokens);
double filterChoice = r.nextDouble();
final QueryComponent filter;
if (filterChoice < 0.2) {
filter = Query.field("text").text(tokenizer.getName()).containsAll(tokenString);
} else if (filterChoice < 0.4) {
filter = Query.field("text").text(tokenizer.getName()).containsAny(tokenString);
} else if (filterChoice < 0.6) {
filter = Query.field("text").text(tokenizer.getName()).containsPhrase(tokenString);
} else if (filterChoice < 0.8) {
int maxDistance = r.nextInt(10) + tokens.size();
filter = Query.field("text").text(tokenizer.getName()).containsAll(tokenString, maxDistance);
} else if (filterChoice < 0.9) {
filter = Query.field("text").text(tokenizer.getName()).containsAnyPrefix(tokenString);
} else if (filterChoice < 0.95) {
filter = Query.field("text").text(tokenizer.getName()).containsAllPrefixes(tokenString);
} else {
if (tokens.isEmpty()) {
continue;
}
// Choose the first non-empty token from the iterator
Iterator<? extends CharSequence> tokenIterator = tokenizer.tokenize(tokenString, tokenizer.getMaxVersion(), TextTokenizer.TokenizerMode.QUERY);
String firstToken = null;
while (tokenIterator.hasNext()) {
String nextToken = tokenIterator.next().toString();
if (!nextToken.isEmpty()) {
firstToken = nextToken;
break;
}
}
if (firstToken == null) {
continue;
}
int prefixEnd;
if (firstToken.length() > 1) {
prefixEnd = r.nextInt(firstToken.length() - 1) + 1;
} else {
prefixEnd = 1;
}
filter = Query.field("text").text(tokenizer.getName()).containsPrefix(firstToken.substring(0, prefixEnd));
}
LOGGER.info(KeyValueLogMessage.of("generated random filter", TestLogMessageKeys.ITERATION, i, LogMessageKeys.FILTER, filter));
// Manual scan all of the records
long startTime = System.nanoTime();
final Set<Long> manualRecordIds = performQueryWithRecordStoreScan(hook, filter);
long endTime = System.nanoTime();
LOGGER.info(KeyValueLogMessage.of("manual scan completed", TestLogMessageKeys.SCAN_MILLIS, TimeUnit.MILLISECONDS.convert(endTime - startTime, TimeUnit.NANOSECONDS)));
totalScanningTime += endTime - startTime;
// Generate a query and use the index
startTime = System.nanoTime();
final Set<Long> queryRecordIds = performQueryWithIndexScan(hook, index, filter);
endTime = System.nanoTime();
LOGGER.info(KeyValueLogMessage.of("query completed", TestLogMessageKeys.SCAN_MILLIS, TimeUnit.MILLISECONDS.convert(endTime - startTime, TimeUnit.NANOSECONDS)));
totalQueryingTime += endTime - startTime;
if (!manualRecordIds.equals(queryRecordIds)) {
Set<Long> onlyManual = new HashSet<>(manualRecordIds);
onlyManual.removeAll(queryRecordIds);
Set<Long> onlyQuery = new HashSet<>(queryRecordIds);
onlyManual.removeAll(manualRecordIds);
LOGGER.warn(KeyValueLogMessage.of("results did not match", LogMessageKeys.FILTER, filter, TestLogMessageKeys.MANUAL_RESULT_COUNT, manualRecordIds.size(), TestLogMessageKeys.QUERY_RESULT_COUNT, queryRecordIds.size(), TestLogMessageKeys.ONLY_MANUAL_COUNT, onlyManual.size(), TestLogMessageKeys.ONLY_QUERY_COUNT, onlyQuery.size()));
}
assertEquals(manualRecordIds, queryRecordIds);
LOGGER.info(KeyValueLogMessage.of("results matched", LogMessageKeys.FILTER, filter, TestLogMessageKeys.RESULT_COUNT, manualRecordIds.size()));
totalResults += queryRecordIds.size();
}
LOGGER.info(KeyValueLogMessage.of("test completed", TestLogMessageKeys.TOTAL_SCAN_MILLIS, TimeUnit.MILLISECONDS.convert(totalScanningTime, TimeUnit.NANOSECONDS), TestLogMessageKeys.TOTAL_QUERY_MILLIS, TimeUnit.MILLISECONDS.convert(totalQueryingTime, TimeUnit.NANOSECONDS), TestLogMessageKeys.TOTAL_RESULT_COUNT, totalResults));
}
use of com.apple.foundationdb.record.provider.common.text.TextTokenizer in project fdb-record-layer by FoundationDB.
the class TextIndexMaintainerFactory method getIndexValidator.
/**
* Validates that the index provided is valid for text indexes. This means that
* the index must:
*
* <ul>
* <li>Not be a unique index.</li>
* <li>Not include a {@link com.apple.foundationdb.record.metadata.expressions.VersionKeyExpression#VERSION} expression in its root expression.</li>
* <li>Have a key expression whose first column is of type <code>string</code> (possibly with grouping columns
* before the tokenized text column) and is not repeated.</li> <!--Maybe we should support FanType.Concatenate?-->
* <li>Specify a valid tokenizer and tokenizer version through the index options (possibly using the defaults).</li>
* <li>Not define a value expression.</li>
* </ul>
*
* @param index the index to validate
* @return a validator to run against the index
* @throws KeyExpression.InvalidExpressionException if the expression does not contain a string as its first ungrouped column
* @throws com.apple.foundationdb.record.metadata.MetaDataException if the tokenizer is not defined, if the tokenizer version
* is out of range, or if the index is marked as unique
*/
@Nonnull
@Override
public IndexValidator getIndexValidator(Index index) {
return new IndexValidator(index) {
@Override
public void validate(@Nonnull MetaDataValidator metaDataValidator) {
super.validate(metaDataValidator);
validateNotVersion();
validateNotUnique();
// TODO: allow value expressions for covering text indexes
validateNoValue();
// Validate that the tokenizer exists and that the version is in a valid range.
TextTokenizer tokenizer = TextIndexMaintainer.getTokenizer(index);
int tokenizerVersion = TextIndexMaintainer.getIndexTokenizerVersion(index);
tokenizer.validateVersion(tokenizerVersion);
}
@Override
public void validateIndexForRecordType(@Nonnull RecordType recordType, @Nonnull MetaDataValidator metaDataValidator) {
final List<Descriptors.FieldDescriptor> fields = metaDataValidator.validateIndexForRecordType(index, recordType);
int textFieldPosition = TextIndexMaintainer.textFieldPosition(index.getRootExpression());
if (textFieldPosition > fields.size()) {
throw new KeyExpression.InvalidExpressionException("text index does not have text field after grouped fields");
} else {
Descriptors.FieldDescriptor textFieldDescriptor = fields.get(textFieldPosition);
if (!textFieldDescriptor.getType().equals(Descriptors.FieldDescriptor.Type.STRING)) {
throw new KeyExpression.InvalidExpressionException(String.format("text index has non-string type %s as text field", textFieldDescriptor.getLiteJavaType()));
}
if (textFieldDescriptor.isRepeated()) {
throw new KeyExpression.InvalidExpressionException("text index does not allow a repeated field for text body");
}
}
}
/**
* Validate any options that have changed. There are several options unique to text indexes which
* may change without requiring the index be rebuilt. They are:
*
* <ul>
* <li>{@link IndexOptions#TEXT_TOKENIZER_VERSION_OPTION} which can be increased (but not decreased)</li>
* <li>{@link IndexOptions#TEXT_ADD_AGGRESSIVE_CONFLICT_RANGES_OPTION} which only affects what conflict ranges
* are added at index update time and thus has no impact on the on-disk representation</li>
* <li>{@link IndexOptions#TEXT_OMIT_POSITIONS_OPTION} which changes whether the position lists are included
* in index entries</li>
* </ul>
*
* <p>
* Note that the {@link IndexOptions#TEXT_TOKENIZER_NAME_OPTION} is <em>not</em> allowed to change
* (without rebuilding the index).
* </p>
*
* @param oldIndex an older version of this index
* @param changedOptions the set of changed options
*/
@Override
protected void validateChangedOptions(@Nonnull Index oldIndex, @Nonnull Set<String> changedOptions) {
for (String changedOption : changedOptions) {
switch(changedOption) {
case IndexOptions.TEXT_ADD_AGGRESSIVE_CONFLICT_RANGES_OPTION:
case IndexOptions.TEXT_OMIT_POSITIONS_OPTION:
// without breaking compatibility.
break;
case IndexOptions.TEXT_TOKENIZER_NAME_OPTION:
String oldTokenizerName = TextIndexMaintainer.getTokenizer(oldIndex).getName();
String newTokenizerName = TextIndexMaintainer.getTokenizer(index).getName();
if (!oldTokenizerName.equals(newTokenizerName)) {
throw new MetaDataException("text tokenizer changed", LogMessageKeys.INDEX_NAME, index.getName());
}
break;
case IndexOptions.TEXT_TOKENIZER_VERSION_OPTION:
// The tokenizer version should always go up.
int oldTokenizerVersion = TextIndexMaintainer.getIndexTokenizerVersion(oldIndex);
int newTokenizerVersion = TextIndexMaintainer.getIndexTokenizerVersion(index);
if (oldTokenizerVersion > newTokenizerVersion) {
throw new MetaDataException("text tokenizer version downgraded", LogMessageKeys.INDEX_NAME, index.getName(), LogMessageKeys.OLD_VERSION, oldTokenizerVersion, LogMessageKeys.NEW_VERSION, newTokenizerVersion);
}
break;
default:
// Changed options that are not text options will be handled by super class
if (TEXT_OPTIONS.contains(changedOption)) {
throw new MetaDataException("index option changed", LogMessageKeys.INDEX_NAME, index.getName(), LogMessageKeys.INDEX_OPTION, changedOption, LogMessageKeys.OLD_OPTION, oldIndex.getOption(changedOption), LogMessageKeys.NEW_OPTION, index.getOption(changedOption));
}
}
}
changedOptions.removeAll(TEXT_OPTIONS);
super.validateChangedOptions(oldIndex, changedOptions);
}
};
}
Aggregations