use of com.apple.foundationdb.record.provider.foundationdb.indexes.TextIndexTestUtils.SIMPLE_DOC in project fdb-record-layer by FoundationDB.
the class TextIndexTest method performQueryWithRecordStoreScan.
@Nonnull
private Set<Long> performQueryWithRecordStoreScan(@Nonnull RecordMetaDataHook hook, @Nonnull QueryComponent filter) throws Exception {
final ScanProperties scanProperties = new ScanProperties(ExecuteProperties.newBuilder().setTimeLimit(3000).build());
Set<Long> results = new HashSet<>();
byte[] continuation = null;
do {
try (FDBRecordContext context = openContext()) {
openRecordStore(context);
try (RecordCursor<Long> cursor = recordStore.scanRecords(continuation, scanProperties).filter(record -> record.getRecordType().getName().equals(SIMPLE_DOC)).filter(record -> filter.eval(recordStore, EvaluationContext.EMPTY, record) == Boolean.TRUE).map(record -> record.getPrimaryKey().getLong(0))) {
cursor.forEach(results::add).get();
RecordCursorResult<Long> noNextResult = cursor.getNext();
continuation = noNextResult.getContinuation().toBytes();
}
}
} while (continuation != null);
return results;
}
use of com.apple.foundationdb.record.provider.foundationdb.indexes.TextIndexTestUtils.SIMPLE_DOC in project fdb-record-layer by FoundationDB.
the class TextIndexTest method querySimpleDocumentsWithoutPositions.
@Test
public void querySimpleDocumentsWithoutPositions() throws Exception {
final List<SimpleDocument> documents = TextIndexTestUtils.toSimpleDocuments(Arrays.asList(TextSamples.ANGSTROM, TextSamples.AETHELRED, TextSamples.ROMEO_AND_JULIET_PROLOGUE, TextSamples.FRENCH));
// Query but make sure
try (FDBRecordContext context = openContext()) {
openRecordStore(context, metaDataBuilder -> {
metaDataBuilder.removeIndex(TextIndexTestUtils.SIMPLE_DEFAULT_NAME);
metaDataBuilder.addIndex(SIMPLE_DOC, SIMPLE_TEXT_NO_POSITIONS);
});
documents.forEach(recordStore::saveRecord);
// Queries that *don't* require position information should be planned to use the index
assertEquals(Arrays.asList(1L, 2L, 3L), querySimpleDocumentsWithIndex(Query.field("text").text().containsAny("king civil récu"), SIMPLE_TEXT_NO_POSITIONS.getName(), 0, true));
assertEquals(Collections.singletonList(2L), querySimpleDocumentsWithIndex(Query.field("text").text().containsAll("unclean verona"), SIMPLE_TEXT_NO_POSITIONS.getName(), 0, true));
assertEquals(Arrays.asList(0L, 1L, 2L, 3L), querySimpleDocumentsWithIndex(Query.field("text").text().containsPrefix("th"), SIMPLE_TEXT_NO_POSITIONS.getName(), 0, true));
// Queries that *do* require position information must be planned as scans
assertEquals(Collections.singletonList(2L), querySimpleDocumentsWithScan(Query.field("text").text().containsPhrase("civil blood makes civil hands unclean"), 0));
assertEquals(Collections.singletonList(3L), querySimpleDocumentsWithScan(Query.field("text").text().containsAll("France Napoleons", 3), 0));
commit(context);
}
final List<SimpleDocument> newDocuments = documents.stream().map(doc -> doc.toBuilder().setDocId(doc.getDocId() + documents.size()).build()).collect(Collectors.toList());
// Upgrade to writing position information
try (FDBRecordContext context = openContext()) {
openRecordStore(context, metaDataBuilder -> {
metaDataBuilder.removeIndex(TextIndexTestUtils.SIMPLE_DEFAULT_NAME);
metaDataBuilder.addIndex(SIMPLE_DOC, new Index(SIMPLE_TEXT_NO_POSITIONS.getName(), SIMPLE_TEXT_NO_POSITIONS.getRootExpression(), IndexTypes.TEXT));
});
newDocuments.forEach(recordStore::saveRecord);
// Queries that *don't* require position information produce the same plan
assertEquals(Arrays.asList(1L, 2L, 3L, 5L, 6L, 7L), querySimpleDocumentsWithIndex(Query.field("text").text().containsAny("king civil récu"), SIMPLE_TEXT_NO_POSITIONS.getName(), 0, true));
assertEquals(Arrays.asList(2L, 6L), querySimpleDocumentsWithIndex(Query.field("text").text().containsAll("unclean verona"), SIMPLE_TEXT_NO_POSITIONS.getName(), 0, true));
assertEquals(Arrays.asList(0L, 1L, 2L, 4L, 5L, 6L, 3L, 7L), querySimpleDocumentsWithIndex(Query.field("text").text().containsPrefix("th"), SIMPLE_TEXT_NO_POSITIONS.getName(), 0, true));
// Queries that *do* require position information now use the index, but previously written documents show up in the
// query spuriously
assertEquals(Arrays.asList(2L, 6L), querySimpleDocumentsWithIndex(Query.field("text").text().containsPhrase("civil blood makes civil hands unclean"), SIMPLE_TEXT_NO_POSITIONS.getName(), 0, true));
assertEquals(Collections.singletonList(2L), querySimpleDocumentsWithIndex(Query.field("text").text().containsPhrase("unclean verona"), SIMPLE_TEXT_NO_POSITIONS.getName(), 0, true));
assertEquals(Arrays.asList(3L, 7L), querySimpleDocumentsWithIndex(Query.field("text").text().containsAll("France Napoleons", 3), SIMPLE_TEXT_NO_POSITIONS.getName(), 0, true));
assertEquals(Collections.singletonList(3L), querySimpleDocumentsWithIndex(Query.field("text").text().containsAll("Thiers Napoleons", 3), SIMPLE_TEXT_NO_POSITIONS.getName(), 0, true));
commit(context);
}
}
use of com.apple.foundationdb.record.provider.foundationdb.indexes.TextIndexTestUtils.SIMPLE_DOC in project fdb-record-layer by FoundationDB.
the class TextIndexTest method queryMultiTypeDocuments.
@Test
public void queryMultiTypeDocuments() throws Exception {
final List<String> bothTypes = Arrays.asList(SIMPLE_DOC, COMPLEX_DOC);
final List<String> simpleTypes = Collections.singletonList(SIMPLE_DOC);
final List<String> complexTypes = Collections.singletonList(COMPLEX_DOC);
final List<String> textSamples = Arrays.asList(TextSamples.ROMEO_AND_JULIET_PROLOGUE, TextSamples.ROMEO_AND_JULIET_PROLOGUE, TextSamples.ANGSTROM, TextSamples.AETHELRED, TextSamples.FRENCH, TextSamples.GERMAN);
final List<Message> documents = IntStream.range(0, textSamples.size()).mapToObj(i -> {
final String text = textSamples.get(i);
if (i % 2 == 0) {
return SimpleDocument.newBuilder().setDocId(i).setText(text).setGroup(i % 4).build();
} else {
return ComplexDocument.newBuilder().setDocId(i).setText(text).setGroup(i % 4).build();
}
}).collect(Collectors.toList());
try (FDBRecordContext context = openContext()) {
openRecordStore(context, metaDataBuilder -> {
metaDataBuilder.getRecordType(COMPLEX_DOC).setPrimaryKey(field("doc_id"));
metaDataBuilder.removeIndex(TextIndexTestUtils.SIMPLE_DEFAULT_NAME);
metaDataBuilder.addMultiTypeIndex(Arrays.asList(metaDataBuilder.getRecordType(SIMPLE_DOC), metaDataBuilder.getRecordType(COMPLEX_DOC)), MULTI_TYPE_INDEX);
});
documents.forEach(recordStore::saveRecord);
assertEquals(Arrays.asList(0L, 1L), queryMultiTypeDocuments(Query.field("text").text().containsPhrase("where we lay our scene"), bothTypes, 1755757799));
assertEquals(Collections.singletonList(0L), queryMultiTypeDocuments(Query.field("text").text().containsPhrase("where we lay our scene"), simpleTypes, -1489953261));
assertEquals(Collections.singletonList(1L), queryMultiTypeDocuments(Query.field("text").text().containsPhrase("where we lay our scene"), complexTypes, -1333764399));
assertEquals(Arrays.asList(2L, 4L, 5L), queryMultiTypeDocuments(Query.field("text").text().containsPrefix("na"), bothTypes, -714642562));
assertEquals(Arrays.asList(2L, 4L), queryMultiTypeDocuments(Query.field("text").text().containsPrefix("na"), simpleTypes, 334613674));
assertEquals(Collections.singletonList(5L), queryMultiTypeDocuments(Query.field("text").text().containsPrefix("na"), complexTypes, 490802536));
commit(context);
}
}
use of com.apple.foundationdb.record.provider.foundationdb.indexes.TextIndexTestUtils.SIMPLE_DOC in project fdb-record-layer by FoundationDB.
the class TextIndexTest method queryScanEquivalence.
/**
* Generate random documents and then make sure that querying them using the index
* produces the same result as performing a full scan of all records.
*/
@MethodSource("indexArguments")
@ParameterizedTest
public void queryScanEquivalence(@Nonnull Index index) throws Exception {
final Random r = new Random(0xba5eba1L + index.getName().hashCode());
final int recordCount = 100;
final int recordBatch = 25;
final int queryCount = 25;
final List<String> lexicon = getStandardLexicon();
TextTokenizerRegistryImpl.instance().register(FILTERING_TOKENIZER);
final TextTokenizer tokenizer = TextIndexMaintainer.getTokenizer(index);
final RecordMetaDataHook hook = metaDataBuilder -> {
metaDataBuilder.removeIndex(TextIndexTestUtils.SIMPLE_DEFAULT_NAME);
metaDataBuilder.addIndex(SIMPLE_DOC, index);
};
long seed = r.nextLong();
LOGGER.info(KeyValueLogMessage.of("initializing random number generator", TestLogMessageKeys.SEED, seed));
r.setSeed(seed);
for (int i = 0; i < recordCount; i += recordBatch) {
List<SimpleDocument> records = getRandomRecords(r, recordBatch, lexicon);
LOGGER.info(KeyValueLogMessage.of("creating and saving random records", TestLogMessageKeys.BATCH_SIZE, recordBatch));
try (FDBRecordContext context = openContext()) {
openRecordStore(context, hook);
records.forEach(recordStore::saveRecord);
commit(context);
}
}
double[] proportions = getZipfProportions(lexicon);
long totalScanningTime = 0;
long totalQueryingTime = 0;
long totalResults = 0;
for (int i = 0; i < queryCount; i++) {
// Generate a random text query
List<String> tokens = getRandomWords(r, lexicon, proportions, 6, 3);
String tokenString = String.join(" ", tokens);
double filterChoice = r.nextDouble();
final QueryComponent filter;
if (filterChoice < 0.2) {
filter = Query.field("text").text(tokenizer.getName()).containsAll(tokenString);
} else if (filterChoice < 0.4) {
filter = Query.field("text").text(tokenizer.getName()).containsAny(tokenString);
} else if (filterChoice < 0.6) {
filter = Query.field("text").text(tokenizer.getName()).containsPhrase(tokenString);
} else if (filterChoice < 0.8) {
int maxDistance = r.nextInt(10) + tokens.size();
filter = Query.field("text").text(tokenizer.getName()).containsAll(tokenString, maxDistance);
} else if (filterChoice < 0.9) {
filter = Query.field("text").text(tokenizer.getName()).containsAnyPrefix(tokenString);
} else if (filterChoice < 0.95) {
filter = Query.field("text").text(tokenizer.getName()).containsAllPrefixes(tokenString);
} else {
if (tokens.isEmpty()) {
continue;
}
// Choose the first non-empty token from the iterator
Iterator<? extends CharSequence> tokenIterator = tokenizer.tokenize(tokenString, tokenizer.getMaxVersion(), TextTokenizer.TokenizerMode.QUERY);
String firstToken = null;
while (tokenIterator.hasNext()) {
String nextToken = tokenIterator.next().toString();
if (!nextToken.isEmpty()) {
firstToken = nextToken;
break;
}
}
if (firstToken == null) {
continue;
}
int prefixEnd;
if (firstToken.length() > 1) {
prefixEnd = r.nextInt(firstToken.length() - 1) + 1;
} else {
prefixEnd = 1;
}
filter = Query.field("text").text(tokenizer.getName()).containsPrefix(firstToken.substring(0, prefixEnd));
}
LOGGER.info(KeyValueLogMessage.of("generated random filter", TestLogMessageKeys.ITERATION, i, LogMessageKeys.FILTER, filter));
// Manual scan all of the records
long startTime = System.nanoTime();
final Set<Long> manualRecordIds = performQueryWithRecordStoreScan(hook, filter);
long endTime = System.nanoTime();
LOGGER.info(KeyValueLogMessage.of("manual scan completed", TestLogMessageKeys.SCAN_MILLIS, TimeUnit.MILLISECONDS.convert(endTime - startTime, TimeUnit.NANOSECONDS)));
totalScanningTime += endTime - startTime;
// Generate a query and use the index
startTime = System.nanoTime();
final Set<Long> queryRecordIds = performQueryWithIndexScan(hook, index, filter);
endTime = System.nanoTime();
LOGGER.info(KeyValueLogMessage.of("query completed", TestLogMessageKeys.SCAN_MILLIS, TimeUnit.MILLISECONDS.convert(endTime - startTime, TimeUnit.NANOSECONDS)));
totalQueryingTime += endTime - startTime;
if (!manualRecordIds.equals(queryRecordIds)) {
Set<Long> onlyManual = new HashSet<>(manualRecordIds);
onlyManual.removeAll(queryRecordIds);
Set<Long> onlyQuery = new HashSet<>(queryRecordIds);
onlyManual.removeAll(manualRecordIds);
LOGGER.warn(KeyValueLogMessage.of("results did not match", LogMessageKeys.FILTER, filter, TestLogMessageKeys.MANUAL_RESULT_COUNT, manualRecordIds.size(), TestLogMessageKeys.QUERY_RESULT_COUNT, queryRecordIds.size(), TestLogMessageKeys.ONLY_MANUAL_COUNT, onlyManual.size(), TestLogMessageKeys.ONLY_QUERY_COUNT, onlyQuery.size()));
}
assertEquals(manualRecordIds, queryRecordIds);
LOGGER.info(KeyValueLogMessage.of("results matched", LogMessageKeys.FILTER, filter, TestLogMessageKeys.RESULT_COUNT, manualRecordIds.size()));
totalResults += queryRecordIds.size();
}
LOGGER.info(KeyValueLogMessage.of("test completed", TestLogMessageKeys.TOTAL_SCAN_MILLIS, TimeUnit.MILLISECONDS.convert(totalScanningTime, TimeUnit.NANOSECONDS), TestLogMessageKeys.TOTAL_QUERY_MILLIS, TimeUnit.MILLISECONDS.convert(totalQueryingTime, TimeUnit.NANOSECONDS), TestLogMessageKeys.TOTAL_RESULT_COUNT, totalResults));
}
use of com.apple.foundationdb.record.provider.foundationdb.indexes.TextIndexTestUtils.SIMPLE_DOC in project fdb-record-layer by FoundationDB.
the class TextIndexTest method saveSimpleDocumentsWithFilter.
@Test
public void saveSimpleDocumentsWithFilter() throws Exception {
final SimpleDocument russianDocument = SimpleDocument.newBuilder().setDocId(1547L).setText(TextSamples.RUSSIAN).build();
try (FDBRecordContext context = openContext()) {
// Because missing tokenizer
assertThrows(MetaDataException.class, () -> openRecordStore(context, metaDataBuilder -> {
metaDataBuilder.removeIndex(TextIndexTestUtils.SIMPLE_DEFAULT_NAME);
metaDataBuilder.addIndex(SIMPLE_DOC, SIMPLE_TEXT_FILTERING);
}));
}
TextTokenizerRegistryImpl.instance().register(FILTERING_TOKENIZER);
try (FDBRecordContext context = openContext()) {
openRecordStore(context, metaDataBuilder -> {
metaDataBuilder.removeIndex(TextIndexTestUtils.SIMPLE_DEFAULT_NAME);
metaDataBuilder.addIndex(SIMPLE_DOC, SIMPLE_TEXT_FILTERING);
});
recordStore.saveRecord(russianDocument);
// Note that достопримечательности has been filtered out, so it's probably a
// lot less interesting to visit.
assertEquals(4, getSaveIndexKeyCount(recordStore));
List<Map.Entry<Tuple, List<Integer>>> entryList = scanMapEntries(recordStore, SIMPLE_TEXT_FILTERING, Tuple.from("достопримечательности"));
assertEquals(Collections.emptyList(), entryList);
entryList = scanMapEntries(recordStore, SIMPLE_TEXT_FILTERING, Tuple.from("москвы"));
assertEquals(Collections.singletonList(entryOf(Tuple.from(1547L), Collections.singletonList(4))), entryList);
commit(context);
}
}
Aggregations