use of com.apple.foundationdb.record.TestRecordsTextProto.SimpleDocument in project fdb-record-layer by FoundationDB.
the class TextIndexTest method saveSimpleDocumentsWithNoPositions.
@Test
public void saveSimpleDocumentsWithNoPositions() throws Exception {
final SimpleDocument shakespeareDocument = SimpleDocument.newBuilder().setDocId(1623L).setText(TextSamples.ROMEO_AND_JULIET_PROLOGUE).build();
final SimpleDocument germanDocument = SimpleDocument.newBuilder().setDocId(1066L).setText(TextSamples.GERMAN).build();
// Save with positions
final int shakespeareValueBytes;
final int germanValueBytes;
try (FDBRecordContext context = openContext()) {
openRecordStore(context);
recordStore.saveRecord(shakespeareDocument);
shakespeareValueBytes = getSaveIndexValueBytes(recordStore);
recordStore.saveRecord(germanDocument);
germanValueBytes = getSaveIndexValueBytes(recordStore) - shakespeareValueBytes;
commit(context);
}
// Save without positions
try (FDBRecordContext context = openContext()) {
openRecordStore(context, metaDataBuilder -> {
metaDataBuilder.removeIndex(TextIndexTestUtils.SIMPLE_DEFAULT_NAME);
metaDataBuilder.addIndex(SIMPLE_DOC, SIMPLE_TEXT_NO_POSITIONS);
});
recordStore.deleteAllRecords();
recordStore.saveRecord(shakespeareDocument);
assertThat(getSaveIndexValueBytes(recordStore), lessThan(shakespeareValueBytes));
final int newShakespeareBytes = getSaveIndexValueBytes(recordStore);
recordStore.saveRecord(germanDocument);
assertThat(getSaveIndexValueBytes(recordStore) - newShakespeareBytes, lessThan(germanValueBytes));
List<Map.Entry<Tuple, List<Integer>>> entryList = scanMapEntries(recordStore, SIMPLE_TEXT_NO_POSITIONS, Tuple.from("gewonnen"));
assertEquals(Collections.singletonList(entryOf(Tuple.from(1066L), Collections.emptyList())), entryList);
entryList = scanMapEntries(recordStore, SIMPLE_TEXT_NO_POSITIONS, Tuple.from("dignity"));
assertEquals(Collections.singletonList(entryOf(Tuple.from(1623L), Collections.emptyList())), entryList);
commit(context);
}
}
use of com.apple.foundationdb.record.TestRecordsTextProto.SimpleDocument in project fdb-record-layer by FoundationDB.
the class TextIndexTest method textIndexPerf1000SerialInsertNoBatching.
@Tag(Tags.Performance)
@Test
public void textIndexPerf1000SerialInsertNoBatching() throws Exception {
// Create 1000 records
Random r = new Random();
List<SimpleDocument> records = getRandomRecords(r, 1000);
long startTime = System.nanoTime();
for (int i = 0; i < records.size(); i++) {
try (FDBRecordContext context = openContext()) {
openRecordStore(context);
recordStore.saveRecord(records.get(i));
commit(context);
}
}
long endTime = System.nanoTime();
LOGGER.info("performed 1000 serial insertions in {} seconds.", (endTime - startTime) * 1e-9);
printUsage();
}
use of com.apple.foundationdb.record.TestRecordsTextProto.SimpleDocument in project fdb-record-layer by FoundationDB.
the class TextIndexTest method queryScanEquivalence.
/**
* Generate random documents and then make sure that querying them using the index
* produces the same result as performing a full scan of all records.
*/
@MethodSource("indexArguments")
@ParameterizedTest
public void queryScanEquivalence(@Nonnull Index index) throws Exception {
final Random r = new Random(0xba5eba1L + index.getName().hashCode());
final int recordCount = 100;
final int recordBatch = 25;
final int queryCount = 25;
final List<String> lexicon = getStandardLexicon();
TextTokenizerRegistryImpl.instance().register(FILTERING_TOKENIZER);
final TextTokenizer tokenizer = TextIndexMaintainer.getTokenizer(index);
final RecordMetaDataHook hook = metaDataBuilder -> {
metaDataBuilder.removeIndex(TextIndexTestUtils.SIMPLE_DEFAULT_NAME);
metaDataBuilder.addIndex(SIMPLE_DOC, index);
};
long seed = r.nextLong();
LOGGER.info(KeyValueLogMessage.of("initializing random number generator", TestLogMessageKeys.SEED, seed));
r.setSeed(seed);
for (int i = 0; i < recordCount; i += recordBatch) {
List<SimpleDocument> records = getRandomRecords(r, recordBatch, lexicon);
LOGGER.info(KeyValueLogMessage.of("creating and saving random records", TestLogMessageKeys.BATCH_SIZE, recordBatch));
try (FDBRecordContext context = openContext()) {
openRecordStore(context, hook);
records.forEach(recordStore::saveRecord);
commit(context);
}
}
double[] proportions = getZipfProportions(lexicon);
long totalScanningTime = 0;
long totalQueryingTime = 0;
long totalResults = 0;
for (int i = 0; i < queryCount; i++) {
// Generate a random text query
List<String> tokens = getRandomWords(r, lexicon, proportions, 6, 3);
String tokenString = String.join(" ", tokens);
double filterChoice = r.nextDouble();
final QueryComponent filter;
if (filterChoice < 0.2) {
filter = Query.field("text").text(tokenizer.getName()).containsAll(tokenString);
} else if (filterChoice < 0.4) {
filter = Query.field("text").text(tokenizer.getName()).containsAny(tokenString);
} else if (filterChoice < 0.6) {
filter = Query.field("text").text(tokenizer.getName()).containsPhrase(tokenString);
} else if (filterChoice < 0.8) {
int maxDistance = r.nextInt(10) + tokens.size();
filter = Query.field("text").text(tokenizer.getName()).containsAll(tokenString, maxDistance);
} else if (filterChoice < 0.9) {
filter = Query.field("text").text(tokenizer.getName()).containsAnyPrefix(tokenString);
} else if (filterChoice < 0.95) {
filter = Query.field("text").text(tokenizer.getName()).containsAllPrefixes(tokenString);
} else {
if (tokens.isEmpty()) {
continue;
}
// Choose the first non-empty token from the iterator
Iterator<? extends CharSequence> tokenIterator = tokenizer.tokenize(tokenString, tokenizer.getMaxVersion(), TextTokenizer.TokenizerMode.QUERY);
String firstToken = null;
while (tokenIterator.hasNext()) {
String nextToken = tokenIterator.next().toString();
if (!nextToken.isEmpty()) {
firstToken = nextToken;
break;
}
}
if (firstToken == null) {
continue;
}
int prefixEnd;
if (firstToken.length() > 1) {
prefixEnd = r.nextInt(firstToken.length() - 1) + 1;
} else {
prefixEnd = 1;
}
filter = Query.field("text").text(tokenizer.getName()).containsPrefix(firstToken.substring(0, prefixEnd));
}
LOGGER.info(KeyValueLogMessage.of("generated random filter", TestLogMessageKeys.ITERATION, i, LogMessageKeys.FILTER, filter));
// Manual scan all of the records
long startTime = System.nanoTime();
final Set<Long> manualRecordIds = performQueryWithRecordStoreScan(hook, filter);
long endTime = System.nanoTime();
LOGGER.info(KeyValueLogMessage.of("manual scan completed", TestLogMessageKeys.SCAN_MILLIS, TimeUnit.MILLISECONDS.convert(endTime - startTime, TimeUnit.NANOSECONDS)));
totalScanningTime += endTime - startTime;
// Generate a query and use the index
startTime = System.nanoTime();
final Set<Long> queryRecordIds = performQueryWithIndexScan(hook, index, filter);
endTime = System.nanoTime();
LOGGER.info(KeyValueLogMessage.of("query completed", TestLogMessageKeys.SCAN_MILLIS, TimeUnit.MILLISECONDS.convert(endTime - startTime, TimeUnit.NANOSECONDS)));
totalQueryingTime += endTime - startTime;
if (!manualRecordIds.equals(queryRecordIds)) {
Set<Long> onlyManual = new HashSet<>(manualRecordIds);
onlyManual.removeAll(queryRecordIds);
Set<Long> onlyQuery = new HashSet<>(queryRecordIds);
onlyManual.removeAll(manualRecordIds);
LOGGER.warn(KeyValueLogMessage.of("results did not match", LogMessageKeys.FILTER, filter, TestLogMessageKeys.MANUAL_RESULT_COUNT, manualRecordIds.size(), TestLogMessageKeys.QUERY_RESULT_COUNT, queryRecordIds.size(), TestLogMessageKeys.ONLY_MANUAL_COUNT, onlyManual.size(), TestLogMessageKeys.ONLY_QUERY_COUNT, onlyQuery.size()));
}
assertEquals(manualRecordIds, queryRecordIds);
LOGGER.info(KeyValueLogMessage.of("results matched", LogMessageKeys.FILTER, filter, TestLogMessageKeys.RESULT_COUNT, manualRecordIds.size()));
totalResults += queryRecordIds.size();
}
LOGGER.info(KeyValueLogMessage.of("test completed", TestLogMessageKeys.TOTAL_SCAN_MILLIS, TimeUnit.MILLISECONDS.convert(totalScanningTime, TimeUnit.NANOSECONDS), TestLogMessageKeys.TOTAL_QUERY_MILLIS, TimeUnit.MILLISECONDS.convert(totalQueryingTime, TimeUnit.NANOSECONDS), TestLogMessageKeys.TOTAL_RESULT_COUNT, totalResults));
}
use of com.apple.foundationdb.record.TestRecordsTextProto.SimpleDocument in project fdb-record-layer by FoundationDB.
the class TextIndexTest method scan.
@Test
public void scan() throws Exception {
final Random r = new Random(0x5ca1ab1e);
List<SimpleDocument> records = getRandomRecords(r, 50);
try (FDBRecordContext context = openContext()) {
openRecordStore(context);
records.forEach(recordStore::saveRecord);
commit(context);
}
final Index index = recordStore.getRecordMetaData().getIndex(TextIndexTestUtils.SIMPLE_DEFAULT_NAME);
// existing token
scanWithZeroScanRecordLimit(index, "angstrom", false);
// existing token
scanWithZeroScanRecordLimit(index, "angstrom", true);
// non-existent token
scanWithZeroScanRecordLimit(index, "asdfasdf", false);
// non-existent token
scanWithZeroScanRecordLimit(index, "asdfasdf", true);
final List<Integer> limits = Arrays.asList(0, 1, 2, Integer.MAX_VALUE);
final List<Integer> skips = Arrays.asList(0, 1, 10, 1000);
final List<String> tokens = Arrays.asList("angstrom", "the", "not_a_token_in_the_lexicon", "שפראך");
for (int limit : limits) {
scanMultipleWithScanRecordLimits(index, tokens, limit, false);
scanMultipleWithScanRecordLimits(index, tokens, limit, true);
scanWithContinuations(index, "достопримечательности", limit, false);
scanWithContinuations(index, "достопримечательности", limit, false);
for (int skip : skips) {
scanWithSkip(index, "toil", skip, limit, false);
scanWithSkip(index, "toil", skip, limit, true);
}
}
}
use of com.apple.foundationdb.record.TestRecordsTextProto.SimpleDocument in project fdb-record-layer by FoundationDB.
the class TextIndexTest method saveSimpleDocumentsWithFilter.
@Test
public void saveSimpleDocumentsWithFilter() throws Exception {
final SimpleDocument russianDocument = SimpleDocument.newBuilder().setDocId(1547L).setText(TextSamples.RUSSIAN).build();
try (FDBRecordContext context = openContext()) {
// Because missing tokenizer
assertThrows(MetaDataException.class, () -> openRecordStore(context, metaDataBuilder -> {
metaDataBuilder.removeIndex(TextIndexTestUtils.SIMPLE_DEFAULT_NAME);
metaDataBuilder.addIndex(SIMPLE_DOC, SIMPLE_TEXT_FILTERING);
}));
}
TextTokenizerRegistryImpl.instance().register(FILTERING_TOKENIZER);
try (FDBRecordContext context = openContext()) {
openRecordStore(context, metaDataBuilder -> {
metaDataBuilder.removeIndex(TextIndexTestUtils.SIMPLE_DEFAULT_NAME);
metaDataBuilder.addIndex(SIMPLE_DOC, SIMPLE_TEXT_FILTERING);
});
recordStore.saveRecord(russianDocument);
// Note that достопримечательности has been filtered out, so it's probably a
// lot less interesting to visit.
assertEquals(4, getSaveIndexKeyCount(recordStore));
List<Map.Entry<Tuple, List<Integer>>> entryList = scanMapEntries(recordStore, SIMPLE_TEXT_FILTERING, Tuple.from("достопримечательности"));
assertEquals(Collections.emptyList(), entryList);
entryList = scanMapEntries(recordStore, SIMPLE_TEXT_FILTERING, Tuple.from("москвы"));
assertEquals(Collections.singletonList(entryOf(Tuple.from(1547L), Collections.singletonList(4))), entryList);
commit(context);
}
}
Aggregations