use of com.cinchapi.concourse.server.model.Position in project concourse by cinchapi.
the class Database method search.
@Override
public Set<Long> search(String key, String query) {
// NOTE: Locking must happen here since CorpusRecords are not cached and
// search potentially works across multiple ones.
masterLock.readLock().lock();
try {
Text L = Text.wrapCached(key);
// Get each word in the query separately to ensure that multi word
// search works.
String[] words = query.toString().toLowerCase().split(TStrings.REGEX_GROUP_OF_ONE_OR_MORE_WHITESPACE_CHARS);
Multimap<Identifier, Integer> reference = ImmutableMultimap.of();
boolean initial = true;
int offset = 0;
for (String word : words) {
if (GlobalState.STOPWORDS.contains(word)) {
// When skipping a stop word, we must record an offset to
// correctly determine if the next term match is in the
// correct relative position to the previous term match
++offset;
continue;
}
Text K = Text.wrap(word);
CorpusRecord corpus = getCorpusRecord(L, K);
Set<Position> appearances = corpus.get(K);
Multimap<Identifier, Integer> temp = HashMultimap.create();
for (Position appearance : appearances) {
Identifier record = appearance.getIdentifier();
int position = appearance.getIndex();
if (initial) {
temp.put(record, position);
} else {
for (int current : reference.get(record)) {
if (position == current + 1 + offset) {
temp.put(record, position);
}
}
}
}
initial = false;
reference = temp;
offset = 0;
}
// Result Scoring: Scoring is simply the number of times the query
// appears in a Record [e.g. the number of Positions mapped from
// key: #reference.get(key).size()]. The total number of positions
// in #reference is equal to the total number of times a document
// appears in the corpus [e.g. reference.asMap().values().size()].
Multimap<Integer, Long> sorted = TreeMultimap.create(Collections.<Integer>reverseOrder(), Long::compareUnsigned);
for (Entry<Identifier, Collection<Integer>> entry : reference.asMap().entrySet()) {
sorted.put(entry.getValue().size(), entry.getKey().longValue());
}
Set<Long> results = (Set<Long>) sorted.values().stream().collect(Collectors.toCollection(LinkedHashSet::new));
return results;
} finally {
masterLock.readLock().unlock();
}
}
use of com.cinchapi.concourse.server.model.Position in project concourse by cinchapi.
the class CorpusChunk method prepare.
/**
* Calculate all possible substrings for {@code term} and
* {@link SearchIndexer#enqueue(SearchIndex, CountUpLatch, Text, String, Position, long, Action)
* enqueue} work that will store a revision for the {@code term} at
* {@code position} for {@code key} in {@code record} at {@code version}.
*
* @param tracker a {@link CountUpLatch} that is associated with each of the
* tasks that are
* {@link SearchIndexer#enqueue(SearchIndex, CountUpLatch, Text, String, Position, long, Action)
* enqueued} by this method; when each index task completes, it
* {@link CountUpLatch#countUp() increments} the tracker
* @param key
* @param term
* @param record
* @param position
* @param version
* @param type
* @param artifacts a collection where each {@link Chunk.Artifact} that is
* generated from the
* {@link #index(Text, Text, Position, long, Action, Collection)}
* job is stored
* @return the number of inserts that have been enqueued so that the caller
* can {@link CountUpLatch#await(int) await} all related inserts
* to finish.
*/
private int prepare(CountUpLatch tracker, Text key, String term, Identifier record, int position, long version, Action type, Collection<CorpusArtifact> artifacts) {
int count = 0;
if (!GlobalState.STOPWORDS.contains(term)) {
Position pos = Position.of(record, position);
int length = term.length();
int upperBound = upperBoundOfPossibleSubstrings(length);
// Detect if the #term is large enough to likely cause OOMs when
// indexing and prepare the appropriate precautions.
boolean isLargeTerm = upperBound > 5000000;
// A flag that indicates whether the {@link #prepare(CountUpLatch,
// Text, String, PrimaryKey, int, long, Action) prepare} function
// should limit the length of substrings that are indexed.
// Generally, this value is {@code true} if the configuration has a
// value for {@link GlobalState#MAX_SEARCH_SUBSTRING_LENGTH} that is
// greater than 0.
// NOTE: This is NOT static because unit tests sequencing would
// cause this to fail :-/
boolean shouldLimitSubstringLength = GlobalState.MAX_SEARCH_SUBSTRING_LENGTH > 0;
// The set of substrings that have been indexed from {@code term} at
// {@code position} for {@code key} in {@code record} at {@code
// version}. This is used to ensure that we do not add duplicate
// indexes (i.e. 'abrakadabra')
// @formatter:off
Set<Text> indexed = isLargeTerm ? OffHeapTextSet.create(upperBound) : Sets.newHashSetWithExpectedSize(upperBound);
// @formatter:on
final char[] chars = isLargeTerm ? term.toCharArray() : null;
for (int i = 0; i < length; ++i) {
int start = i + 1;
int limit = (shouldLimitSubstringLength ? Math.min(length, start + GlobalState.MAX_SEARCH_SUBSTRING_LENGTH) : length) + 1;
for (int j = start; j < limit; ++j) {
// @formatter:off
Text infix = (isLargeTerm ? Text.wrap(chars, i, j) : Text.wrap(term.substring(i, j))).trim();
// @formatter:on
if (!infix.isEmpty() && !STOPWORDS.contains(infix) && indexed.add(infix)) {
INDEXER.enqueue(this, tracker, key, infix, pos, version, type, artifacts);
++count;
}
}
}
PossibleCloseables.tryCloseQuietly(indexed);
// make eligible for immediate GC
indexed = null;
}
return count;
}
use of com.cinchapi.concourse.server.model.Position in project concourse by cinchapi.
the class SegmentTest method testDataDeduplication.
@Test
public void testDataDeduplication() {
String key = "name";
TObject value = Convert.javaToThrift("Fonamey");
long record = 1;
// Simulate adding and removing while server is running, but creating
// new intermediate TObjects
Write w1 = Write.add(key, Convert.javaToThrift("Fonamey"), record);
Write w2 = Write.remove(key, Convert.javaToThrift("Fonamey"), record);
Assert.assertNotSame(w1.getValue(), w2.getValue());
segment.acquire(w1);
segment.acquire(w2);
// Simulate loading data from disk and creating new intermediate because
// values are not cached when read
w1 = Write.fromByteBuffer(w1.getBytes());
w2 = Write.fromByteBuffer(w2.getBytes());
Assert.assertNotSame(w1.getValue(), w2.getValue());
segment.acquire(w1);
segment.acquire(w2);
int count = TestData.getScaleCount();
for (int i = 0; i < count; ++i) {
Write write = Numbers.isEven(i) ? Write.remove(key, value, record) : Write.add(key, value, record);
write = Write.fromByteBuffer(write.getBytes());
segment.acquire(write);
}
Text name = null;
Value fonamey = null;
Identifier one = null;
Position position = null;
Iterator<Revision<Identifier, Text, Value>> tit = segment.table().iterator();
while (tit.hasNext()) {
Revision<Identifier, Text, Value> revision = tit.next();
if (one == null) {
one = revision.getLocator();
}
if (name == null) {
name = revision.getKey();
}
if (fonamey == null) {
fonamey = revision.getValue();
}
Assert.assertSame(one, revision.getLocator());
Assert.assertSame(name, revision.getKey());
Assert.assertSame(fonamey, revision.getValue());
}
Iterator<Revision<Text, Value, Identifier>> iit = segment.index().iterator();
while (iit.hasNext()) {
Revision<Text, Value, Identifier> revision = iit.next();
if (one == null) {
one = revision.getValue();
}
if (name == null) {
name = revision.getLocator();
}
if (fonamey == null) {
fonamey = revision.getKey();
}
Assert.assertSame(one, revision.getValue());
Assert.assertSame(name, revision.getLocator());
Assert.assertSame(fonamey, revision.getKey());
}
Iterator<Revision<Text, Text, Position>> cit = segment.corpus().iterator();
while (cit.hasNext()) {
Revision<Text, Text, Position> revision = cit.next();
if (position == null) {
position = revision.getValue();
}
if (name == null) {
name = revision.getLocator();
}
Assert.assertSame(position, revision.getValue());
Assert.assertSame(name, revision.getLocator());
if (revision.getKey().toString().equals("name")) {
Assert.assertSame(name, revision.getKey());
}
}
}
use of com.cinchapi.concourse.server.model.Position in project concourse by cinchapi.
the class CorpusChunkTest method testDataDeduplication.
@Test
public void testDataDeduplication() {
Text locator1 = Text.wrap("name");
Text locator2 = Text.wrap("name");
Value key1 = Value.wrap(Convert.javaToThrift("Fonamey"));
Value key2 = Value.wrap(Convert.javaToThrift("Fonamey"));
Identifier value1 = Identifier.of(1);
Identifier value2 = Identifier.of(1);
CorpusChunk corpus = (CorpusChunk) chunk;
corpus.insert(locator2, key2, value2, Time.now(), Action.ADD);
corpus.insert(locator1, key1, value1, Time.now(), Action.ADD);
Position position = null;
Iterator<Revision<Text, Text, Position>> it = corpus.iterator();
while (it.hasNext()) {
Revision<Text, Text, Position> revision = it.next();
if (position == null) {
position = revision.getValue();
}
Assert.assertSame(locator2, revision.getLocator());
if (revision.getKey().toString().equals("name")) {
Assert.assertSame(locator2, revision.getKey());
}
Assert.assertSame(position, revision.getValue());
}
}
use of com.cinchapi.concourse.server.model.Position in project concourse by cinchapi.
the class CorpusChunkTest method testShiftExisting.
@SuppressWarnings("unchecked")
@Test
@Override
public void testShiftExisting() {
int count = TestData.getScaleCount();
CorpusChunk corpus = (CorpusChunk) chunk;
for (int i = 0; i < count; ++i) {
corpus.insert(Text.wrap(TestData.getString()), Value.wrap(Convert.javaToThrift(TestData.getString())), Identifier.of(Time.now()), Time.now(), Action.ADD);
}
Iterator<Revision<Text, Text, Position>> expected = ((Iterable<Revision<Text, Text, Position>>) Reflection.get("revisions", chunk)).iterator();
OffHeapMemory memory = OffHeapMemory.allocateDirect(count * 3 * Write.MINIMUM_SIZE);
chunk.shift(memory);
Iterator<Revision<Text, Text, Position>> actual = chunk.iterator();
while (expected.hasNext()) {
Revision<Text, Text, Position> a = expected.next();
Revision<Text, Text, Position> b = actual.next();
Assert.assertEquals(a, b);
}
}
Aggregations