use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class TestTokenInfoDictionary method testEnumerateAll.
/** enumerates the entire FST/lookup data and just does basic sanity checks */
public void testEnumerateAll() throws Exception {
// just for debugging
int numTerms = 0;
int numWords = 0;
int lastWordId = -1;
int lastSourceId = -1;
TokenInfoDictionary tid = TokenInfoDictionary.getInstance();
ConnectionCosts matrix = ConnectionCosts.getInstance();
FST<Long> fst = tid.getFST().getInternalFST();
IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<>(fst);
InputOutput<Long> mapping;
IntsRef scratch = new IntsRef();
while ((mapping = fstEnum.next()) != null) {
numTerms++;
IntsRef input = mapping.input;
char[] chars = new char[input.length];
for (int i = 0; i < chars.length; i++) {
chars[i] = (char) input.ints[input.offset + i];
}
assertTrue(UnicodeUtil.validUTF16String(new String(chars)));
Long output = mapping.output;
int sourceId = output.intValue();
// we walk in order, terms, sourceIds, and wordIds should always be increasing
assertTrue(sourceId > lastSourceId);
lastSourceId = sourceId;
tid.lookupWordIds(sourceId, scratch);
for (int i = 0; i < scratch.length; i++) {
numWords++;
int wordId = scratch.ints[scratch.offset + i];
assertTrue(wordId > lastWordId);
lastWordId = wordId;
String baseForm = tid.getBaseForm(wordId, chars, 0, chars.length);
assertTrue(baseForm == null || UnicodeUtil.validUTF16String(baseForm));
String inflectionForm = tid.getInflectionForm(wordId);
assertTrue(inflectionForm == null || UnicodeUtil.validUTF16String(inflectionForm));
if (inflectionForm != null) {
// check that it's actually an ipadic inflection form
assertNotNull(ToStringUtil.getInflectedFormTranslation(inflectionForm));
}
String inflectionType = tid.getInflectionType(wordId);
assertTrue(inflectionType == null || UnicodeUtil.validUTF16String(inflectionType));
if (inflectionType != null) {
// check that it's actually an ipadic inflection type
assertNotNull(ToStringUtil.getInflectionTypeTranslation(inflectionType));
}
int leftId = tid.getLeftId(wordId);
int rightId = tid.getRightId(wordId);
matrix.get(rightId, leftId);
tid.getWordCost(wordId);
String pos = tid.getPartOfSpeech(wordId);
assertNotNull(pos);
assertTrue(UnicodeUtil.validUTF16String(pos));
// check that it's actually an ipadic pos tag
assertNotNull(ToStringUtil.getPOSTranslation(pos));
String pronunciation = tid.getPronunciation(wordId, chars, 0, chars.length);
assertNotNull(pronunciation);
assertTrue(UnicodeUtil.validUTF16String(pronunciation));
String reading = tid.getReading(wordId, chars, 0, chars.length);
assertNotNull(reading);
assertTrue(UnicodeUtil.validUTF16String(reading));
}
}
if (VERBOSE) {
System.out.println("checked " + numTerms + " terms, " + numWords + " words.");
}
}
use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class TestGraphTokenizers method toPathStrings.
/** Returns all paths */
private Set<String> toPathStrings(Automaton a) {
BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
Set<String> paths = new HashSet<>();
for (IntsRef ir : AutomatonTestUtil.getFiniteStringsRecursive(a, -1)) {
paths.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
}
return paths;
}
use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class DocValuesOrdinalsReader method getReader.
@Override
public OrdinalsSegmentReader getReader(LeafReaderContext context) throws IOException {
BinaryDocValues values0 = context.reader().getBinaryDocValues(field);
if (values0 == null) {
values0 = DocValues.emptyBinary();
}
final BinaryDocValues values = values0;
return new OrdinalsSegmentReader() {
private int lastDocID;
@Override
public void get(int docID, IntsRef ordinals) throws IOException {
if (docID < lastDocID) {
throw new AssertionError("docs out of order: lastDocID=" + lastDocID + " vs docID=" + docID);
}
lastDocID = docID;
if (docID > values.docID()) {
values.advance(docID);
}
final BytesRef bytes;
if (values.docID() == docID) {
bytes = values.binaryValue();
} else {
bytes = new BytesRef(BytesRef.EMPTY_BYTES);
}
decode(bytes, ordinals);
}
};
}
use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class TaxonomyFacetSumValueSource method sumValues.
private void sumValues(List<MatchingDocs> matchingDocs, boolean keepScores, DoubleValuesSource valueSource) throws IOException {
IntsRef scratch = new IntsRef();
for (MatchingDocs hits : matchingDocs) {
OrdinalsReader.OrdinalsSegmentReader ords = ordinalsReader.getReader(hits.context);
DoubleValues scores = keepScores ? scores(hits) : null;
DoubleValues functionValues = valueSource.getValues(hits.context, scores);
DocIdSetIterator docs = hits.bits.iterator();
int doc;
while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
ords.get(doc, scratch);
if (functionValues.advanceExact(doc)) {
float value = (float) functionValues.doubleValue();
for (int i = 0; i < scratch.length; i++) {
values[scratch.ints[i]] += value;
}
}
}
}
rollup();
}
use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class BaseTokenStreamTestCase method getGraphStrings.
/** Returns all paths accepted by the token stream graph produced by the already initialized {@link TokenStream}. */
public static Set<String> getGraphStrings(TokenStream tokenStream) throws IOException {
Automaton automaton = new TokenStreamToAutomaton().toAutomaton(tokenStream);
Set<IntsRef> actualStringPaths = AutomatonTestUtil.getFiniteStringsRecursive(automaton, -1);
BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
Set<String> paths = new HashSet<>();
for (IntsRef ir : actualStringPaths) {
paths.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
}
return paths;
}
Aggregations