use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class TestFSTsMisc method testRandomWords.
private void testRandomWords(int maxNumWords, int numIter) throws IOException {
Random random = new Random(random().nextLong());
for (int iter = 0; iter < numIter; iter++) {
if (VERBOSE) {
System.out.println("\nTEST: iter " + iter);
}
for (int inputMode = 0; inputMode < 2; inputMode++) {
final int numWords = random.nextInt(maxNumWords + 1);
Set<IntsRef> termsSet = new HashSet<>();
IntsRef[] terms = new IntsRef[numWords];
while (termsSet.size() < numWords) {
final String term = getRandomString(random);
termsSet.add(toIntsRef(term, inputMode));
}
doTest(inputMode, termsSet.toArray(new IntsRef[termsSet.size()]));
}
}
}
use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class TestTokenInfoDictionary method testEnumerateAll.
/** enumerates the entire FST/lookup data and just does basic sanity checks */
public void testEnumerateAll() throws Exception {
// just for debugging
int numTerms = 0;
int numWords = 0;
int lastWordId = -1;
int lastSourceId = -1;
TokenInfoDictionary tid = TokenInfoDictionary.getInstance();
ConnectionCosts matrix = ConnectionCosts.getInstance();
FST<Long> fst = tid.getFST().getInternalFST();
IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<>(fst);
InputOutput<Long> mapping;
IntsRef scratch = new IntsRef();
while ((mapping = fstEnum.next()) != null) {
numTerms++;
IntsRef input = mapping.input;
char[] chars = new char[input.length];
for (int i = 0; i < chars.length; i++) {
chars[i] = (char) input.ints[input.offset + i];
}
assertTrue(UnicodeUtil.validUTF16String(new String(chars)));
Long output = mapping.output;
int sourceId = output.intValue();
// we walk in order, terms, sourceIds, and wordIds should always be increasing
assertTrue(sourceId > lastSourceId);
lastSourceId = sourceId;
tid.lookupWordIds(sourceId, scratch);
for (int i = 0; i < scratch.length; i++) {
numWords++;
int wordId = scratch.ints[scratch.offset + i];
assertTrue(wordId > lastWordId);
lastWordId = wordId;
String baseForm = tid.getBaseForm(wordId, chars, 0, chars.length);
assertTrue(baseForm == null || UnicodeUtil.validUTF16String(baseForm));
String inflectionForm = tid.getInflectionForm(wordId);
assertTrue(inflectionForm == null || UnicodeUtil.validUTF16String(inflectionForm));
if (inflectionForm != null) {
// check that it's actually an ipadic inflection form
assertNotNull(ToStringUtil.getInflectedFormTranslation(inflectionForm));
}
String inflectionType = tid.getInflectionType(wordId);
assertTrue(inflectionType == null || UnicodeUtil.validUTF16String(inflectionType));
if (inflectionType != null) {
// check that it's actually an ipadic inflection type
assertNotNull(ToStringUtil.getInflectionTypeTranslation(inflectionType));
}
int leftId = tid.getLeftId(wordId);
int rightId = tid.getRightId(wordId);
matrix.get(rightId, leftId);
tid.getWordCost(wordId);
String pos = tid.getPartOfSpeech(wordId);
assertNotNull(pos);
assertTrue(UnicodeUtil.validUTF16String(pos));
// check that it's actually an ipadic pos tag
assertNotNull(ToStringUtil.getPOSTranslation(pos));
String pronunciation = tid.getPronunciation(wordId, chars, 0, chars.length);
assertNotNull(pronunciation);
assertTrue(UnicodeUtil.validUTF16String(pronunciation));
String reading = tid.getReading(wordId, chars, 0, chars.length);
assertNotNull(reading);
assertTrue(UnicodeUtil.validUTF16String(reading));
}
}
if (VERBOSE) {
System.out.println("checked " + numTerms + " terms, " + numWords + " words.");
}
}
use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class TestGraphTokenizers method toPathStrings.
/** Returns all paths */
private Set<String> toPathStrings(Automaton a) {
BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
Set<String> paths = new HashSet<>();
for (IntsRef ir : AutomatonTestUtil.getFiniteStringsRecursive(a, -1)) {
paths.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
}
return paths;
}
use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class TestFSTs method main.
// TODO: try experiment: reverse terms before
// compressing -- how much smaller?
// TODO: can FST be used to index all internal substrings,
// mapping to term?
// java -cp ../build/codecs/classes/java:../test-framework/lib/randomizedtesting-runner-*.jar:../build/core/classes/test:../build/core/classes/test-framework:../build/core/classes/java:../build/test-framework/classes/java:../test-framework/lib/junit-4.10.jar org.apache.lucene.util.fst.TestFSTs /xold/tmp/allTerms3.txt out
public static void main(String[] args) throws IOException {
int prune = 0;
int limit = Integer.MAX_VALUE;
// utf8
int inputMode = 0;
boolean storeOrds = false;
boolean storeDocFreqs = false;
boolean verify = true;
boolean noArcArrays = false;
Path wordsFileIn = null;
Path dirOut = null;
int idx = 0;
while (idx < args.length) {
if (args[idx].equals("-prune")) {
prune = Integer.parseInt(args[1 + idx]);
idx++;
} else if (args[idx].equals("-limit")) {
limit = Integer.parseInt(args[1 + idx]);
idx++;
} else if (args[idx].equals("-utf8")) {
inputMode = 0;
} else if (args[idx].equals("-utf32")) {
inputMode = 1;
} else if (args[idx].equals("-docFreq")) {
storeDocFreqs = true;
} else if (args[idx].equals("-noArcArrays")) {
noArcArrays = true;
} else if (args[idx].equals("-ords")) {
storeOrds = true;
} else if (args[idx].equals("-noverify")) {
verify = false;
} else if (args[idx].startsWith("-")) {
System.err.println("Unrecognized option: " + args[idx]);
System.exit(-1);
} else {
if (wordsFileIn == null) {
wordsFileIn = Paths.get(args[idx]);
} else if (dirOut == null) {
dirOut = Paths.get(args[idx]);
} else {
System.err.println("Too many arguments, expected: input [output]");
System.exit(-1);
}
}
idx++;
}
if (wordsFileIn == null) {
System.err.println("No input file.");
System.exit(-1);
}
if (storeOrds && storeDocFreqs) {
// Store both ord & docFreq:
final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton();
final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton();
final PairOutputs<Long, Long> outputs = new PairOutputs<>(o1, o2);
new VisitTerms<PairOutputs.Pair<Long, Long>>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {
Random rand;
@Override
public PairOutputs.Pair<Long, Long> getOutput(IntsRef input, int ord) {
if (ord == 0) {
rand = new Random(17);
}
return outputs.newPair((long) ord, (long) TestUtil.nextInt(rand, 1, 5000));
}
}.run(limit, verify, false);
} else if (storeOrds) {
// Store only ords
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {
@Override
public Long getOutput(IntsRef input, int ord) {
return (long) ord;
}
}.run(limit, verify, true);
} else if (storeDocFreqs) {
// Store only docFreq
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {
Random rand;
@Override
public Long getOutput(IntsRef input, int ord) {
if (ord == 0) {
rand = new Random(17);
}
return (long) TestUtil.nextInt(rand, 1, 5000);
}
}.run(limit, verify, false);
} else {
// Store nothing
final NoOutputs outputs = NoOutputs.getSingleton();
final Object NO_OUTPUT = outputs.getNoOutput();
new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {
@Override
public Object getOutput(IntsRef input, int ord) {
return NO_OUTPUT;
}
}.run(limit, verify, false);
}
}
use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class Test2BFST method test.
public void test() throws Exception {
assumeWorkingMMapOnWindows();
int[] ints = new int[7];
IntsRef input = new IntsRef(ints, 0, ints.length);
long seed = random().nextLong();
Directory dir = new MMapDirectory(createTempDir("2BFST"));
for (int iter = 0; iter < 1; iter++) {
// Build FST w/ NoOutputs and stop when nodeCount > 2.2B
{
System.out.println("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS");
Outputs<Object> outputs = NoOutputs.getSingleton();
Object NO_OUTPUT = outputs.getNoOutput();
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
int count = 0;
Random r = new Random(seed);
int[] ints2 = new int[200];
IntsRef input2 = new IntsRef(ints2, 0, ints2.length);
while (true) {
//System.out.println("add: " + input + " -> " + output);
for (int i = 10; i < ints2.length; i++) {
ints2[i] = r.nextInt(256);
}
b.add(input2, NO_OUTPUT);
count++;
if (count % 100000 == 0) {
System.out.println(count + ": " + b.fstRamBytesUsed() + " bytes; " + b.getNodeCount() + " nodes");
}
if (b.getNodeCount() > Integer.MAX_VALUE + 100L * 1024 * 1024) {
break;
}
nextInput(r, ints2);
}
FST<Object> fst = b.finish();
for (int verify = 0; verify < 2; verify++) {
System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + b.getNodeCount() + "; arcCount=" + b.getArcCount() + "]");
Arrays.fill(ints2, 0);
r = new Random(seed);
for (int i = 0; i < count; i++) {
if (i % 1000000 == 0) {
System.out.println(i + "...: ");
}
for (int j = 10; j < ints2.length; j++) {
ints2[j] = r.nextInt(256);
}
assertEquals(NO_OUTPUT, Util.get(fst, input2));
nextInput(r, ints2);
}
System.out.println("\nTEST: enum all input/outputs");
IntsRefFSTEnum<Object> fstEnum = new IntsRefFSTEnum<>(fst);
Arrays.fill(ints2, 0);
r = new Random(seed);
int upto = 0;
while (true) {
IntsRefFSTEnum.InputOutput<Object> pair = fstEnum.next();
if (pair == null) {
break;
}
for (int j = 10; j < ints2.length; j++) {
ints2[j] = r.nextInt(256);
}
assertEquals(input2, pair.input);
assertEquals(NO_OUTPUT, pair.output);
upto++;
nextInput(r, ints2);
}
assertEquals(count, upto);
if (verify == 0) {
System.out.println("\nTEST: save/load FST and re-verify");
IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
fst.save(out);
out.close();
IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
fst = new FST<>(in, outputs);
in.close();
} else {
dir.deleteFile("fst");
}
}
}
// Build FST w/ ByteSequenceOutputs and stop when FST
// size = 3GB
{
System.out.println("\nTEST: 3 GB size; outputs=bytes");
Outputs<BytesRef> outputs = ByteSequenceOutputs.getSingleton();
final Builder<BytesRef> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
byte[] outputBytes = new byte[20];
BytesRef output = new BytesRef(outputBytes);
Arrays.fill(ints, 0);
int count = 0;
Random r = new Random(seed);
while (true) {
r.nextBytes(outputBytes);
//System.out.println("add: " + input + " -> " + output);
b.add(input, BytesRef.deepCopyOf(output));
count++;
if (count % 1000000 == 0) {
System.out.println(count + "...: " + b.fstRamBytesUsed() + " bytes");
}
if (b.fstRamBytesUsed() > LIMIT) {
break;
}
nextInput(r, ints);
}
FST<BytesRef> fst = b.finish();
for (int verify = 0; verify < 2; verify++) {
System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + b.getNodeCount() + "; arcCount=" + b.getArcCount() + "]");
r = new Random(seed);
Arrays.fill(ints, 0);
for (int i = 0; i < count; i++) {
if (i % 1000000 == 0) {
System.out.println(i + "...: ");
}
r.nextBytes(outputBytes);
assertEquals(output, Util.get(fst, input));
nextInput(r, ints);
}
System.out.println("\nTEST: enum all input/outputs");
IntsRefFSTEnum<BytesRef> fstEnum = new IntsRefFSTEnum<>(fst);
Arrays.fill(ints, 0);
r = new Random(seed);
int upto = 0;
while (true) {
IntsRefFSTEnum.InputOutput<BytesRef> pair = fstEnum.next();
if (pair == null) {
break;
}
assertEquals(input, pair.input);
r.nextBytes(outputBytes);
assertEquals(output, pair.output);
upto++;
nextInput(r, ints);
}
assertEquals(count, upto);
if (verify == 0) {
System.out.println("\nTEST: save/load FST and re-verify");
IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
fst.save(out);
out.close();
IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
fst = new FST<>(in, outputs);
in.close();
} else {
dir.deleteFile("fst");
}
}
}
// Build FST w/ PositiveIntOutputs and stop when FST
// size = 3GB
{
System.out.println("\nTEST: 3 GB size; outputs=long");
Outputs<Long> outputs = PositiveIntOutputs.getSingleton();
final Builder<Long> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
long output = 1;
Arrays.fill(ints, 0);
int count = 0;
Random r = new Random(seed);
while (true) {
//System.out.println("add: " + input + " -> " + output);
b.add(input, output);
output += 1 + r.nextInt(10);
count++;
if (count % 1000000 == 0) {
System.out.println(count + "...: " + b.fstRamBytesUsed() + " bytes");
}
if (b.fstRamBytesUsed() > LIMIT) {
break;
}
nextInput(r, ints);
}
FST<Long> fst = b.finish();
for (int verify = 0; verify < 2; verify++) {
System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + b.getNodeCount() + "; arcCount=" + b.getArcCount() + "]");
Arrays.fill(ints, 0);
output = 1;
r = new Random(seed);
for (int i = 0; i < count; i++) {
if (i % 1000000 == 0) {
System.out.println(i + "...: ");
}
// forward lookup:
assertEquals(output, Util.get(fst, input).longValue());
// reverse lookup:
assertEquals(input, Util.getByOutput(fst, output));
output += 1 + r.nextInt(10);
nextInput(r, ints);
}
System.out.println("\nTEST: enum all input/outputs");
IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<>(fst);
Arrays.fill(ints, 0);
r = new Random(seed);
int upto = 0;
output = 1;
while (true) {
IntsRefFSTEnum.InputOutput<Long> pair = fstEnum.next();
if (pair == null) {
break;
}
assertEquals(input, pair.input);
assertEquals(output, pair.output.longValue());
output += 1 + r.nextInt(10);
upto++;
nextInput(r, ints);
}
assertEquals(count, upto);
if (verify == 0) {
System.out.println("\nTEST: save/load FST and re-verify");
IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
fst.save(out);
out.close();
IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
fst = new FST<>(in, outputs);
in.close();
} else {
dir.deleteFile("fst");
}
}
}
}
dir.close();
}
Aggregations