Search in sources :

Example 11 with WikipediaTitles

use of org.trie4j.test.WikipediaTitles in project trie4j by takawitter.

the class CreateTail method main.

public static void main(String[] args) throws Exception {
    TailPatriciaTrie trie = new TailPatriciaTrie();
    for (String s : new WikipediaTitles("data/jawiki-20120220-all-titles-in-ns0.gz")) {
        trie.insert(s);
    }
    ConcatTailArrayBuilder ta = new ConcatTailArrayBuilder(trie.size());
    new TailLOUDSTrie(trie, ta);
    OutputStream os = new FileOutputStream("data/jawiki-20120220-tail");
    try {
    /*			CharSequence seq = ta.build().getTails();
			byte[] bytes = seq.toString().getBytes("UTF16");
			System.out.println(seq.length() + "chars.");
			System.out.println(bytes.length + "bytes.");
			os.write(bytes);
*/
    } finally {
        os.close();
    }
}
Also used : TailPatriciaTrie(org.trie4j.patricia.TailPatriciaTrie) TailLOUDSTrie(org.trie4j.louds.TailLOUDSTrie) OutputStream(java.io.OutputStream) FileOutputStream(java.io.FileOutputStream) FileOutputStream(java.io.FileOutputStream) WikipediaTitles(org.trie4j.test.WikipediaTitles)

Example 12 with WikipediaTitles

use of org.trie4j.test.WikipediaTitles in project trie4j by takawitter.

the class AbstractMapTrieWikipediaTest method test.

@Test
public void test() throws Exception {
    MapTrie<Integer> trie = createFirstTrie();
    System.out.println("building first trie: " + trie.getClass().getName());
    int c = 0, chars = 0;
    long b = 0;
    LapTimer t = new LapTimer();
    for (String word : new WikipediaTitles(wikipediaFilename)) {
        try {
            t.reset();
            trie.insert(word, c);
            b += t.lapNanos();
        } catch (Exception e) {
            System.out.println("exception at " + c + "th word: " + word);
            trie.dump(new PrintWriter(System.out));
            throw e;
        }
        c++;
        chars += word.length();
    }
    System.out.println(String.format("done in %d millis with %d words and %d chars.", (b / 1000000), c, chars));
    MapTrie<Integer> second = buildSecondTrie(trie);
    try {
        getClass().getDeclaredMethod("buildSecondTrie", Trie.class);
        System.out.print("building second trie: ");
        t.reset();
        second = buildSecondTrie(trie);
        System.out.println("done in " + t.lapMillis() + "millis.");
    } catch (NoSuchMethodException e) {
    }
    System.out.println("verifying trie.");
    long sum = 0;
    c = 0;
    for (String word : new WikipediaTitles(wikipediaFilename)) {
        t.reset();
        boolean found = (int) second.get(word) == c;
        sum += t.lapNanos();
        c++;
        if (!found) {
            System.out.println(String.format("verification failed.  trie not contains %d th word: [%s]" + " with id: [%d] actual: [%d].", c, word, c, second.get(word)));
            break;
        }
    }
    System.out.println("done in " + (sum / 1000000) + " millis with " + c + " words.");
    afterVerification(second);
}
Also used : WikipediaTitles(org.trie4j.test.WikipediaTitles) LapTimer(org.trie4j.test.LapTimer) PrintWriter(java.io.PrintWriter) Test(org.junit.Test)

Example 13 with WikipediaTitles

use of org.trie4j.test.WikipediaTitles in project trie4j by takawitter.

the class SaveLOUDSTrie method main.

public static void main(String[] args) throws Exception {
    TailPatriciaTrie trie1 = new TailPatriciaTrie();
    for (String s : new WikipediaTitles("data/jawiki-20120220-all-titles-in-ns0.gz")) {
        trie1.insert(s);
    }
    System.out.println(trie1.size() + "nodes.");
    SBVConcatTailArrayBuilder tailArray = new SBVConcatTailArrayBuilder(trie1.size());
    TailLOUDSTrie trie = new TailLOUDSTrie(trie1, tailArray);
    System.out.println(trie.size() + "nodes.");
    trie.freeze();
    OutputStream os = new FileOutputStream("louds.dat");
    try {
        ObjectOutputStream oos = new ObjectOutputStream(os);
        trie.writeExternal(oos);
        oos.flush();
    } finally {
        os.close();
    }
    os = new FileOutputStream("louds-bv.dat");
    try {
        ObjectOutputStream oos = new ObjectOutputStream(os);
        oos.writeObject(trie.getBvTree());
        oos.flush();
    } finally {
        os.close();
    }
    os = new FileOutputStream("louds-labels.dat");
    try {
        DataOutputStream dos = new DataOutputStream(os);
        for (char c : trie.getLabels()) {
            dos.writeChar(c);
        }
        dos.flush();
    } finally {
        os.close();
    }
    os = new FileOutputStream("louds-tails.dat");
    try {
        ObjectOutputStream dos = new ObjectOutputStream(os);
        dos.writeObject(tailArray);
        dos.flush();
    } finally {
        os.close();
    }
    os = new FileOutputStream("louds-tailIndex.dat");
    try {
        ObjectOutputStream oos = new ObjectOutputStream(os);
        oos.writeObject(tailArray);
        oos.flush();
    } finally {
        os.close();
    }
    os = new FileOutputStream("louds-term.dat");
    try {
        ObjectOutputStream dos = new ObjectOutputStream(os);
        dos.writeObject(trie.getTerm());
        dos.flush();
    } finally {
        os.close();
    }
}
Also used : TailPatriciaTrie(org.trie4j.patricia.TailPatriciaTrie) DataOutputStream(java.io.DataOutputStream) OutputStream(java.io.OutputStream) DataOutputStream(java.io.DataOutputStream) FileOutputStream(java.io.FileOutputStream) ObjectOutputStream(java.io.ObjectOutputStream) FileOutputStream(java.io.FileOutputStream) SBVConcatTailArrayBuilder(org.trie4j.tail.SBVConcatTailArrayBuilder) WikipediaTitles(org.trie4j.test.WikipediaTitles) ObjectOutputStream(java.io.ObjectOutputStream)

Example 14 with WikipediaTitles

use of org.trie4j.test.WikipediaTitles in project trie4j by takawitter.

the class TestIO method verify.

private static void verify(Trie da) throws Exception {
    System.out.println("verifying double array...");
    int c = 0;
    int sum = 0;
    LapTimer t1 = new LapTimer();
    LapTimer t = new LapTimer();
    for (String word : new WikipediaTitles()) {
        if (c == maxCount)
            break;
        t.reset();
        boolean found = da.contains(word);
        sum += t.lapMillis();
        c++;
        if (!found) {
            System.out.println("verification failed.  trie not contains " + c + " th word: [" + word + "]");
            break;
        }
    }
    System.out.println("done " + c + "words in " + t1.lapMillis() + " millis.");
    System.out.println("contains time: " + sum + " millis.");
}
Also used : WikipediaTitles(org.trie4j.test.WikipediaTitles) LapTimer(org.trie4j.test.LapTimer)

Example 15 with WikipediaTitles

use of org.trie4j.test.WikipediaTitles in project trie4j by takawitter.

the class AbstractWikipediaSerializeTest method test.

@Test
public void test() throws Exception {
    WikipediaTitles wt = new WikipediaTitles();
    Trie trie = wt.insertTo(firstTrie());
    trie = secondTrie(trie);
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    ObjectOutputStream oos = new ObjectOutputStream(baos);
    LapTimer lt = new LapTimer();
    oos.writeObject(trie);
    oos.flush();
    long wd = lt.lapMillis();
    byte[] serialized = baos.toByteArray();
    lt.reset();
    Trie t = (Trie) new ObjectInputStream(new ByteArrayInputStream(serialized)).readObject();
    long rd = lt.lapMillis();
    long vd = wt.assertAllContains(t);
    System.out.println(String.format("%s%s%s, size: %d, write(ms): %d, read(ms): %d, verify(ms): %d.", trie.getClass().getSimpleName(), getBvTreeClassName(trie), getTailClassName(trie), serialized.length, wd, rd, vd));
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) WikipediaTitles(org.trie4j.test.WikipediaTitles) ByteArrayOutputStream(java.io.ByteArrayOutputStream) ObjectOutputStream(java.io.ObjectOutputStream) TailPatriciaTrie(org.trie4j.patricia.TailPatriciaTrie) TailLOUDSTrie(org.trie4j.louds.TailLOUDSTrie) LapTimer(org.trie4j.test.LapTimer) ObjectInputStream(java.io.ObjectInputStream) Test(org.junit.Test)

Aggregations

WikipediaTitles (org.trie4j.test.WikipediaTitles)17 LapTimer (org.trie4j.test.LapTimer)12 Test (org.junit.Test)8 TailPatriciaTrie (org.trie4j.patricia.TailPatriciaTrie)7 PrintWriter (java.io.PrintWriter)5 Trie (org.trie4j.Trie)5 ByteArrayInputStream (java.io.ByteArrayInputStream)4 ByteArrayOutputStream (java.io.ByteArrayOutputStream)4 ObjectOutputStream (java.io.ObjectOutputStream)4 TailLOUDSTrie (org.trie4j.louds.TailLOUDSTrie)4 FileOutputStream (java.io.FileOutputStream)3 ObjectInputStream (java.io.ObjectInputStream)3 OutputStream (java.io.OutputStream)3 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)3 PatriciaTrie (org.trie4j.patricia.PatriciaTrie)3 Node (org.trie4j.Node)2 NodeVisitor (org.trie4j.NodeVisitor)2 BytesSuccinctBitVector (org.trie4j.bv.BytesSuccinctBitVector)2 ConcatTailBuilder (org.trie4j.tail.builder.ConcatTailBuilder)2 DataOutputStream (java.io.DataOutputStream)1