Search in sources :

Example 1 with PatriciaTrie

use of org.trie4j.bytes.PatriciaTrie in project trie4j by takawitter.

the class TestWikipedia method investigate.

private static void investigate(PatriciaTrie trie, int charCount) throws Exception {
    System.out.println("-- count elements.");
    final AtomicInteger count = new AtomicInteger();
    trie.visit(new TrieVisitor() {

        public void accept(Node node, int nest) {
            if (node.isTerminate())
                count.incrementAndGet();
        }
    });
    System.out.println(count.intValue() + " elements.");
    //*
    System.out.println("-- list elements.");
    final AtomicInteger n = new AtomicInteger();
    final AtomicInteger l = new AtomicInteger();
    final AtomicInteger ln = new AtomicInteger();
    final AtomicInteger chars = new AtomicInteger();
    trie.visit(new TrieVisitor() {

        public void accept(Node node, int nest) {
            if (node.isTerminate()) {
                l.incrementAndGet();
            } else {
                n.incrementAndGet();
            }
            chars.addAndGet(node.getLetters().length);
        }
    });
    System.out.println("node: " + n.intValue());
    System.out.println("leaf: " + l.intValue());
    System.out.println("label node: " + ln.intValue());
    System.out.println("total char count: " + charCount);
    System.out.println("total char count in trie: " + chars.intValue());
    System.out.println("verifying trie...");
    BufferedReader r = new BufferedReader(new InputStreamReader(//				new GZIPInputStream(new FileInputStream("jawiki-20120220-all-titles-in-ns0.gz"))
    new GZIPInputStream(new FileInputStream("enwiki-20120403-all-titles-in-ns0.gz")), CharsetUtil.newUTF8Decoder()));
    long lap = System.currentTimeMillis();
    int c = 0;
    int sum = 0;
    String word = null;
    while ((word = r.readLine()) != null) {
        if (c == maxCount)
            break;
        long d = System.currentTimeMillis();
        boolean found = trie.contains(word);
        sum += System.currentTimeMillis() - d;
        if (!found) {
            System.out.println("trie not contains [" + word + "]");
            break;
        }
        if (c % 100000 == 0) {
            System.out.println(c + " elements done.");
        }
        c++;
    }
    System.out.println("done in " + (System.currentTimeMillis() - lap) + " millis.");
    System.out.println("contains time: " + sum + " millis.");
    System.out.println(trie.getRoot().getChildren().length + "children in root");
    final PatriciaTrie t = trie;
    new Thread(new Runnable() {

        @Override
        public void run() {
            try {
                Thread.sleep(100000);
                t.contains("hello");
            } catch (InterruptedException e) {
            }
        }
    }).start();
//*/
}
Also used : InputStreamReader(java.io.InputStreamReader) TrieVisitor(org.trie4j.bytes.TrieVisitor) Node(org.trie4j.bytes.Node) PatriciaTrie(org.trie4j.bytes.PatriciaTrie) FileInputStream(java.io.FileInputStream) GZIPInputStream(java.util.zip.GZIPInputStream) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) BufferedReader(java.io.BufferedReader)

Example 2 with PatriciaTrie

use of org.trie4j.bytes.PatriciaTrie in project trie4j by takawitter.

the class Test method main.

public static void main(String[] args) throws Exception {
    System.out.println("--- patricia trie ---");
    go(new PatriciaTrie());
//		System.out.println("--- hash trie ---");
//		go(new HashSetTrie());
}
Also used : PatriciaTrie(org.trie4j.bytes.PatriciaTrie)

Example 3 with PatriciaTrie

use of org.trie4j.bytes.PatriciaTrie in project trie4j by takawitter.

the class TestWikipedia method main.

public static void main(String[] args) throws Exception {
    System.out.println("--- recursive patricia trie ---");
    PatriciaTrie trie = new org.trie4j.bytes.PatriciaTrie();
    int c = 0;
    // You can download archive from http://dumps.wikimedia.org/jawiki/latest/
    BufferedReader r = new BufferedReader(new InputStreamReader(//				new GZIPInputStream(new FileInputStream("jawiki-20120220-all-titles-in-ns0.gz"))
    new GZIPInputStream(new FileInputStream("enwiki-20120403-all-titles-in-ns0.gz")), CharsetUtil.newUTF8Decoder()));
    String word = null;
    System.gc();
    Thread.sleep(1000);
    System.out.println(Runtime.getRuntime().freeMemory() + " bytes free.");
    long sum = 0;
    long lap = System.currentTimeMillis();
    int charCount = 0;
    while ((word = r.readLine()) != null) {
        byte[] bytes = word.getBytes("UTF-8");
        long d = System.currentTimeMillis();
        trie.insert(bytes);
        sum += System.currentTimeMillis() - d;
        charCount += word.length();
        if (c % 100000 == 0) {
            d = System.currentTimeMillis() - lap;
            long free = Runtime.getRuntime().freeMemory();
            System.out.println(c + "," + free + "," + Runtime.getRuntime().maxMemory() + "," + d);
            lap = System.currentTimeMillis();
        }
        c++;
        if (c == maxCount)
            break;
    }
    System.out.println(c + "entries in ja wikipedia titles.");
    System.out.println("insert time: " + sum + " millis.");
    System.out.println("-- insert done.");
    System.gc();
    Thread.sleep(1000);
    System.out.println(Runtime.getRuntime().freeMemory() + " bytes free.");
    investigate(trie, charCount);
/*
//		dump(trie);
		System.out.println("-- pack");
		lap = System.currentTimeMillis();
		if(trie instanceof MultilayerPatriciaTrie){
			MultilayerPatriciaTrie mt = (MultilayerPatriciaTrie)trie;
			mt.pack();
			System.out.println("-- pack done in " + (System.currentTimeMillis() - lap) + " millis.");
	//		dump(trie);
			System.gc();
			Thread.sleep(1000);
			System.out.println(Runtime.getRuntime().freeMemory() + " bytes free.");
			investigate(mt, charCount);
		}
//*/
}
Also used : GZIPInputStream(java.util.zip.GZIPInputStream) InputStreamReader(java.io.InputStreamReader) PatriciaTrie(org.trie4j.bytes.PatriciaTrie) BufferedReader(java.io.BufferedReader) FileInputStream(java.io.FileInputStream)

Aggregations

PatriciaTrie (org.trie4j.bytes.PatriciaTrie)3 BufferedReader (java.io.BufferedReader)2 FileInputStream (java.io.FileInputStream)2 InputStreamReader (java.io.InputStreamReader)2 GZIPInputStream (java.util.zip.GZIPInputStream)2 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1 Node (org.trie4j.bytes.Node)1 TrieVisitor (org.trie4j.bytes.TrieVisitor)1