Search in sources :

Example 86 with GZIPInputStream

use of java.util.zip.GZIPInputStream in project CoreNLP by stanfordnlp.

the class JointParser method main.

/**
   *
   * @param args
   */
public static void main(String[] args) {
    if (args.length < MIN_ARGS) {
        log.info(usage());
        System.exit(-1);
    }
    Properties options = StringUtils.argsToProperties(args, optionArgDefs());
    boolean VERBOSE = PropertiesUtils.getBool(options, "v", false);
    File testTreebank = options.containsKey("t") ? new File(options.getProperty("t")) : null;
    int maxGoldSentLen = PropertiesUtils.getInt(options, "l", Integer.MAX_VALUE);
    boolean SER_INPUT = PropertiesUtils.getBool(options, "o", false);
    String[] parsedArgs = options.getProperty("", "").split("\\s+");
    if (parsedArgs.length != MIN_ARGS) {
        log.info(usage());
        System.exit(-1);
    }
    File trainTreebank = new File(parsedArgs[0]);
    Date startTime = new Date();
    log.info("###################################");
    log.info("### Joint Segmentation / Parser ###");
    log.info("###################################");
    System.err.printf("Start time: %s\n", startTime);
    JointParsingModel parsingModel = new JointParsingModel();
    parsingModel.setVerbose(VERBOSE);
    parsingModel.setMaxEvalSentLen(maxGoldSentLen);
    parsingModel.setSerInput(SER_INPUT);
    //WSGDEBUG -- Some stuff for eclipse debugging
    InputStream inputStream = null;
    try {
        if (System.getProperty("eclipse") == null)
            inputStream = (SER_INPUT) ? new ObjectInputStream(new GZIPInputStream(System.in)) : System.in;
        else {
            FileInputStream fileStream = new FileInputStream(new File("debug.2.xml"));
            inputStream = (SER_INPUT) ? new ObjectInputStream(new GZIPInputStream(fileStream)) : fileStream;
        }
    } catch (FileNotFoundException e) {
        e.printStackTrace();
        System.exit(-1);
    } catch (IOException e) {
        e.printStackTrace();
        System.exit(-1);
    }
    if (!trainTreebank.exists())
        log.info("Training treebank does not exist!\n  " + trainTreebank.getPath());
    else if (testTreebank != null && !testTreebank.exists())
        log.info("Test treebank does not exist!\n  " + testTreebank.getPath());
    else if (parsingModel.run(trainTreebank, testTreebank, inputStream))
        log.info("Successful shutdown!");
    else
        log.error("Parsing model failure.");
    Date stopTime = new Date();
    long elapsedTime = stopTime.getTime() - startTime.getTime();
    log.info();
    log.info();
    System.err.printf("Completed processing at %s\n", stopTime);
    System.err.printf("Elapsed time: %d seconds\n", (int) (elapsedTime / 1000F));
}
Also used : GZIPInputStream(java.util.zip.GZIPInputStream) ObjectInputStream(java.io.ObjectInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) FileNotFoundException(java.io.FileNotFoundException) IOException(java.io.IOException) Properties(java.util.Properties) Date(java.util.Date) FileInputStream(java.io.FileInputStream) GZIPInputStream(java.util.zip.GZIPInputStream) File(java.io.File) ObjectInputStream(java.io.ObjectInputStream)

Example 87 with GZIPInputStream

use of java.util.zip.GZIPInputStream in project CoreNLP by stanfordnlp.

the class CoNLLBenchmark method getEmbeddings.

@SuppressWarnings("unchecked")
public Map<String, double[]> getEmbeddings(String cacheFilename, List<CoNLLSentence> sentences) throws IOException, ClassNotFoundException {
    File f = new File(cacheFilename);
    Map<String, double[]> trimmedSet;
    if (!f.exists()) {
        trimmedSet = new HashMap<>();
        Map<String, double[]> massiveSet = loadEmbeddingsFromFile("../google-300.txt");
        log.info("Got massive embedding set size " + massiveSet.size());
        for (CoNLLSentence sentence : sentences) {
            for (String token : sentence.token) {
                if (massiveSet.containsKey(token)) {
                    trimmedSet.put(token, massiveSet.get(token));
                }
            }
        }
        log.info("Got trimmed embedding set size " + trimmedSet.size());
        f.createNewFile();
        ObjectOutputStream oos = new ObjectOutputStream(new GZIPOutputStream(new FileOutputStream(cacheFilename)));
        oos.writeObject(trimmedSet);
        oos.close();
        log.info("Wrote trimmed set to file");
    } else {
        ObjectInputStream ois = new ObjectInputStream(new GZIPInputStream(new FileInputStream(cacheFilename)));
        trimmedSet = (Map<String, double[]>) ois.readObject();
    }
    return trimmedSet;
}
Also used : GZIPInputStream(java.util.zip.GZIPInputStream) GZIPOutputStream(java.util.zip.GZIPOutputStream)

Example 88 with GZIPInputStream

use of java.util.zip.GZIPInputStream in project CoreNLP by stanfordnlp.

the class CacheParseHypotheses method convertToTrees.

public static List<Tree> convertToTrees(byte[] input) {
    try {
        List<Tree> output = new ArrayList<>();
        ByteArrayInputStream bis = new ByteArrayInputStream(input);
        GZIPInputStream gis = new GZIPInputStream(bis);
        ObjectInputStream ois = new ObjectInputStream(gis);
        int size = ErasureUtils.<Integer>uncheckedCast(ois.readObject());
        for (int i = 0; i < size; ++i) {
            String rawTree = ErasureUtils.uncheckedCast(ois.readObject());
            Tree tree = Tree.valueOf(rawTree, trf);
            tree.setSpans();
            output.add(tree);
        }
        ois.close();
        gis.close();
        bis.close();
        return output;
    } catch (IOException e) {
        throw new RuntimeIOException(e);
    } catch (ClassNotFoundException e) {
        throw new RuntimeException(e);
    }
}
Also used : RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) ArrayList(java.util.ArrayList) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) GZIPInputStream(java.util.zip.GZIPInputStream) Tree(edu.stanford.nlp.trees.Tree)

Example 89 with GZIPInputStream

use of java.util.zip.GZIPInputStream in project trie4j by takawitter.

the class TestWikipedia method investigate.

private static void investigate(PatriciaTrie trie, int charCount) throws Exception {
    System.out.println("-- count elements.");
    final AtomicInteger count = new AtomicInteger();
    trie.visit(new TrieVisitor() {

        public void accept(Node node, int nest) {
            if (node.isTerminate())
                count.incrementAndGet();
        }
    });
    System.out.println(count.intValue() + " elements.");
    //*
    System.out.println("-- list elements.");
    final AtomicInteger n = new AtomicInteger();
    final AtomicInteger l = new AtomicInteger();
    final AtomicInteger ln = new AtomicInteger();
    final AtomicInteger chars = new AtomicInteger();
    trie.visit(new TrieVisitor() {

        public void accept(Node node, int nest) {
            if (node.isTerminate()) {
                l.incrementAndGet();
            } else {
                n.incrementAndGet();
            }
            chars.addAndGet(node.getLetters().length);
        }
    });
    System.out.println("node: " + n.intValue());
    System.out.println("leaf: " + l.intValue());
    System.out.println("label node: " + ln.intValue());
    System.out.println("total char count: " + charCount);
    System.out.println("total char count in trie: " + chars.intValue());
    System.out.println("verifying trie...");
    BufferedReader r = new BufferedReader(new InputStreamReader(//				new GZIPInputStream(new FileInputStream("jawiki-20120220-all-titles-in-ns0.gz"))
    new GZIPInputStream(new FileInputStream("enwiki-20120403-all-titles-in-ns0.gz")), CharsetUtil.newUTF8Decoder()));
    long lap = System.currentTimeMillis();
    int c = 0;
    int sum = 0;
    String word = null;
    while ((word = r.readLine()) != null) {
        if (c == maxCount)
            break;
        long d = System.currentTimeMillis();
        boolean found = trie.contains(word);
        sum += System.currentTimeMillis() - d;
        if (!found) {
            System.out.println("trie not contains [" + word + "]");
            break;
        }
        if (c % 100000 == 0) {
            System.out.println(c + " elements done.");
        }
        c++;
    }
    System.out.println("done in " + (System.currentTimeMillis() - lap) + " millis.");
    System.out.println("contains time: " + sum + " millis.");
    System.out.println(trie.getRoot().getChildren().length + "children in root");
    final PatriciaTrie t = trie;
    new Thread(new Runnable() {

        @Override
        public void run() {
            try {
                Thread.sleep(100000);
                t.contains("hello");
            } catch (InterruptedException e) {
            }
        }
    }).start();
//*/
}
Also used : InputStreamReader(java.io.InputStreamReader) TrieVisitor(org.trie4j.bytes.TrieVisitor) Node(org.trie4j.bytes.Node) PatriciaTrie(org.trie4j.bytes.PatriciaTrie) FileInputStream(java.io.FileInputStream) GZIPInputStream(java.util.zip.GZIPInputStream) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) BufferedReader(java.io.BufferedReader)

Example 90 with GZIPInputStream

use of java.util.zip.GZIPInputStream in project commons by twitter.

the class Base64ZlibCodec method decompress.

private static byte[] decompress(byte[] compressed) throws InvalidDataException {
    byte[] bytes;
    try {
        final InputStream bin = new ByteArrayInputStream(compressed);
        final InputStream zin;
        if (startsWith(compressed, GZIP_HEADER_PREFIX)) {
            zin = new GZIPInputStream(bin);
        } else if (startsWith(compressed, ZLIB_HEADER_PREFIX)) {
            zin = new InflaterInputStream(bin);
        } else {
            throw new Base64ZlibCodec.InvalidDataException("Value doesn't start with either GZIP or zlib header");
        }
        try {
            bytes = ByteStreams.toByteArray(zin);
        } finally {
            zin.close();
        }
    } catch (IOException e) {
        throw new Base64ZlibCodec.InvalidDataException("zlib/GZIP decoding error", e);
    }
    return bytes;
}
Also used : GZIPInputStream(java.util.zip.GZIPInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) GZIPInputStream(java.util.zip.GZIPInputStream) InflaterInputStream(java.util.zip.InflaterInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) InflaterInputStream(java.util.zip.InflaterInputStream) IOException(java.io.IOException)

Aggregations

GZIPInputStream (java.util.zip.GZIPInputStream)376 InputStream (java.io.InputStream)144 IOException (java.io.IOException)125 ByteArrayInputStream (java.io.ByteArrayInputStream)120 FileInputStream (java.io.FileInputStream)98 ByteArrayOutputStream (java.io.ByteArrayOutputStream)77 InputStreamReader (java.io.InputStreamReader)57 File (java.io.File)56 BufferedReader (java.io.BufferedReader)45 BufferedInputStream (java.io.BufferedInputStream)41 Test (org.junit.Test)41 FileOutputStream (java.io.FileOutputStream)30 URL (java.net.URL)25 InflaterInputStream (java.util.zip.InflaterInputStream)25 OutputStream (java.io.OutputStream)24 GZIPOutputStream (java.util.zip.GZIPOutputStream)21 ObjectInputStream (java.io.ObjectInputStream)19 HttpURLConnection (java.net.HttpURLConnection)19 URLConnection (java.net.URLConnection)17 HashMap (java.util.HashMap)15