use of java.util.zip.GZIPInputStream in project CoreNLP by stanfordnlp.
the class JointParser method main.
/**
*
* @param args
*/
public static void main(String[] args) {
if (args.length < MIN_ARGS) {
log.info(usage());
System.exit(-1);
}
Properties options = StringUtils.argsToProperties(args, optionArgDefs());
boolean VERBOSE = PropertiesUtils.getBool(options, "v", false);
File testTreebank = options.containsKey("t") ? new File(options.getProperty("t")) : null;
int maxGoldSentLen = PropertiesUtils.getInt(options, "l", Integer.MAX_VALUE);
boolean SER_INPUT = PropertiesUtils.getBool(options, "o", false);
String[] parsedArgs = options.getProperty("", "").split("\\s+");
if (parsedArgs.length != MIN_ARGS) {
log.info(usage());
System.exit(-1);
}
File trainTreebank = new File(parsedArgs[0]);
Date startTime = new Date();
log.info("###################################");
log.info("### Joint Segmentation / Parser ###");
log.info("###################################");
System.err.printf("Start time: %s\n", startTime);
JointParsingModel parsingModel = new JointParsingModel();
parsingModel.setVerbose(VERBOSE);
parsingModel.setMaxEvalSentLen(maxGoldSentLen);
parsingModel.setSerInput(SER_INPUT);
//WSGDEBUG -- Some stuff for eclipse debugging
InputStream inputStream = null;
try {
if (System.getProperty("eclipse") == null)
inputStream = (SER_INPUT) ? new ObjectInputStream(new GZIPInputStream(System.in)) : System.in;
else {
FileInputStream fileStream = new FileInputStream(new File("debug.2.xml"));
inputStream = (SER_INPUT) ? new ObjectInputStream(new GZIPInputStream(fileStream)) : fileStream;
}
} catch (FileNotFoundException e) {
e.printStackTrace();
System.exit(-1);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
if (!trainTreebank.exists())
log.info("Training treebank does not exist!\n " + trainTreebank.getPath());
else if (testTreebank != null && !testTreebank.exists())
log.info("Test treebank does not exist!\n " + testTreebank.getPath());
else if (parsingModel.run(trainTreebank, testTreebank, inputStream))
log.info("Successful shutdown!");
else
log.error("Parsing model failure.");
Date stopTime = new Date();
long elapsedTime = stopTime.getTime() - startTime.getTime();
log.info();
log.info();
System.err.printf("Completed processing at %s\n", stopTime);
System.err.printf("Elapsed time: %d seconds\n", (int) (elapsedTime / 1000F));
}
use of java.util.zip.GZIPInputStream in project CoreNLP by stanfordnlp.
the class CoNLLBenchmark method getEmbeddings.
@SuppressWarnings("unchecked")
public Map<String, double[]> getEmbeddings(String cacheFilename, List<CoNLLSentence> sentences) throws IOException, ClassNotFoundException {
File f = new File(cacheFilename);
Map<String, double[]> trimmedSet;
if (!f.exists()) {
trimmedSet = new HashMap<>();
Map<String, double[]> massiveSet = loadEmbeddingsFromFile("../google-300.txt");
log.info("Got massive embedding set size " + massiveSet.size());
for (CoNLLSentence sentence : sentences) {
for (String token : sentence.token) {
if (massiveSet.containsKey(token)) {
trimmedSet.put(token, massiveSet.get(token));
}
}
}
log.info("Got trimmed embedding set size " + trimmedSet.size());
f.createNewFile();
ObjectOutputStream oos = new ObjectOutputStream(new GZIPOutputStream(new FileOutputStream(cacheFilename)));
oos.writeObject(trimmedSet);
oos.close();
log.info("Wrote trimmed set to file");
} else {
ObjectInputStream ois = new ObjectInputStream(new GZIPInputStream(new FileInputStream(cacheFilename)));
trimmedSet = (Map<String, double[]>) ois.readObject();
}
return trimmedSet;
}
use of java.util.zip.GZIPInputStream in project CoreNLP by stanfordnlp.
the class CacheParseHypotheses method convertToTrees.
public static List<Tree> convertToTrees(byte[] input) {
try {
List<Tree> output = new ArrayList<>();
ByteArrayInputStream bis = new ByteArrayInputStream(input);
GZIPInputStream gis = new GZIPInputStream(bis);
ObjectInputStream ois = new ObjectInputStream(gis);
int size = ErasureUtils.<Integer>uncheckedCast(ois.readObject());
for (int i = 0; i < size; ++i) {
String rawTree = ErasureUtils.uncheckedCast(ois.readObject());
Tree tree = Tree.valueOf(rawTree, trf);
tree.setSpans();
output.add(tree);
}
ois.close();
gis.close();
bis.close();
return output;
} catch (IOException e) {
throw new RuntimeIOException(e);
} catch (ClassNotFoundException e) {
throw new RuntimeException(e);
}
}
use of java.util.zip.GZIPInputStream in project trie4j by takawitter.
the class TestWikipedia method investigate.
private static void investigate(PatriciaTrie trie, int charCount) throws Exception {
System.out.println("-- count elements.");
final AtomicInteger count = new AtomicInteger();
trie.visit(new TrieVisitor() {
public void accept(Node node, int nest) {
if (node.isTerminate())
count.incrementAndGet();
}
});
System.out.println(count.intValue() + " elements.");
//*
System.out.println("-- list elements.");
final AtomicInteger n = new AtomicInteger();
final AtomicInteger l = new AtomicInteger();
final AtomicInteger ln = new AtomicInteger();
final AtomicInteger chars = new AtomicInteger();
trie.visit(new TrieVisitor() {
public void accept(Node node, int nest) {
if (node.isTerminate()) {
l.incrementAndGet();
} else {
n.incrementAndGet();
}
chars.addAndGet(node.getLetters().length);
}
});
System.out.println("node: " + n.intValue());
System.out.println("leaf: " + l.intValue());
System.out.println("label node: " + ln.intValue());
System.out.println("total char count: " + charCount);
System.out.println("total char count in trie: " + chars.intValue());
System.out.println("verifying trie...");
BufferedReader r = new BufferedReader(new InputStreamReader(// new GZIPInputStream(new FileInputStream("jawiki-20120220-all-titles-in-ns0.gz"))
new GZIPInputStream(new FileInputStream("enwiki-20120403-all-titles-in-ns0.gz")), CharsetUtil.newUTF8Decoder()));
long lap = System.currentTimeMillis();
int c = 0;
int sum = 0;
String word = null;
while ((word = r.readLine()) != null) {
if (c == maxCount)
break;
long d = System.currentTimeMillis();
boolean found = trie.contains(word);
sum += System.currentTimeMillis() - d;
if (!found) {
System.out.println("trie not contains [" + word + "]");
break;
}
if (c % 100000 == 0) {
System.out.println(c + " elements done.");
}
c++;
}
System.out.println("done in " + (System.currentTimeMillis() - lap) + " millis.");
System.out.println("contains time: " + sum + " millis.");
System.out.println(trie.getRoot().getChildren().length + "children in root");
final PatriciaTrie t = trie;
new Thread(new Runnable() {
@Override
public void run() {
try {
Thread.sleep(100000);
t.contains("hello");
} catch (InterruptedException e) {
}
}
}).start();
//*/
}
use of java.util.zip.GZIPInputStream in project commons by twitter.
the class Base64ZlibCodec method decompress.
private static byte[] decompress(byte[] compressed) throws InvalidDataException {
byte[] bytes;
try {
final InputStream bin = new ByteArrayInputStream(compressed);
final InputStream zin;
if (startsWith(compressed, GZIP_HEADER_PREFIX)) {
zin = new GZIPInputStream(bin);
} else if (startsWith(compressed, ZLIB_HEADER_PREFIX)) {
zin = new InflaterInputStream(bin);
} else {
throw new Base64ZlibCodec.InvalidDataException("Value doesn't start with either GZIP or zlib header");
}
try {
bytes = ByteStreams.toByteArray(zin);
} finally {
zin.close();
}
} catch (IOException e) {
throw new Base64ZlibCodec.InvalidDataException("zlib/GZIP decoding error", e);
}
return bytes;
}
Aggregations