Search in sources :

Example 11 with CBZip2InputStream

use of org.apache.hadoop.io.compress.bzip2.CBZip2InputStream in project Anserini by castorini.

the class IndexTweets method main.

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(new Option(HELP_OPTION, "show help"));
    options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment"));
    options.addOption(new Option(STORE_TERM_VECTORS_OPTION, "store term vectors"));
    options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("source collection directory").create(COLLECTION_OPTION));
    options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("file with deleted tweetids").create(DELETES_OPTION));
    options.addOption(OptionBuilder.withArgName("id").hasArg().withDescription("max id").create(MAX_ID_OPTION));
    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }
    if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(IndexTweets.class.getName(), options);
        System.exit(-1);
    }
    String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
    String indexPath = cmdline.getOptionValue(INDEX_OPTION);
    final FieldType textOptions = new FieldType();
    textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    textOptions.setStored(true);
    textOptions.setTokenized(true);
    if (cmdline.hasOption(STORE_TERM_VECTORS_OPTION)) {
        textOptions.setStoreTermVectors(true);
    }
    LOG.info("collection: " + collectionPath);
    LOG.info("index: " + indexPath);
    LongOpenHashSet deletes = null;
    if (cmdline.hasOption(DELETES_OPTION)) {
        deletes = new LongOpenHashSet();
        File deletesFile = new File(cmdline.getOptionValue(DELETES_OPTION));
        if (!deletesFile.exists()) {
            System.err.println("Error: " + deletesFile + " does not exist!");
            System.exit(-1);
        }
        LOG.info("Reading deletes from " + deletesFile);
        FileInputStream fin = new FileInputStream(deletesFile);
        byte[] ignoreBytes = new byte[2];
        // "B", "Z" bytes from commandline tools
        fin.read(ignoreBytes);
        BufferedReader br = new BufferedReader(new InputStreamReader(new CBZip2InputStream(fin)));
        String s;
        while ((s = br.readLine()) != null) {
            if (s.contains("\t")) {
                deletes.add(Long.parseLong(s.split("\t")[0]));
            } else {
                deletes.add(Long.parseLong(s));
            }
        }
        br.close();
        fin.close();
        LOG.info("Read " + deletes.size() + " tweetids from deletes file.");
    }
    long maxId = Long.MAX_VALUE;
    if (cmdline.hasOption(MAX_ID_OPTION)) {
        maxId = Long.parseLong(cmdline.getOptionValue(MAX_ID_OPTION));
        LOG.info("index: " + maxId);
    }
    long startTime = System.currentTimeMillis();
    File file = new File(collectionPath);
    if (!file.exists()) {
        System.err.println("Error: " + file + " does not exist!");
        System.exit(-1);
    }
    StatusStream stream = new JsonStatusCorpusReader(file);
    Directory dir = FSDirectory.open(Paths.get(indexPath));
    final IndexWriterConfig config = new IndexWriterConfig(ANALYZER);
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    IndexWriter writer = new IndexWriter(dir, config);
    int cnt = 0;
    Status status;
    try {
        while ((status = stream.next()) != null) {
            if (status.getText() == null) {
                continue;
            }
            // Skip deletes tweetids.
            if (deletes != null && deletes.contains(status.getId())) {
                continue;
            }
            if (status.getId() > maxId) {
                continue;
            }
            cnt++;
            Document doc = new Document();
            doc.add(new LongPoint(StatusField.ID.name, status.getId()));
            doc.add(new StoredField(StatusField.ID.name, status.getId()));
            doc.add(new LongPoint(StatusField.EPOCH.name, status.getEpoch()));
            doc.add(new StoredField(StatusField.EPOCH.name, status.getEpoch()));
            doc.add(new TextField(StatusField.SCREEN_NAME.name, status.getScreenname(), Store.YES));
            doc.add(new Field(StatusField.TEXT.name, status.getText(), textOptions));
            doc.add(new IntPoint(StatusField.FRIENDS_COUNT.name, status.getFollowersCount()));
            doc.add(new StoredField(StatusField.FRIENDS_COUNT.name, status.getFollowersCount()));
            doc.add(new IntPoint(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount()));
            doc.add(new StoredField(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount()));
            doc.add(new IntPoint(StatusField.STATUSES_COUNT.name, status.getStatusesCount()));
            doc.add(new StoredField(StatusField.STATUSES_COUNT.name, status.getStatusesCount()));
            long inReplyToStatusId = status.getInReplyToStatusId();
            if (inReplyToStatusId > 0) {
                doc.add(new LongPoint(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId));
                doc.add(new StoredField(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId));
                doc.add(new LongPoint(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId()));
                doc.add(new StoredField(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId()));
            }
            String lang = status.getLang();
            if (!lang.equals("unknown")) {
                doc.add(new TextField(StatusField.LANG.name, status.getLang(), Store.YES));
            }
            long retweetStatusId = status.getRetweetedStatusId();
            if (retweetStatusId > 0) {
                doc.add(new LongPoint(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId));
                doc.add(new StoredField(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId));
                doc.add(new LongPoint(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId()));
                doc.add(new StoredField(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId()));
                doc.add(new IntPoint(StatusField.RETWEET_COUNT.name, status.getRetweetCount()));
                doc.add(new StoredField(StatusField.RETWEET_COUNT.name, status.getRetweetCount()));
                if (status.getRetweetCount() < 0 || status.getRetweetedStatusId() < 0) {
                    LOG.warn("Error parsing retweet fields of " + status.getId());
                }
            }
            writer.addDocument(doc);
            if (cnt % 100000 == 0) {
                LOG.info(cnt + " statuses indexed");
            }
        }
        LOG.info(String.format("Total of %s statuses added", cnt));
        if (cmdline.hasOption(OPTIMIZE_OPTION)) {
            LOG.info("Merging segments...");
            writer.forceMerge(1);
            LOG.info("Done!");
        }
        LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms");
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        writer.close();
        dir.close();
        stream.close();
    }
}
Also used : IndexOptions(org.apache.lucene.index.IndexOptions) CBZip2InputStream(org.apache.tools.bzip2.CBZip2InputStream) StatusStream(io.anserini.document.twitter.StatusStream) JsonStatusCorpusReader(io.anserini.document.twitter.JsonStatusCorpusReader) LongOpenHashSet(it.unimi.dsi.fastutil.longs.LongOpenHashSet) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) Status(io.anserini.document.twitter.Status) InputStreamReader(java.io.InputStreamReader) FileInputStream(java.io.FileInputStream) IndexWriter(org.apache.lucene.index.IndexWriter) BufferedReader(java.io.BufferedReader) File(java.io.File) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 12 with CBZip2InputStream

use of org.apache.hadoop.io.compress.bzip2.CBZip2InputStream in project ant by apache.

the class BZip2Test method testRealTest.

@Test
public void testRealTest() throws IOException {
    buildRule.executeTarget("realTest");
    // doesn't work: Depending on the compression engine used,
    // compressed bytes may differ. False errors would be
    // reported.
    // assertTrue("File content mismatch",
    // FILE_UTILS.contentEquals(project.resolveFile("expected/asf-logo-huge.tar.bz2"),
    // project.resolveFile("asf-logo-huge.tar.bz2")));
    // We have to compare the decompressed content instead:
    File originalFile = buildRule.getProject().resolveFile("expected/asf-logo-huge.tar.bz2");
    File actualFile = new File(outputDir, "asf-logo-huge.tar.bz2");
    InputStream originalIn = new BufferedInputStream(new FileInputStream(originalFile));
    assertEquals((byte) 'B', originalIn.read());
    assertEquals((byte) 'Z', originalIn.read());
    InputStream actualIn = new BufferedInputStream(new FileInputStream(actualFile));
    assertEquals((byte) 'B', actualIn.read());
    assertEquals((byte) 'Z', actualIn.read());
    originalIn = new CBZip2InputStream(originalIn);
    actualIn = new CBZip2InputStream(actualIn);
    while (true) {
        int expected = originalIn.read();
        int actual = actualIn.read();
        if (expected >= 0) {
            if (expected != actual) {
                fail("File content mismatch");
            }
        } else {
            if (actual >= 0) {
                fail("File content mismatch");
            }
            break;
        }
    }
    originalIn.close();
    actualIn.close();
}
Also used : BufferedInputStream(java.io.BufferedInputStream) BufferedInputStream(java.io.BufferedInputStream) FileInputStream(java.io.FileInputStream) CBZip2InputStream(org.apache.tools.bzip2.CBZip2InputStream) InputStream(java.io.InputStream) CBZip2InputStream(org.apache.tools.bzip2.CBZip2InputStream) File(java.io.File) FileInputStream(java.io.FileInputStream) Test(org.junit.Test)

Example 13 with CBZip2InputStream

use of org.apache.hadoop.io.compress.bzip2.CBZip2InputStream in project gradle by gradle.

the class Bzip2Archiver method read.

@Override
public InputStream read() {
    InputStream input = new BufferedInputStream(resource.read());
    try {
        // CBZip2InputStream expects the opening "BZ" to be skipped
        byte[] skip = new byte[2];
        input.read(skip);
        return new CBZip2InputStream(input);
    } catch (Exception e) {
        IoActions.closeQuietly(input);
        throw ResourceExceptions.readFailed(resource.getDisplayName(), e);
    }
}
Also used : BufferedInputStream(java.io.BufferedInputStream) BufferedInputStream(java.io.BufferedInputStream) CBZip2InputStream(org.apache.tools.bzip2.CBZip2InputStream) InputStream(java.io.InputStream) CBZip2InputStream(org.apache.tools.bzip2.CBZip2InputStream)

Aggregations

InputStream (java.io.InputStream)12 CBZip2InputStream (org.apache.tools.bzip2.CBZip2InputStream)12 FileInputStream (java.io.FileInputStream)10 BufferedInputStream (java.io.BufferedInputStream)9 GZIPInputStream (java.util.zip.GZIPInputStream)6 File (java.io.File)5 InputStreamReader (java.io.InputStreamReader)4 OsmBaseStorage (net.osmand.osm.io.OsmBaseStorage)4 IOException (java.io.IOException)3 Reader (java.io.Reader)3 BufferedReader (java.io.BufferedReader)2 ByteArrayInputStream (java.io.ByteArrayInputStream)2 OutputStream (java.io.OutputStream)2 SAXParser (javax.xml.parsers.SAXParser)2 ConsoleProgressImplementation (net.osmand.impl.ConsoleProgressImplementation)2 Entity (net.osmand.osm.edit.Entity)2 InputSource (org.xml.sax.InputSource)2 TLongObjectHashMap (gnu.trove.map.hash.TLongObjectHashMap)1 JsonStatusCorpusReader (io.anserini.document.twitter.JsonStatusCorpusReader)1 Status (io.anserini.document.twitter.Status)1