Search in sources :

Example 1 with LongOpenHashBigSet

use of it.unimi.dsi.fastutil.longs.LongOpenHashBigSet in project gridss by PapenfussLab.

the class SubsetToMissing method doWork.

@Override
protected int doWork() {
    long stop = Long.MAX_VALUE;
    if (STOP_AFTER != null && (long) STOP_AFTER > 0) {
        stop = STOP_AFTER;
    }
    log.debug("Setting language-neutral locale");
    java.util.Locale.setDefault(Locale.ROOT);
    if (TMP_DIR == null || TMP_DIR.size() == 0) {
        TMP_DIR = Lists.newArrayList(new File("."));
    }
    SamReaderFactory factory = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT);
    SamReader input = factory.open(INPUT);
    Iterator<SAMRecord> intputit = new AsyncBufferedIterator<SAMRecord>(input.iterator(), 2, 16384);
    SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(input.getFileHeader(), true, OUTPUT);
    LongSet hashtable;
    if (PREALLOCATE != null) {
        log.info("Preallocating hash table");
        hashtable = new LongOpenHashBigSet(PREALLOCATE);
    } else {
        hashtable = new LongOpenHashBigSet();
    }
    for (File file : LOOKUP) {
        log.info("Loading lookup hashes for " + file.getAbsolutePath());
        SamReader lookup = factory.open(file);
        AsyncBufferedIterator<SAMRecord> it = new AsyncBufferedIterator<SAMRecord>(lookup.iterator(), 2, 16384);
        File cache = new File(file.getAbsolutePath() + ".SubsetToMissing.cache");
        if (cache.exists()) {
            log.info("Loading lookup hashes from cache");
            long n = stop;
            DataInputStream dis = null;
            try {
                long loadCount = 0;
                dis = new DataInputStream(new BufferedInputStream(new FileInputStream(cache)));
                while (n-- > 0) {
                    hashtable.add(dis.readLong());
                    if (loadCount % 10000000 == 0) {
                        log.info(String.format("Loaded %d from cache", loadCount));
                    }
                }
            } catch (EOFException e) {
                try {
                    if (dis != null)
                        dis.close();
                } catch (IOException e1) {
                    log.error(e1);
                }
            } catch (IOException e) {
                log.error(e);
            }
        } else {
            long n = stop;
            ProgressLoggingSAMRecordIterator loggedit = new ProgressLoggingSAMRecordIterator(it, new ProgressLogger(log));
            try {
                DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(cache)));
                while (loggedit.hasNext() && n-- > 0) {
                    long recordhash = hash(loggedit.next());
                    hashtable.add(recordhash);
                    dos.writeLong(recordhash);
                }
                dos.close();
            } catch (Exception e) {
                log.error(e, "Failed to load lookup. Running with partial results");
            }
            loggedit.close();
        }
        it.close();
    }
    long filtered = 0;
    log.info("Processing input");
    intputit = new ProgressLoggingSAMRecordIterator(intputit, new ProgressLogger(log));
    long n = stop;
    while (intputit.hasNext() && n-- > 0) {
        SAMRecord r = intputit.next();
        if (!hashtable.contains(hash(r))) {
            out.addAlignment(r);
        } else {
            filtered++;
            if (filtered % 1000000 == 0) {
                log.info(String.format("Filtered %d reads", filtered));
            }
        }
    }
    log.info("Closing output");
    out.close();
    return 0;
}
Also used : SamReaderFactory(htsjdk.samtools.SamReaderFactory) SAMFileWriter(htsjdk.samtools.SAMFileWriter) DataOutputStream(java.io.DataOutputStream) LongSet(it.unimi.dsi.fastutil.longs.LongSet) SAMFileWriterFactory(htsjdk.samtools.SAMFileWriterFactory) ProgressLogger(htsjdk.samtools.util.ProgressLogger) IOException(java.io.IOException) DataInputStream(java.io.DataInputStream) FileInputStream(java.io.FileInputStream) IOException(java.io.IOException) EOFException(java.io.EOFException) SamReader(htsjdk.samtools.SamReader) LongOpenHashBigSet(it.unimi.dsi.fastutil.longs.LongOpenHashBigSet) BufferedInputStream(java.io.BufferedInputStream) SAMRecord(htsjdk.samtools.SAMRecord) FileOutputStream(java.io.FileOutputStream) EOFException(java.io.EOFException) ProgressLoggingSAMRecordIterator(au.edu.wehi.idsv.ProgressLoggingSAMRecordIterator) File(java.io.File) BufferedOutputStream(java.io.BufferedOutputStream)

Aggregations

ProgressLoggingSAMRecordIterator (au.edu.wehi.idsv.ProgressLoggingSAMRecordIterator)1 SAMFileWriter (htsjdk.samtools.SAMFileWriter)1 SAMFileWriterFactory (htsjdk.samtools.SAMFileWriterFactory)1 SAMRecord (htsjdk.samtools.SAMRecord)1 SamReader (htsjdk.samtools.SamReader)1 SamReaderFactory (htsjdk.samtools.SamReaderFactory)1 ProgressLogger (htsjdk.samtools.util.ProgressLogger)1 LongOpenHashBigSet (it.unimi.dsi.fastutil.longs.LongOpenHashBigSet)1 LongSet (it.unimi.dsi.fastutil.longs.LongSet)1 BufferedInputStream (java.io.BufferedInputStream)1 BufferedOutputStream (java.io.BufferedOutputStream)1 DataInputStream (java.io.DataInputStream)1 DataOutputStream (java.io.DataOutputStream)1 EOFException (java.io.EOFException)1 File (java.io.File)1 FileInputStream (java.io.FileInputStream)1 FileOutputStream (java.io.FileOutputStream)1 IOException (java.io.IOException)1