Search in sources :

Example 1 with KahanSummation

use of org.gridfour.util.KahanSummation in project gridfour by gwlucastrig.

the class EntropyTabulator method process.

/**
 * Process the specified GVRS file and write a report to the specified print
 * stream.
 * <p>
 * If configured to do so, this method will write progress reports to the
 * specified print stream.
 *
 * @param ps a valid print stream, System&#46;out is a valid candidate
 * @param inputFile a reference to a GVRS file
 * @param showProgress indicates if progress reports are to be printed during
 * processing
 * @return on successful completion, a valid floating-point value; otherwise,
 * a Double&#46;NaN.
 */
public double process(PrintStream ps, File inputFile, boolean showProgress) {
    double entropy = Double.NaN;
    ps.format("%nEntropy tabulation for GVRS files%n");
    Locale locale = Locale.getDefault();
    Date date = new Date();
    SimpleDateFormat sdFormat = new SimpleDateFormat("dd MMM yyyy HH:mm z", locale);
    ps.format("Date of Execution: %s%n", sdFormat.format(date));
    String inputPath = inputFile.getPath();
    ps.format("Input file:  %s%n", inputPath);
    File parent = inputFile.getParentFile();
    File countsFile = new File(parent, TEMP_COUNT_FILE_NAME);
    // Define the specs for the entropy stats file
    GvrsFileSpecification countsSpec = new GvrsFileSpecification(65536, 65536, 256, 256);
    countsSpec.setDataCompressionEnabled(false);
    GvrsElementSpecificationInt countsElementSpec = new GvrsElementSpecificationInt("counts", 0);
    countsSpec.addElementSpecification(countsElementSpec);
    try (GvrsFile source = new GvrsFile(inputFile, "r");
        GvrsFile counts = new GvrsFile(countsFile, countsSpec)) {
        GvrsFileSpecification sourceSpec = source.getSpecification();
        int nRowsInSource = sourceSpec.getRowsInGrid();
        int nColsInSource = sourceSpec.getColumnsInGrid();
        int nRowsOfTilesInSource = sourceSpec.getRowsOfTilesInGrid();
        int nColsOfTilesInSource = sourceSpec.getColumnsOfTilesInGrid();
        int nRowsInTile = sourceSpec.getRowsInTile();
        int nColsInTile = sourceSpec.getColumnsInTile();
        GvrsElement sourceElement = source.getElements().get(0);
        GvrsElementType sourceDataType = sourceElement.getDataType();
        GvrsElement countsElement = counts.getElement("counts");
        long nSamples = 0;
        long nSymbols = 0;
        ps.println("Source File " + inputFile.getName());
        ps.format("   Rows:      %8d%n", nRowsInSource);
        ps.format("   Columns:   %8d%n", nColsInSource);
        source.setTileCacheSize(GvrsCacheSize.Small);
        counts.setTileCacheSize(2000);
        long time0 = System.currentTimeMillis();
        if (showProgress) {
            ps.format("Initializing temporary entropy tabulation file %s%n", countsFile.getPath());
            ps.flush();
        }
        // Package the data
        if (showProgress) {
            ps.format("Initialization done in %d ms%n", System.currentTimeMillis() - time0);
            ps.println("Beginning tabulation");
        }
        time0 = System.currentTimeMillis();
        for (int iTileRow = 0; iTileRow < nRowsOfTilesInSource; iTileRow++) {
            if (showProgress && iTileRow > 0) {
                long time1 = System.currentTimeMillis();
                double deltaT = time1 - time0;
                // rows per millis
                double rate = (iTileRow + 1) / deltaT;
                int nRemaining = nRowsOfTilesInSource - iTileRow;
                long remainingT = (long) (nRemaining / rate);
                Date d = new Date(time1 + remainingT);
                ps.format("Surveyed %d rows, %4.1f%% of total, est completion at %s%n", iTileRow * nRowsInTile, 100.0 * (double) iTileRow / (nRowsOfTilesInSource - 1.0), d);
                ps.flush();
            }
            int row0 = iTileRow * nRowsInTile;
            int row1 = row0 + nRowsInTile;
            if (row1 > nRowsInSource) {
                row1 = nRowsInSource;
            }
            for (int iTileCol = 0; iTileCol < nColsOfTilesInSource; iTileCol++) {
                int col0 = iTileCol * nColsInTile;
                int col1 = col0 + nColsInTile;
                if (col1 > nColsInSource) {
                    col1 = nColsInSource;
                }
                for (int iRow = row0; iRow < row1; iRow++) {
                    for (int iCol = col0; iCol < col1; iCol++) {
                        int bits;
                        if (sourceDataType == GvrsElementType.FLOAT) {
                            float sample = sourceElement.readValue(iRow, iCol);
                            bits = Float.floatToRawIntBits(sample);
                        } else {
                            bits = sourceElement.readValueInt(iRow, iCol);
                        }
                        long longIndex = ((long) bits) & 0x00ffffffffL;
                        long longRow = longIndex / 65536L;
                        long longCol = longIndex - longRow * 65536L;
                        int count = countsElement.readValueInt((int) longRow, (int) longCol);
                        countsElement.writeValueInt((int) longRow, (int) longCol, count + 1);
                        nSamples++;
                        if (count == 0) {
                            nSymbols++;
                        }
                    }
                }
            }
        }
        counts.flush();
        long time1 = System.currentTimeMillis();
        double timeToProcess = (time1 - time0) / 1000.0;
        if (showProgress) {
            ps.format("Finished surveying source file in %4.1f seconds%n", timeToProcess);
            ps.format("Performing tabulation of count data%n");
            ps.flush();
        }
        time0 = System.currentTimeMillis();
        double nSamplesDouble = (double) nSamples;
        int maxCount = 0;
        long nUnique = 0;
        long nRepeated = 0;
        KahanSummation ks = new KahanSummation();
        for (int iRow = 0; iRow < 65536; iRow++) {
            if (showProgress && (iRow & 1023) == 0 && iRow > 0) {
                time1 = System.currentTimeMillis();
                double deltaT = time1 - time0;
                // rows per millis
                double rate = (iRow + 1) / deltaT;
                int nRemaining = 65536 - iRow;
                long remainingT = (long) (nRemaining / rate);
                Date d = new Date(time1 + remainingT);
                ps.format("Tabulated %d rows, %4.1f%% of total, est completion at %s%n", iRow, 100.0 * (double) iRow / 65536.0, d);
                ps.flush();
            }
            for (int iCol = 0; iCol < 65536; iCol++) {
                int count = countsElement.readValueInt(iRow, iCol);
                if (count > 0) {
                    double p = (double) count / nSamplesDouble;
                    double s = -p * Math.log(p);
                    ks.add(s);
                    if (count > maxCount) {
                        maxCount = count;
                    }
                    if (count == 1) {
                        nUnique++;
                    } else {
                        nRepeated++;
                    }
                }
            }
        }
        // get sum of entropy calculations, and them apply
        // adjustment for base 2.
        entropy = ks.getSum() / Math.log(2.0);
        time1 = System.currentTimeMillis();
        double timeToTabulate = (time1 - time0) / 1000.0;
        ps.format("Finished processing file in %4.1f seconds%n", timeToTabulate);
        ps.format("Size of Counts File %12d%n", countsFile.length());
        ps.format("Samples:            %12d%n", nSamples);
        ps.format("Unique Symbols:     %12d%n", nUnique);
        ps.format("Repeated Symbols:   %12d%n", nRepeated);
        ps.format("Total symbols:      %12d%n", nSymbols);
        ps.format("Max count:          %12d%n", maxCount);
        ps.format("Entropy:            %9.5f%n ", entropy);
    } catch (IOException ioex) {
        ps.println("IOException accessing " + inputFile.getPath() + ", " + ioex.getMessage());
        ioex.printStackTrace(ps);
    }
    countsFile.delete();
    return entropy;
}
Also used : Locale(java.util.Locale) GvrsElementSpecificationInt(org.gridfour.gvrs.GvrsElementSpecificationInt) GvrsElement(org.gridfour.gvrs.GvrsElement) IOException(java.io.IOException) Date(java.util.Date) GvrsElementType(org.gridfour.gvrs.GvrsElementType) GvrsFileSpecification(org.gridfour.gvrs.GvrsFileSpecification) KahanSummation(org.gridfour.util.KahanSummation) SimpleDateFormat(java.text.SimpleDateFormat) File(java.io.File) GvrsFile(org.gridfour.gvrs.GvrsFile) GvrsFile(org.gridfour.gvrs.GvrsFile)

Aggregations

File (java.io.File)1 IOException (java.io.IOException)1 SimpleDateFormat (java.text.SimpleDateFormat)1 Date (java.util.Date)1 Locale (java.util.Locale)1 GvrsElement (org.gridfour.gvrs.GvrsElement)1 GvrsElementSpecificationInt (org.gridfour.gvrs.GvrsElementSpecificationInt)1 GvrsElementType (org.gridfour.gvrs.GvrsElementType)1 GvrsFile (org.gridfour.gvrs.GvrsFile)1 GvrsFileSpecification (org.gridfour.gvrs.GvrsFileSpecification)1 KahanSummation (org.gridfour.util.KahanSummation)1