Search in sources :

Example 16 with IntIterator

use of it.unimi.dsi.fastutil.ints.IntIterator in project druid by druid-io.

the class StringDimensionMergerV9 method mergeBitmaps.

static void mergeBitmaps(List<IntBuffer> segmentRowNumConversions, Indexed<String> dimVals, BitmapFactory bmpFactory, RTree tree, boolean hasSpatial, IndexSeeker[] dictIdSeeker, int dictId, List<IndexableAdapter> adapters, String dimensionName, MutableBitmap nullRowsBitmap, GenericIndexedWriter<ImmutableBitmap> bitmapWriter) throws IOException {
    List<ConvertingIndexedInts> convertedInvertedIndexesToMerge = Lists.newArrayListWithCapacity(adapters.size());
    for (int j = 0; j < adapters.size(); ++j) {
        int seekedDictId = dictIdSeeker[j].seek(dictId);
        if (seekedDictId != IndexSeeker.NOT_EXIST) {
            convertedInvertedIndexesToMerge.add(new ConvertingIndexedInts(adapters.get(j).getBitmapIndex(dimensionName, seekedDictId), segmentRowNumConversions.get(j)));
        }
    }
    MutableBitmap mergedIndexes = bmpFactory.makeEmptyMutableBitmap();
    List<IntIterator> convertedInvertedIndexesIterators = new ArrayList<>(convertedInvertedIndexesToMerge.size());
    for (ConvertingIndexedInts convertedInvertedIndexes : convertedInvertedIndexesToMerge) {
        convertedInvertedIndexesIterators.add(convertedInvertedIndexes.iterator());
    }
    // Merge ascending index iterators into a single one, remove duplicates, and add to the mergedIndexes bitmap.
    // Merge is needed, because some compacting MutableBitmap implementations are very inefficient when bits are
    // added not in the ascending order.
    int prevRow = IndexMerger.INVALID_ROW;
    for (IntIterator mergeIt = IntIteratorUtils.mergeAscending(convertedInvertedIndexesIterators); mergeIt.hasNext(); ) {
        int row = mergeIt.nextInt();
        if (row != prevRow && row != IndexMerger.INVALID_ROW) {
            mergedIndexes.add(row);
        }
        prevRow = row;
    }
    if ((dictId == 0) && (Iterables.getFirst(dimVals, "") == null)) {
        mergedIndexes.or(nullRowsBitmap);
    }
    bitmapWriter.write(bmpFactory.makeImmutableBitmap(mergedIndexes));
    if (hasSpatial) {
        String dimVal = dimVals.get(dictId);
        if (dimVal != null) {
            List<String> stringCoords = Lists.newArrayList(SPLITTER.split(dimVal));
            float[] coords = new float[stringCoords.size()];
            for (int j = 0; j < coords.length; j++) {
                coords[j] = Float.valueOf(stringCoords.get(j));
            }
            tree.insert(coords, mergedIndexes);
        }
    }
}
Also used : AbstractIntIterator(it.unimi.dsi.fastutil.ints.AbstractIntIterator) IntIterator(it.unimi.dsi.fastutil.ints.IntIterator) MutableBitmap(io.druid.collections.bitmap.MutableBitmap) ArrayList(java.util.ArrayList)

Example 17 with IntIterator

use of it.unimi.dsi.fastutil.ints.IntIterator in project pinot by linkedin.

the class ObjectCustomSerDe method serializeIntOpenHashSet.

/**
   * Helper method to serialize an {@link IntOpenHashSet}.
   */
private static byte[] serializeIntOpenHashSet(IntOpenHashSet intOpenHashSet) throws IOException {
    ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
    DataOutputStream dataOutputStream = new DataOutputStream(byteArrayOutputStream);
    // Write the size of the set.
    dataOutputStream.writeInt(intOpenHashSet.size());
    IntIterator intIterator = intOpenHashSet.iterator();
    while (intIterator.hasNext()) {
        dataOutputStream.writeInt(intIterator.nextInt());
    }
    return byteArrayOutputStream.toByteArray();
}
Also used : IntIterator(it.unimi.dsi.fastutil.ints.IntIterator) DataOutputStream(java.io.DataOutputStream) ByteArrayOutputStream(java.io.ByteArrayOutputStream)

Example 18 with IntIterator

use of it.unimi.dsi.fastutil.ints.IntIterator in project elki by elki-project.

the class XSplitter method minimumOverlapSplit.

/**
 * Perform an minimum overlap split. The
 * {@link #chooseMinimumOverlapSplit(int, int, int, boolean) minimum overlap
 * split} calculates the partition for the split dimension determined by
 * {@link #chooseSplitAxis(Iterable, int, int) chooseSplitAxis}
 * <code>(common split
 * history, minFanout, maxEntries - minFanout + 1)</code> with the minimum
 * overlap. This range may have been tested before (by the
 * {@link #topologicalSplit()}), but for the minimum overlap test we need to
 * test that anew. Note that this method returns <code>null</code>, if the
 * minimum overlap split has a volume which is larger than the allowed
 * <code>maxOverlap</code> ratio or if the tree's minimum fanout is not larger
 * than the minimum directory size.
 *
 * @return distribution resulting from the minimum overlap split
 */
public SplitSorting minimumOverlapSplit() {
    if (node.getEntry(0) instanceof LeafEntry) {
        throw new IllegalArgumentException("The minimum overlap split will only be performed on directory nodes");
    }
    if (node.getNumEntries() < 2) {
        throw new IllegalArgumentException("Splitting less than two entries is pointless.");
    }
    int maxEntries = tree.getDirCapacity() - 1;
    int minFanout = tree.get_min_fanout();
    if (node.getNumEntries() < maxEntries) {
        throw new IllegalArgumentException("This entry list has not yet reached the maximum limit: " + node.getNumEntries() + "<=" + maxEntries);
    }
    assert !(node.getEntry(0) instanceof LeafEntry);
    if (minFanout >= tree.getDirMinimum()) {
        // minFanout not set for allowing underflowing nodes
        return null;
    }
    IntIterator dimensionListing;
    if (node.getEntry(0) instanceof XTreeDirectoryEntry) {
        // filter common split dimensions
        dimensionListing = getCommonSplitDimensions(node);
        if (!dimensionListing.hasNext()) {
            // no common dimensions
            return null;
        }
    } else {
        // test all dimensions
        dimensionListing = new IntegerRangeIterator(0, node.getEntry(0).getDimensionality());
    }
    int formerSplitAxis = this.splitAxis;
    // = maximum left-hand size
    maxEntries = maxEntries + 1 - minFanout;
    chooseSplitAxis(dimensionListing, minFanout, maxEntries);
    // find the best split point
    if (formerSplitAxis == this.splitAxis && tree.getDirMinimum() > minFanout) {
        // remember: this follows an unsuccessful topological split
        // avoid duplicate computations of {minEntries, ..., maxEntries}
        double minOverlap = pastOverlap;
        // test {minFanout, ..., minEntries - 1}
        SplitSorting ret1 = chooseMinimumOverlapSplit(this.splitAxis, minFanout, tree.getDirMinimum() - 1, false);
        if (ret1 != null && pastOverlap < minOverlap) {
            // this is a valid choice
            minOverlap = pastOverlap;
        }
        // test {maxEntries - minEntries + 2, ..., maxEntries - minFanout + 1}
        SplitSorting ret2 = chooseMinimumOverlapSplit(this.splitAxis, minFanout, tree.getDirMinimum() - 1, true);
        if (ret2 == null) {
            // accept first range regardless of whether or not there is one
            pastOverlap = minOverlap;
            return ret1;
        }
        if (pastOverlap < minOverlap) {
            // the second range is better
            return ret2;
        }
        // the first range is better
        pastOverlap = minOverlap;
        return ret1;
    } else {
        return chooseMinimumOverlapSplit(this.splitAxis, minFanout, maxEntries, false);
    }
}
Also used : XTreeDirectoryEntry(de.lmu.ifi.dbs.elki.index.tree.spatial.rstarvariants.xtree.XTreeDirectoryEntry) IntIterator(it.unimi.dsi.fastutil.ints.IntIterator) LeafEntry(de.lmu.ifi.dbs.elki.index.tree.LeafEntry)

Example 19 with IntIterator

use of it.unimi.dsi.fastutil.ints.IntIterator in project elki by elki-project.

the class RandomProjectedNeighborsAndDensities method computeSetsBounds.

/**
 * Create random projections, project points and put points into sets of size
 * about minSplitSize/2
 *
 * @param points points to process
 * @param minSplitSize minimum size for which a point set is further
 *        partitioned (roughly corresponds to minPts in OPTICS)
 * @param ptList Points that are to be projected
 */
public void computeSetsBounds(Relation<V> points, int minSplitSize, DBIDs ptList) {
    this.minSplitSize = minSplitSize;
    final int size = points.size();
    final int dim = RelationUtil.dimensionality(points);
    this.points = points;
    // perform O(log N+log dim) splits of the entire point sets projections
    int nPointSetSplits = (int) (logOProjectionConst * MathUtil.log2(size * dim + 1));
    // perform O(log N+log dim) projections of the point set onto a random line
    int nProject1d = (int) (logOProjectionConst * MathUtil.log2(size * dim + 1));
    LOG.statistics(new LongStatistic(PREFIX + ".partition-size", nPointSetSplits));
    LOG.statistics(new LongStatistic(PREFIX + ".num-projections", nProject1d));
    splitsets = new ArrayList<>();
    // perform projections of points
    projectedPoints = new DoubleDataStore[nProject1d];
    DoubleDataStore[] tmpPro = new DoubleDataStore[nProject1d];
    Random rand = rnd.getSingleThreadedRandom();
    FiniteProgress projp = LOG.isVerbose() ? new FiniteProgress("Random projections", nProject1d, LOG) : null;
    for (int j = 0; j < nProject1d; j++) {
        double[] currRp = new double[dim];
        double sum = 0;
        for (int i = 0; i < dim; i++) {
            double fl = rand.nextDouble() - 0.5;
            currRp[i] = fl;
            sum += fl * fl;
        }
        sum = FastMath.sqrt(sum);
        for (int i = 0; i < dim; i++) {
            currRp[i] /= sum;
        }
        WritableDoubleDataStore currPro = DataStoreUtil.makeDoubleStorage(ptList, DataStoreFactory.HINT_HOT);
        for (DBIDIter it = ptList.iter(); it.valid(); it.advance()) {
            NumberVector vecPt = points.get(it);
            // Dot product:
            double sum2 = 0;
            for (int i = 0; i < dim; i++) {
                sum2 += currRp[i] * vecPt.doubleValue(i);
            }
            currPro.put(it, sum2);
        }
        projectedPoints[j] = currPro;
        LOG.incrementProcessed(projp);
    }
    LOG.ensureCompleted(projp);
    // Log the number of scalar projections performed.
    long numprod = nProject1d * (long) ptList.size();
    LOG.statistics(new LongStatistic(PREFIX + ".num-scalar-products", numprod));
    // split entire point set, reuse projections by shuffling them
    IntArrayList proind = new IntArrayList(nProject1d);
    for (int j = 0; j < nProject1d; j++) {
        proind.add(j);
    }
    FiniteProgress splitp = LOG.isVerbose() ? new FiniteProgress("Splitting data", nPointSetSplits, LOG) : null;
    for (int avgP = 0; avgP < nPointSetSplits; avgP++) {
        // shuffle projections
        for (int i = 0; i < nProject1d; i++) {
            tmpPro[i] = projectedPoints[i];
        }
        // Shuffle axes (Fisher-Yates)
        for (int i = 1; i < nProject1d; i++) {
            final int j = rand.nextInt(i);
            // Swap i,j
            proind.set(i, proind.set(j, proind.getInt(i)));
        }
        IntIterator it = proind.iterator();
        int i = 0;
        while (it.hasNext()) {
            int cind = it.nextInt();
            projectedPoints[cind] = tmpPro[i];
            i++;
        }
        // split point set
        splitupNoSort(DBIDUtil.newArray(ptList), 0, size, 0, rand);
        LOG.incrementProcessed(splitp);
    }
    LOG.ensureCompleted(splitp);
}
Also used : IntIterator(it.unimi.dsi.fastutil.ints.IntIterator) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) Random(java.util.Random) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic) IntArrayList(it.unimi.dsi.fastutil.ints.IntArrayList)

Example 20 with IntIterator

use of it.unimi.dsi.fastutil.ints.IntIterator in project elki by elki-project.

the class LinearDiscriminantAnalysisFilter method computeProjectionMatrix.

@Override
protected double[][] computeProjectionMatrix(List<V> vectorcolumn, List<? extends ClassLabel> classcolumn, int dim) {
    Map<ClassLabel, IntList> classes = partition(classcolumn);
    // Fix indexing of classes:
    List<ClassLabel> keys = new ArrayList<>(classes.keySet());
    // Compute centroids:
    List<Centroid> centroids = computeCentroids(dim, vectorcolumn, keys, classes);
    final double[][] sigmaB, sigmaI;
    // Between classes covariance:
    {
        CovarianceMatrix covmake = new CovarianceMatrix(dim);
        for (Centroid c : centroids) {
            covmake.put(c);
        }
        sigmaB = covmake.destroyToSampleMatrix();
    }
    {
        // (Average) within class variance:
        CovarianceMatrix covmake = new CovarianceMatrix(dim);
        int numc = keys.size();
        for (int i = 0; i < numc; i++) {
            double[] c = centroids.get(i).getArrayRef();
            // TODO: different weighting strategies? Sampling?
            for (IntIterator it = classes.get(keys.get(i)).iterator(); it.hasNext(); ) {
                covmake.put(minusEquals(vectorcolumn.get(it.nextInt()).toArray(), c));
            }
        }
        sigmaI = covmake.destroyToSampleMatrix();
        if (new LUDecomposition(sigmaI).det() == 0) {
            for (int i = 0; i < dim; i++) {
                sigmaI[i][i] += 1e-10;
            }
        }
    }
    double[][] sol = times(inverse(sigmaI), sigmaB);
    EigenvalueDecomposition decomp = new EigenvalueDecomposition(sol);
    SortedEigenPairs sorted = new SortedEigenPairs(decomp, false);
    return transpose(sorted.eigenVectors(tdim));
}
Also used : IntIterator(it.unimi.dsi.fastutil.ints.IntIterator) EigenvalueDecomposition(de.lmu.ifi.dbs.elki.math.linearalgebra.EigenvalueDecomposition) ArrayList(java.util.ArrayList) LUDecomposition(de.lmu.ifi.dbs.elki.math.linearalgebra.LUDecomposition) IntList(it.unimi.dsi.fastutil.ints.IntList) CovarianceMatrix(de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix) Centroid(de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid) ClassLabel(de.lmu.ifi.dbs.elki.data.ClassLabel) SortedEigenPairs(de.lmu.ifi.dbs.elki.math.linearalgebra.pca.SortedEigenPairs)

Aggregations

IntIterator (it.unimi.dsi.fastutil.ints.IntIterator)31 ArrayList (java.util.ArrayList)5 Test (org.junit.Test)4 IntOpenHashSet (it.unimi.dsi.fastutil.ints.IntOpenHashSet)3 Centroid (de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid)2 IntArrayList (it.unimi.dsi.fastutil.ints.IntArrayList)2 BitmapBackedSelection (tech.tablesaw.selection.BitmapBackedSelection)2 Selection (tech.tablesaw.selection.Selection)2 PartitionKey (com.tencent.angel.PartitionKey)1 MatrixMeta (com.tencent.angel.ml.matrix.MatrixMeta)1 ClassLabel (de.lmu.ifi.dbs.elki.data.ClassLabel)1 NumberVector (de.lmu.ifi.dbs.elki.data.NumberVector)1 Model (de.lmu.ifi.dbs.elki.data.model.Model)1 GeneratorSingleCluster (de.lmu.ifi.dbs.elki.data.synthetic.bymodel.GeneratorSingleCluster)1 LeafEntry (de.lmu.ifi.dbs.elki.index.tree.LeafEntry)1 XTreeDirectoryEntry (de.lmu.ifi.dbs.elki.index.tree.spatial.rstarvariants.xtree.XTreeDirectoryEntry)1 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)1 LongStatistic (de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)1 CovarianceMatrix (de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix)1 EigenvalueDecomposition (de.lmu.ifi.dbs.elki.math.linearalgebra.EigenvalueDecomposition)1