use of de.lmu.ifi.dbs.elki.data.BitVector in project elki by elki-project.
the class BitVectorLabelParser method parseLineInternal.
@Override
protected boolean parseLineInternal() {
int curdim = 0;
for (; tokenizer.valid(); tokenizer.advance()) {
try {
final int word = curdim >>> 6;
final int off = curdim & 0x3F;
if (word >= buf.size()) {
// Ensure size.
buf.add(0L);
}
if (tokenizer.getIntBase10() > 0) {
buf.set(word, buf.getLong(word) | (1L << off));
}
++curdim;
} catch (NumberFormatException e) {
labels.add(tokenizer.getSubstring());
}
}
if (curdim == 0) {
// Maybe a label row
return false;
}
curvec = new BitVector(buf.toLongArray(), curdim);
curlbl = LabelList.make(labels);
buf.clear();
labels.clear();
return true;
}
use of de.lmu.ifi.dbs.elki.data.BitVector in project elki by elki-project.
the class APRIORI method frequentItemsetsSparse.
/**
* Returns the frequent BitSets out of the given BitSets with respect to the
* given database. Optimized implementation for SparseItemset.
*
* @param candidates the candidates to be evaluated
* @param relation the database to evaluate the candidates on
* @param needed Minimum support needed
* @param ids Objects to process
* @param survivors Output: objects that had at least two 1-frequent items.
* @param length Itemset length
* @return Itemsets with sufficient support
*/
protected List<SparseItemset> frequentItemsetsSparse(List<SparseItemset> candidates, Relation<BitVector> relation, int needed, DBIDs ids, ArrayModifiableDBIDs survivors, int length) {
// Current search interval:
int begin = 0, end = candidates.size();
int[] scratchi = new int[length], iters = new int[length];
SparseItemset scratch = new SparseItemset(scratchi);
for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
BitVector bv = relation.get(iditer);
if (!initializeSearchItemset(bv, scratchi, iters)) {
continue;
}
int lives = 0;
while (begin < end) {
begin = binarySearch(candidates, scratch, begin, end);
if (begin > 0) {
candidates.get(begin).increaseSupport();
++lives;
} else {
begin = (-begin) - 1;
}
if (begin >= end || !nextSearchItemset(bv, scratchi, iters)) {
break;
}
}
for (Itemset candidate : candidates) {
if (candidate.containedIn(bv)) {
candidate.increaseSupport();
++lives;
}
}
if (lives > length) {
survivors.add(iditer);
}
}
// Retain only those with minimum support:
List<SparseItemset> frequent = new ArrayList<>(candidates.size());
for (Iterator<SparseItemset> iter = candidates.iterator(); iter.hasNext(); ) {
final SparseItemset candidate = iter.next();
if (candidate.getSupport() >= needed) {
frequent.add(candidate);
}
}
return frequent;
}
use of de.lmu.ifi.dbs.elki.data.BitVector in project elki by elki-project.
the class DiSHPreferenceVectorIndex method determinePreferenceVectorByApriori.
/**
* Determines the preference vector with the apriori strategy.
*
* @param relation the database storing the objects
* @param neighborIDs the list of ids of the neighbors in each dimension
* @param msg a string buffer for debug messages
* @return the preference vector
*/
private long[] determinePreferenceVectorByApriori(Relation<V> relation, ModifiableDBIDs[] neighborIDs, StringBuilder msg) {
int dimensionality = neighborIDs.length;
// database for apriori
UpdatableDatabase apriori_db = new HashmapDatabase();
SimpleTypeInformation<?> bitmeta = VectorFieldTypeInformation.typeRequest(BitVector.class, dimensionality, dimensionality);
for (DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) {
long[] bits = BitsUtil.zero(dimensionality);
boolean allFalse = true;
for (int d = 0; d < dimensionality; d++) {
if (neighborIDs[d].contains(it)) {
BitsUtil.setI(bits, d);
allFalse = false;
}
}
if (!allFalse) {
SingleObjectBundle oaa = new SingleObjectBundle();
oaa.append(bitmeta, new BitVector(bits, dimensionality));
apriori_db.insert(oaa);
}
}
APRIORI apriori = new APRIORI(minpts);
FrequentItemsetsResult aprioriResult = apriori.run(apriori_db);
// result of apriori
List<Itemset> frequentItemsets = aprioriResult.getItemsets();
if (msg != null) {
msg.append("\n Frequent itemsets: ").append(frequentItemsets);
}
int maxSupport = 0;
int maxCardinality = 0;
long[] preferenceVector = BitsUtil.zero(dimensionality);
for (Itemset itemset : frequentItemsets) {
if ((maxCardinality < itemset.length()) || (maxCardinality == itemset.length() && maxSupport == itemset.getSupport())) {
preferenceVector = Itemset.toBitset(itemset, BitsUtil.zero(dimensionality));
maxCardinality = itemset.length();
maxSupport = itemset.getSupport();
}
}
if (msg != null) {
//
msg.append("\n preference ").append(//
BitsUtil.toStringLow(preferenceVector, dimensionality)).append('\n');
LOG.debugFine(msg.toString());
}
return preferenceVector;
}
use of de.lmu.ifi.dbs.elki.data.BitVector in project elki by elki-project.
the class APRIORI method frequentItemsets.
/**
* Returns the frequent BitSets out of the given BitSets with respect to the
* given database.
*
* @param candidates the candidates to be evaluated
* @param relation the database to evaluate the candidates on
* @param needed Minimum support needed
* @param ids Objects to process
* @param survivors Output: objects that had at least two 1-frequent items.
* @param length Itemset length
* @return Itemsets with sufficient support
*/
protected List<? extends Itemset> frequentItemsets(List<? extends Itemset> candidates, Relation<BitVector> relation, int needed, DBIDs ids, ArrayModifiableDBIDs survivors, int length) {
if (candidates.isEmpty()) {
return Collections.emptyList();
}
Itemset first = candidates.get(0);
// number of frequent itemsets is about to break down to 0.
if (candidates.size() > length * length * length * 100 && first instanceof SparseItemset) {
// Assume that all itemsets are sparse itemsets!
@SuppressWarnings("unchecked") List<SparseItemset> sparsecand = (List<SparseItemset>) candidates;
return frequentItemsetsSparse(sparsecand, relation, needed, ids, survivors, length);
}
for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
BitVector bv = relation.get(iditer);
// TODO: exploit that the candidate set it sorted?
int lives = 0;
for (Itemset candidate : candidates) {
if (candidate.containedIn(bv)) {
candidate.increaseSupport();
++lives;
}
}
if (lives > length) {
survivors.add(iditer);
}
}
// Retain only those with minimum support:
List<Itemset> frequent = new ArrayList<>(candidates.size());
for (Iterator<? extends Itemset> iter = candidates.iterator(); iter.hasNext(); ) {
final Itemset candidate = iter.next();
if (candidate.getSupport() >= needed) {
frequent.add(candidate);
}
}
return frequent;
}
use of de.lmu.ifi.dbs.elki.data.BitVector in project elki by elki-project.
the class APRIORI method run.
/**
* Performs the APRIORI algorithm on the given database.
*
* @param relation the Relation to process
* @return the AprioriResult learned by this APRIORI
*/
public FrequentItemsetsResult run(Relation<BitVector> relation) {
DBIDs ids = relation.getDBIDs();
List<Itemset> solution = new ArrayList<>();
final int size = ids.size();
final int needed = getMinimumSupport(size);
// TODO: we don't strictly require a vector field.
// We could work with knowing just the maximum dimensionality beforehand.
VectorFieldTypeInformation<BitVector> meta = RelationUtil.assumeVectorField(relation);
if (size > 0) {
final int dim = meta.getDimensionality();
Duration timeone = LOG.newDuration(STAT + "1-items.time").begin();
List<OneItemset> oneitems = buildFrequentOneItemsets(relation, dim, needed);
LOG.statistics(timeone.end());
if (LOG.isStatistics()) {
LOG.statistics(new LongStatistic(STAT + "1-items.frequent", oneitems.size()));
LOG.statistics(new LongStatistic(STAT + "1-items.transactions", ids.size()));
}
if (LOG.isDebuggingFine()) {
LOG.debugFine(debugDumpCandidates(new StringBuilder(), oneitems, meta));
}
if (minlength <= 1) {
solution.addAll(oneitems);
}
if (oneitems.size() >= 2 && maxlength >= 2) {
Duration timetwo = LOG.newDuration(STAT + "2-items.time").begin();
ArrayModifiableDBIDs survivors = DBIDUtil.newArray(ids.size());
List<? extends Itemset> candidates = buildFrequentTwoItemsets(oneitems, relation, dim, needed, ids, survivors);
// Continue with reduced set of transactions.
ids = survivors;
LOG.statistics(timetwo.end());
if (LOG.isStatistics()) {
LOG.statistics(new LongStatistic(STAT + "2-items.frequent", candidates.size()));
LOG.statistics(new LongStatistic(STAT + "2-items.transactions", ids.size()));
}
if (LOG.isDebuggingFine()) {
LOG.debugFine(debugDumpCandidates(new StringBuilder(), candidates, meta));
}
if (minlength <= 2) {
solution.addAll(candidates);
}
for (int length = 3; length <= maxlength && candidates.size() >= length; length++) {
Duration timel = LOG.newDuration(STAT + length + "-items.time").begin();
// Join to get the new candidates
candidates = aprioriGenerate(candidates, length, dim);
if (LOG.isDebuggingFinest()) {
LOG.debugFinest(debugDumpCandidates(new StringBuilder().append("Before pruning: "), candidates, meta));
}
survivors = DBIDUtil.newArray(ids.size());
candidates = frequentItemsets(candidates, relation, needed, ids, survivors, length);
// Continue with reduced set of transactions.
ids = survivors;
LOG.statistics(timel.end());
if (LOG.isStatistics()) {
LOG.statistics(new LongStatistic(STAT + length + "-items.frequent", candidates.size()));
LOG.statistics(new LongStatistic(STAT + length + "-items.transactions", ids.size()));
}
if (LOG.isDebuggingFine()) {
LOG.debugFine(debugDumpCandidates(new StringBuilder(), candidates, meta));
}
solution.addAll(candidates);
}
}
}
return new FrequentItemsetsResult("APRIORI", "apriori", solution, meta, size);
}
Aggregations