use of com.bakdata.conquery.models.preproc.parser.specific.string.MapTypeGuesser in project conquery by bakdata.
the class StringParser method decideType.
@Override
protected StringStore decideType() {
// check if a singleton type is enough
if (strings.isEmpty()) {
return EmptyStore.INSTANCE;
}
// Is this a singleton?
if (strings.size() == 1) {
StringTypeSingleton type = new StringTypeSingleton(strings.keySet().iterator().next(), BitSetStore.create(getLines()));
return type;
}
// remove prefix and suffix
if (!StringUtils.isEmpty(prefix) || !StringUtils.isEmpty(suffix)) {
log.debug("Reduced strings by the '{}' prefix and '{}' suffix", prefix, suffix);
Object2IntMap<String> oldStrings = strings;
strings = new Object2IntOpenHashMap<>(oldStrings.size());
for (Object2IntMap.Entry<String> e : oldStrings.object2IntEntrySet()) {
strings.put(e.getKey().substring(prefix.length(), e.getKey().length() - suffix.length()), e.getIntValue());
}
}
decode();
// Try all guesses and select the least memory intensive one.
// TODO FK: Simplify this, the guessers do a lot of weird lazy computation but implicit.
Guess guess = Stream.of(new TrieTypeGuesser(this), new MapTypeGuesser(this), new NumberTypeGuesser(this, getConfig())).map(StringTypeGuesser::createGuess).filter(Objects::nonNull).min(Comparator.naturalOrder()).orElseThrow();
log.debug("\tUsing {}(est. {})", guess.getGuesser(), BinaryByteUnit.format(guess.estimate()));
StringStore result = guess.getType();
// wrap in prefix suffix
if (!Strings.isNullOrEmpty(prefix) || !Strings.isNullOrEmpty(suffix)) {
result = new StringTypePrefixSuffix(result, prefix, suffix);
}
return result;
}
use of com.bakdata.conquery.models.preproc.parser.specific.string.MapTypeGuesser in project conquery by bakdata.
the class Preprocessed method encodePrimaryDictionary.
private Dictionary encodePrimaryDictionary() {
log.debug("Encode primary Dictionary");
primaryColumn.applyEncoding(StringTypeEncoded.Encoding.UTF8);
final Dictionary primaryDictionary = new MapTypeGuesser(primaryColumn).createGuess().getType().getUnderlyingDictionary();
log.trace("\tPrimaryColumn -> {}", primaryDictionary);
return primaryDictionary;
}
Aggregations