use of io.airlift.slice.SliceUtf8.offsetOfCodePoint in project presto by prestodb.
the class CharacterStringCasts method varcharToCharSaturatedFloorCast.
@ScalarOperator(OperatorType.SATURATED_FLOOR_CAST)
@SqlType("char(y)")
@LiteralParameters({ "x", "y" })
public static // Char(y) value that is smaller than the original Varchar(x) value. This is fine though for usage in TupleDomainTranslator.
Slice varcharToCharSaturatedFloorCast(@LiteralParameter("y") Long y, @SqlType("varchar(x)") Slice slice) {
Slice trimmedSlice = trimSpaces(slice);
int trimmedTextLength = countCodePoints(trimmedSlice);
int numberOfTrailingSpaces = slice.length() - trimmedSlice.length();
// if Varchar(x) value length (including spaces) is greater than y, we can just truncate it
if (trimmedTextLength + numberOfTrailingSpaces >= y) {
return truncateToLength(trimmedSlice, y.intValue());
}
if (trimmedTextLength == 0) {
return EMPTY_SLICE;
}
// and also remove one additional trailing character to get smaller Char(y) value
return trimmedSlice.slice(0, offsetOfCodePoint(trimmedSlice, trimmedTextLength - 1));
}
use of io.airlift.slice.SliceUtf8.offsetOfCodePoint in project presto by prestodb.
the class StringFunctions method pad.
private static Slice pad(Slice text, long targetLength, Slice padString, int paddingOffset) {
checkCondition(0 <= targetLength && targetLength <= Integer.MAX_VALUE, INVALID_FUNCTION_ARGUMENT, "Target length must be in the range [0.." + Integer.MAX_VALUE + "]");
checkCondition(padString.length() > 0, INVALID_FUNCTION_ARGUMENT, "Padding string must not be empty");
int textLength = countCodePoints(text);
int resultLength = (int) targetLength;
// if our target length is the same as our string then return our string
if (textLength == resultLength) {
return text;
}
// if our string is bigger than requested then truncate
if (textLength > resultLength) {
return SliceUtf8.substring(text, 0, resultLength);
}
// number of bytes in each code point
int padStringLength = countCodePoints(padString);
int[] padStringCounts = new int[padStringLength];
for (int i = 0; i < padStringLength; ++i) {
padStringCounts[i] = lengthOfCodePointSafe(padString, offsetOfCodePoint(padString, i));
}
// preallocate the result
int bufferSize = text.length();
for (int i = 0; i < resultLength - textLength; ++i) {
bufferSize += padStringCounts[i % padStringLength];
}
Slice buffer = Slices.allocate(bufferSize);
// fill in the existing string
int countBytes = bufferSize - text.length();
int startPointOfExistingText = (paddingOffset + countBytes) % bufferSize;
buffer.setBytes(startPointOfExistingText, text);
// assign the pad string while there's enough space for it
int byteIndex = paddingOffset;
for (int i = 0; i < countBytes / padString.length(); ++i) {
buffer.setBytes(byteIndex, padString);
byteIndex += padString.length();
}
// handle the tail: at most we assign padStringLength - 1 code points
buffer.setBytes(byteIndex, padString.getBytes(0, paddingOffset + countBytes - byteIndex));
return buffer;
}
use of io.airlift.slice.SliceUtf8.offsetOfCodePoint in project presto by prestodb.
the class StringFunctions method fromUtf8.
@Description("decodes the UTF-8 encoded string")
@ScalarFunction
@LiteralParameters("x")
@SqlType(StandardTypes.VARCHAR)
public static Slice fromUtf8(@SqlType(StandardTypes.VARBINARY) Slice slice, @SqlType("varchar(x)") Slice replacementCharacter) {
int count = countCodePoints(replacementCharacter);
if (count > 1) {
throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Replacement character string must empty or a single character");
}
OptionalInt replacementCodePoint;
if (count == 1) {
try {
replacementCodePoint = OptionalInt.of(getCodePointAt(replacementCharacter, 0));
} catch (InvalidUtf8Exception e) {
throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Invalid replacement character");
}
} else {
replacementCodePoint = OptionalInt.empty();
}
return SliceUtf8.fixInvalidUtf8(slice, replacementCodePoint);
}
use of io.airlift.slice.SliceUtf8.offsetOfCodePoint in project presto by prestodb.
the class StringFunctions method splitPart.
@SqlNullable
@Description("splits a string by a delimiter and returns the specified field (counting from one)")
@ScalarFunction
@LiteralParameters({ "x", "y" })
@SqlType("varchar(x)")
public static Slice splitPart(@SqlType("varchar(x)") Slice string, @SqlType("varchar(y)") Slice delimiter, @SqlType(StandardTypes.BIGINT) long index) {
checkCondition(index > 0, INVALID_FUNCTION_ARGUMENT, "Index must be greater than zero");
// Empty delimiter? Then every character will be a split
if (delimiter.length() == 0) {
int startCodePoint = toIntExact(index);
int indexStart = offsetOfCodePoint(string, startCodePoint - 1);
if (indexStart < 0) {
// index too big
return null;
}
int length = lengthOfCodePoint(string, indexStart);
if (indexStart + length > string.length()) {
throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Invalid UTF-8 encoding");
}
return string.slice(indexStart, length);
}
int matchCount = 0;
int previousIndex = 0;
while (previousIndex < string.length()) {
int matchIndex = string.indexOf(delimiter, previousIndex);
// No match
if (matchIndex < 0) {
break;
}
// Reached the requested part?
if (++matchCount == index) {
return string.slice(previousIndex, matchIndex - previousIndex);
}
// Continue searching after the delimiter
previousIndex = matchIndex + delimiter.length();
}
if (matchCount == index - 1) {
// returns last section of the split
return string.slice(previousIndex, string.length() - previousIndex);
}
// index is too big, null is returned
return null;
}
use of io.airlift.slice.SliceUtf8.offsetOfCodePoint in project presto by prestodb.
the class StringFunctions method substr.
@Description("substring of given length starting at an index")
@ScalarFunction
@LiteralParameters("x")
@SqlType("varchar(x)")
public static Slice substr(@SqlType("varchar(x)") Slice utf8, @SqlType(StandardTypes.BIGINT) long start, @SqlType(StandardTypes.BIGINT) long length) {
if (start == 0 || (length <= 0) || (utf8.length() == 0)) {
return Slices.EMPTY_SLICE;
}
int startCodePoint = Ints.saturatedCast(start);
int lengthCodePoints = Ints.saturatedCast(length);
if (startCodePoint > 0) {
int indexStart = offsetOfCodePoint(utf8, startCodePoint - 1);
if (indexStart < 0) {
// before beginning of string
return Slices.EMPTY_SLICE;
}
int indexEnd = offsetOfCodePoint(utf8, indexStart, lengthCodePoints);
if (indexEnd < 0) {
// after end of string
indexEnd = utf8.length();
}
return utf8.slice(indexStart, indexEnd - indexStart);
}
// negative start is relative to end of string
int codePoints = countCodePoints(utf8);
startCodePoint += codePoints;
// before beginning of string
if (startCodePoint < 0) {
return Slices.EMPTY_SLICE;
}
int indexStart = offsetOfCodePoint(utf8, startCodePoint);
int indexEnd;
if (startCodePoint + lengthCodePoints < codePoints) {
indexEnd = offsetOfCodePoint(utf8, indexStart, lengthCodePoints);
} else {
indexEnd = utf8.length();
}
return utf8.slice(indexStart, indexEnd - indexStart);
}
Aggregations