use of io.airlift.slice.SliceUtf8.tryGetCodePointAt in project presto by prestodb.
the class StringFunctions method hammingDistance.
@Description("computes Hamming distance between two strings")
@ScalarFunction
@LiteralParameters({ "x", "y" })
@SqlType(StandardTypes.BIGINT)
public static long hammingDistance(@SqlType("varchar(x)") Slice left, @SqlType("varchar(y)") Slice right) {
int distance = 0;
int leftPosition = 0;
int rightPosition = 0;
while (leftPosition < left.length() && rightPosition < right.length()) {
int codePointLeft = tryGetCodePointAt(left, leftPosition);
int codePointRight = tryGetCodePointAt(right, rightPosition);
// the following code treats them as equal if they happen to be of the same length
if (codePointLeft != codePointRight) {
distance++;
}
leftPosition += codePointLeft >= 0 ? lengthOfCodePoint(codePointLeft) : -codePointLeft;
rightPosition += codePointRight >= 0 ? lengthOfCodePoint(codePointRight) : -codePointRight;
}
checkCondition(leftPosition == left.length() && rightPosition == right.length(), INVALID_FUNCTION_ARGUMENT, "The input strings to hamming_distance function must have the same length");
return distance;
}
use of io.airlift.slice.SliceUtf8.tryGetCodePointAt in project presto by prestodb.
the class StringFunctions method pad.
private static Slice pad(Slice text, long targetLength, Slice padString, int paddingOffset) {
checkCondition(0 <= targetLength && targetLength <= Integer.MAX_VALUE, INVALID_FUNCTION_ARGUMENT, "Target length must be in the range [0.." + Integer.MAX_VALUE + "]");
checkCondition(padString.length() > 0, INVALID_FUNCTION_ARGUMENT, "Padding string must not be empty");
int textLength = countCodePoints(text);
int resultLength = (int) targetLength;
// if our target length is the same as our string then return our string
if (textLength == resultLength) {
return text;
}
// if our string is bigger than requested then truncate
if (textLength > resultLength) {
return SliceUtf8.substring(text, 0, resultLength);
}
// number of bytes in each code point
int padStringLength = countCodePoints(padString);
int[] padStringCounts = new int[padStringLength];
for (int i = 0; i < padStringLength; ++i) {
padStringCounts[i] = lengthOfCodePointSafe(padString, offsetOfCodePoint(padString, i));
}
// preallocate the result
int bufferSize = text.length();
for (int i = 0; i < resultLength - textLength; ++i) {
bufferSize += padStringCounts[i % padStringLength];
}
Slice buffer = Slices.allocate(bufferSize);
// fill in the existing string
int countBytes = bufferSize - text.length();
int startPointOfExistingText = (paddingOffset + countBytes) % bufferSize;
buffer.setBytes(startPointOfExistingText, text);
// assign the pad string while there's enough space for it
int byteIndex = paddingOffset;
for (int i = 0; i < countBytes / padString.length(); ++i) {
buffer.setBytes(byteIndex, padString);
byteIndex += padString.length();
}
// handle the tail: at most we assign padStringLength - 1 code points
buffer.setBytes(byteIndex, padString.getBytes(0, paddingOffset + countBytes - byteIndex));
return buffer;
}
use of io.airlift.slice.SliceUtf8.tryGetCodePointAt in project presto by prestodb.
the class StringFunctions method fromUtf8.
@Description("decodes the UTF-8 encoded string")
@ScalarFunction
@LiteralParameters("x")
@SqlType(StandardTypes.VARCHAR)
public static Slice fromUtf8(@SqlType(StandardTypes.VARBINARY) Slice slice, @SqlType("varchar(x)") Slice replacementCharacter) {
int count = countCodePoints(replacementCharacter);
if (count > 1) {
throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Replacement character string must empty or a single character");
}
OptionalInt replacementCodePoint;
if (count == 1) {
try {
replacementCodePoint = OptionalInt.of(getCodePointAt(replacementCharacter, 0));
} catch (InvalidUtf8Exception e) {
throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Invalid replacement character");
}
} else {
replacementCodePoint = OptionalInt.empty();
}
return SliceUtf8.fixInvalidUtf8(slice, replacementCodePoint);
}
use of io.airlift.slice.SliceUtf8.tryGetCodePointAt in project presto by prestodb.
the class StringFunctions method safeCountCodePoints.
private static int safeCountCodePoints(Slice slice) {
int codePoints = 0;
for (int position = 0; position < slice.length(); ) {
int codePoint = tryGetCodePointAt(slice, position);
if (codePoint < 0) {
throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Invalid UTF-8 encoding in characters: " + slice.toStringUtf8());
}
position += lengthOfCodePoint(codePoint);
codePoints++;
}
return codePoints;
}
use of io.airlift.slice.SliceUtf8.tryGetCodePointAt in project presto by prestodb.
the class CharacterStringCasts method codePointsToSliceUtf8.
private static Slice codePointsToSliceUtf8(List<Integer> codePoints) {
int length = codePoints.stream().mapToInt(SliceUtf8::lengthOfCodePoint).sum();
Slice result = Slices.wrappedBuffer(new byte[length]);
int offset = 0;
for (int codePoint : codePoints) {
setCodePointAt(codePoint, result, offset);
offset += lengthOfCodePoint(codePoint);
}
return result;
}
Aggregations