use of org.omegat.util.Token in project omegat by omegat-org.
the class FindMatches method processEntry.
/**
* Compare one entry with original entry.
*
* @param candEntry
* entry to compare
*/
protected void processEntry(final EntryKey key, final String source, final String translation, NearString.MATCH_SOURCE comesFrom, final boolean fuzzy, final int penalty, final String tmxName, final String creator, final long creationDate, final String changer, final long changedDate, final List<TMXProp> props) {
// remove part that is to be removed prior to tokenize
String realSource = source;
int realPenaltyForRemoved = 0;
if (removePattern != null) {
StringBuilder entryRemovedText = new StringBuilder();
Matcher removeMatcher = removePattern.matcher(realSource);
while (removeMatcher.find()) {
entryRemovedText.append(removeMatcher.group());
}
realSource = removeMatcher.replaceAll("");
// calculate penalty if something has been removed, otherwise different strings get 100% match.
if (!entryRemovedText.toString().equals(removedText)) {
// penalty for different 'removed'-part
realPenaltyForRemoved = PENALTY_FOR_REMOVED;
}
}
Token[] candTokens = tokenizeStem(realSource);
// First percent value - with stemming if possible
int similarityStem = FuzzyMatcher.calcSimilarity(distance, strTokensStem, candTokens);
similarityStem -= penalty;
if (fuzzy) {
// penalty for fuzzy
similarityStem -= PENALTY_FOR_FUZZY;
}
similarityStem -= realPenaltyForRemoved;
// check if we have chance by first percentage only
if (!haveChanceToAdd(similarityStem, Integer.MAX_VALUE, Integer.MAX_VALUE)) {
return;
}
Token[] candTokensNoStem = tokenizeNoStem(realSource);
// Second percent value - without stemming
int similarityNoStem = FuzzyMatcher.calcSimilarity(distance, strTokensNoStem, candTokensNoStem);
similarityNoStem -= penalty;
if (fuzzy) {
// penalty for fuzzy
similarityNoStem -= PENALTY_FOR_FUZZY;
}
similarityNoStem -= realPenaltyForRemoved;
// check if we have chance by first and second percentages
if (!haveChanceToAdd(similarityStem, similarityNoStem, Integer.MAX_VALUE)) {
return;
}
Token[] candTokensAll = tokenizeAll(realSource);
// Third percent value - with numbers, tags, etc.
int simAdjusted = FuzzyMatcher.calcSimilarity(distance, strTokensAll, candTokensAll);
simAdjusted -= penalty;
if (fuzzy) {
// penalty for fuzzy
simAdjusted -= PENALTY_FOR_FUZZY;
}
simAdjusted -= realPenaltyForRemoved;
// check if we have chance by first, second and third percentages
if (!haveChanceToAdd(similarityStem, similarityNoStem, simAdjusted)) {
return;
}
addNearString(key, source, translation, comesFrom, fuzzy, similarityStem, similarityNoStem, simAdjusted, null, tmxName, creator, creationDate, changer, changedDate, props);
}
use of org.omegat.util.Token in project omegat by omegat-org.
the class EditorUtils method doChangeCase.
/**
* Change the case of the input string to the indicated case. When toWhat is
* {@link CHANGE_CASE_TO#CYCLE} the result will be UPPER > LOWER > SENTENCE
* > TITLE > UPPER.
*
* @param input
* The string to change
* @param toWhat
* The case to change to, or {@link CHANGE_CASE_TO#CYCLE}
* @param locale
* The locale of the input string
* @param tokenizer
* A tokenizer for the input string language
* @return The modified string
*/
public static String doChangeCase(String input, CHANGE_CASE_TO toWhat, Locale locale, ITokenizer tokenizer) {
// tokenize the selection
Token[] tokenList = tokenizer.tokenizeVerbatim(input);
if (toWhat == CHANGE_CASE_TO.CYCLE) {
int lower = 0;
int upper = 0;
int title = 0;
// Maybe title, maybe upper
int ambiguous = 0;
int mixed = 0;
for (Token token : tokenList) {
String word = token.getTextFromString(input);
if (!canChangeTokenCase(word)) {
continue;
}
if (StringUtil.isLowerCase(word)) {
lower++;
continue;
}
boolean isTitle = StringUtil.isTitleCase(word);
boolean isUpper = StringUtil.isUpperCase(word);
if (isTitle && isUpper) {
ambiguous++;
continue;
}
if (isTitle) {
title++;
continue;
}
if (isUpper) {
upper++;
continue;
}
if (StringUtil.isMixedCase(word)) {
mixed++;
}
// Ignore other tokens as they should be caseless text
// such as CJK ideographs or symbols only.
}
if (lower == 0 && title == 0 && upper == 0 && mixed == 0 && ambiguous == 0) {
// nothing to do here
return input;
}
toWhat = determineTargetCase(lower, upper, title, mixed, ambiguous);
}
StringBuilder buffer = new StringBuilder(input);
int lengthIncrement = 0;
for (Token token : tokenList) {
// find out the case and change to the selected
String tokText = token.getTextFromString(input);
if (!canChangeTokenCase(tokText)) {
continue;
}
String result;
if (toWhat == CHANGE_CASE_TO.LOWER) {
result = tokText.toLowerCase(locale);
} else if (toWhat == CHANGE_CASE_TO.UPPER) {
result = tokText.toUpperCase(locale);
} else if (toWhat == CHANGE_CASE_TO.TITLE) {
result = StringUtil.toTitleCase(tokText, locale);
} else if (toWhat == CHANGE_CASE_TO.SENTENCE) {
result = StringUtil.toTitleCase(tokText, locale);
toWhat = CHANGE_CASE_TO.LOWER;
} else {
result = tokText;
}
// replace this token
buffer.replace(token.getOffset() + lengthIncrement, token.getLength() + token.getOffset() + lengthIncrement, result);
lengthIncrement += result.length() - token.getLength();
}
return buffer.toString();
}
use of org.omegat.util.Token in project omegat by omegat-org.
the class GlossarySearcher method getCjkMatchingTokens.
private static List<Token[]> getCjkMatchingTokens(String fullText, String term) {
// This is a CJK word and our source language is not space-delimited, so include if
// word appears anywhere in source string.
IProject project = Core.getProject();
if (!project.isProjectLoaded() || project.getProjectProperties().getSourceLanguage().isSpaceDelimited()) {
return Collections.emptyList();
}
if (!StringUtil.isCJK(term)) {
return Collections.emptyList();
}
int i = fullText.indexOf(term);
if (i == -1) {
return Collections.emptyList();
}
List<Token[]> result = new ArrayList<>();
result.add(new Token[] { new Token(term, i) });
while ((i = fullText.indexOf(term, i + 1)) != -1) {
result.add(new Token[] { new Token(term, i) });
}
return result;
}
use of org.omegat.util.Token in project omegat by omegat-org.
the class TransTipsMarker method getMarksForTokens.
private static List<Mark> getMarksForTokens(List<Token[]> tokens, String srcText, String tooltip) {
if (tokens.isEmpty() || srcText.isEmpty()) {
return Collections.emptyList();
}
List<Mark> result = new ArrayList<>(tokens.size());
tokens.sort(Comparator.comparing(toks -> toks[0].getOffset()));
for (Token[] toks : tokens) {
if (toks.length > 1) {
Arrays.sort(toks, Comparator.comparingInt(Token::getOffset));
}
for (Token tok : toks) {
Mark prev = result.isEmpty() ? null : result.get(result.size() - 1);
int currStart = tok.getOffset();
int currEnd = currStart + tok.getLength();
Mark newMark;
// combine them into a single mark
if (prev != null && canCloseSpan(srcText, prev.endOffset, currStart)) {
newMark = new Mark(Mark.ENTRY_PART.SOURCE, prev.startOffset, currEnd);
result.set(result.size() - 1, newMark);
} else {
newMark = new Mark(Mark.ENTRY_PART.SOURCE, currStart, currEnd);
result.add(newMark);
}
newMark.painter = TRANSTIPS_UNDERLINER;
newMark.toolTipText = tooltip;
}
}
return result;
}
use of org.omegat.util.Token in project omegat by omegat-org.
the class BaseTokenizer method tokenizeByCodePoint.
protected Token[] tokenizeByCodePoint(String strOrig) {
// See http://www.ibm.com/developerworks/library/j-unicode/#1-5
// Example 1-5 appears to be faster than 1-6 for us (because our strings are short?)
Token[] tokens = new Token[strOrig.codePointCount(0, strOrig.length())];
for (int cp, i = 0, j = 0; i < strOrig.length(); i += Character.charCount(cp)) {
cp = strOrig.codePointAt(i);
tokens[j++] = new Token(String.valueOf(Character.toChars(cp)), i);
}
return tokens;
}
Aggregations