Search in sources :

Example 6 with Token

use of org.omegat.util.Token in project omegat by omegat-org.

the class FindMatches method processEntry.

/**
 * Compare one entry with original entry.
 *
 * @param candEntry
 *            entry to compare
 */
protected void processEntry(final EntryKey key, final String source, final String translation, NearString.MATCH_SOURCE comesFrom, final boolean fuzzy, final int penalty, final String tmxName, final String creator, final long creationDate, final String changer, final long changedDate, final List<TMXProp> props) {
    // remove part that is to be removed prior to tokenize
    String realSource = source;
    int realPenaltyForRemoved = 0;
    if (removePattern != null) {
        StringBuilder entryRemovedText = new StringBuilder();
        Matcher removeMatcher = removePattern.matcher(realSource);
        while (removeMatcher.find()) {
            entryRemovedText.append(removeMatcher.group());
        }
        realSource = removeMatcher.replaceAll("");
        // calculate penalty if something has been removed, otherwise different strings get 100% match.
        if (!entryRemovedText.toString().equals(removedText)) {
            // penalty for different 'removed'-part
            realPenaltyForRemoved = PENALTY_FOR_REMOVED;
        }
    }
    Token[] candTokens = tokenizeStem(realSource);
    // First percent value - with stemming if possible
    int similarityStem = FuzzyMatcher.calcSimilarity(distance, strTokensStem, candTokens);
    similarityStem -= penalty;
    if (fuzzy) {
        // penalty for fuzzy
        similarityStem -= PENALTY_FOR_FUZZY;
    }
    similarityStem -= realPenaltyForRemoved;
    // check if we have chance by first percentage only
    if (!haveChanceToAdd(similarityStem, Integer.MAX_VALUE, Integer.MAX_VALUE)) {
        return;
    }
    Token[] candTokensNoStem = tokenizeNoStem(realSource);
    // Second percent value - without stemming
    int similarityNoStem = FuzzyMatcher.calcSimilarity(distance, strTokensNoStem, candTokensNoStem);
    similarityNoStem -= penalty;
    if (fuzzy) {
        // penalty for fuzzy
        similarityNoStem -= PENALTY_FOR_FUZZY;
    }
    similarityNoStem -= realPenaltyForRemoved;
    // check if we have chance by first and second percentages
    if (!haveChanceToAdd(similarityStem, similarityNoStem, Integer.MAX_VALUE)) {
        return;
    }
    Token[] candTokensAll = tokenizeAll(realSource);
    // Third percent value - with numbers, tags, etc.
    int simAdjusted = FuzzyMatcher.calcSimilarity(distance, strTokensAll, candTokensAll);
    simAdjusted -= penalty;
    if (fuzzy) {
        // penalty for fuzzy
        simAdjusted -= PENALTY_FOR_FUZZY;
    }
    simAdjusted -= realPenaltyForRemoved;
    // check if we have chance by first, second and third percentages
    if (!haveChanceToAdd(similarityStem, similarityNoStem, simAdjusted)) {
        return;
    }
    addNearString(key, source, translation, comesFrom, fuzzy, similarityStem, similarityNoStem, simAdjusted, null, tmxName, creator, creationDate, changer, changedDate, props);
}
Also used : Matcher(java.util.regex.Matcher) FuzzyMatcher(org.omegat.core.matching.FuzzyMatcher) Token(org.omegat.util.Token) NearString(org.omegat.core.matching.NearString)

Example 7 with Token

use of org.omegat.util.Token in project omegat by omegat-org.

the class EditorUtils method doChangeCase.

/**
 * Change the case of the input string to the indicated case. When toWhat is
 * {@link CHANGE_CASE_TO#CYCLE} the result will be UPPER > LOWER > SENTENCE
 * > TITLE > UPPER.
 *
 * @param input
 *            The string to change
 * @param toWhat
 *            The case to change to, or {@link CHANGE_CASE_TO#CYCLE}
 * @param locale
 *            The locale of the input string
 * @param tokenizer
 *            A tokenizer for the input string language
 * @return The modified string
 */
public static String doChangeCase(String input, CHANGE_CASE_TO toWhat, Locale locale, ITokenizer tokenizer) {
    // tokenize the selection
    Token[] tokenList = tokenizer.tokenizeVerbatim(input);
    if (toWhat == CHANGE_CASE_TO.CYCLE) {
        int lower = 0;
        int upper = 0;
        int title = 0;
        // Maybe title, maybe upper
        int ambiguous = 0;
        int mixed = 0;
        for (Token token : tokenList) {
            String word = token.getTextFromString(input);
            if (!canChangeTokenCase(word)) {
                continue;
            }
            if (StringUtil.isLowerCase(word)) {
                lower++;
                continue;
            }
            boolean isTitle = StringUtil.isTitleCase(word);
            boolean isUpper = StringUtil.isUpperCase(word);
            if (isTitle && isUpper) {
                ambiguous++;
                continue;
            }
            if (isTitle) {
                title++;
                continue;
            }
            if (isUpper) {
                upper++;
                continue;
            }
            if (StringUtil.isMixedCase(word)) {
                mixed++;
            }
        // Ignore other tokens as they should be caseless text
        // such as CJK ideographs or symbols only.
        }
        if (lower == 0 && title == 0 && upper == 0 && mixed == 0 && ambiguous == 0) {
            // nothing to do here
            return input;
        }
        toWhat = determineTargetCase(lower, upper, title, mixed, ambiguous);
    }
    StringBuilder buffer = new StringBuilder(input);
    int lengthIncrement = 0;
    for (Token token : tokenList) {
        // find out the case and change to the selected
        String tokText = token.getTextFromString(input);
        if (!canChangeTokenCase(tokText)) {
            continue;
        }
        String result;
        if (toWhat == CHANGE_CASE_TO.LOWER) {
            result = tokText.toLowerCase(locale);
        } else if (toWhat == CHANGE_CASE_TO.UPPER) {
            result = tokText.toUpperCase(locale);
        } else if (toWhat == CHANGE_CASE_TO.TITLE) {
            result = StringUtil.toTitleCase(tokText, locale);
        } else if (toWhat == CHANGE_CASE_TO.SENTENCE) {
            result = StringUtil.toTitleCase(tokText, locale);
            toWhat = CHANGE_CASE_TO.LOWER;
        } else {
            result = tokText;
        }
        // replace this token
        buffer.replace(token.getOffset() + lengthIncrement, token.getLength() + token.getOffset() + lengthIncrement, result);
        lengthIncrement += result.length() - token.getLength();
    }
    return buffer.toString();
}
Also used : Token(org.omegat.util.Token)

Example 8 with Token

use of org.omegat.util.Token in project omegat by omegat-org.

the class GlossarySearcher method getCjkMatchingTokens.

private static List<Token[]> getCjkMatchingTokens(String fullText, String term) {
    // This is a CJK word and our source language is not space-delimited, so include if
    // word appears anywhere in source string.
    IProject project = Core.getProject();
    if (!project.isProjectLoaded() || project.getProjectProperties().getSourceLanguage().isSpaceDelimited()) {
        return Collections.emptyList();
    }
    if (!StringUtil.isCJK(term)) {
        return Collections.emptyList();
    }
    int i = fullText.indexOf(term);
    if (i == -1) {
        return Collections.emptyList();
    }
    List<Token[]> result = new ArrayList<>();
    result.add(new Token[] { new Token(term, i) });
    while ((i = fullText.indexOf(term, i + 1)) != -1) {
        result.add(new Token[] { new Token(term, i) });
    }
    return result;
}
Also used : ArrayList(java.util.ArrayList) Token(org.omegat.util.Token) IProject(org.omegat.core.data.IProject)

Example 9 with Token

use of org.omegat.util.Token in project omegat by omegat-org.

the class TransTipsMarker method getMarksForTokens.

private static List<Mark> getMarksForTokens(List<Token[]> tokens, String srcText, String tooltip) {
    if (tokens.isEmpty() || srcText.isEmpty()) {
        return Collections.emptyList();
    }
    List<Mark> result = new ArrayList<>(tokens.size());
    tokens.sort(Comparator.comparing(toks -> toks[0].getOffset()));
    for (Token[] toks : tokens) {
        if (toks.length > 1) {
            Arrays.sort(toks, Comparator.comparingInt(Token::getOffset));
        }
        for (Token tok : toks) {
            Mark prev = result.isEmpty() ? null : result.get(result.size() - 1);
            int currStart = tok.getOffset();
            int currEnd = currStart + tok.getLength();
            Mark newMark;
            // combine them into a single mark
            if (prev != null && canCloseSpan(srcText, prev.endOffset, currStart)) {
                newMark = new Mark(Mark.ENTRY_PART.SOURCE, prev.startOffset, currEnd);
                result.set(result.size() - 1, newMark);
            } else {
                newMark = new Mark(Mark.ENTRY_PART.SOURCE, currStart, currEnd);
                result.add(newMark);
            }
            newMark.painter = TRANSTIPS_UNDERLINER;
            newMark.toolTipText = tooltip;
        }
    }
    return result;
}
Also used : UnderlineFactory(org.omegat.gui.editor.UnderlineFactory) Arrays(java.util.Arrays) SourceTextEntry(org.omegat.core.data.SourceTextEntry) Token(org.omegat.util.Token) Styles(org.omegat.util.gui.Styles) ArrayList(java.util.ArrayList) List(java.util.List) Mark(org.omegat.gui.editor.mark.Mark) Core(org.omegat.core.Core) IMarker(org.omegat.gui.editor.mark.IMarker) HighlightPainter(javax.swing.text.Highlighter.HighlightPainter) Comparator(java.util.Comparator) Collections(java.util.Collections) ArrayList(java.util.ArrayList) Mark(org.omegat.gui.editor.mark.Mark) Token(org.omegat.util.Token)

Example 10 with Token

use of org.omegat.util.Token in project omegat by omegat-org.

the class BaseTokenizer method tokenizeByCodePoint.

protected Token[] tokenizeByCodePoint(String strOrig) {
    // See http://www.ibm.com/developerworks/library/j-unicode/#1-5
    // Example 1-5 appears to be faster than 1-6 for us (because our strings are short?)
    Token[] tokens = new Token[strOrig.codePointCount(0, strOrig.length())];
    for (int cp, i = 0, j = 0; i < strOrig.length(); i += Character.charCount(cp)) {
        cp = strOrig.codePointAt(i);
        tokens[j++] = new Token(String.valueOf(Character.toChars(cp)), i);
    }
    return tokens;
}
Also used : Token(org.omegat.util.Token)

Aggregations

Token (org.omegat.util.Token)19 ArrayList (java.util.ArrayList)8 NearString (org.omegat.core.matching.NearString)3 List (java.util.List)2 ITokenizer (org.omegat.tokenizer.ITokenizer)2 Point (java.awt.Point)1 IOException (java.io.IOException)1 BreakIterator (java.text.BreakIterator)1 Arrays (java.util.Arrays)1 Collections (java.util.Collections)1 Comparator (java.util.Comparator)1 HashSet (java.util.HashSet)1 Matcher (java.util.regex.Matcher)1 JMenuItem (javax.swing.JMenuItem)1 HighlightPainter (javax.swing.text.Highlighter.HighlightPainter)1 StyledDocument (javax.swing.text.StyledDocument)1 TokenStream (org.apache.lucene.analysis.TokenStream)1 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)1 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)1 Test (org.junit.Test)1